mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-16 19:09:53 +00:00
Compare commits
38 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
126f4ebb35 | ||
|
|
83d99bbb02 | ||
|
|
2624102d65 | ||
|
|
02587bcbe6 | ||
|
|
c51bf04f9e | ||
|
|
41195b1a60 | ||
|
|
ab80acbee7 | ||
|
|
3573d13ea9 | ||
|
|
9c5251d52f | ||
|
|
a0bba27edc | ||
|
|
0d0143d1e0 | ||
|
|
0004c05f81 | ||
|
|
57a747a34a | ||
|
|
22108ae4e7 | ||
|
|
cecaa1eda3 | ||
|
|
5450ecb914 | ||
|
|
cad6b68f43 | ||
|
|
0eba329305 | ||
|
|
ce8593f2f0 | ||
|
|
9061ddbb5b | ||
|
|
dd4d0d0389 | ||
|
|
0cabe5e91d | ||
|
|
32fe0223ff | ||
|
|
a25736ad08 | ||
|
|
440890d252 | ||
|
|
69bf20fc76 | ||
|
|
2a42a2dc31 | ||
|
|
21ab8d475d | ||
|
|
b024cfde19 | ||
|
|
c7e068a562 | ||
|
|
64cfd2ca4d | ||
|
|
9cb701a616 | ||
|
|
0372013b67 | ||
|
|
4fea1a354d | ||
|
|
667798d588 | ||
|
|
0c30d89a1b | ||
|
|
2ba20fa483 | ||
|
|
97035a765c |
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
## Description
|
||||
<!-- Provide a brief description of the changes made in this PR. -->
|
||||
|
||||
## Documentation
|
||||
- [ ] **Is documentation needed for this update?**
|
||||
|
||||
If checked, a documentation PR must be created and merged in the [website repository](https://github.com/krkn-chaos/website/).
|
||||
|
||||
## Related Documentation PR (if applicable)
|
||||
<!-- Add the link to the corresponding documentation PR in the website repository -->
|
||||
2
.github/workflows/docker-image.yml
vendored
2
.github/workflows/docker-image.yml
vendored
@@ -13,6 +13,7 @@ jobs:
|
||||
- name: Build the Docker images
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
./containers/compile_dockerfile.sh
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg TAG=${GITHUB_REF#refs/tags/}
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
@@ -21,6 +22,7 @@ jobs:
|
||||
- name: Test Build the Docker images
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
./containers/compile_dockerfile.sh
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg PR_NUMBER=${{ github.event.pull_request.number }}
|
||||
- name: Login in quay
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
|
||||
45
.github/workflows/require-docs.yml
vendored
Normal file
45
.github/workflows/require-docs.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Require Documentation Update
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, edited, synchronize]
|
||||
branches:
|
||||
- main
|
||||
jobs:
|
||||
check-docs:
|
||||
name: Check Documentation Update
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check if Documentation is Required
|
||||
id: check_docs
|
||||
run: |
|
||||
echo "Checking PR body for documentation checkbox..."
|
||||
# Read the PR body from the GitHub event payload
|
||||
if echo "${{ github.event.pull_request.body }}" | grep -qi '\[x\].*documentation needed'; then
|
||||
echo "Documentation required detected."
|
||||
echo "docs_required=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "Documentation not required."
|
||||
echo "docs_required=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Enforce Documentation Update (if required)
|
||||
if: steps.check_docs.outputs.docs_required == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Retrieve feature branch and repository owner from the GitHub context
|
||||
FEATURE_BRANCH="${{ github.head_ref }}"
|
||||
REPO_OWNER="${{ github.repository_owner }}"
|
||||
WEBSITE_REPO="website"
|
||||
echo "Searching for a merged documentation PR for feature branch: $FEATURE_BRANCH in $REPO_OWNER/$WEBSITE_REPO..."
|
||||
MERGED_PR=$(gh pr list --repo "$REPO_OWNER/$WEBSITE_REPO" --state merged --json headRefName,title,url | jq -r \
|
||||
--arg FEATURE_BRANCH "$FEATURE_BRANCH" '.[] | select(.title | contains($FEATURE_BRANCH)) | .url')
|
||||
if [[ -z "$MERGED_PR" ]]; then
|
||||
echo ":x: Documentation PR for branch '$FEATURE_BRANCH' is required and has not been merged."
|
||||
exit 1
|
||||
else
|
||||
echo ":white_check_mark: Found merged documentation PR: $MERGED_PR"
|
||||
fi
|
||||
12
.github/workflows/tests.yml
vendored
12
.github/workflows/tests.yml
vendored
@@ -84,9 +84,9 @@ jobs:
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_time" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
|
||||
# Push on main only steps + all other functional to collect coverage
|
||||
@@ -113,9 +113,9 @@ jobs:
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_time" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
# Final common steps
|
||||
- name: Run Functional tests
|
||||
|
||||
7
ADOPTERS.md
Normal file
7
ADOPTERS.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Krkn Adopters
|
||||
|
||||
This is a list of organizations that have publicly acknowledged usage of Krkn and shared details of how they are leveraging it in their environment for chaos engineering use cases. Do you want to add yourself to this list? Please fork the repository and open a PR with the required change.
|
||||
|
||||
| Organization | Since | Website | Use-Case |
|
||||
|:-|:-|:-|:-|
|
||||
| MarketAxess | 2024 | https://www.marketaxess.com/ | Kraken enables us to achieve our goal of increasing the reliability of our cloud products on Kubernetes. The tool allows us to automatically run various chaos scenarios, identify resilience and performance bottlenecks, and seamlessly restore the system to its original state once scenarios finish. These chaos scenarios include pod disruptions, node (EC2) outages, simulating availability zone (AZ) outages, and filling up storage spaces like EBS and EFS. The community is highly responsive to requests and works on expanding the tool's capabilities. MarketAxess actively contributes to the project, adding features such as the ability to leverage existing network ACLs and proposing several feature improvements to enhance test coverage. |
|
||||
@@ -62,3 +62,11 @@ elastic:
|
||||
metrics_index: "krkn-metrics"
|
||||
alerts_index: "krkn-alerts"
|
||||
telemetry_index: "krkn-telemetry"
|
||||
|
||||
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
||||
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
||||
config: # Provide list of health check configurations for applications
|
||||
- url: # Provide application endpoint
|
||||
bearer_token: # Bearer token for authentication if any
|
||||
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
||||
|
||||
@@ -1,19 +0,0 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_arca_cpu_hog {
|
||||
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/cpu-hog/input.yaml
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/cpu-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/arca_cpu_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/arca_cpu_hog.yaml
|
||||
echo "Arcaflow CPU Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_arca_cpu_hog
|
||||
@@ -1,19 +0,0 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_arca_io_hog {
|
||||
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/io-hog/input.yaml
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/io-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/arca_io_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/arca_io_hog.yaml
|
||||
echo "Arcaflow IO Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_arca_io_hog
|
||||
@@ -1,19 +0,0 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_arca_memory_hog {
|
||||
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/kube/memory-hog/input.yaml
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/memory-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/arca_memory_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/arca_memory_hog.yaml
|
||||
echo "Arcaflow Memory Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_arca_memory_hog
|
||||
20
CI/tests/test_cpu_hog.sh
Normal file
20
CI/tests/test_cpu_hog.sh
Normal file
@@ -0,0 +1,20 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_cpu_hog {
|
||||
yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/cpu-hog.yml
|
||||
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/cpu-hog.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/cpu_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/cpu_hog.yaml
|
||||
echo "CPU Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_cpu_hog
|
||||
19
CI/tests/test_io_hog.sh
Normal file
19
CI/tests/test_io_hog.sh
Normal file
@@ -0,0 +1,19 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_io_hog {
|
||||
yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/io-hog.yml
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/io-hog.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/io_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/io_hog.yaml
|
||||
echo "IO Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_io_hog
|
||||
19
CI/tests/test_memory_hog.sh
Normal file
19
CI/tests/test_memory_hog.sh
Normal file
@@ -0,0 +1,19 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
|
||||
function functional_test_memory_hog {
|
||||
yq -i '.node_selector="kubernetes.io/hostname=kind-worker2"' scenarios/kube/memory-hog.yml
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/memory-hog.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/memory_hog.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/memory_hog.yaml
|
||||
echo "Memory Hog: Success"
|
||||
}
|
||||
|
||||
functional_test_memory_hog
|
||||
@@ -19,7 +19,7 @@ function functional_test_telemetry {
|
||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/cpu-hog/input.yaml"
|
||||
export scenario_file="scenarios/kube/cpu–hog.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||
|
||||
36
README.md
36
README.md
@@ -9,14 +9,17 @@ Chaos and resiliency testing tool for Kubernetes.
|
||||
Kraken injects deliberate failures into Kubernetes clusters to check if it is resilient to turbulent conditions.
|
||||
|
||||
|
||||
### Website
|
||||
[Kraken Website](https://krkn-chaos.dev) is the one stop shop for all things Kraken.
|
||||
The website contains comprehensive information about the workflow, supported scenarios, and detailed descriptions of each scenario. It also provides the necessary configurations needed to run Kraken, along with insights into performance monitoring and signaling features.
|
||||
### Workflow
|
||||

|
||||

|
||||
|
||||
### Demo
|
||||
[](https://youtu.be/LN-fZywp_mo "Kraken Demo - Click to Watch!")
|
||||
<!-- ### Demo
|
||||
[](https://youtu.be/LN-fZywp_mo "Kraken Demo - Click to Watch!") -->
|
||||
|
||||
|
||||
### Chaos Testing Guide
|
||||
<!-- ### Chaos Testing Guide
|
||||
[Guide](docs/index.md) encapsulates:
|
||||
- Test methodology that needs to be embraced.
|
||||
- Best practices that an Kubernetes cluster, platform and applications running on top of it should take into account for best user experience, performance, resilience and reliability.
|
||||
@@ -25,28 +28,28 @@ Kraken injects deliberate failures into Kubernetes clusters to check if it is re
|
||||
- Test environment recommendations as to how and where to run chaos tests.
|
||||
- Chaos testing in practice.
|
||||
|
||||
The guide is hosted at https://krkn-chaos.github.io/krkn.
|
||||
The guide is hosted at https://krkn-chaos.github.io/krkn. -->
|
||||
|
||||
|
||||
### How to Get Started
|
||||
Instructions on how to setup, configure and run Kraken can be found at [Installation](docs/installation.md).
|
||||
Instructions on how to setup, configure and run Kraken can be found at [Installation](https://krkn-chaos.dev/docs/installation/).
|
||||
|
||||
You may consider utilizing the chaos recommendation tool prior to initiating the chaos runs to profile the application service(s) under test. This tool discovers a list of Krkn scenarios with a high probability of causing failures or disruptions to your application service(s). The tool can be accessed at [Chaos-Recommender](utils/chaos_recommender/README.md).
|
||||
You may consider utilizing the chaos recommendation tool prior to initiating the chaos runs to profile the application service(s) under test. This tool discovers a list of Krkn scenarios with a high probability of causing failures or disruptions to your application service(s). The tool can be accessed at [Chaos-Recommender](https://krkn-chaos.dev/docs/chaos-recommender/).
|
||||
|
||||
See the [getting started doc](docs/getting_started.md) on support on how to get started with your own custom scenario or editing current scenarios for your specific usage.
|
||||
See the [getting started doc](https://krkn-chaos.dev/docs/getting-started/) on support on how to get started with your own custom scenario or editing current scenarios for your specific usage.
|
||||
|
||||
After installation, refer back to the below sections for supported scenarios and how to tweak the kraken config to load them on your cluster.
|
||||
|
||||
|
||||
#### Running Kraken with minimal configuration tweaks
|
||||
For cases where you want to run Kraken with minimal configuration changes, refer to [krkn-hub](https://github.com/krkn-chaos/krkn-hub). One use case is CI integration where you do not want to carry around different configuration files for the scenarios.
|
||||
<!-- #### Running Kraken with minimal configuration tweaks
|
||||
For cases where you want to run Kraken with minimal configuration changes, refer to [krkn-hub](https://github.com/krkn-chaos/krkn-hub). One use case is CI integration where you do not want to carry around different configuration files for the scenarios. -->
|
||||
|
||||
|
||||
### Config
|
||||
Instructions on how to setup the config and the options supported can be found at [Config](docs/config.md).
|
||||
|
||||
|
||||
### Kubernetes chaos scenarios supported
|
||||
<!-- ### Kubernetes chaos scenarios supported
|
||||
|
||||
Scenario type | Kubernetes
|
||||
--------------------------- | ------------- |
|
||||
@@ -55,9 +58,9 @@ Scenario type | Kubernetes
|
||||
[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: |
|
||||
[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: |
|
||||
[Time Scenarios](docs/time_scenarios.md) | :heavy_check_mark: |
|
||||
[Hog Scenarios: CPU, Memory](docs/arcaflow_scenarios.md) | :heavy_check_mark: |
|
||||
[Hog Scenarios: CPU, Memory](docs/hog_scenarios.md) | :heavy_check_mark: |
|
||||
[Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) | :heavy_check_mark: |
|
||||
[Service Disruption Scenarios](docs/service_disruption_scenarios.md.md) | :heavy_check_mark: |
|
||||
[Service Disruption Scenarios](docs/service_disruption_scenarios.md) | :heavy_check_mark: |
|
||||
[Zone Outage Scenarios](docs/zone_outage.md) | :heavy_check_mark: |
|
||||
[Application_outages](docs/application_outages.md) | :heavy_check_mark: |
|
||||
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: |
|
||||
@@ -72,6 +75,7 @@ It is important to make sure to check if the targeted component recovered from t
|
||||
- Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
|
||||
- Leveraging [Cerberus](https://github.com/krkn-chaos/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/krkn-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
|
||||
- Leveraging built-in alert collection feature to fail the runs in case of critical alerts.
|
||||
- Utilizing health check endpoints to observe application behavior during chaos injection [Health checks](docs/health_checks.md)
|
||||
|
||||
### Signaling
|
||||
In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice.
|
||||
@@ -94,7 +98,7 @@ Information on enabling and leveraging this feature can be found [here](docs/SLO
|
||||
|
||||
### OCM / ACM integration
|
||||
|
||||
Kraken supports injecting faults into [Open Cluster Management (OCM)](https://open-cluster-management.io/) and [Red Hat Advanced Cluster Management for Kubernetes (ACM)](https://www.krkn.com/en/technologies/management/advanced-cluster-management) managed clusters through [ManagedCluster Scenarios](docs/managedcluster_scenarios.md).
|
||||
Kraken supports injecting faults into [Open Cluster Management (OCM)](https://open-cluster-management.io/) and [Red Hat Advanced Cluster Management for Kubernetes (ACM)](https://www.krkn.com/en/technologies/management/advanced-cluster-management) managed clusters through [ManagedCluster Scenarios](docs/managedcluster_scenarios.md). -->
|
||||
|
||||
|
||||
### Blogs and other useful resources
|
||||
@@ -113,10 +117,10 @@ Enhancements being planned can be found in the [roadmap](ROADMAP.md).
|
||||
### Contributions
|
||||
We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
|
||||
|
||||
[More information on how to Contribute](docs/contribute.md)
|
||||
[More information on how to Contribute](https://krkn-chaos.dev/docs/contribution-guidelines/contribute/)
|
||||
|
||||
If adding a new scenario or tweaking the main config, be sure to add in updates into the CI to be sure the CI is up to date.
|
||||
Please read [this file]((CI/README.md#adding-a-test-case)) for more information on updates.
|
||||
Please read [this file](https://krkn-chaos.dev/docs/getting-started/#adding-new-scenarios) for more information on updates.
|
||||
|
||||
|
||||
### Scenario Plugin Development
|
||||
|
||||
43
SECURITY.md
Normal file
43
SECURITY.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Security Policy
|
||||
|
||||
We attach great importance to code security. We are very grateful to the users, security vulnerability researchers, etc. for reporting security vulnerabilities to the Krkn community. All reported security vulnerabilities will be carefully assessed and addressed in a timely manner.
|
||||
|
||||
|
||||
## Security Checks
|
||||
|
||||
Krkn leverages [Snyk](https://snyk.io/) to ensure that any security vulnerabilities found
|
||||
in the code base and dependencies are fixed and published in the latest release. Security
|
||||
vulnerability checks are enabled for each pull request to enable developers to get insights
|
||||
and proactively fix them.
|
||||
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
The Krkn project treats security vulnerabilities seriously, so we
|
||||
strive to take action quickly when required.
|
||||
|
||||
The project requests that security issues be disclosed in a responsible
|
||||
manner to allow adequate time to respond. If a security issue or
|
||||
vulnerability has been found, please disclose the details to our
|
||||
dedicated email address:
|
||||
|
||||
cncf-krkn-maintainers@lists.cncf.io
|
||||
|
||||
You can also use the [GitHub vulnerability report mechanism](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability#privately-reporting-a-security-vulnerability) to report the security vulnerability.
|
||||
|
||||
Please include as much information as possible with the report. The
|
||||
following details assist with analysis efforts:
|
||||
- Description of the vulnerability
|
||||
- Affected component (version, commit, branch etc)
|
||||
- Affected code (file path, line numbers)
|
||||
- Exploit code
|
||||
|
||||
|
||||
## Security Team
|
||||
|
||||
The security team currently consists of the [Maintainers of Krkn](https://github.com/krkn-chaos/krkn/blob/main/MAINTAINERS.md)
|
||||
|
||||
|
||||
## Process and Supported Releases
|
||||
|
||||
The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
@@ -1,5 +1,4 @@
|
||||
kraken:
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
@@ -9,10 +8,9 @@ kraken:
|
||||
chaos_scenarios:
|
||||
# List of policies/chaos scenarios to load
|
||||
- hog_scenarios:
|
||||
- scenarios/kube/cpu-hog/input.yaml
|
||||
- scenarios/kube/memory-hog/input.yaml
|
||||
- scenarios/kube/io-hog/input.yaml
|
||||
- scenarios/kube/io-hog/input.yaml
|
||||
- scenarios/kube/cpu-hog.yml
|
||||
- scenarios/kube/memory-hog.yml
|
||||
- scenarios/kube/io-hog.yml
|
||||
- application_outages_scenarios:
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
@@ -26,12 +24,10 @@ kraken:
|
||||
- scenarios/openshift/prom_kill.yml
|
||||
- scenarios/openshift/openshift-apiserver.yml
|
||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||
- vmware_node_scenarios:
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- ibmcloud_node_scenarios:
|
||||
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/aws_node_scenarios.yml
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/openshift/time_scenarios_example.yml
|
||||
- cluster_shut_down_scenarios:
|
||||
@@ -64,12 +60,10 @@ performance_monitoring:
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
enable_metrics: False
|
||||
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
|
||||
metrics_profile: config/metrics.yaml
|
||||
metrics_profile: config/metrics-report.yaml
|
||||
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
|
||||
elastic:
|
||||
enable_elastic: False
|
||||
collect_metrics: False
|
||||
collect_alerts: False
|
||||
verify_certs: False
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_port: 32766
|
||||
@@ -113,7 +107,10 @@ telemetry:
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
||||
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
||||
config: # Provide list of health check configurations for applications
|
||||
- url: # Provide application endpoint
|
||||
bearer_token: # Bearer token for authentication if any
|
||||
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
||||
|
||||
@@ -1,133 +1,126 @@
|
||||
metrics:
|
||||
# API server
|
||||
- query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
|
||||
metricName: API99thLatency
|
||||
|
||||
- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
|
||||
metricName: APIRequestRate
|
||||
instant: True
|
||||
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(apiserver_current_inflight_requests[5m]))
|
||||
metricName: APIInflightRequests
|
||||
instant: True
|
||||
|
||||
# Container & pod metrics
|
||||
- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerCPU-Masters
|
||||
instant: true
|
||||
|
||||
- query: (sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
|
||||
metricName: containerCPU-Prometheus
|
||||
instant: true
|
||||
|
||||
- query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0
|
||||
metricName: containerCPU-AggregatedWorkers
|
||||
instant: true
|
||||
|
||||
- query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0
|
||||
metricName: containerCPU-AggregatedInfra
|
||||
|
||||
- query: (sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
|
||||
metricName: containerMemory-Prometheus
|
||||
instant: True
|
||||
|
||||
- query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace)
|
||||
metricName: containerMemory-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace)
|
||||
metricName: containerMemory-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
# Node metrics
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Masters
|
||||
instant: True
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: maxCPU-Masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
|
||||
metricName: nodeCPU-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
|
||||
metricName: nodeCPU-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryAvailable-Masters
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryAvailable-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: maxCPU-Workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Workers
|
||||
instant: true
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryAvailable-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryActive-Masters
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryActive-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: avg(avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryActive-AggregatedInfra
|
||||
|
||||
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-Masters
|
||||
|
||||
- query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers
|
||||
|
||||
- query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra
|
||||
|
||||
- query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: rxNetworkBytes-Masters
|
||||
|
||||
- query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: rxNetworkBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: rxNetworkBytes-AggregatedInfra
|
||||
|
||||
- query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: txNetworkBytes-Masters
|
||||
|
||||
- query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: txNetworkBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: txNetworkBytes-AggregatedInfra
|
||||
|
||||
- query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeDiskWrittenBytes-Masters
|
||||
|
||||
- query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskWrittenBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskWrittenBytes-AggregatedInfra
|
||||
|
||||
- query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeDiskReadBytes-Masters
|
||||
|
||||
- query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskReadBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskReadBytes-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
# Etcd metrics
|
||||
- query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
|
||||
metricName: etcdLeaderChangesRate
|
||||
instant: True
|
||||
|
||||
- query: etcd_server_is_leader > 0
|
||||
metricName: etcdServerIsLeader
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
|
||||
metricName: 99thEtcdDiskBackendCommitDurationSeconds
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
|
||||
metricName: 99thEtcdDiskWalFsyncDurationSeconds
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||
metricName: 99thEtcdRoundTripTimeSeconds
|
||||
|
||||
- query: etcd_mvcc_db_total_size_in_bytes
|
||||
metricName: etcdDBPhysicalSizeBytes
|
||||
|
||||
- query: etcd_mvcc_db_total_size_in_use_in_bytes
|
||||
metricName: etcdDBLogicalSizeBytes
|
||||
instant: True
|
||||
|
||||
- query: sum by (cluster_version)(etcd_cluster_version)
|
||||
metricName: etcdVersion
|
||||
@@ -135,83 +128,16 @@ metrics:
|
||||
|
||||
- query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
|
||||
metricName: etcdObjectCount
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99,sum(rate(etcd_request_duration_seconds_bucket[2m])) by (le,operation,apiserver)) > 0
|
||||
metricName: P99APIEtcdRequestLatency
|
||||
|
||||
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})
|
||||
metricName: ActiveWatchStreams
|
||||
|
||||
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})
|
||||
metricName: ActiveLeaseStreams
|
||||
|
||||
- query: sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))
|
||||
metricName: snapshotSaveLatency
|
||||
|
||||
- query: sum(rate(etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: HeartBeatFailures
|
||||
|
||||
- query: sum(rate(etcd_server_health_failures{namespace="openshift-etcd"}[2m]))
|
||||
metricName: HealthFailures
|
||||
|
||||
- query: sum(rate(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: SlowApplies
|
||||
|
||||
- query: sum(rate(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: SlowIndexRead
|
||||
|
||||
- query: sum(etcd_server_proposals_pending)
|
||||
metricName: PendingProposals
|
||||
|
||||
- query: histogram_quantile(1.0, sum(rate(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket[1m])) by (le, instance))
|
||||
metricName: CompactionMaxPause
|
||||
instant: True
|
||||
|
||||
- query: sum by (instance) (apiserver_storage_objects)
|
||||
metricName: etcdTotalObjectCount
|
||||
instant: True
|
||||
|
||||
- query: topk(500, max by(resource) (apiserver_storage_objects))
|
||||
metricName: etcdTopObectCount
|
||||
|
||||
# Cluster metrics
|
||||
- query: count(kube_namespace_created)
|
||||
metricName: namespaceCount
|
||||
|
||||
- query: sum(kube_pod_status_phase{}) by (phase)
|
||||
metricName: podStatusCount
|
||||
|
||||
- query: count(kube_secret_info{})
|
||||
metricName: secretCount
|
||||
|
||||
- query: count(kube_deployment_labels{})
|
||||
metricName: deploymentCount
|
||||
|
||||
- query: count(kube_configmap_info{})
|
||||
metricName: configmapCount
|
||||
|
||||
- query: count(kube_service_info{})
|
||||
metricName: serviceCount
|
||||
|
||||
- query: kube_node_role
|
||||
metricName: nodeRoles
|
||||
instant: true
|
||||
|
||||
- query: sum(kube_node_status_condition{status="true"}) by (condition)
|
||||
metricName: nodeStatus
|
||||
|
||||
- query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerDiskUsage
|
||||
|
||||
- query: cluster_version{type="completed"}
|
||||
metricName: clusterVersion
|
||||
instant: true
|
||||
|
||||
# Golang metrics
|
||||
|
||||
- query: go_memstats_heap_alloc_bytes{job=~"apiserver|api|etcd"}
|
||||
metricName: goHeapAllocBytes
|
||||
|
||||
- query: go_memstats_heap_inuse_bytes{job=~"apiserver|api|etcd"}
|
||||
metricName: goHeapInuseBytes
|
||||
|
||||
- query: go_gc_duration_seconds{job=~"apiserver|api|etcd",quantile="1"}
|
||||
metricName: goGCDurationSeconds
|
||||
instant: True
|
||||
|
||||
248
config/metrics-report.yaml
Normal file
248
config/metrics-report.yaml
Normal file
@@ -0,0 +1,248 @@
|
||||
metrics:
|
||||
|
||||
# API server
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
instant: true
|
||||
|
||||
# Kubelet & CRI-O
|
||||
|
||||
# Average and max of the CPU usage from all worker's kubelet
|
||||
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: cpu-kubelet
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-cpu-kubelet
|
||||
instant: true
|
||||
|
||||
# Average of the memory usage from all worker's kubelet
|
||||
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: memory-kubelet
|
||||
instant: true
|
||||
|
||||
# Max of the memory usage from all worker's kubelet
|
||||
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-memory-kubelet
|
||||
instant: true
|
||||
|
||||
- query: max_over_time(sum(process_resident_memory_bytes{service="kubelet",job="kubelet"} and on (node) kube_node_role{role="worker"})[.elapsed:])
|
||||
metricName: max-memory-sum-kubelet
|
||||
instant: true
|
||||
|
||||
# Average and max of the CPU usage from all worker's CRI-O
|
||||
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: cpu-crio
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-cpu-crio
|
||||
instant: true
|
||||
|
||||
# Average of the memory usage from all worker's CRI-O
|
||||
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: memory-crio
|
||||
instant: true
|
||||
|
||||
# Max of the memory usage from all worker's CRI-O
|
||||
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-memory-crio
|
||||
instant: true
|
||||
|
||||
# Etcd
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdDiskBackendCommit
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdDiskWalFsync
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdRoundTripTime
|
||||
instant: true
|
||||
|
||||
# Control-plane
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: max-cpu-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: max-memory-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-apiserver"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-kube-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-kube-apiserver"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-kube-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-apiserver"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-openshift-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-apiserver"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-openshift-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-etcd"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-etcd
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3,sum(container_memory_rss{name!="", namespace="openshift-etcd"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-etcd
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-openshift-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-openshift-controller-manager
|
||||
instant: true
|
||||
|
||||
# multus
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-multus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-multus
|
||||
instant: true
|
||||
|
||||
# OVNKubernetes - standard & IC
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-ovn-control-plane
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-ovn-control-plane
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-ovnkube-node
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-ovnkube-node
|
||||
instant: true
|
||||
|
||||
# Nodes
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-masters
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: max-cpu-workers
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-workers
|
||||
instant: true
|
||||
|
||||
- query: sum( (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") )
|
||||
metricName: memory-sum-workers
|
||||
instant: true
|
||||
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-infra
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: max-cpu-infra
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-infra
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-infra
|
||||
instant: true
|
||||
|
||||
- query: max_over_time(sum((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))[.elapsed:])
|
||||
metricName: max-memory-sum-infra
|
||||
instant: true
|
||||
|
||||
# Monitoring and ingress
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: cpu-prometheus
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: max-cpu-prometheus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[.elapsed:]))
|
||||
metricName: memory-prometheus
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[.elapsed:]))
|
||||
metricName: max-memory-prometheus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ingress", pod=~"router-default.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: cpu-router
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-ingress", pod=~"router-default.+"}) by (pod)[.elapsed:]))
|
||||
metricName: memory-router
|
||||
instant: true
|
||||
|
||||
# Cluster
|
||||
|
||||
- query: avg_over_time(cluster:memory_usage:ratio[.elapsed:])
|
||||
metricName: memory-cluster-usage-ratio
|
||||
instant: true
|
||||
|
||||
- query: avg_over_time(cluster:node_cpu:ratio[.elapsed:])
|
||||
metricName: cpu-cluster-usage-ratio
|
||||
instant: true
|
||||
|
||||
# Retain the raw CPU seconds totals for comparison
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="worker",role!="infra"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Workers
|
||||
instant: true
|
||||
|
||||
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Masters
|
||||
instant: true
|
||||
|
||||
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Infra
|
||||
instant: true
|
||||
@@ -1,13 +1,7 @@
|
||||
metrics:
|
||||
# API server
|
||||
- query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
|
||||
metricName: API99thLatency
|
||||
|
||||
- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
|
||||
metricName: APIRequestRate
|
||||
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0
|
||||
metricName: schedulingThroughput
|
||||
|
||||
# Containers & pod metrics
|
||||
- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
|
||||
@@ -33,8 +27,17 @@ metrics:
|
||||
metricName: crioMemory
|
||||
|
||||
# Node metrics
|
||||
- query: sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) > 0
|
||||
metricName: nodeCPU
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Masters
|
||||
|
||||
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Workers
|
||||
|
||||
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Workers
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes) by (instance)
|
||||
metricName: nodeMemoryAvailable
|
||||
@@ -42,6 +45,9 @@ metrics:
|
||||
- query: avg(node_memory_Active_bytes) by (instance)
|
||||
metricName: nodeMemoryActive
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Masters
|
||||
|
||||
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers
|
||||
|
||||
@@ -84,34 +90,4 @@ metrics:
|
||||
|
||||
- query: sum by (cluster_version)(etcd_cluster_version)
|
||||
metricName: etcdVersion
|
||||
instant: true
|
||||
|
||||
# Cluster metrics
|
||||
- query: count(kube_namespace_created)
|
||||
metricName: namespaceCount
|
||||
|
||||
- query: sum(kube_pod_status_phase{}) by (phase)
|
||||
metricName: podStatusCount
|
||||
|
||||
- query: count(kube_secret_info{})
|
||||
metricName: secretCount
|
||||
|
||||
- query: count(kube_deployment_labels{})
|
||||
metricName: deploymentCount
|
||||
|
||||
- query: count(kube_configmap_info{})
|
||||
metricName: configmapCount
|
||||
|
||||
- query: count(kube_service_info{})
|
||||
metricName: serviceCount
|
||||
|
||||
- query: kube_node_role
|
||||
metricName: nodeRoles
|
||||
instant: true
|
||||
|
||||
- query: sum(kube_node_status_condition{status="true"}) by (condition)
|
||||
metricName: nodeStatus
|
||||
|
||||
- query: cluster_version{type="completed"}
|
||||
metricName: clusterVersion
|
||||
instant: true
|
||||
instant: true
|
||||
@@ -1,14 +1,19 @@
|
||||
# oc build
|
||||
FROM golang:1.22.5 AS oc-build
|
||||
FROM golang:1.23.1 AS oc-build
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libkrb5-dev
|
||||
WORKDIR /tmp
|
||||
RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
|
||||
WORKDIR /tmp/oc
|
||||
RUN go mod edit -go 1.22.5 &&\
|
||||
RUN go mod edit -go 1.23.1 &&\
|
||||
go get github.com/moby/buildkit@v0.12.5 &&\
|
||||
go get github.com/containerd/containerd@v1.7.11&&\
|
||||
go get github.com/docker/docker@v25.0.6&&\
|
||||
go get github.com/opencontainers/runc@v1.1.14&&\
|
||||
go get github.com/go-git/go-git/v5@v5.13.0&&\
|
||||
go get golang.org/x/net@v0.36.0&&\
|
||||
go get github.com/containerd/containerd@v1.7.27&&\
|
||||
go get golang.org/x/oauth2@v0.27.0&&\
|
||||
go get golang.org/x/crypto@v0.35.0&&\
|
||||
go mod tidy && go mod vendor
|
||||
RUN make GO_REQUIRED_MIN_VERSION:= oc
|
||||
|
||||
@@ -45,10 +50,16 @@ RUN if [ -n "$PR_NUMBER" ]; then git fetch origin pull/${PR_NUMBER}/head:pr-${PR
|
||||
# if it is a TAG trigger checkout the tag
|
||||
RUN if [ -n "$TAG" ]; then git checkout "$TAG";fi
|
||||
|
||||
RUN python3.9 -m ensurepip
|
||||
RUN python3.9 -m ensurepip --upgrade --default-pip
|
||||
RUN python3.9 -m pip install --upgrade pip setuptools==70.0.0
|
||||
RUN pip3.9 install -r requirements.txt
|
||||
RUN pip3.9 install jsonschema
|
||||
|
||||
LABEL krknctl.title.global="Krkn Base Image"
|
||||
LABEL krknctl.description.global="This is the krkn base image."
|
||||
LABEL krknctl.input_fields.global='$KRKNCTL_INPUT'
|
||||
|
||||
|
||||
RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn
|
||||
USER krkn
|
||||
ENTRYPOINT ["python3.9", "run_kraken.py"]
|
||||
5
containers/compile_dockerfile.sh
Executable file
5
containers/compile_dockerfile.sh
Executable file
@@ -0,0 +1,5 @@
|
||||
SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
|
||||
cd "$SCRIPT_DIR"
|
||||
export KRKNCTL_INPUT=$(cat krknctl-input.json|tr -d "\n")
|
||||
|
||||
envsubst '${KRKNCTL_INPUT}' < Dockerfile.template > Dockerfile
|
||||
381
containers/krknctl-input.json
Normal file
381
containers/krknctl-input.json
Normal file
@@ -0,0 +1,381 @@
|
||||
[
|
||||
{
|
||||
"name": "cerberus-enabled",
|
||||
"short_description": "Enable Cerberus",
|
||||
"description": "Enables Cerberus Support",
|
||||
"variable": "CERBERUS_ENABLED",
|
||||
"type": "enum",
|
||||
"default": "False",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "cerberus-url",
|
||||
"short_description": "Cerberus URL",
|
||||
"description": "Cerberus http url",
|
||||
"variable": "CERBERUS_URL",
|
||||
"type": "string",
|
||||
"default": "http://0.0.0.0:8080",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "distribution",
|
||||
"short_description": "Orchestrator distribution",
|
||||
"description": "Selects the orchestrator distribution",
|
||||
"variable": "DISTRIBUTION",
|
||||
"type": "enum",
|
||||
"default": "openshift",
|
||||
"allowed_values": "openshift,kubernetes",
|
||||
"separator": ",",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "krkn-kubeconfig",
|
||||
"short_description": "Krkn kubeconfig path",
|
||||
"description": "Sets the path where krkn will search for kubeconfig (in container)",
|
||||
"variable": "KRKN_KUBE_CONFIG",
|
||||
"type": "string",
|
||||
"default": "/home/krkn/.kube/config",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "wait-duration",
|
||||
"short_description": "Post chaos wait duration",
|
||||
"description": "waits for a certain amount of time after the scenario",
|
||||
"variable": "WAIT_DURATION",
|
||||
"type": "number",
|
||||
"default": "1"
|
||||
},
|
||||
{
|
||||
"name": "iterations",
|
||||
"short_description": "Chaos scenario iterations",
|
||||
"description": "number of times the same chaos scenario will be executed",
|
||||
"variable": "ITERATIONS",
|
||||
"type": "number",
|
||||
"default": "1"
|
||||
},
|
||||
{
|
||||
"name": "daemon-mode",
|
||||
"short_description": "Sets krkn daemon mode",
|
||||
"description": "if set the scenario will execute forever",
|
||||
"variable": "DAEMON_MODE",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "uuid",
|
||||
"short_description": "Sets krkn run uuid",
|
||||
"description": "sets krkn run uuid instead of generating it",
|
||||
"variable": "UUID",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "capture-metrics",
|
||||
"short_description": "Enables metrics capture",
|
||||
"description": "Enables metrics capture",
|
||||
"variable": "CAPTURE_METRICS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "enable-alerts",
|
||||
"short_description": "Enables cluster alerts check",
|
||||
"description": "Enables cluster alerts check",
|
||||
"variable": "ENABLE_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "alerts-path",
|
||||
"short_description": "Cluster alerts path file (in container)",
|
||||
"description": "Allows to specify a different alert file path",
|
||||
"variable": "ALERTS_PATH",
|
||||
"type": "string",
|
||||
"default": "config/alerts.yaml",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "metrics-path",
|
||||
"short_description": "Cluster metrics path file (in container)",
|
||||
"description": "Allows to specify a different metrics file path",
|
||||
"variable": "METRICS_PATH",
|
||||
"type": "string",
|
||||
"default": "config/metrics-aggregated.yaml",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "enable-es",
|
||||
"short_description": "Enables elastic search data collection",
|
||||
"description": "Enables elastic search data collection",
|
||||
"variable": "ENABLE_ES",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-server",
|
||||
"short_description": "Elasticsearch instance URL",
|
||||
"description": "Elasticsearch instance URL",
|
||||
"variable": "ES_SERVER",
|
||||
"type": "string",
|
||||
"default": "http://0.0.0.0",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-port",
|
||||
"short_description": "Elasticsearch instance port",
|
||||
"description": "Elasticsearch instance port",
|
||||
"variable": "ES_PORT",
|
||||
"type": "number",
|
||||
"default": "443",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-username",
|
||||
"short_description": "Elasticsearch instance username",
|
||||
"description": "Elasticsearch instance username",
|
||||
"variable": "ES_USERNAME",
|
||||
"type": "string",
|
||||
"default": "elastic",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-password",
|
||||
"short_description": "Elasticsearch instance password",
|
||||
"description": "Elasticsearch instance password",
|
||||
"variable": "ES_PASSWORD",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-verify-certs",
|
||||
"short_description": "Enables elasticsearch TLS certificate verification",
|
||||
"description": "Enables elasticsearch TLS certificate verification",
|
||||
"variable": "ES_VERIFY_CERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-metrics-index",
|
||||
"short_description": "Elasticsearch metrics index",
|
||||
"description": "Index name for metrics in Elasticsearch",
|
||||
"variable": "ES_METRICS_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-metrics",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-alerts-index",
|
||||
"short_description": "Elasticsearch alerts index",
|
||||
"description": "Index name for alerts in Elasticsearch",
|
||||
"variable": "ES_ALERTS_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-alerts",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-telemetry-index",
|
||||
"short_description": "Elasticsearch telemetry index",
|
||||
"description": "Index name for telemetry in Elasticsearch",
|
||||
"variable": "ES_TELEMETRY_INDEX",
|
||||
"type": "string",
|
||||
"default": "krkn-telemetry",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "check-critical-alerts",
|
||||
"short_description": "Check critical alerts",
|
||||
"description": "Enables checking for critical alerts",
|
||||
"variable": "CHECK_CRITICAL_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-enabled",
|
||||
"short_description": "Enable telemetry",
|
||||
"description": "Enables telemetry support",
|
||||
"variable": "TELEMETRY_ENABLED",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-api-url",
|
||||
"short_description": "Telemetry API URL",
|
||||
"description": "API endpoint for telemetry data",
|
||||
"variable": "TELEMETRY_API_URL",
|
||||
"type": "string",
|
||||
"default": "https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-username",
|
||||
"short_description": "Telemetry username",
|
||||
"description": "Username for telemetry authentication",
|
||||
"variable": "TELEMETRY_USERNAME",
|
||||
"type": "string",
|
||||
"default": "redhat-chaos",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-password",
|
||||
"short_description": "Telemetry password",
|
||||
"description": "Password for telemetry authentication",
|
||||
"variable": "TELEMETRY_PASSWORD",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-prometheus-backup",
|
||||
"short_description": "Prometheus backup for telemetry",
|
||||
"description": "Enables Prometheus backup for telemetry",
|
||||
"variable": "TELEMETRY_PROMETHEUS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "True",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-full-prometheus-backup",
|
||||
"short_description": "Full Prometheus backup",
|
||||
"description": "Enables full Prometheus backup for telemetry",
|
||||
"variable": "TELEMETRY_FULL_PROMETHEUS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-backup-threads",
|
||||
"short_description": "Telemetry backup threads",
|
||||
"description": "Number of threads for telemetry backup",
|
||||
"variable": "TELEMETRY_BACKUP_THREADS",
|
||||
"type": "number",
|
||||
"default": "5",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-archive-path",
|
||||
"short_description": "Telemetry archive path",
|
||||
"description": "Path to save telemetry archive",
|
||||
"variable": "TELEMETRY_ARCHIVE_PATH",
|
||||
"type": "string",
|
||||
"default": "/tmp",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-max-retries",
|
||||
"short_description": "Telemetry max retries",
|
||||
"description": "Maximum retries for telemetry operations",
|
||||
"variable": "TELEMETRY_MAX_RETRIES",
|
||||
"type": "number",
|
||||
"default": "0",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-run-tag",
|
||||
"short_description": "Telemetry run tag",
|
||||
"description": "Tag for telemetry run",
|
||||
"variable": "TELEMETRY_RUN_TAG",
|
||||
"type": "string",
|
||||
"default": "chaos",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-group",
|
||||
"short_description": "Telemetry group",
|
||||
"description": "Group name for telemetry data",
|
||||
"variable": "TELEMETRY_GROUP",
|
||||
"type": "string",
|
||||
"default": "default",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-archive-size",
|
||||
"short_description": "Telemetry archive size",
|
||||
"description": "Maximum size for telemetry archives",
|
||||
"variable": "TELEMETRY_ARCHIVE_SIZE",
|
||||
"type": "number",
|
||||
"default": "1000",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-logs-backup",
|
||||
"short_description": "Telemetry logs backup",
|
||||
"description": "Enables logs backup for telemetry",
|
||||
"variable": "TELEMETRY_LOGS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-filter-pattern",
|
||||
"short_description": "Telemetry filter pattern",
|
||||
"description": "Filter pattern for telemetry logs",
|
||||
"variable": "TELEMETRY_FILTER_PATTERN",
|
||||
"type": "string",
|
||||
"default": "[\"(\\\\w{3}\\\\s\\\\d{1,2}\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d+).+\",\"kinit (\\\\d+/\\\\d+/\\\\d+\\\\s\\\\d{2}:\\\\d{2}:\\\\d{2})\\\\s+\",\"(\\\\d{4}-\\\\d{2}-\\\\d{2}T\\\\d{2}:\\\\d{2}:\\\\d{2}\\\\.\\\\d+Z).+\"]",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-cli-path",
|
||||
"short_description": "Telemetry CLI path (oc)",
|
||||
"description": "Path to telemetry CLI tool (oc)",
|
||||
"variable": "TELEMETRY_CLI_PATH",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "telemetry-events-backup",
|
||||
"short_description": "Telemetry events backup",
|
||||
"description": "Enables events backup for telemetry",
|
||||
"variable": "TELEMETRY_EVENTS_BACKUP",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "True",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "krkn-debug",
|
||||
"short_description": "Krkn debug mode",
|
||||
"description": "Enables debug mode for Krkn",
|
||||
"variable": "KRKN_DEBUG",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
}
|
||||
]
|
||||
@@ -1,70 +0,0 @@
|
||||
## Arcaflow Scenarios
|
||||
Arcaflow is a workflow engine in development which provides the ability to execute workflow steps in sequence, in parallel, repeatedly, etc. The main difference to competitors such as Netflix Conductor is the ability to run ad-hoc workflows without an infrastructure setup required.
|
||||
|
||||
The engine uses containers to execute plugins and runs them either locally in Docker/Podman or remotely on a Kubernetes cluster. The workflow system is strongly typed and allows for generating JSON schema and OpenAPI documents for all data formats involved.
|
||||
|
||||
### Available Scenarios
|
||||
#### Hog scenarios:
|
||||
- [CPU Hog](arcaflow_scenarios/cpu_hog.md)
|
||||
- [Memory Hog](arcaflow_scenarios/memory_hog.md)
|
||||
- [I/O Hog](arcaflow_scenarios/io_hog.md)
|
||||
|
||||
|
||||
### Prequisites
|
||||
Arcaflow supports three deployment technologies:
|
||||
- Docker
|
||||
- Podman
|
||||
- Kubernetes
|
||||
|
||||
#### Docker
|
||||
In order to run Arcaflow Scenarios with the Docker deployer, be sure that:
|
||||
- Docker is correctly installed in your Operating System (to find instructions on how to install docker please refer to [Docker Documentation](https://www.docker.com/))
|
||||
- The Docker daemon is running
|
||||
|
||||
#### Podman
|
||||
The podman deployer is built around the podman CLI and doesn't need necessarily to be run along with the podman daemon.
|
||||
To run Arcaflow Scenarios in your Operating system be sure that:
|
||||
- podman is correctly installed in your Operating System (to find instructions on how to install podman refer to [Podman Documentation](https://podman.io/))
|
||||
- the podman CLI is in your shell PATH
|
||||
|
||||
#### Kubernetes
|
||||
The kubernetes deployer integrates directly the Kubernetes API Client and needs only a valid kubeconfig file and a reachable Kubernetes/OpenShift Cluster.
|
||||
|
||||
### Usage
|
||||
|
||||
To enable arcaflow scenarios edit the kraken config file, go to the section `kraken -> chaos_scenarios` of the yaml structure
|
||||
and add a new element to the list named `arcaflow_scenarios` then add the desired scenario
|
||||
pointing to the `input.yaml` file.
|
||||
```
|
||||
kraken:
|
||||
...
|
||||
chaos_scenarios:
|
||||
- arcaflow_scenarios:
|
||||
- scenarios/arcaflow/cpu-hog/input.yaml
|
||||
```
|
||||
|
||||
#### input.yaml
|
||||
The implemented scenarios can be found in *scenarios/arcaflow/<scenario_name>* folder.
|
||||
The entrypoint of each scenario is the *input.yaml* file.
|
||||
In this file there are all the options to set up the scenario accordingly to the desired target
|
||||
### config.yaml
|
||||
The arcaflow config file. Here you can set the arcaflow deployer and the arcaflow log level.
|
||||
The supported deployers are:
|
||||
- Docker
|
||||
- Podman (podman daemon not needed, suggested option)
|
||||
- Kubernetes
|
||||
|
||||
The supported log levels are:
|
||||
- debug
|
||||
- info
|
||||
- warning
|
||||
- error
|
||||
### workflow.yaml
|
||||
This file contains the steps that will be executed to perform the scenario against the target.
|
||||
Each step is represented by a container that will be executed from the deployer and its options.
|
||||
Note that we provide the scenarios as a template, but they can be manipulated to define more complex workflows.
|
||||
To have more details regarding the arcaflow workflows architecture and syntax it is suggested to refer to the [Arcaflow Documentation](https://arcalot.io/arcaflow/).
|
||||
|
||||
This edit is no longer in quay image
|
||||
Working on fix in ticket: https://issues.redhat.com/browse/CHAOS-494
|
||||
This will effect all versions 4.12 and higher of OpenShift
|
||||
@@ -1,19 +0,0 @@
|
||||
# CPU Hog
|
||||
This scenario is based on the arcaflow [arcaflow-plugin-stressng](https://github.com/arcalot/arcaflow-plugin-stressng) plugin.
|
||||
The purpose of this scenario is to create cpu pressure on a particular node of the Kubernetes/OpenShift cluster for a time span.
|
||||
To enable this plugin add the pointer to the scenario input file `scenarios/arcaflow/cpu-hog/input.yaml` as described in the
|
||||
Usage section.
|
||||
This scenario takes a list of objects named `input_list` with the following properties:
|
||||
|
||||
- **kubeconfig :** *string* the kubeconfig needed by the deployer to deploy the sysbench plugin in the target cluster
|
||||
- **namespace :** *string* the namespace where the scenario container will be deployed
|
||||
**Note:** this parameter will be automatically filled by kraken if the `kubeconfig_path` property is correctly set
|
||||
- **node_selector :** *key-value map* the node label that will be used as `nodeSelector` by the pod to target a specific cluster node
|
||||
- **duration :** *string* stop stress test after N seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y.
|
||||
- **cpu_count :** *int* the number of CPU cores to be used (0 means all)
|
||||
- **cpu_method :** *string* a fine-grained control of which cpu stressors to use (ackermann, cfloat etc. see [manpage](https://manpages.org/sysbench) for all the cpu_method options)
|
||||
- **cpu_load_percentage :** *int* the CPU load by percentage
|
||||
|
||||
To perform several load tests in the same run simultaneously (eg. stress two or more nodes in the same run) add another item
|
||||
to the `input_list` with the same properties (and eventually different values eg. different node_selectors
|
||||
to schedule the pod on different nodes). To reduce (or increase) the parallelism change the value `parallelism` in `workload.yaml` file
|
||||
@@ -1,21 +0,0 @@
|
||||
# I/O Hog
|
||||
This scenario is based on the arcaflow [arcaflow-plugin-stressng](https://github.com/arcalot/arcaflow-plugin-stressng) plugin.
|
||||
The purpose of this scenario is to create disk pressure on a particular node of the Kubernetes/OpenShift cluster for a time span.
|
||||
The scenario allows to attach a node path to the pod as a `hostPath` volume.
|
||||
To enable this plugin add the pointer to the scenario input file `scenarios/arcaflow/io-hog/input.yaml` as described in the
|
||||
Usage section.
|
||||
This scenario takes a list of objects named `input_list` with the following properties:
|
||||
|
||||
- **kubeconfig :** *string* the kubeconfig needed by the deployer to deploy the sysbench plugin in the target cluster
|
||||
- **namespace :** *string* the namespace where the scenario container will be deployed
|
||||
**Note:** this parameter will be automatically filled by kraken if the `kubeconfig_path` property is correctly set
|
||||
- **node_selector :** *key-value map* the node label that will be used as `nodeSelector` by the pod to target a specific cluster node
|
||||
- **duration :** *string* stop stress test after N seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y.
|
||||
- **target_pod_folder :** *string* the path in the pod where the volume is mounted
|
||||
- **target_pod_volume :** *object* the `hostPath` volume definition in the [Kubernetes/OpenShift](https://docs.openshift.com/container-platform/3.11/install_config/persistent_storage/using_hostpath.html) format, that will be attached to the pod as a volume
|
||||
- **io_write_bytes :** *string* writes N bytes for each hdd process. The size can be expressed as % of free space on the file system or in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g
|
||||
- **io_block_size :** *string* size of each write in bytes. Size can be from 1 byte to 4m.
|
||||
|
||||
To perform several load tests in the same run simultaneously (eg. stress two or more nodes in the same run) add another item
|
||||
to the `input_list` with the same properties (and eventually different values eg. different node_selectors
|
||||
to schedule the pod on different nodes). To reduce (or increase) the parallelism change the value `parallelism` in `workload.yaml` file
|
||||
@@ -1,18 +0,0 @@
|
||||
# Memory Hog
|
||||
This scenario is based on the arcaflow [arcaflow-plugin-stressng](https://github.com/arcalot/arcaflow-plugin-stressng) plugin.
|
||||
The purpose of this scenario is to create Virtual Memory pressure on a particular node of the Kubernetes/OpenShift cluster for a time span.
|
||||
To enable this plugin add the pointer to the scenario input file `scenarios/arcaflow/memory-hog/input.yaml` as described in the
|
||||
Usage section.
|
||||
This scenario takes a list of objects named `input_list` with the following properties:
|
||||
|
||||
- **kubeconfig :** *string* the kubeconfig needed by the deployer to deploy the sysbench plugin in the target cluster
|
||||
- **namespace :** *string* the namespace where the scenario container will be deployed
|
||||
**Note:** this parameter will be automatically filled by kraken if the `kubeconfig_path` property is correctly set
|
||||
- **node_selector :** *key-value map* the node label that will be used as `nodeSelector` by the pod to target a specific cluster node
|
||||
- **duration :** *string* stop stress test after N seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y.
|
||||
- **vm_bytes :** *string* N bytes per vm process or percentage of memory used (using the % symbol). The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
|
||||
- **vm_workers :** *int* Number of VM stressors to be run (0 means 1 stressor per CPU)
|
||||
|
||||
To perform several load tests in the same run simultaneously (eg. stress two or more nodes in the same run) add another item
|
||||
to the `input_list` with the same properties (and eventually different values eg. different node_selectors
|
||||
to schedule the pod on different nodes). To reduce (or increase) the parallelism change the value `parallelism` in `workload.yaml` file
|
||||
@@ -13,13 +13,26 @@ Supported Cloud Providers:
|
||||
**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/getting-started-install.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account
|
||||
|
||||
## GCP
|
||||
**NOTE**: For clusters with GCP make sure [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed.
|
||||
|
||||
A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
|
||||
In order to set up Application Default Credentials (ADC) for use by Cloud Client Libraries, you can provide either service account credentials or the credentials associated with your user acccount:
|
||||
|
||||
**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
|
||||
- Using service account credentials:
|
||||
|
||||
After creating the service account you will need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
|
||||
A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
|
||||
|
||||
**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
|
||||
|
||||
After creating the service account you will need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
|
||||
|
||||
- Using the credentials associated with your user acccount:
|
||||
|
||||
1. Make sure that the [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed and [initialized](https://cloud.google.com/sdk/docs/initializing) by running:
|
||||
|
||||
```gcloud init```
|
||||
|
||||
2. Create local authentication credentials for your user account:
|
||||
|
||||
```gcloud auth application-default login```
|
||||
|
||||
## Openstack
|
||||
|
||||
@@ -32,6 +45,7 @@ After creating the service account you will need to enable the account using the
|
||||
To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator”.
|
||||
|
||||
Before running you will need to set the following:
|
||||
|
||||
1. ```export AZURE_SUBSCRIPTION_ID=<subscription_id>```
|
||||
|
||||
2. ```export AZURE_TENANT_ID=<tenant_id>```
|
||||
@@ -66,9 +80,10 @@ Set the following environment variables
|
||||
|
||||
These are the credentials that you would normally use to access the vSphere client.
|
||||
|
||||
|
||||
## IBMCloud
|
||||
If no api key is set up with proper VPC resource permissions, use the following to create:
|
||||
|
||||
If no API key is set up with proper VPC resource permissions, use the following to create it:
|
||||
|
||||
* Access group
|
||||
* Service id with the following access
|
||||
* With policy **VPC Infrastructure Services**
|
||||
|
||||
@@ -12,10 +12,6 @@ Config components:
|
||||
# Kraken
|
||||
This section defines scenarios and specific data to the chaos run
|
||||
|
||||
## Distribution
|
||||
Either **openshift** or **kubernetes** depending on the type of cluster you want to run chaos on.
|
||||
The prometheus url/route and bearer token are automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
|
||||
## Exit on failure
|
||||
**exit_on_failure**: Exit when a post action check or cerberus run fails
|
||||
|
||||
|
||||
59
docs/health_checks.md
Normal file
59
docs/health_checks.md
Normal file
@@ -0,0 +1,59 @@
|
||||
### Health Checks
|
||||
|
||||
Health checks provide real-time visibility into the impact of chaos scenarios on application availability and performance. Health check configuration supports application endpoints accessible via http / https along with authentication mechanism such as bearer token and authentication credentials.
|
||||
Health checks are configured in the ```config.yaml```
|
||||
|
||||
The system periodically checks the provided URLs based on the defined interval and records the results in Telemetry. The telemetry data includes:
|
||||
|
||||
- Success response ```200``` when the application is running normally.
|
||||
- Failure response other than 200 if the application experiences downtime or errors.
|
||||
|
||||
This helps users quickly identify application health issues and take necessary actions.
|
||||
|
||||
#### Sample health check config
|
||||
```
|
||||
health_checks:
|
||||
interval: <time_in_seconds> # Defines the frequency of health checks, default value is 2 seconds
|
||||
config: # List of application endpoints to check
|
||||
- url: "https://example.com/health"
|
||||
bearer_token: "hfjauljl..." # Bearer token for authentication if any
|
||||
auth:
|
||||
exit_on_failure: True # If value is True exits when health check failed for application, values can be True/False
|
||||
- url: "https://another-service.com/status"
|
||||
bearer_token:
|
||||
auth: ("admin","secretpassword") # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: False
|
||||
- url: http://general-service.com
|
||||
bearer_token:
|
||||
auth:
|
||||
exit_on_failure:
|
||||
```
|
||||
#### Sample health check telemetry
|
||||
```
|
||||
"health_checks": [
|
||||
{
|
||||
"url": "https://example.com/health",
|
||||
"status": False,
|
||||
"status_code": "503",
|
||||
"start_timestamp": "2025-02-25 11:51:33",
|
||||
"end_timestamp": "2025-02-25 11:51:40",
|
||||
"duration": "0:00:07"
|
||||
},
|
||||
{
|
||||
"url": "https://another-service.com/status",
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": "2025-02-25 22:18:19",
|
||||
"end_timestamp": "22025-02-25 22:22:46",
|
||||
"duration": "0:04:27"
|
||||
},
|
||||
{
|
||||
"url": "http://general-service.com",
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": "2025-02-25 22:18:19",
|
||||
"end_timestamp": "22025-02-25 22:22:46",
|
||||
"duration": "0:04:27"
|
||||
}
|
||||
],
|
||||
```
|
||||
49
docs/hog_scenarios.md
Normal file
49
docs/hog_scenarios.md
Normal file
@@ -0,0 +1,49 @@
|
||||
### Hog Scenarios
|
||||
|
||||
Hog Scenarios are designed to push the limits of memory, CPU, or I/O on one or more nodes in your cluster.
|
||||
They also serve to evaluate whether your cluster can withstand rogue pods that excessively consume resources
|
||||
without any limits.
|
||||
|
||||
These scenarios involve deploying one or more workloads in the cluster. Based on the specific configuration,
|
||||
these workloads will use a predetermined amount of resources for a specified duration.
|
||||
|
||||
#### Common options
|
||||
|
||||
| Option | Type | Description |
|
||||
|---------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
|`duration`| number | the duration of the stress test in seconds |
|
||||
|`workers`| number (Optional) | the number of threads instantiated by stress-ng, if left empty the number of workers will match the number of available cores in the node. |
|
||||
|`hog-type`| string (Enum) | can be cpu, memory or io. |
|
||||
|`image`| string | the container image of the stress workload |
|
||||
|`namespace`| string | the namespace where the stress workload will be deployed |
|
||||
|`node-selector`| string (Optional) | defines the node selector for choosing target nodes. If not specified, one schedulable node in the cluster will be chosen at random. If multiple nodes match the selector, all of them will be subjected to stress. If number-of-nodes is specified, that many nodes will be randomly selected from those identified by the selector. |
|
||||
|`number-of-nodes`| number (Optional) | restricts the number of selected nodes by the selector|
|
||||
|
||||
|
||||
#### `cpu-hog` options
|
||||
|
||||
| Option | Type |Description|
|
||||
|---|--------|---|
|
||||
|`cpu-load-percentage`| number | the amount of cpu that will be consumed by the hog|
|
||||
|`cpu-method`| string | reflects the cpu load strategy adopted by stress-ng, please refer to the stress-ng documentation for all the available options|
|
||||
|
||||
|
||||
|
||||
|
||||
#### `io-hog` options
|
||||
|
||||
| Option | Type | Description |
|
||||
|-----------------------|--------|--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|
|
||||
| `io-block-size` |string| the block size written by the stressor |
|
||||
| `io-write-bytes` |string| the total amount of data that will be written by the stressor. The size can be specified as % of free space on the file system or in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g |
|
||||
| `io-target-pod-folder` |string| the folder where the volume will be mounted in the pod |
|
||||
| `io-target-pod-volume`| dictionary | the pod volume definition that will be stressed by the scenario. |
|
||||
|
||||
> [!CAUTION]
|
||||
> Modifying the structure of `io-target-pod-volume` might alter how the hog operates, potentially rendering it ineffective.
|
||||
|
||||
#### `memory-hog` options
|
||||
|
||||
| Option | Type |Description|
|
||||
|-----------------------|--------|---|
|
||||
|`memory-vm-bytes`| string | the amount of memory that the scenario will try to hog.The size can be specified as % of free space on the file system or in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g |
|
||||
@@ -88,7 +88,8 @@ We want to look at this in terms of CPU, Memory, Disk, Throughput, Network etc.
|
||||
- Appropriate caching and Content Delivery Network should be enabled to be performant and usable when there is a latency on the client side.
|
||||
- Not every user or machine has access to unlimited bandwidth, there might be a delay on the user side ( client ) to access the API’s due to limited bandwidth, throttling or latency depending on the geographic location. It is important to inject latency between the client and API calls to understand the behavior and optimize things including caching wherever possible, using CDN’s or opting for different protocols like HTTP/2 or HTTP/3 vs HTTP.
|
||||
|
||||
|
||||
- Ensure Disruption Budgets are enabled for your critical applications
|
||||
- Protect your application during disruptions by setting a [pod disruption budget](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) to avoid downtime. For instance, etcd, zookeeper or similar applications need at least 2 replicas to maintain quorum. This can be ensured by setting PDB maxUnavailable to 1.
|
||||
|
||||
|
||||
### Tooling
|
||||
|
||||
@@ -18,7 +18,7 @@ network_chaos: # Scenario to create an outage
|
||||
```
|
||||
|
||||
##### Sample scenario config for ingress traffic shaping (using a plugin)
|
||||
'''
|
||||
```
|
||||
- id: network_chaos
|
||||
config:
|
||||
node_interface_name: # Dictionary with key as node name(s) and value as a list of its interfaces to test
|
||||
@@ -35,7 +35,7 @@ network_chaos: # Scenario to create an outage
|
||||
bandwidth: 10mbit
|
||||
wait_duration: 120
|
||||
test_duration: 60
|
||||
'''
|
||||
```
|
||||
|
||||
Note: For ingress traffic shaping, ensure that your node doesn't have any [IFB](https://wiki.linuxfoundation.org/networking/ifb) interfaces already present. The scenario relies on creating IFBs to do the shaping, and they are deleted at the end of the scenario.
|
||||
|
||||
|
||||
@@ -2,9 +2,9 @@
|
||||
|
||||
The following node chaos scenarios are supported:
|
||||
|
||||
1. **node_start_scenario**: Scenario to stop the node instance.
|
||||
1. **node_start_scenario**: Scenario to start the node instance.
|
||||
2. **node_stop_scenario**: Scenario to stop the node instance.
|
||||
3. **node_stop_start_scenario**: Scenario to stop and then start the node instance. Not supported on VMware.
|
||||
3. **node_stop_start_scenario**: Scenario to stop the node instance for specified duration and then start the node instance. Not supported on VMware.
|
||||
4. **node_termination_scenario**: Scenario to terminate the node instance.
|
||||
5. **node_reboot_scenario**: Scenario to reboot the node instance.
|
||||
6. **stop_kubelet_scenario**: Scenario to stop the kubelet of the node instance.
|
||||
@@ -12,6 +12,7 @@ The following node chaos scenarios are supported:
|
||||
8. **restart_kubelet_scenario**: Scenario to restart the kubelet of the node instance.
|
||||
9. **node_crash_scenario**: Scenario to crash the node instance.
|
||||
10. **stop_start_helper_node_scenario**: Scenario to stop and start the helper node and check service status.
|
||||
11. **node_disk_detach_attach_scenario**: Scenario to detach node disk for specified duration.
|
||||
|
||||
|
||||
**NOTE**: If the node does not recover from the node_crash_scenario injection, reboot the node to get it back to Ready state.
|
||||
@@ -20,6 +21,8 @@ The following node chaos scenarios are supported:
|
||||
, node_reboot_scenario and stop_start_kubelet_scenario are supported on AWS, Azure, OpenStack, BareMetal, GCP
|
||||
, VMware and Alibaba.
|
||||
|
||||
**NOTE**: node_disk_detach_attach_scenario is supported only on AWS and cannot detach root disk.
|
||||
|
||||
|
||||
#### AWS
|
||||
|
||||
@@ -90,12 +93,7 @@ How to set up Alibaba cli to run node scenarios is defined [here](cloud_setup.md
|
||||
#### VMware
|
||||
How to set up VMware vSphere to run node scenarios is defined [here](cloud_setup.md#vmware)
|
||||
|
||||
This cloud type uses a different configuration style, see actions below and [example config file](../scenarios/openshift/vmware_node_scenarios.yml)
|
||||
|
||||
- vmware-node-terminate
|
||||
- vmware-node-reboot
|
||||
- vmware-node-stop
|
||||
- vmware-node-start
|
||||
See [example config file](../scenarios/openshift/vmware_node_scenarios.yml)
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
### Service Disruption Scenarios (Previously Delete Namespace Scenario)
|
||||
|
||||
Using this type of scenario configuration one is able to delete crucial objects in a specific namespace, or a namespace matching a certain regex string.
|
||||
Using this type of scenario configuration one is able to delete crucial objects in a specific namespace, or a namespace matching a certain regex string. The goal of this scenario is to ensure Pod Disruption Budgets with appropriate configurations are set to have minimum number of replicas are running at a given time and avoid downtime.
|
||||
|
||||
|
||||
**NOTE**: Protect your application during disruptions by setting a [pod disruption budget](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) to avoid downtime. For instance, etcd, zookeeper or similar applications need at least 2 replicas to maintain quorum. This can be ensured by setting PDB maxUnavailable to 1.
|
||||
|
||||
Configuration Options:
|
||||
|
||||
|
||||
@@ -29,9 +29,9 @@ def calculate_zscores(data):
|
||||
|
||||
|
||||
def identify_outliers(data, threshold):
|
||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||
outliers_cpu = data[data["CPU"] > float(threshold)]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > float(threshold)]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > float(threshold)]["Service"].tolist()
|
||||
|
||||
return outliers_cpu, outliers_memory, outliers_network
|
||||
|
||||
@@ -39,13 +39,13 @@ def identify_outliers(data, threshold):
|
||||
def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold):
|
||||
# Filter the DataFrame based on CPU_HEATMAP and MEM_HEATMAP thresholds
|
||||
filtered_df = dataframe[
|
||||
((dataframe["CPU"] / dataframe["CPU_LIMITS"]) > cpu_threshold)
|
||||
((dataframe["CPU"] / dataframe["CPU_LIMITS"]) > float(cpu_threshold))
|
||||
]
|
||||
# Get the lists of services
|
||||
cpu_services = filtered_df["service"].tolist()
|
||||
|
||||
filtered_df = dataframe[
|
||||
((dataframe["MEM"] / dataframe["MEM_LIMITS"]) > mem_threshold)
|
||||
((dataframe["MEM"] / dataframe["MEM_LIMITS"]) > float(mem_threshold))
|
||||
]
|
||||
mem_services = filtered_df["service"].tolist()
|
||||
|
||||
|
||||
@@ -2,10 +2,11 @@ from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import os.path
|
||||
import math
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import urllib3
|
||||
import logging
|
||||
import urllib3
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
@@ -25,8 +26,7 @@ def alerts(
|
||||
start_time,
|
||||
end_time,
|
||||
alert_profile,
|
||||
elastic_collect_alerts,
|
||||
elastic_alerts_index,
|
||||
elastic_alerts_index
|
||||
):
|
||||
|
||||
if alert_profile is None or os.path.exists(alert_profile) is False:
|
||||
@@ -46,6 +46,7 @@ def alerts(
|
||||
for alert in profile_yaml:
|
||||
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
|
||||
logging.error(f"wrong alert {alert}, skipping")
|
||||
continue
|
||||
|
||||
processed_alert = prom_cli.process_alert(
|
||||
alert,
|
||||
@@ -56,7 +57,6 @@ def alerts(
|
||||
processed_alert[0]
|
||||
and processed_alert[1]
|
||||
and elastic
|
||||
and elastic_collect_alerts
|
||||
):
|
||||
elastic_alert = ElasticAlert(
|
||||
run_uuid=run_uuid,
|
||||
@@ -156,15 +156,15 @@ def metrics(
|
||||
start_time,
|
||||
end_time,
|
||||
metrics_profile,
|
||||
elastic_collect_metrics,
|
||||
elastic_metrics_index,
|
||||
elastic_metrics_index
|
||||
) -> list[dict[str, list[(int, float)] | str]]:
|
||||
metrics_list: list[dict[str, list[(int, float)] | str]] = []
|
||||
|
||||
if metrics_profile is None or os.path.exists(metrics_profile) is False:
|
||||
logging.error(f"{metrics_profile} alert profile does not exist")
|
||||
sys.exit(1)
|
||||
with open(metrics_profile) as profile:
|
||||
profile_yaml = yaml.safe_load(profile)
|
||||
|
||||
if not profile_yaml["metrics"] or not isinstance(profile_yaml["metrics"], list):
|
||||
logging.error(
|
||||
f"{metrics_profile} wrong file format, alert profile must be "
|
||||
@@ -172,30 +172,58 @@ def metrics(
|
||||
f"expr, description, severity"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
elapsed_ceil = math.ceil((end_time - start_time)/ 60 )
|
||||
elapsed_time = str(elapsed_ceil) + "m"
|
||||
metrics_list: list[dict[str, int | float | str]] = []
|
||||
for metric_query in profile_yaml["metrics"]:
|
||||
if (
|
||||
query = metric_query['query']
|
||||
|
||||
# calculate elapsed time
|
||||
if ".elapsed" in metric_query["query"]:
|
||||
query = metric_query['query'].replace(".elapsed", elapsed_time)
|
||||
if "instant" in list(metric_query.keys()) and metric_query['instant']:
|
||||
metrics_result = prom_cli.process_query(
|
||||
query
|
||||
)
|
||||
elif (
|
||||
list(metric_query.keys()).sort()
|
||||
!= ["query", "metricName", "instant"].sort()
|
||||
== ["query", "metricName"].sort()
|
||||
):
|
||||
logging.error(f"wrong alert {metric_query}, skipping")
|
||||
metrics_result = prom_cli.process_prom_query_in_range(
|
||||
metric_query["query"],
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=datetime.datetime.fromtimestamp(end_time),
|
||||
)
|
||||
|
||||
metric = {"name": metric_query["metricName"], "values": []}
|
||||
metrics_result = prom_cli.process_prom_query_in_range(
|
||||
query,
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=datetime.datetime.fromtimestamp(end_time), granularity=30
|
||||
)
|
||||
else:
|
||||
logging.info('didnt match keys')
|
||||
continue
|
||||
|
||||
for returned_metric in metrics_result:
|
||||
if "values" in returned_metric:
|
||||
metric = {"query": query, "metricName": metric_query['metricName']}
|
||||
for k,v in returned_metric['metric'].items():
|
||||
metric[k] = v
|
||||
|
||||
if "values" in returned_metric:
|
||||
for value in returned_metric["values"]:
|
||||
try:
|
||||
metric["values"].append((value[0], float(value[1])))
|
||||
metric['timestamp'] = str(datetime.datetime.fromtimestamp(value[0]))
|
||||
metric["value"] = float(value[1])
|
||||
# want double array of the known details and the metrics specific to each call
|
||||
metrics_list.append(metric.copy())
|
||||
except ValueError:
|
||||
pass
|
||||
metrics_list.append(metric)
|
||||
elif "value" in returned_metric:
|
||||
try:
|
||||
value = returned_metric["value"]
|
||||
metric['timestamp'] = str(datetime.datetime.fromtimestamp(value[0]))
|
||||
metric["value"] = float(value[1])
|
||||
|
||||
if elastic_collect_metrics and elastic:
|
||||
# want double array of the known details and the metrics specific to each call
|
||||
metrics_list.append(metric.copy())
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if elastic:
|
||||
result = elastic.upload_metrics_to_elasticsearch(
|
||||
run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list
|
||||
)
|
||||
|
||||
@@ -56,6 +56,7 @@ class AbstractScenarioPlugin(ABC):
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
wait_duration = krkn_config["tunings"]["wait_duration"]
|
||||
events_backup = krkn_config["telemetry"]["events_backup"]
|
||||
for scenario_config in scenarios_list:
|
||||
if isinstance(scenario_config, list):
|
||||
logging.error(
|
||||
@@ -67,6 +68,7 @@ class AbstractScenarioPlugin(ABC):
|
||||
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario_config
|
||||
scenario_telemetry.scenario_type = self.get_scenario_types()[0]
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
parsed_scenario_config = telemetry.set_parameters_base64(
|
||||
scenario_telemetry, scenario_config
|
||||
@@ -99,13 +101,15 @@ class AbstractScenarioPlugin(ABC):
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
int(scenario_telemetry.end_timestamp),
|
||||
)
|
||||
utils.populate_cluster_events(
|
||||
scenario_telemetry,
|
||||
parsed_scenario_config,
|
||||
telemetry.get_lib_kubernetes(),
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
int(scenario_telemetry.end_timestamp),
|
||||
)
|
||||
|
||||
if events_backup:
|
||||
utils.populate_cluster_events(
|
||||
krkn_config,
|
||||
parsed_scenario_config,
|
||||
telemetry.get_lib_kubernetes(),
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
int(scenario_telemetry.end_timestamp),
|
||||
)
|
||||
|
||||
if scenario_telemetry.exit_status != 0:
|
||||
failed_scenarios.append(scenario_config)
|
||||
|
||||
@@ -3,7 +3,7 @@ import time
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
from krkn_lib.utils import get_yaml_item_value, get_random_string
|
||||
from jinja2 import Template
|
||||
from krkn import cerberus
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
@@ -33,17 +33,22 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
duration = get_yaml_item_value(scenario_config, "duration", 60)
|
||||
|
||||
start_time = int(time.time())
|
||||
policy_name = f"krkn-deny-{get_random_string(5)}"
|
||||
|
||||
network_policy_template = """---
|
||||
network_policy_template = (
|
||||
"""---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: kraken-deny
|
||||
name: """
|
||||
+ policy_name
|
||||
+ """
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels: {{ pod_selector }}
|
||||
policyTypes: {{ traffic_type }}
|
||||
"""
|
||||
)
|
||||
t = Template(network_policy_template)
|
||||
rendered_spec = t.render(
|
||||
pod_selector=pod_selector, traffic_type=traffic_type
|
||||
@@ -65,7 +70,7 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
# unblock the traffic by deleting the network policy
|
||||
logging.info("Deleting the network policy")
|
||||
lib_telemetry.get_lib_kubernetes().delete_net_policy(
|
||||
"kraken-deny", namespace
|
||||
policy_name, namespace
|
||||
)
|
||||
|
||||
logging.info(
|
||||
|
||||
@@ -1,197 +0,0 @@
|
||||
import logging
|
||||
import os
|
||||
from pathlib import Path
|
||||
import arcaflow
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.arcaflow.context_auth import ContextAuth
|
||||
|
||||
|
||||
class ArcaflowScenarioPlugin(AbstractScenarioPlugin):
|
||||
|
||||
def run(
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
try:
|
||||
engine_args = self.build_args(scenario)
|
||||
status_code = self.run_workflow(
|
||||
engine_args, lib_telemetry.get_lib_kubernetes().get_kubeconfig_path()
|
||||
)
|
||||
return status_code
|
||||
except Exception as e:
|
||||
logging.error("ArcaflowScenarioPlugin exiting due to Exception %s" % e)
|
||||
return 1
|
||||
|
||||
def get_scenario_types(self) -> [str]:
|
||||
return ["hog_scenarios", "arcaflow_scenario"]
|
||||
|
||||
def run_workflow(
|
||||
self, engine_args: arcaflow.EngineArgs, kubeconfig_path: str
|
||||
) -> int:
|
||||
self.set_arca_kubeconfig(engine_args, kubeconfig_path)
|
||||
exit_status = arcaflow.run(engine_args)
|
||||
return exit_status
|
||||
|
||||
def build_args(self, input_file: str) -> arcaflow.EngineArgs:
|
||||
"""sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow"""
|
||||
current_path = Path().resolve()
|
||||
context = f"{current_path}/{Path(input_file).parent}"
|
||||
workflow = f"{context}/workflow.yaml"
|
||||
config = f"{context}/config.yaml"
|
||||
if not os.path.exists(context):
|
||||
raise Exception(
|
||||
"context folder for arcaflow workflow not found: {}".format(context)
|
||||
)
|
||||
if not os.path.exists(input_file):
|
||||
raise Exception(
|
||||
"input file for arcaflow workflow not found: {}".format(input_file)
|
||||
)
|
||||
if not os.path.exists(workflow):
|
||||
raise Exception(
|
||||
"workflow file for arcaflow workflow not found: {}".format(workflow)
|
||||
)
|
||||
if not os.path.exists(config):
|
||||
raise Exception(
|
||||
"configuration file for arcaflow workflow not found: {}".format(config)
|
||||
)
|
||||
|
||||
engine_args = arcaflow.EngineArgs()
|
||||
engine_args.context = context
|
||||
engine_args.config = config
|
||||
engine_args.workflow = workflow
|
||||
engine_args.input = f"{current_path}/{input_file}"
|
||||
return engine_args
|
||||
|
||||
def set_arca_kubeconfig(
|
||||
self, engine_args: arcaflow.EngineArgs, kubeconfig_path: str
|
||||
):
|
||||
|
||||
context_auth = ContextAuth()
|
||||
if not os.path.exists(kubeconfig_path):
|
||||
raise Exception("kubeconfig not found in {}".format(kubeconfig_path))
|
||||
|
||||
with open(kubeconfig_path, "r") as stream:
|
||||
try:
|
||||
kubeconfig = yaml.safe_load(stream)
|
||||
context_auth.fetch_auth_data(kubeconfig)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"impossible to read kubeconfig file in: {}".format(kubeconfig_path)
|
||||
)
|
||||
raise e
|
||||
|
||||
kubeconfig_str = self.set_kubeconfig_auth(kubeconfig, context_auth)
|
||||
|
||||
with open(engine_args.input, "r") as stream:
|
||||
input_file = yaml.safe_load(stream)
|
||||
if "input_list" in input_file and isinstance(
|
||||
input_file["input_list"], list
|
||||
):
|
||||
for index, _ in enumerate(input_file["input_list"]):
|
||||
if isinstance(input_file["input_list"][index], dict):
|
||||
input_file["input_list"][index]["kubeconfig"] = kubeconfig_str
|
||||
else:
|
||||
input_file["kubeconfig"] = kubeconfig_str
|
||||
stream.close()
|
||||
with open(engine_args.input, "w") as stream:
|
||||
yaml.safe_dump(input_file, stream)
|
||||
|
||||
with open(engine_args.config, "r") as stream:
|
||||
config_file = yaml.safe_load(stream)
|
||||
if config_file["deployers"]["image"]["deployer_name"] == "kubernetes":
|
||||
kube_connection = self.set_kubernetes_deployer_auth(
|
||||
config_file["deployers"]["image"]["connection"], context_auth
|
||||
)
|
||||
config_file["deployers"]["image"]["connection"] = kube_connection
|
||||
with open(engine_args.config, "w") as stream:
|
||||
yaml.safe_dump(config_file, stream, explicit_start=True, width=4096)
|
||||
|
||||
def set_kubernetes_deployer_auth(
|
||||
self, deployer: any, context_auth: ContextAuth
|
||||
) -> any:
|
||||
if context_auth.clusterHost is not None:
|
||||
deployer["host"] = context_auth.clusterHost
|
||||
if context_auth.clientCertificateData is not None:
|
||||
deployer["cert"] = context_auth.clientCertificateData
|
||||
if context_auth.clientKeyData is not None:
|
||||
deployer["key"] = context_auth.clientKeyData
|
||||
if context_auth.clusterCertificateData is not None:
|
||||
deployer["cacert"] = context_auth.clusterCertificateData
|
||||
if context_auth.username is not None:
|
||||
deployer["username"] = context_auth.username
|
||||
if context_auth.password is not None:
|
||||
deployer["password"] = context_auth.password
|
||||
if context_auth.bearerToken is not None:
|
||||
deployer["bearerToken"] = context_auth.bearerToken
|
||||
return deployer
|
||||
|
||||
def set_kubeconfig_auth(self, kubeconfig: any, context_auth: ContextAuth) -> str:
|
||||
"""
|
||||
Builds an arcaflow-compatible kubeconfig representation and returns it as a string.
|
||||
In order to run arcaflow plugins in kubernetes/openshift the kubeconfig must contain client certificate/key
|
||||
and server certificate base64 encoded within the kubeconfig file itself in *-data fields. That is not always the
|
||||
case, infact kubeconfig may contain filesystem paths to those files, this function builds an arcaflow-compatible
|
||||
kubeconfig file and returns it as a string that can be safely included in input.yaml
|
||||
"""
|
||||
|
||||
if "current-context" not in kubeconfig.keys():
|
||||
raise Exception(
|
||||
"invalid kubeconfig file, impossible to determine current-context"
|
||||
)
|
||||
user_id = None
|
||||
cluster_id = None
|
||||
user_name = None
|
||||
cluster_name = None
|
||||
current_context = kubeconfig["current-context"]
|
||||
for context in kubeconfig["contexts"]:
|
||||
if context["name"] == current_context:
|
||||
user_name = context["context"]["user"]
|
||||
cluster_name = context["context"]["cluster"]
|
||||
if user_name is None:
|
||||
raise Exception(
|
||||
"user not set for context {} in kubeconfig file".format(current_context)
|
||||
)
|
||||
if cluster_name is None:
|
||||
raise Exception(
|
||||
"cluster not set for context {} in kubeconfig file".format(
|
||||
current_context
|
||||
)
|
||||
)
|
||||
|
||||
for index, user in enumerate(kubeconfig["users"]):
|
||||
if user["name"] == user_name:
|
||||
user_id = index
|
||||
for index, cluster in enumerate(kubeconfig["clusters"]):
|
||||
if cluster["name"] == cluster_name:
|
||||
cluster_id = index
|
||||
|
||||
if cluster_id is None:
|
||||
raise Exception(
|
||||
"no cluster {} found in kubeconfig users".format(cluster_name)
|
||||
)
|
||||
if "client-certificate" in kubeconfig["users"][user_id]["user"]:
|
||||
kubeconfig["users"][user_id]["user"][
|
||||
"client-certificate-data"
|
||||
] = context_auth.clientCertificateDataBase64
|
||||
del kubeconfig["users"][user_id]["user"]["client-certificate"]
|
||||
|
||||
if "client-key" in kubeconfig["users"][user_id]["user"]:
|
||||
kubeconfig["users"][user_id]["user"][
|
||||
"client-key-data"
|
||||
] = context_auth.clientKeyDataBase64
|
||||
del kubeconfig["users"][user_id]["user"]["client-key"]
|
||||
|
||||
if "certificate-authority" in kubeconfig["clusters"][cluster_id]["cluster"]:
|
||||
kubeconfig["clusters"][cluster_id]["cluster"][
|
||||
"certificate-authority-data"
|
||||
] = context_auth.clusterCertificateDataBase64
|
||||
del kubeconfig["clusters"][cluster_id]["cluster"]["certificate-authority"]
|
||||
kubeconfig_str = yaml.dump(kubeconfig)
|
||||
return kubeconfig_str
|
||||
@@ -1,142 +0,0 @@
|
||||
import os
|
||||
import base64
|
||||
|
||||
|
||||
class ContextAuth:
|
||||
clusterCertificate: str = None
|
||||
clusterCertificateData: str = None
|
||||
clusterHost: str = None
|
||||
clientCertificate: str = None
|
||||
clientCertificateData: str = None
|
||||
clientKey: str = None
|
||||
clientKeyData: str = None
|
||||
clusterName: str = None
|
||||
username: str = None
|
||||
password: str = None
|
||||
bearerToken: str = None
|
||||
# TODO: integrate in krkn-lib-kubernetes in the next iteration
|
||||
|
||||
@property
|
||||
def clusterCertificateDataBase64(self):
|
||||
if self.clusterCertificateData is not None:
|
||||
return base64.b64encode(bytes(self.clusterCertificateData, "utf8")).decode(
|
||||
"ascii"
|
||||
)
|
||||
return
|
||||
|
||||
@property
|
||||
def clientCertificateDataBase64(self):
|
||||
if self.clientCertificateData is not None:
|
||||
return base64.b64encode(bytes(self.clientCertificateData, "utf8")).decode(
|
||||
"ascii"
|
||||
)
|
||||
return
|
||||
|
||||
@property
|
||||
def clientKeyDataBase64(self):
|
||||
if self.clientKeyData is not None:
|
||||
return base64.b64encode(bytes(self.clientKeyData, "utf-8")).decode("ascii")
|
||||
return
|
||||
|
||||
def fetch_auth_data(self, kubeconfig: any):
|
||||
context_username = None
|
||||
current_context = kubeconfig["current-context"]
|
||||
if current_context is None:
|
||||
raise Exception("no current-context found in kubeconfig")
|
||||
|
||||
for context in kubeconfig["contexts"]:
|
||||
if context["name"] == current_context:
|
||||
context_username = context["context"]["user"]
|
||||
self.clusterName = context["context"]["cluster"]
|
||||
if context_username is None:
|
||||
raise Exception("user not found for context {0}".format(current_context))
|
||||
if self.clusterName is None:
|
||||
raise Exception("cluster not found for context {0}".format(current_context))
|
||||
cluster_id = None
|
||||
user_id = None
|
||||
for index, user in enumerate(kubeconfig["users"]):
|
||||
if user["name"] == context_username:
|
||||
user_id = index
|
||||
if user_id is None:
|
||||
raise Exception(
|
||||
"user {0} not found in kubeconfig users".format(context_username)
|
||||
)
|
||||
|
||||
for index, cluster in enumerate(kubeconfig["clusters"]):
|
||||
if cluster["name"] == self.clusterName:
|
||||
cluster_id = index
|
||||
|
||||
if cluster_id is None:
|
||||
raise Exception(
|
||||
"no cluster {} found in kubeconfig users".format(self.clusterName)
|
||||
)
|
||||
|
||||
user = kubeconfig["users"][user_id]["user"]
|
||||
cluster = kubeconfig["clusters"][cluster_id]["cluster"]
|
||||
# sets cluster api URL
|
||||
self.clusterHost = cluster["server"]
|
||||
# client certificates
|
||||
|
||||
if "client-key" in user:
|
||||
try:
|
||||
self.clientKey = user["client-key"]
|
||||
self.clientKeyData = self.read_file(user["client-key"])
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
if "client-key-data" in user:
|
||||
try:
|
||||
self.clientKeyData = base64.b64decode(user["client-key-data"]).decode(
|
||||
"utf-8"
|
||||
)
|
||||
except Exception as e:
|
||||
raise Exception("impossible to decode client-key-data")
|
||||
|
||||
if "client-certificate" in user:
|
||||
try:
|
||||
self.clientCertificate = user["client-certificate"]
|
||||
self.clientCertificateData = self.read_file(user["client-certificate"])
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
if "client-certificate-data" in user:
|
||||
try:
|
||||
self.clientCertificateData = base64.b64decode(
|
||||
user["client-certificate-data"]
|
||||
).decode("utf-8")
|
||||
except Exception as e:
|
||||
raise Exception("impossible to decode client-certificate-data")
|
||||
|
||||
# cluster certificate authority
|
||||
|
||||
if "certificate-authority" in cluster:
|
||||
try:
|
||||
self.clusterCertificate = cluster["certificate-authority"]
|
||||
self.clusterCertificateData = self.read_file(
|
||||
cluster["certificate-authority"]
|
||||
)
|
||||
except Exception as e:
|
||||
raise e
|
||||
|
||||
if "certificate-authority-data" in cluster:
|
||||
try:
|
||||
self.clusterCertificateData = base64.b64decode(
|
||||
cluster["certificate-authority-data"]
|
||||
).decode("utf-8")
|
||||
except Exception as e:
|
||||
raise Exception("impossible to decode certificate-authority-data")
|
||||
|
||||
if "username" in user:
|
||||
self.username = user["username"]
|
||||
|
||||
if "password" in user:
|
||||
self.password = user["password"]
|
||||
|
||||
if "token" in user:
|
||||
self.bearerToken = user["token"]
|
||||
|
||||
def read_file(self, filename: str) -> str:
|
||||
if not os.path.exists(filename):
|
||||
raise Exception("file not found {0} ".format(filename))
|
||||
with open(filename, "rb") as file_stream:
|
||||
return file_stream.read().decode("utf-8")
|
||||
@@ -1,19 +0,0 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIDBjCCAe6gAwIBAgIBATANBgkqhkiG9w0BAQsFADAVMRMwEQYDVQQDEwptaW5p
|
||||
a3ViZUNBMB4XDTIzMDMxMzE1NDAxM1oXDTMzMDMxMTE1NDAxM1owFTETMBEGA1UE
|
||||
AxMKbWluaWt1YmVDQTCCASIwDQYJKoZIhvcNAQEBBQADggEPADCCAQoCggEBAMnz
|
||||
U/gIbJBRGOgNYVKX2fV03ANOwnM4VjquR28QMAdxURqgOFZ6IxYNysHEyxxE9I+I
|
||||
DAm9hi4vQPbOX7FlxUezuzw+ExEfa6RRJ+n+AGJOV1lezCVph6OaJxB1+L1UqaDZ
|
||||
eM3B4cUf/iCc5Y4bs927+CBG3MJL/jmCVPCO+MiSn/l73PXSFNJAYMvRj42zkXqD
|
||||
CVG9CwY2vWgZnnzl01l7jNGtie871AmV2uqKakJrQ2ILhD+8fZk4jE5JBDTCZnqQ
|
||||
pXIc+vERNKLUS8cvjO6Ux8dMv/Z7+xonpXOU59LlpUdHWP9jgCvMTwiOriwqGjJ+
|
||||
pQJWpX9Dm+oxJiVOJzsCAwEAAaNhMF8wDgYDVR0PAQH/BAQDAgKkMB0GA1UdJQQW
|
||||
MBQGCCsGAQUFBwMCBggrBgEFBQcDATAPBgNVHRMBAf8EBTADAQH/MB0GA1UdDgQW
|
||||
BBQU9pDMtbayJdNM6bp0IG8dcs15qTANBgkqhkiG9w0BAQsFAAOCAQEAtl9TVKPA
|
||||
hTnPODqv0AGTqreS9kLg4WUUjZRaPUkPWmtCoTh2Yf55nRWdHOHeZnCWDSg24x42
|
||||
lpt+13IdqKew1RKTpKCTkicMFi090A01bYu/w39Cm6nOAA5h8zkgSkV5czvQotuV
|
||||
SoN2vB+nbuY28ah5PkdqjMHEZbNwa59cgEke8wB1R1DWFQ/pqflrH2v9ACAuY+5Q
|
||||
i673tA6CXrb1YfaCQnVBzcfvjGS1MqShPKpOLMF+/GccPczNimaBxMnKvYLvf3pN
|
||||
qEUrJC00mAcein8HmxR2Xz8wredbMUUyrQxW29pZJwfGE5GU0olnlsA0lZLbTwio
|
||||
xoolo5y+fsK/dA==
|
||||
-----END CERTIFICATE-----
|
||||
@@ -1,19 +0,0 @@
|
||||
-----BEGIN CERTIFICATE-----
|
||||
MIIDITCCAgmgAwIBAgIBAjANBgkqhkiG9w0BAQsFADAVMRMwEQYDVQQDEwptaW5p
|
||||
a3ViZUNBMB4XDTIzMDUwMTA4NTc0N1oXDTI2MDUwMTA4NTc0N1owMTEXMBUGA1UE
|
||||
ChMOc3lzdGVtOm1hc3RlcnMxFjAUBgNVBAMTDW1pbmlrdWJlLXVzZXIwggEiMA0G
|
||||
CSqGSIb3DQEBAQUAA4IBDwAwggEKAoIBAQC0b7uy9nQYrh7uC5NODve7dFNLAgo5
|
||||
pWRS6Kx13ULA55gOpieZiI5/1jwUBjOz0Hhl5QAdHC1HDNu5wf4MmwIEheuq3kMA
|
||||
mfuvNxW2BnWSDuXyUMlBfqlwg5o6W8ndEWaK33D7wd2WQsSsAnhQPJSjnzWKvWKq
|
||||
+Kbcygc4hdss/ZWN+SXLTahNpHBw0sw8AcJqddNeXs2WI5GdZmbXL4QZI36EaNUm
|
||||
m4xKmKRKYIP9wYkmXOV/D2h1meM44y4lul5v2qvo6I+umJ84q4W1/W1vVmAzyVfL
|
||||
v1TQCUx8cpKMHzw3ma6CTBCtU3Oq9HKHBnf8GyHZicmV7ESzf/phJu4ZAgMBAAGj
|
||||
YDBeMA4GA1UdDwEB/wQEAwIFoDAdBgNVHSUEFjAUBggrBgEFBQcDAQYIKwYBBQUH
|
||||
AwIwDAYDVR0TAQH/BAIwADAfBgNVHSMEGDAWgBQU9pDMtbayJdNM6bp0IG8dcs15
|
||||
qTANBgkqhkiG9w0BAQsFAAOCAQEABNzEQQMYUcLsBASHladEjr46avKn7gREfaDl
|
||||
Y5PBvgCPP42q/sW/9iCNY3UpT9TJZWM6s01+0p6I96jYbRQER1NX7O4OgQYHmFw2
|
||||
PF6UOG2vMo54w11OvL7sbr4d+nkE6ItdM9fLDIJ3fEOYJZkSoxhOL/U3jSjIl7Wu
|
||||
KCIlpM/M/gcZ4w2IvcLrWtvswbFNUd+dwQfBGcQTmSQDOLE7MqSvzYAkeNv73GLB
|
||||
ieba7gs/PmoTFsf9nW60iXymDDF4MtODn15kqT/y1uD6coujmiEiIomBfxqAkUCU
|
||||
0ciP/KF5oOEMmMedm7/peQxaRTMdRSk4yu7vbj/BxnTcj039Qg==
|
||||
-----END CERTIFICATE-----
|
||||
@@ -1,27 +0,0 @@
|
||||
-----BEGIN RSA PRIVATE KEY-----
|
||||
MIIEowIBAAKCAQEAtG+7svZ0GK4e7guTTg73u3RTSwIKOaVkUuisdd1CwOeYDqYn
|
||||
mYiOf9Y8FAYzs9B4ZeUAHRwtRwzbucH+DJsCBIXrqt5DAJn7rzcVtgZ1kg7l8lDJ
|
||||
QX6pcIOaOlvJ3RFmit9w+8HdlkLErAJ4UDyUo581ir1iqvim3MoHOIXbLP2Vjfkl
|
||||
y02oTaRwcNLMPAHCanXTXl7NliORnWZm1y+EGSN+hGjVJpuMSpikSmCD/cGJJlzl
|
||||
fw9odZnjOOMuJbpeb9qr6OiPrpifOKuFtf1tb1ZgM8lXy79U0AlMfHKSjB88N5mu
|
||||
gkwQrVNzqvRyhwZ3/Bsh2YnJlexEs3/6YSbuGQIDAQABAoIBAQCdJxPb8zt6o2zc
|
||||
98f8nJy378D7+3LccmjGrVBH98ZELXIKkDy9RGqYfQcmiaBOZKv4U1OeBwSIdXKK
|
||||
f6O9ZuSC/AEeeSbyRysmmFuYhlewNrmgKyyelqsNDBIv8fIHUTh2i9Xj8B4G2XBi
|
||||
QGR5vcnYGLqRdBGTx63Nb0iKuksDCwPAuPA/e0ySz9HdWL1j4bqpVSYsOIXsqTDr
|
||||
CVnxUeSIL0fFQnRm3IASXQD7zdq9eEFX7vESeleZoz8qNcKb4Na/C3N6crScjgH7
|
||||
qyNZ2zNLfy1LT84k8uc1TMX2KcEVEmfdDv5cCnUH2ic12CwXMZ0vgId5LJTaHx4x
|
||||
ytIQIe5hAoGBANB+TsRXP4KzcjZlUUfiAp/pWUM4kVktbsfZa1R2NEuIGJUxPk3P
|
||||
7WS0WX5W75QKRg+UWTubg5kfd0f9fklLgofmliBnY/HrpgdyugJmUZBgzIxmy0k+
|
||||
aCe0biD1gULfyyrKtfe8k5wRFstzhfGszlOf2ebR87sSVNBuF2lEwPTvAoGBAN2M
|
||||
0/XrsodGU4B9Mj86Go2gb2k2WU2izI0cO+tm2S5U5DvKmVEnmjXfPRaOFj2UUQjo
|
||||
cljnDAinbN+O0+Inc35qsEeYdAIepNAPglzcpfTHagja9mhx2idLYTXGhbZLL+Ei
|
||||
TRzMyP27NF+GVVfYU/cA86ns6NboG6spohmnqh13AoGAKPc4aNGv0/GIVnHP56zb
|
||||
0SnbdR7PSFNp+fCZay4Slmi2U9IqKMXbIjdhgjZ4uoDORU9jvReQYuzQ1h9TyfkB
|
||||
O8yt4M4P0D/6DmqXa9NI4XJznn6wIMMXWf3UybsTW913IQBVgsjVxAuDjBQ11Eec
|
||||
/sdg3D6SgkZWzeFjzjZJJ5cCgYBSYVg7fE3hERxhjawOaJuRCBQFSklAngVzfwkk
|
||||
yhR9ruFC/l2uGIy19XFwnprUgP700gIa3qbR3PeV1TUiRcsjOaacqKqSUzSzjODL
|
||||
iNxIvZHHAyxWv+b/b38REOWNWD3QeAG2cMtX1bFux7OaO31VPkxcZhRaPOp05cE5
|
||||
yudtlwKBgDBbR7RLYn03OPm3NDBLLjTybhD8Iu8Oj7UeNCiEWAdZpqIKYnwSxMzQ
|
||||
kdo4aTENA/seEwq+XDV7TwbUIFFJg5gDXIhkcK2c9kiO2bObCAmKpBlQCcrp0a5X
|
||||
NSBk1N/ZG/Qhqns7z8k01KN4LNcdpRoNiYYPgY+p3xbY8+nWhv+q
|
||||
-----END RSA PRIVATE KEY-----
|
||||
@@ -1,98 +0,0 @@
|
||||
import os
|
||||
import unittest
|
||||
|
||||
import yaml
|
||||
|
||||
from .context_auth import ContextAuth
|
||||
|
||||
|
||||
class TestCurrentContext(unittest.TestCase):
|
||||
|
||||
def get_kubeconfig_with_data(self) -> str:
|
||||
"""
|
||||
This function returns a test kubeconfig file as a string.
|
||||
|
||||
:return: a test kubeconfig file in string format (for unit testing purposes)
|
||||
""" # NOQA
|
||||
return """apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
certificate-authority-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM5ekNDQWQrZ0F3SUJBZ0lVV01PTVBNMVUrRi9uNXN6TSthYzlMcGZISHB3d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RhM1ZpZFc1MGRTNXNiMk5oYkdSdmJXRnBiakFlRncweU1URXlNRFl4T0RBdwpNRFJhRncwek1URXlNRFF4T0RBd01EUmFNQjR4SERBYUJnTlZCQU1NRTJ0MVluVnVkSFV1Ykc5allXeGtiMjFoCmFXNHdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFDNExhcG00SDB0T1NuYTNXVisKdzI4a0tOWWRwaHhYOUtvNjUwVGlOK2c5ZFNQU3VZK0V6T1JVOWVONlgyWUZkMEJmVFNodno4Y25rclAvNysxegpETEoxQ3MwRi9haEV3ZDQxQXN5UGFjbnRiVE80dGRLWm9POUdyODR3YVdBN1hSZmtEc2ZxRGN1YW5UTmVmT1hpCkdGbmdDVzU5Q285M056alB1eEFrakJxdVF6eE5GQkgwRlJPbXJtVFJ4cnVLZXo0aFFuUW1OWEFUNnp0M21udzMKWUtWTzU4b2xlcUxUcjVHNlRtVFQyYTZpVGdtdWY2N0cvaVZlalJGbkw3YkNHWmgzSjlCSTNMcVpqRzE4dWxvbgpaVDdQcGQrQTlnaTJOTm9UZlI2TVB5SndxU1BCL0xZQU5ZNGRoZDVJYlVydDZzbmViTlRZSHV2T0tZTDdNTWRMCmVMSzFBZ01CQUFHakxUQXJNQWtHQTFVZEV3UUNNQUF3SGdZRFZSMFJCQmN3RllJVGEzVmlkVzUwZFM1c2IyTmgKYkdSdmJXRnBiakFOQmdrcWhraUc5dzBCQVFzRkFBT0NBUUVBQTVqUHVpZVlnMExySE1PSkxYY0N4d3EvVzBDNApZeFpncVd3VHF5VHNCZjVKdDlhYTk0SkZTc2dHQWdzUTN3NnA2SlBtL0MyR05MY3U4ZWxjV0E4UXViQWxueXRRCnF1cEh5WnYrZ08wMG83TXdrejZrTUxqQVZ0QllkRzJnZ21FRjViTEk5czBKSEhjUGpHUkl1VHV0Z0tHV1dPWHgKSEg4T0RzaG9wZHRXMktrR2c2aThKaEpYaWVIbzkzTHptM00xRUNGcXAvMEdtNkN1RFphVVA2SGpJMWRrYllLdgpsSHNVZ1U1SmZjSWhNYmJLdUllTzRkc1YvT3FHcm9iNW5vcmRjaExBQmRDTnc1cmU5T1NXZGZ1VVhSK0ViZVhrCjVFM0tFYzA1RGNjcGV2a1NTdlJ4SVQrQzNMOTltWGcxL3B5NEw3VUhvNFFLTXlqWXJXTWlLRlVKV1E9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
server: https://127.0.0.1:6443
|
||||
name: default
|
||||
contexts:
|
||||
- context:
|
||||
cluster: default
|
||||
namespace: default
|
||||
user: testuser
|
||||
name: default
|
||||
current-context: default
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: testuser
|
||||
user:
|
||||
client-certificate-data: LS0tLS1CRUdJTiBDRVJUSUZJQ0FURS0tLS0tCk1JSUM5ekNDQWQrZ0F3SUJBZ0lVV01PTVBNMVUrRi9uNXN6TSthYzlMcGZISHB3d0RRWUpLb1pJaHZjTkFRRUwKQlFBd0hqRWNNQm9HQTFVRUF3d1RhM1ZpZFc1MGRTNXNiMk5oYkdSdmJXRnBiakFlRncweU1URXlNRFl4T0RBdwpNRFJhRncwek1URXlNRFF4T0RBd01EUmFNQjR4SERBYUJnTlZCQU1NRTJ0MVluVnVkSFV1Ykc5allXeGtiMjFoCmFXNHdnZ0VpTUEwR0NTcUdTSWIzRFFFQkFRVUFBNElCRHdBd2dnRUtBb0lCQVFDNExhcG00SDB0T1NuYTNXVisKdzI4a0tOWWRwaHhYOUtvNjUwVGlOK2c5ZFNQU3VZK0V6T1JVOWVONlgyWUZkMEJmVFNodno4Y25rclAvNysxegpETEoxQ3MwRi9haEV3ZDQxQXN5UGFjbnRiVE80dGRLWm9POUdyODR3YVdBN1hSZmtEc2ZxRGN1YW5UTmVmT1hpCkdGbmdDVzU5Q285M056alB1eEFrakJxdVF6eE5GQkgwRlJPbXJtVFJ4cnVLZXo0aFFuUW1OWEFUNnp0M21udzMKWUtWTzU4b2xlcUxUcjVHNlRtVFQyYTZpVGdtdWY2N0cvaVZlalJGbkw3YkNHWmgzSjlCSTNMcVpqRzE4dWxvbgpaVDdQcGQrQTlnaTJOTm9UZlI2TVB5SndxU1BCL0xZQU5ZNGRoZDVJYlVydDZzbmViTlRZSHV2T0tZTDdNTWRMCmVMSzFBZ01CQUFHakxUQXJNQWtHQTFVZEV3UUNNQUF3SGdZRFZSMFJCQmN3RllJVGEzVmlkVzUwZFM1c2IyTmgKYkdSdmJXRnBiakFOQmdrcWhraUc5dzBCQVFzRkFBT0NBUUVBQTVqUHVpZVlnMExySE1PSkxYY0N4d3EvVzBDNApZeFpncVd3VHF5VHNCZjVKdDlhYTk0SkZTc2dHQWdzUTN3NnA2SlBtL0MyR05MY3U4ZWxjV0E4UXViQWxueXRRCnF1cEh5WnYrZ08wMG83TXdrejZrTUxqQVZ0QllkRzJnZ21FRjViTEk5czBKSEhjUGpHUkl1VHV0Z0tHV1dPWHgKSEg4T0RzaG9wZHRXMktrR2c2aThKaEpYaWVIbzkzTHptM00xRUNGcXAvMEdtNkN1RFphVVA2SGpJMWRrYllLdgpsSHNVZ1U1SmZjSWhNYmJLdUllTzRkc1YvT3FHcm9iNW5vcmRjaExBQmRDTnc1cmU5T1NXZGZ1VVhSK0ViZVhrCjVFM0tFYzA1RGNjcGV2a1NTdlJ4SVQrQzNMOTltWGcxL3B5NEw3VUhvNFFLTXlqWXJXTWlLRlVKV1E9PQotLS0tLUVORCBDRVJUSUZJQ0FURS0tLS0tCg==
|
||||
client-key-data: LS0tLS1CRUdJTiBQUklWQVRFIEtFWS0tLS0tCk1JSUV2QUlCQURBTkJna3Foa2lHOXcwQkFRRUZBQVNDQktZd2dnU2lBZ0VBQW9JQkFRQzRMYXBtNEgwdE9TbmEKM1dWK3cyOGtLTllkcGh4WDlLbzY1MFRpTitnOWRTUFN1WStFek9SVTllTjZYMllGZDBCZlRTaHZ6OGNua3JQLwo3KzF6RExKMUNzMEYvYWhFd2Q0MUFzeVBhY250YlRPNHRkS1pvTzlHcjg0d2FXQTdYUmZrRHNmcURjdWFuVE5lCmZPWGlHRm5nQ1c1OUNvOTNOempQdXhBa2pCcXVRenhORkJIMEZST21ybVRSeHJ1S2V6NGhRblFtTlhBVDZ6dDMKbW53M1lLVk81OG9sZXFMVHI1RzZUbVRUMmE2aVRnbXVmNjdHL2lWZWpSRm5MN2JDR1poM0o5QkkzTHFaakcxOAp1bG9uWlQ3UHBkK0E5Z2kyTk5vVGZSNk1QeUp3cVNQQi9MWUFOWTRkaGQ1SWJVcnQ2c25lYk5UWUh1dk9LWUw3Ck1NZExlTEsxQWdNQkFBRUNnZ0VBQ28rank4NW5ueVk5L2l6ZjJ3cjkzb2J3OERaTVBjYnIxQURhOUZYY1hWblEKT2c4bDZhbU9Ga2tiU0RNY09JZ0VDdkx6dEtXbmQ5OXpydU5sTEVtNEdmb0trNk5kK01OZEtKRUdoZHE5RjM1Qgpqdi91R1owZTIyRE5ZLzFHNVdDTE5DcWMwQkVHY2RFOTF0YzJuMlppRVBTNWZ6WVJ6L1k4cmJ5K1NqbzJkWE9RCmRHYWRlUFplbi9UbmlHTFlqZWhrbXZNQjJvU0FDbVMycTd2OUNrcmdmR1RZbWJzeGVjSU1QK0JONG9KS3BOZ28KOUpnRWJ5SUxkR1pZS2pQb2lLaHNjMVhmSy8zZStXSmxuYjJBaEE5Y1JMUzhMcDdtcEYySWp4SjNSNE93QTg3WQpNeGZvZWFGdnNuVUFHWUdFWFo4Z3BkWmhQMEoxNWRGdERjajIrcngrQVFLQmdRRDFoSE9nVGdFbERrVEc5bm5TCjE1eXYxRzUxYnJMQU1UaWpzNklEMU1qelhzck0xY2ZvazVaaUlxNVJsQ3dReTlYNDdtV1RhY0lZRGR4TGJEcXEKY0IydjR5Wm1YK1VleGJ3cDU1OWY0V05HdzF5YzQrQjdaNFF5aTRFelN4WmFjbldjMnBzcHJMUFVoOUFXRXVNcApOaW1vcXNiVGNnNGs5QWRxeUIrbWhIWmJRUUtCZ1FEQUNzU09qNXZMU1VtaVpxYWcrOVMySUxZOVNOdDZzS1VyCkprcjdCZEVpN3N2YmU5cldRR2RBb0xkQXNzcU94aENydmtPNkpSSHB1YjlRRjlYdlF4Riszc2ZpZm4yYkQ0ZloKMlVsclA1emF3RlNrNDNLbjdMZzRscURpaVUxVGlqTkJBL3dUcFlmbTB4dW5WeFRWNDZpNVViQW1XRk12TWV0bQozWUZYQmJkK2RRS0JnRGl6Q1B6cFpzeEcrazAwbUxlL2dYajl4ekNwaXZCbHJaM29teTdsVWk4YUloMmg5VlBaCjJhMzZNbVcyb1dLVG9HdW5xcCtibWU1eUxRRGlFcjVQdkJ0bGl2V3ppYmRNbFFMY2Nlcnpveml4WDA4QU5WUnEKZUpZdnIzdklDSGFFM25LRjdiVjNJK1NlSk1ra1BYL0QrV1R4WTQ5clZLYm1FRnh4c1JXRW04ekJBb0dBWEZ3UgpZanJoQTZqUW1DRmtYQ0loa0NJMVkwNEorSHpDUXZsY3NGT0EzSnNhUWduVUdwekl5OFUvdlFiLzhpQ0IzZ2RZCmpVck16YXErdnVkbnhYVnRFYVpWWGJIVitPQkVSdHFBdStyUkprZS9yYm1SNS84cUxsVUxOVWd4ZjA4RkRXeTgKTERxOUhKOUZPbnJnRTJvMU9FTjRRMGpSWU81U041dXFXODd0REEwQ2dZQXpXbk1KSFgrbmlyMjhRRXFyVnJKRAo4ZUEwOHIwWTJRMDhMRlcvMjNIVWQ4WU12VnhTUTdwcUwzaE41RXVJQ2dCbEpGVFI3TndBREo3eDY2M002akFMCm1DNlI4dWxSZStwa08xN2Y0UUs3MnVRanJGZEhESnlXQmdDL0RKSkV6d1dwY0Q4VVNPK3A5bVVIbllLTUJTOEsKTVB1ejYrZ3h0VEtsRU5pZUVacXhxZz09Ci0tLS0tRU5EIFBSSVZBVEUgS0VZLS0tLS0K
|
||||
username: testuser
|
||||
password: testpassword
|
||||
token: sha256~fFyEqjf1xxFMO0tbEyGRvWeNOd7QByuEgS4hyEq_A9o
|
||||
""" # NOQA
|
||||
|
||||
def get_kubeconfig_with_paths(self) -> str:
|
||||
"""
|
||||
This function returns a test kubeconfig file as a string.
|
||||
|
||||
:return: a test kubeconfig file in string format (for unit testing purposes)
|
||||
""" # NOQA
|
||||
return """apiVersion: v1
|
||||
clusters:
|
||||
- cluster:
|
||||
certificate-authority: fixtures/ca.crt
|
||||
server: https://127.0.0.1:6443
|
||||
name: default
|
||||
contexts:
|
||||
- context:
|
||||
cluster: default
|
||||
namespace: default
|
||||
user: testuser
|
||||
name: default
|
||||
current-context: default
|
||||
kind: Config
|
||||
preferences: {}
|
||||
users:
|
||||
- name: testuser
|
||||
user:
|
||||
client-certificate: fixtures/client.crt
|
||||
client-key: fixtures/client.key
|
||||
username: testuser
|
||||
password: testpassword
|
||||
token: sha256~fFyEqjf1xxFMO0tbEyGRvWeNOd7QByuEgS4hyEq_A9o
|
||||
""" # NOQA
|
||||
|
||||
def test_current_context(self):
|
||||
cwd = os.getcwd()
|
||||
current_context_data = ContextAuth()
|
||||
data = yaml.safe_load(self.get_kubeconfig_with_data())
|
||||
current_context_data.fetch_auth_data(data)
|
||||
self.assertIsNotNone(current_context_data.clusterCertificateData)
|
||||
self.assertIsNotNone(current_context_data.clientCertificateData)
|
||||
self.assertIsNotNone(current_context_data.clientKeyData)
|
||||
self.assertIsNotNone(current_context_data.username)
|
||||
self.assertIsNotNone(current_context_data.password)
|
||||
self.assertIsNotNone(current_context_data.bearerToken)
|
||||
self.assertIsNotNone(current_context_data.clusterHost)
|
||||
|
||||
current_context_no_data = ContextAuth()
|
||||
data = yaml.safe_load(self.get_kubeconfig_with_paths())
|
||||
current_context_no_data.fetch_auth_data(data)
|
||||
self.assertIsNotNone(current_context_no_data.clusterCertificate)
|
||||
self.assertIsNotNone(current_context_no_data.clusterCertificateData)
|
||||
self.assertIsNotNone(current_context_no_data.clientCertificate)
|
||||
self.assertIsNotNone(current_context_no_data.clientCertificateData)
|
||||
self.assertIsNotNone(current_context_no_data.clientKey)
|
||||
self.assertIsNotNone(current_context_no_data.clientKeyData)
|
||||
self.assertIsNotNone(current_context_no_data.username)
|
||||
self.assertIsNotNone(current_context_no_data.password)
|
||||
self.assertIsNotNone(current_context_no_data.bearerToken)
|
||||
self.assertIsNotNone(current_context_data.clusterHost)
|
||||
@@ -22,9 +22,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
start_time = int(time.time())
|
||||
pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes())
|
||||
wait_duration = krkn_config["tunings"]["wait_duration"]
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
cont_scenario_config = yaml.full_load(f)
|
||||
@@ -45,16 +43,10 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
return 1
|
||||
scenario_telemetry.affected_pods = result
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
# capture end time
|
||||
end_time = int(time.time())
|
||||
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e)
|
||||
logging.error("ContainerScenarioPlugin exiting due to Exception %s")
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
142
krkn/scenario_plugins/hogs/hogs_scenario_plugin.py
Normal file
142
krkn/scenario_plugins/hogs/hogs_scenario_plugin.py
Normal file
@@ -0,0 +1,142 @@
|
||||
import copy
|
||||
import logging
|
||||
import queue
|
||||
import random
|
||||
import re
|
||||
import threading
|
||||
import time
|
||||
|
||||
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.models.krkn import HogConfig, HogType
|
||||
from krkn_lib.models.k8s import NodeResources
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.utils import get_random_string
|
||||
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
|
||||
class HogsScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(self, run_uuid: str, scenario: str, krkn_config: dict[str, any], lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry) -> int:
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
scenario = yaml.full_load(f)
|
||||
scenario_config = HogConfig.from_yaml_dict(scenario)
|
||||
has_selector = True
|
||||
if not scenario_config.node_selector or not re.match("^.+=.*$", scenario_config.node_selector):
|
||||
if scenario_config.node_selector:
|
||||
logging.warning(f"node selector {scenario_config.node_selector} not in right format (key=value)")
|
||||
node_selector = ""
|
||||
else:
|
||||
node_selector = scenario_config.node_selector
|
||||
|
||||
available_nodes = lib_telemetry.get_lib_kubernetes().list_schedulable_nodes(node_selector)
|
||||
if len(available_nodes) == 0:
|
||||
raise Exception("no available nodes to schedule workload")
|
||||
|
||||
if not has_selector:
|
||||
# if selector not specified picks a random node between the available
|
||||
available_nodes = [available_nodes[random.randint(0, len(available_nodes))]]
|
||||
|
||||
if scenario_config.number_of_nodes and len(available_nodes) > scenario_config.number_of_nodes:
|
||||
available_nodes = random.sample(available_nodes, scenario_config.number_of_nodes)
|
||||
|
||||
exception_queue = queue.Queue()
|
||||
self.run_scenario(scenario_config, lib_telemetry.get_lib_kubernetes(), available_nodes, exception_queue)
|
||||
return 0
|
||||
except Exception as e:
|
||||
logging.error(f"scenario exception: {e}")
|
||||
return 1
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["hog_scenarios"]
|
||||
|
||||
def run_scenario_worker(self, config: HogConfig,
|
||||
lib_k8s: KrknKubernetes, node: str,
|
||||
exception_queue: queue.Queue):
|
||||
try:
|
||||
if not config.workers:
|
||||
config.workers = lib_k8s.get_node_cpu_count(node)
|
||||
logging.info(f"[{node}] detected {config.workers} cpus for node {node}")
|
||||
|
||||
logging.info(f"[{node}] workers number: {config.workers}")
|
||||
|
||||
# using kubernetes.io/hostname = <node_name> selector to
|
||||
# precisely deploy each workload on each selected node
|
||||
config.node_selector = f"kubernetes.io/hostname={node}"
|
||||
pod_name = f"{config.type.value}-hog-{get_random_string(5)}"
|
||||
node_resources_start = lib_k8s.get_node_resources_info(node)
|
||||
lib_k8s.deploy_hog(pod_name, config)
|
||||
start = time.time()
|
||||
# waiting 3 seconds before starting sample collection
|
||||
time.sleep(3)
|
||||
node_resources_end = lib_k8s.get_node_resources_info(node)
|
||||
|
||||
samples: list[NodeResources] = []
|
||||
avg_node_resources = NodeResources()
|
||||
|
||||
while time.time() - start < config.duration-1:
|
||||
samples.append(lib_k8s.get_node_resources_info(node))
|
||||
|
||||
max_wait = 30
|
||||
wait = 0
|
||||
logging.info(f"[{node}] waiting {max_wait} up to seconds pod: {pod_name} namespace: {config.namespace} to finish")
|
||||
while lib_k8s.is_pod_running(pod_name, config.namespace):
|
||||
if wait >= max_wait:
|
||||
raise Exception(f"[{node}] hog workload pod: {pod_name} namespace: {config.namespace} "
|
||||
f"didn't finish after {max_wait}")
|
||||
time.sleep(1)
|
||||
wait += 1
|
||||
continue
|
||||
|
||||
logging.info(f"[{node}] deleting pod: {pod_name} namespace: {config.namespace}")
|
||||
lib_k8s.delete_pod(pod_name, config.namespace)
|
||||
|
||||
for resource in samples:
|
||||
avg_node_resources.cpu += resource.cpu
|
||||
avg_node_resources.memory += resource.memory
|
||||
avg_node_resources.disk_space += resource.disk_space
|
||||
|
||||
avg_node_resources.cpu = avg_node_resources.cpu/len(samples)
|
||||
avg_node_resources.memory = avg_node_resources.memory / len(samples)
|
||||
avg_node_resources.disk_space = avg_node_resources.disk_space / len(samples)
|
||||
|
||||
if config.type == HogType.cpu:
|
||||
logging.info(f"[{node}] detected cpu consumption: "
|
||||
f"{(avg_node_resources.cpu / (config.workers * 1000000000)) * 100} %")
|
||||
if config.type == HogType.memory:
|
||||
logging.info(f"[{node}] detected memory increase: "
|
||||
f"{avg_node_resources.memory / node_resources_start.memory * 100} %")
|
||||
if config.type == HogType.io:
|
||||
logging.info(f"[{node}] detected disk space allocated: "
|
||||
f"{(avg_node_resources.disk_space - node_resources_end.disk_space) / 1024 / 1024} MB")
|
||||
except Exception as e:
|
||||
exception_queue.put(e)
|
||||
|
||||
def run_scenario(self, config: HogConfig,
|
||||
lib_k8s: KrknKubernetes,
|
||||
available_nodes: list[str],
|
||||
exception_queue: queue.Queue):
|
||||
workers = []
|
||||
logging.info(f"running {config.type.value} hog scenario")
|
||||
logging.info(f"targeting nodes: [{','.join(available_nodes)}]")
|
||||
for node in available_nodes:
|
||||
config_copy = copy.deepcopy(config)
|
||||
worker = threading.Thread(target=self.run_scenario_worker,
|
||||
args=(config_copy, lib_k8s, node, exception_queue))
|
||||
worker.daemon = True
|
||||
worker.start()
|
||||
workers.append(worker)
|
||||
|
||||
for worker in workers:
|
||||
worker.join()
|
||||
|
||||
try:
|
||||
while True:
|
||||
exception = exception_queue.get_nowait()
|
||||
raise exception
|
||||
except queue.Empty:
|
||||
pass
|
||||
@@ -49,8 +49,7 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
|
||||
return [
|
||||
"pod_disruption_scenarios",
|
||||
"pod_network_scenarios",
|
||||
"vmware_node_scenarios",
|
||||
"ibmcloud_node_scenarios",
|
||||
"ingress_node_scenarios"
|
||||
]
|
||||
|
||||
def start_monitoring(self, pool: PodsMonitorPool, scenarios: list[Any]):
|
||||
|
||||
@@ -97,15 +97,6 @@ class NetworkScenarioConfig:
|
||||
},
|
||||
)
|
||||
|
||||
kraken_config: typing.Optional[str] = field(
|
||||
default="",
|
||||
metadata={
|
||||
"name": "Kraken Config",
|
||||
"description": "Path to the config file of Kraken. "
|
||||
"Set this field if you wish to publish status onto Cerberus",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetworkScenarioSuccessOutput:
|
||||
@@ -710,6 +701,7 @@ def network_chaos(
|
||||
pod_module_template = env.get_template("pod_module.j2")
|
||||
cli, batch_cli = kube_helper.setup_kubernetes(cfg.kubeconfig_path)
|
||||
|
||||
logging.info("Starting Ingress Network Chaos")
|
||||
try:
|
||||
node_interface_dict = get_node_interfaces(
|
||||
cfg.node_interface_name,
|
||||
@@ -721,16 +713,6 @@ def network_chaos(
|
||||
except Exception:
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
job_list = []
|
||||
publish = False
|
||||
if cfg.kraken_config:
|
||||
failed_post_scenarios = ""
|
||||
try:
|
||||
with open(cfg.kraken_config, "r") as f:
|
||||
config = yaml.full_load(f)
|
||||
except Exception:
|
||||
logging.error("Error reading Kraken config from %s" % cfg.kraken_config)
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
publish = True
|
||||
|
||||
try:
|
||||
if cfg.execution_type == "parallel":
|
||||
@@ -747,13 +729,7 @@ def network_chaos(
|
||||
)
|
||||
)
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
config, failed_post_scenarios, start_time, end_time
|
||||
)
|
||||
|
||||
elif cfg.execution_type == "serial":
|
||||
create_interfaces = True
|
||||
@@ -773,18 +749,12 @@ def network_chaos(
|
||||
)
|
||||
)
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100)
|
||||
logging.info("Deleting jobs")
|
||||
delete_jobs(cli, batch_cli, job_list[:])
|
||||
job_list = []
|
||||
logging.info("Waiting for wait_duration : %ss" % cfg.wait_duration)
|
||||
time.sleep(cfg.wait_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
config, failed_post_scenarios, start_time, end_time
|
||||
)
|
||||
create_interfaces = False
|
||||
else:
|
||||
|
||||
@@ -799,7 +769,7 @@ def network_chaos(
|
||||
execution_type=cfg.execution_type,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Network Chaos exiting due to Exception - %s" % e)
|
||||
logging.error("Ingress Network Chaos exiting due to Exception - %s" % e)
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
finally:
|
||||
delete_virtual_interfaces(cli, node_interface_dict.keys(), pod_module_template)
|
||||
|
||||
@@ -1,589 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import typing
|
||||
from os import environ
|
||||
from dataclasses import dataclass, field
|
||||
from traceback import format_exc
|
||||
import logging
|
||||
from krkn.scenario_plugins.native.node_scenarios import (
|
||||
kubernetes_functions as kube_helper,
|
||||
)
|
||||
from arcaflow_plugin_sdk import validation, plugin
|
||||
from kubernetes import client, watch
|
||||
from ibm_vpc import VpcV1
|
||||
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
||||
import sys
|
||||
|
||||
|
||||
class IbmCloud:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the ibm cloud client by using the the env variables:
|
||||
'IBMC_APIKEY' 'IBMC_URL'
|
||||
"""
|
||||
apiKey = environ.get("IBMC_APIKEY")
|
||||
service_url = environ.get("IBMC_URL")
|
||||
if not apiKey:
|
||||
raise Exception("Environmental variable 'IBMC_APIKEY' is not set")
|
||||
if not service_url:
|
||||
raise Exception("Environmental variable 'IBMC_URL' is not set")
|
||||
try:
|
||||
authenticator = IAMAuthenticator(apiKey)
|
||||
self.service = VpcV1(authenticator=authenticator)
|
||||
|
||||
self.service.set_service_url(service_url)
|
||||
except Exception as e:
|
||||
logging.error("error authenticating" + str(e))
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
node_list = self.list_instances()
|
||||
for node in node_list:
|
||||
if node_name == node["vpc_name"]:
|
||||
return node["vpc_id"]
|
||||
logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_instance(self, instance_id):
|
||||
"""
|
||||
Deletes the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
try:
|
||||
self.service.delete_instance(instance_id)
|
||||
logging.info("Deleted Instance -- '{}'".format(instance_id))
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be deleted. ".format(instance_id))
|
||||
return False
|
||||
|
||||
def reboot_instances(self, instance_id):
|
||||
"""
|
||||
Reboots the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is not powered on
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="reboot",
|
||||
)
|
||||
logging.info("Reset Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be rebooted".format(instance_id))
|
||||
return False
|
||||
|
||||
def stop_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already stopped
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="stop",
|
||||
)
|
||||
logging.info("Stopped Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be stopped".format(instance_id))
|
||||
logging.info("error" + str(e))
|
||||
return False
|
||||
|
||||
def start_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already running
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="start",
|
||||
)
|
||||
logging.info("Started Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not start running".format(instance_id))
|
||||
return False
|
||||
|
||||
def list_instances(self):
|
||||
"""
|
||||
Returns a list of Instances present in the datacenter
|
||||
"""
|
||||
instance_names = []
|
||||
try:
|
||||
instances_result = self.service.list_instances().get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc["name"], "vpc_id": vpc["id"]})
|
||||
starting_count = instances_result["total_count"]
|
||||
while instances_result["total_count"] == instances_result["limit"]:
|
||||
instances_result = self.service.list_instances(
|
||||
start=starting_count
|
||||
).get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
starting_count += instances_result["total_count"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc.name, "vpc_id": vpc.id})
|
||||
except Exception as e:
|
||||
logging.error("Error listing out instances: " + str(e))
|
||||
sys.exit(1)
|
||||
return instance_names
|
||||
|
||||
def find_id_in_list(self, name, vpc_list):
|
||||
for vpc in vpc_list:
|
||||
if vpc["vpc_name"] == name:
|
||||
return vpc["vpc_id"]
|
||||
|
||||
def get_instance_status(self, instance_id):
|
||||
"""
|
||||
Returns the status of the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
try:
|
||||
instance = self.service.get_instance(instance_id).get_result()
|
||||
state = instance["status"]
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_deleted(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the instance is deleted or until the timeout. Returns True if
|
||||
the instance is successfully deleted, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
while vpc is not None:
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still being deleted, sleeping for 5 seconds"
|
||||
% instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not deleted in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to running state or until the timeout.
|
||||
Returns True if the Instance switches to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "running":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not ready in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to stopped state or until the timeout.
|
||||
Returns True if the Instance switches to stopped, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "stopped":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not stopped, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not stopped in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_rebooted(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to restarting state and then running state or until the timeout.
|
||||
Returns True if the Instance switches back to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status == "starting":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still restarting, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still restarting after allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
self.wait_until_running(instance_id, timeout)
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node:
|
||||
name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioSuccessOutput:
|
||||
|
||||
nodes: typing.Dict[int, Node] = field(
|
||||
metadata={
|
||||
"name": "Nodes started/stopped/terminated/rebooted",
|
||||
"description": """Map between timestamps and the pods started/stopped/terminated/rebooted.
|
||||
The timestamp is provided in nanoseconds""",
|
||||
}
|
||||
)
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action performed or attempted to be performed on the node. Possible values
|
||||
are : Start, Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioErrorOutput:
|
||||
|
||||
error: str
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action attempted to be performed on the node. Possible values are : Start
|
||||
Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioConfig:
|
||||
|
||||
name: typing.Annotated[
|
||||
typing.Optional[str],
|
||||
validation.required_if_not("label_selector"),
|
||||
validation.required_if("skip_openshift_checks"),
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Name",
|
||||
"description": "Name(s) for target nodes. Required if label_selector is not set.",
|
||||
},
|
||||
)
|
||||
|
||||
runs: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Number of runs per node",
|
||||
"description": "Number of times to inject each scenario under actions (will perform on same node each time)",
|
||||
},
|
||||
)
|
||||
|
||||
label_selector: typing.Annotated[
|
||||
typing.Optional[str], validation.min(1), validation.required_if_not("name")
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Label selector",
|
||||
"description": "Kubernetes label selector for the target nodes. Required if name is not set.\n"
|
||||
"See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ for details.",
|
||||
},
|
||||
)
|
||||
|
||||
timeout: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=180,
|
||||
metadata={
|
||||
"name": "Timeout",
|
||||
"description": "Timeout to wait for the target pod(s) to be removed in seconds.",
|
||||
},
|
||||
)
|
||||
|
||||
instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Instance Count",
|
||||
"description": "Number of nodes to perform action/select that match the label selector.",
|
||||
},
|
||||
)
|
||||
|
||||
skip_openshift_checks: typing.Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"name": "Skip Openshift Checks",
|
||||
"description": "Skip checking the status of the openshift nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
kubeconfig_path: typing.Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Kubeconfig path",
|
||||
"description": "Path to your Kubeconfig file. Defaults to ~/.kube/config.\n"
|
||||
"See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ for "
|
||||
"details.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-start",
|
||||
name="Start the node",
|
||||
description="Start the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_start(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.START, core_v1)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_started = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
vm_started = ibmcloud.start_instances(instance_id)
|
||||
if vm_started:
|
||||
ibmcloud.wait_until_running(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.START,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_started, kube_helper.Actions.START
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-stop",
|
||||
name="Stop the node",
|
||||
description="Stop the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_stop(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
logging.info("set up done")
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1)
|
||||
logging.info("set node list" + str(node_list))
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
logging.info("node names" + str(node_name_id_list))
|
||||
nodes_stopped = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
vm_stopped = ibmcloud.stop_instances(instance_id)
|
||||
if vm_stopped:
|
||||
ibmcloud.wait_until_stopped(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.STOP,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_stopped, kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-reboot",
|
||||
name="Reboot Ibmcloud Instance",
|
||||
description="Reboot the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_reboot(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.REBOOT, core_v1)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_rebooted = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
ibmcloud.reboot_instances(instance_id)
|
||||
ibmcloud.wait_until_rebooted(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted successfully" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.REBOOT,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_rebooted, kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-terminate",
|
||||
name="Reboot Ibmcloud Instance",
|
||||
description="Wait for node to be deleted",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_terminate(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
node_list = kube_helper.get_node_list(
|
||||
cfg, kube_helper.Actions.TERMINATE, core_v1
|
||||
)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_terminated = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping the node"
|
||||
)
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
logging.info("Deleting the node with instance ID: %s " % (name))
|
||||
if instance_id:
|
||||
ibmcloud.delete_instance(instance_id)
|
||||
ibmcloud.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find instances that matched the node specifications on ibm cloud in the set region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.TERMINATE,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error("node_terminate_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_terminated, kube_helper.Actions.TERMINATE
|
||||
)
|
||||
@@ -1,179 +0,0 @@
|
||||
from kubernetes import config, client
|
||||
from kubernetes.client.rest import ApiException
|
||||
import logging
|
||||
import random
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Actions(Enum):
|
||||
"""
|
||||
This enumeration indicates different kinds of node operations
|
||||
"""
|
||||
|
||||
START = "Start"
|
||||
STOP = "Stop"
|
||||
TERMINATE = "Terminate"
|
||||
REBOOT = "Reboot"
|
||||
|
||||
|
||||
def setup_kubernetes(kubeconfig_path):
|
||||
"""
|
||||
Sets up the Kubernetes client
|
||||
"""
|
||||
|
||||
if kubeconfig_path is None:
|
||||
kubeconfig_path = config.KUBE_CONFIG_DEFAULT_LOCATION
|
||||
kubeconfig = config.kube_config.KubeConfigMerger(kubeconfig_path)
|
||||
|
||||
if kubeconfig.config is None:
|
||||
raise Exception(
|
||||
"Invalid kube-config file: %s. " "No configuration found." % kubeconfig_path
|
||||
)
|
||||
loader = config.kube_config.KubeConfigLoader(
|
||||
config_dict=kubeconfig.config,
|
||||
)
|
||||
client_config = client.Configuration()
|
||||
loader.load_and_set(client_config)
|
||||
return client.ApiClient(configuration=client_config)
|
||||
|
||||
|
||||
def list_killable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be stopped/reset/released
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) == "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def list_startable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be started
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) != "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def get_node_list(cfg, action, core_v1):
|
||||
"""
|
||||
Returns a list of nodes to be used in the node scenarios. The list returned is constructed as follows:
|
||||
- If the key 'name' is present in the node scenario config, the value is extracted and split into
|
||||
a list
|
||||
- Each node in the list is fed to the get_node function which checks if the node is killable or
|
||||
fetches the node using the label selector
|
||||
"""
|
||||
|
||||
def get_node(node_name, label_selector, instance_kill_count, action, core_v1):
|
||||
list_nodes_func = (
|
||||
list_startable_nodes if action == Actions.START else list_killable_nodes
|
||||
)
|
||||
if node_name in list_nodes_func(core_v1):
|
||||
return [node_name]
|
||||
elif node_name:
|
||||
logging.info(
|
||||
"Node with provided node_name does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
nodes = list_nodes_func(core_v1, label_selector)
|
||||
if not nodes:
|
||||
raise Exception("Ready nodes with the provided label selector do not exist")
|
||||
logging.info(
|
||||
"Ready nodes with the label selector %s: %s" % (label_selector, nodes)
|
||||
)
|
||||
number_of_nodes = len(nodes)
|
||||
if instance_kill_count == number_of_nodes:
|
||||
return nodes
|
||||
nodes_to_return = []
|
||||
for i in range(instance_kill_count):
|
||||
node_to_add = nodes[random.randint(0, len(nodes) - 1)]
|
||||
nodes_to_return.append(node_to_add)
|
||||
nodes.remove(node_to_add)
|
||||
return nodes_to_return
|
||||
|
||||
if cfg.name:
|
||||
input_nodes = cfg.name.split(",")
|
||||
else:
|
||||
input_nodes = [""]
|
||||
scenario_nodes = set()
|
||||
|
||||
if cfg.skip_openshift_checks:
|
||||
scenario_nodes = input_nodes
|
||||
else:
|
||||
for node in input_nodes:
|
||||
nodes = get_node(
|
||||
node, cfg.label_selector, cfg.instance_count, action, core_v1
|
||||
)
|
||||
scenario_nodes.update(nodes)
|
||||
|
||||
return list(scenario_nodes)
|
||||
|
||||
|
||||
def watch_node_status(node, status, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Monitor the status of a node for change
|
||||
"""
|
||||
count = timeout
|
||||
for event in watch_resource.stream(
|
||||
core_v1.list_node,
|
||||
field_selector=f"metadata.name={node}",
|
||||
timeout_seconds=timeout,
|
||||
):
|
||||
conditions = [
|
||||
status
|
||||
for status in event["object"].status.conditions
|
||||
if status.type == "Ready"
|
||||
]
|
||||
if conditions[0].status == status:
|
||||
watch_resource.stop()
|
||||
break
|
||||
else:
|
||||
count -= 1
|
||||
logging.info("Status of node " + node + ": " + str(conditions[0].status))
|
||||
if not count:
|
||||
watch_resource.stop()
|
||||
|
||||
|
||||
def wait_for_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Ready
|
||||
"""
|
||||
watch_node_status(node, "True", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_not_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Not Ready
|
||||
"""
|
||||
watch_node_status(node, "False", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_unknown_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Unknown
|
||||
"""
|
||||
watch_node_status(node, "Unknown", timeout, watch_resource, core_v1)
|
||||
@@ -12,15 +12,11 @@ from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin i
|
||||
from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import (
|
||||
pod_egress_shaping,
|
||||
)
|
||||
import krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||
from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import (
|
||||
pod_ingress_shaping,
|
||||
)
|
||||
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
||||
|
||||
from krkn.scenario_plugins.native.node_scenarios import vmware_plugin
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
class PluginStep:
|
||||
schema: schema.StepSchema
|
||||
@@ -160,14 +156,6 @@ PLUGINS = Plugins(
|
||||
),
|
||||
PluginStep(wait_for_pods, ["error"]),
|
||||
PluginStep(run_python_file, ["error"]),
|
||||
PluginStep(vmware_plugin.node_start, ["error"]),
|
||||
PluginStep(vmware_plugin.node_stop, ["error"]),
|
||||
PluginStep(vmware_plugin.node_reboot, ["error"]),
|
||||
PluginStep(vmware_plugin.node_terminate, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_start, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_stop, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_reboot, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_terminate, ["error"]),
|
||||
PluginStep(network_chaos, ["error"]),
|
||||
PluginStep(pod_outage, ["error"]),
|
||||
PluginStep(pod_egress_shaping, ["error"]),
|
||||
|
||||
@@ -42,19 +42,13 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
test_egress = get_yaml_item_value(
|
||||
test_dict, "egress", {"bandwidth": "100mbit"}
|
||||
)
|
||||
|
||||
if test_node:
|
||||
node_name_list = test_node.split(",")
|
||||
nodelst = common_node_functions.get_node_by_name(node_name_list, lib_telemetry.get_lib_kubernetes())
|
||||
else:
|
||||
node_name_list = [test_node]
|
||||
nodelst = []
|
||||
for single_node_name in node_name_list:
|
||||
nodelst.extend(
|
||||
common_node_functions.get_node(
|
||||
single_node_name,
|
||||
test_node_label,
|
||||
test_instance_count,
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
)
|
||||
nodelst = common_node_functions.get_node(
|
||||
test_node_label, test_instance_count, lib_telemetry.get_lib_kubernetes()
|
||||
)
|
||||
file_loader = FileSystemLoader(
|
||||
os.path.abspath(os.path.dirname(__file__))
|
||||
@@ -149,7 +143,10 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
finally:
|
||||
logging.info("Deleting jobs")
|
||||
self.delete_job(joblst[:], lib_telemetry.get_lib_kubernetes())
|
||||
except (RuntimeError, Exception):
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error(
|
||||
"NetworkChaosScenarioPlugin exiting due to Exception %s" % e
|
||||
)
|
||||
scenario_telemetry.exit_status = 1
|
||||
return 1
|
||||
else:
|
||||
|
||||
0
krkn/scenario_plugins/network_chaos_ng/__init__.py
Normal file
0
krkn/scenario_plugins/network_chaos_ng/__init__.py
Normal file
41
krkn/scenario_plugins/network_chaos_ng/models.py
Normal file
41
krkn/scenario_plugins/network_chaos_ng/models.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class NetworkChaosScenarioType(Enum):
|
||||
Node = 1
|
||||
Pod = 2
|
||||
|
||||
@dataclass
|
||||
class BaseNetworkChaosConfig:
|
||||
supported_execution = ["serial", "parallel"]
|
||||
id: str
|
||||
wait_duration: int
|
||||
test_duration: int
|
||||
label_selector: str
|
||||
instance_count: int
|
||||
execution: str
|
||||
namespace: str
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = []
|
||||
if self.execution is None:
|
||||
errors.append(f"execution cannot be None, supported values are: {','.join(self.supported_execution)}")
|
||||
if self.execution not in self.supported_execution:
|
||||
errors.append(f"{self.execution} is not in supported execution mod: {','.join(self.supported_execution)}")
|
||||
if self.label_selector is None:
|
||||
errors.append("label_selector cannot be None")
|
||||
return errors
|
||||
|
||||
@dataclass
|
||||
class NetworkFilterConfig(BaseNetworkChaosConfig):
|
||||
ingress: bool
|
||||
egress: bool
|
||||
interfaces: list[str]
|
||||
target: str
|
||||
ports: list[int]
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = super().validate()
|
||||
# here further validations
|
||||
return errors
|
||||
@@ -0,0 +1,58 @@
|
||||
import abc
|
||||
import logging
|
||||
import queue
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import BaseNetworkChaosConfig, NetworkChaosScenarioType
|
||||
|
||||
|
||||
class AbstractNetworkChaosModule(abc.ABC):
|
||||
"""
|
||||
The abstract class that needs to be implemented by each Network Chaos Scenario
|
||||
"""
|
||||
@abc.abstractmethod
|
||||
def run(self, target: str, kubecli: KrknTelemetryOpenshift, error_queue: queue.Queue = None):
|
||||
"""
|
||||
the entrypoint method for the Network Chaos Scenario
|
||||
:param target: The resource name that will be targeted by the scenario (Node Name, Pod Name etc.)
|
||||
:param kubecli: The `KrknTelemetryOpenshift` needed by the scenario to access to the krkn-lib methods
|
||||
:param error_queue: A queue that will be used by the plugin to push the errors raised during the execution of parallel modules
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
"""
|
||||
returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network
|
||||
Chaos Scenario that is running (Pod Scenario or Node Scenario)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def log_info(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for INFO severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.info(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.info(message)
|
||||
|
||||
def log_warning(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for WARNING severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.warning(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.warning(message)
|
||||
|
||||
|
||||
def log_error(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for ERROR severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.error(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.error(message)
|
||||
@@ -0,0 +1,136 @@
|
||||
import os
|
||||
import queue
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
BaseNetworkChaosConfig,
|
||||
NetworkFilterConfig,
|
||||
NetworkChaosScenarioType,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
|
||||
|
||||
class NodeNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
config: NetworkFilterConfig
|
||||
|
||||
def run(
|
||||
self,
|
||||
target: str,
|
||||
kubecli: KrknTelemetryOpenshift,
|
||||
error_queue: queue.Queue = None,
|
||||
):
|
||||
parallel = False
|
||||
if error_queue:
|
||||
parallel = True
|
||||
try:
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_name = f"node-filter-{get_random_string(5)}"
|
||||
pod_template = env.get_template("templates/network-chaos.j2")
|
||||
pod_body = yaml.safe_load(
|
||||
pod_template.render(
|
||||
pod_name=pod_name,
|
||||
namespace=self.config.namespace,
|
||||
host_network=True,
|
||||
target=target,
|
||||
)
|
||||
)
|
||||
self.log_info(
|
||||
f"creating pod to filter "
|
||||
f"ports {','.join([str(port) for port in self.config.ports])}, "
|
||||
f"ingress:{str(self.config.ingress)}, "
|
||||
f"egress:{str(self.config.egress)}",
|
||||
parallel,
|
||||
target,
|
||||
)
|
||||
kubecli.get_lib_kubernetes().create_pod(
|
||||
pod_body, self.config.namespace, 300
|
||||
)
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
interfaces = [
|
||||
self.get_default_interface(pod_name, self.config.namespace, kubecli)
|
||||
]
|
||||
self.log_info(f"detected default interface {interfaces[0]}")
|
||||
else:
|
||||
interfaces = self.config.interfaces
|
||||
|
||||
input_rules, output_rules = self.generate_rules(interfaces)
|
||||
|
||||
for rule in input_rules:
|
||||
self.log_info(f"applying iptables INPUT rule: {rule}", parallel, target)
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[rule], pod_name, self.config.namespace
|
||||
)
|
||||
for rule in output_rules:
|
||||
self.log_info(
|
||||
f"applying iptables OUTPUT rule: {rule}", parallel, target
|
||||
)
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[rule], pod_name, self.config.namespace
|
||||
)
|
||||
self.log_info(
|
||||
f"waiting {self.config.test_duration} seconds before removing the iptables rules"
|
||||
)
|
||||
time.sleep(self.config.test_duration)
|
||||
self.log_info("removing iptables rules")
|
||||
for _ in input_rules:
|
||||
# always deleting the first rule since has been inserted from the top
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[f"iptables -D INPUT 1"], pod_name, self.config.namespace
|
||||
)
|
||||
for _ in output_rules:
|
||||
# always deleting the first rule since has been inserted from the top
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[f"iptables -D OUTPUT 1"], pod_name, self.config.namespace
|
||||
)
|
||||
self.log_info(
|
||||
f"deleting network chaos pod {pod_name} from {self.config.namespace}"
|
||||
)
|
||||
|
||||
kubecli.get_lib_kubernetes().delete_pod(pod_name, self.config.namespace)
|
||||
|
||||
except Exception as e:
|
||||
if error_queue is None:
|
||||
raise e
|
||||
else:
|
||||
error_queue.put(str(e))
|
||||
|
||||
def __init__(self, config: NetworkFilterConfig):
|
||||
self.config = config
|
||||
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
return NetworkChaosScenarioType.Node, self.config
|
||||
|
||||
def get_default_interface(
|
||||
self, pod_name: str, namespace: str, kubecli: KrknTelemetryOpenshift
|
||||
) -> str:
|
||||
cmd = "ip r | grep default | awk '/default/ {print $5}'"
|
||||
output = kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[cmd], pod_name, namespace
|
||||
)
|
||||
return output.replace("\n", "")
|
||||
|
||||
def generate_rules(self, interfaces: list[str]) -> (list[str], list[str]):
|
||||
input_rules = []
|
||||
output_rules = []
|
||||
for interface in interfaces:
|
||||
for port in self.config.ports:
|
||||
if self.config.egress:
|
||||
output_rules.append(
|
||||
f"iptables -I OUTPUT 1 -p tcp --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
|
||||
)
|
||||
|
||||
if self.config.ingress:
|
||||
input_rules.append(
|
||||
f"iptables -I INPUT 1 -i {interface} -p tcp --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
|
||||
)
|
||||
return input_rules, output_rules
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: {{pod_name}}
|
||||
namespace: {{namespace}}
|
||||
spec:
|
||||
{% if host_network %}
|
||||
hostNetwork: true
|
||||
{%endif%}
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: {{target}}
|
||||
containers:
|
||||
- name: fedora
|
||||
imagePullPolicy: Always
|
||||
image: quay.io/krkn-chaos/krkn-network-chaos:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
@@ -0,0 +1,24 @@
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import AbstractNetworkChaosModule
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import NodeNetworkFilterModule
|
||||
|
||||
|
||||
supported_modules = ["node_network_filter"]
|
||||
|
||||
class NetworkChaosFactory:
|
||||
|
||||
@staticmethod
|
||||
def get_instance(config: dict[str, str]) -> AbstractNetworkChaosModule:
|
||||
if config["id"] is None:
|
||||
raise Exception("network chaos id cannot be None")
|
||||
if config["id"] not in supported_modules:
|
||||
raise Exception(f"{config['id']} is not a supported network chaos module")
|
||||
|
||||
if config["id"] == "node_network_filter":
|
||||
config = NetworkFilterConfig(**config)
|
||||
errors = config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return NodeNetworkFilterModule(config)
|
||||
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
import logging
|
||||
import queue
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
NetworkChaosScenarioType,
|
||||
BaseNetworkChaosConfig,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.network_chaos_factory import (
|
||||
NetworkChaosFactory,
|
||||
)
|
||||
|
||||
|
||||
class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
try:
|
||||
with open(scenario, "r") as file:
|
||||
scenario_config = yaml.safe_load(file)
|
||||
if not isinstance(scenario_config, list):
|
||||
logging.error(
|
||||
"network chaos scenario config must be a list of objects"
|
||||
)
|
||||
return 1
|
||||
for config in scenario_config:
|
||||
network_chaos = NetworkChaosFactory.get_instance(config)
|
||||
network_chaos_config = network_chaos.get_config()
|
||||
logging.info(
|
||||
f"running network_chaos scenario: {network_chaos_config[1].id}"
|
||||
)
|
||||
if network_chaos_config[0] == NetworkChaosScenarioType.Node:
|
||||
targets = lib_telemetry.get_lib_kubernetes().list_nodes(
|
||||
network_chaos_config[1].label_selector
|
||||
)
|
||||
else:
|
||||
targets = lib_telemetry.get_lib_kubernetes().list_pods(
|
||||
network_chaos_config[1].namespace,
|
||||
network_chaos_config[1].label_selector,
|
||||
)
|
||||
if len(targets) == 0:
|
||||
logging.warning(
|
||||
f"no targets found for {network_chaos_config[1].id} "
|
||||
f"network chaos scenario with selector {network_chaos_config[1].label_selector} "
|
||||
f"with target type {network_chaos_config[0]}"
|
||||
)
|
||||
|
||||
if network_chaos_config[1].instance_count != 0 and network_chaos_config[1].instance_count > len(targets):
|
||||
targets = random.sample(targets, network_chaos_config[1].instance_count)
|
||||
|
||||
if network_chaos_config[1].execution == "parallel":
|
||||
self.run_parallel(targets, network_chaos, lib_telemetry)
|
||||
else:
|
||||
self.run_serial(targets, network_chaos, lib_telemetry)
|
||||
if len(config) > 1:
|
||||
logging.info(f"waiting {network_chaos_config[1].wait_duration} seconds before running the next "
|
||||
f"Network Chaos NG Module")
|
||||
time.sleep(network_chaos_config[1].wait_duration)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def run_parallel(
|
||||
self,
|
||||
targets: list[str],
|
||||
module: AbstractNetworkChaosModule,
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
):
|
||||
error_queue = queue.Queue()
|
||||
threads = []
|
||||
errors = []
|
||||
for target in targets:
|
||||
thread = threading.Thread(
|
||||
target=module.run, args=[target, lib_telemetry, error_queue]
|
||||
)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
while True:
|
||||
try:
|
||||
errors.append(error_queue.get_nowait())
|
||||
except queue.Empty:
|
||||
break
|
||||
if len(errors) > 0:
|
||||
raise Exception(
|
||||
f"module {module.get_config()[1].id} execution failed: [{';'.join(errors)}]"
|
||||
)
|
||||
|
||||
def run_serial(
|
||||
self,
|
||||
targets: list[str],
|
||||
module: AbstractNetworkChaosModule,
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
):
|
||||
for target in targets:
|
||||
module.run(target, lib_telemetry)
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["network_chaos_ng_scenarios"]
|
||||
@@ -4,14 +4,16 @@ import time
|
||||
import krkn.invoke.command as runcommand
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
# krkn_lib
|
||||
class abstract_node_scenarios:
|
||||
kubecli: KrknKubernetes
|
||||
affected_nodes_status: AffectedNodeStatus
|
||||
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
self.kubecli = kubecli
|
||||
self.affected_nodes_status = affected_nodes_status
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -28,6 +30,7 @@ class abstract_node_scenarios:
|
||||
logging.info("Waiting for %s seconds before starting the node" % (duration))
|
||||
time.sleep(duration)
|
||||
self.node_start_scenario(instance_kill_count, node, timeout)
|
||||
self.affected_nodes_status.merge_affected_nodes()
|
||||
logging.info("node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
def helper_node_stop_start_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -36,6 +39,20 @@ class abstract_node_scenarios:
|
||||
self.helper_node_start_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("helper_node_stop_start_scenario has been successfully injected!")
|
||||
|
||||
# Node scenario to detach and attach the disk
|
||||
def node_disk_detach_attach_scenario(self, instance_kill_count, node, timeout, duration):
|
||||
logging.info("Starting disk_detach_attach_scenario injection")
|
||||
disk_attachment_details = self.get_disk_attachment_info(instance_kill_count, node)
|
||||
if disk_attachment_details:
|
||||
self.disk_detach_scenario(instance_kill_count, node, timeout)
|
||||
logging.info("Waiting for %s seconds before attaching the disk" % (duration))
|
||||
time.sleep(duration)
|
||||
self.disk_attach_scenario(instance_kill_count, disk_attachment_details, timeout)
|
||||
logging.info("node_disk_detach_attach_scenario has been successfully injected!")
|
||||
else:
|
||||
logging.error("Node %s has only root disk attached" % (node))
|
||||
logging.error("node_disk_detach_attach_scenario failed!")
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
pass
|
||||
@@ -47,13 +64,15 @@ class abstract_node_scenarios:
|
||||
# Node scenario to stop the kubelet
|
||||
def stop_kubelet_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting stop_kubelet_scenario injection")
|
||||
logging.info("Stopping the kubelet of the node %s" % (node))
|
||||
runcommand.run(
|
||||
"oc debug node/" + node + " -- chroot /host systemctl stop kubelet"
|
||||
)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
|
||||
logging.info("The kubelet of the node %s has been stopped" % (node))
|
||||
logging.info("stop_kubelet_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
@@ -63,17 +82,20 @@ class abstract_node_scenarios:
|
||||
)
|
||||
logging.error("stop_kubelet_scenario injection failed!")
|
||||
raise e
|
||||
self.add_affected_node(affected_node)
|
||||
|
||||
# Node scenario to stop and start the kubelet
|
||||
def stop_start_kubelet_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Starting stop_start_kubelet_scenario injection")
|
||||
self.stop_kubelet_scenario(instance_kill_count, node, timeout)
|
||||
self.node_reboot_scenario(instance_kill_count, node, timeout)
|
||||
self.affected_nodes_status.merge_affected_nodes()
|
||||
logging.info("stop_start_kubelet_scenario has been successfully injected!")
|
||||
|
||||
# Node scenario to restart the kubelet
|
||||
def restart_kubelet_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting restart_kubelet_scenario injection")
|
||||
logging.info("Restarting the kubelet of the node %s" % (node))
|
||||
@@ -82,8 +104,8 @@ class abstract_node_scenarios:
|
||||
+ node
|
||||
+ " -- chroot /host systemctl restart kubelet &"
|
||||
)
|
||||
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli,affected_node)
|
||||
logging.info("The kubelet of the node %s has been restarted" % (node))
|
||||
logging.info("restart_kubelet_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
@@ -93,6 +115,7 @@ class abstract_node_scenarios:
|
||||
)
|
||||
logging.error("restart_kubelet_scenario injection failed!")
|
||||
raise e
|
||||
self.add_affected_node(affected_node)
|
||||
|
||||
# Node scenario to crash the node
|
||||
def node_crash_scenario(self, instance_kill_count, node, timeout):
|
||||
|
||||
@@ -18,7 +18,7 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class Alibaba:
|
||||
def __init__(self):
|
||||
@@ -161,8 +161,9 @@ class Alibaba:
|
||||
return None
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
def wait_until_running(self, instance_id, timeout, affected_node):
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != "Running":
|
||||
status = self.get_vm_status(instance_id)
|
||||
@@ -174,11 +175,15 @@ class Alibaba:
|
||||
if time_counter >= timeout:
|
||||
logging.info("ECS %s is still not ready in allotted time" % instance_id)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return True
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
def wait_until_stopped(self, instance_id, timeout, affected_node):
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != "Stopped":
|
||||
status = self.get_vm_status(instance_id)
|
||||
@@ -192,10 +197,14 @@ class Alibaba:
|
||||
"Vm %s is still not stopped in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return True
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_released(self, instance_id, timeout):
|
||||
def wait_until_released(self, instance_id, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
statuses = self.get_vm_status(instance_id)
|
||||
time_counter = 0
|
||||
while statuses and statuses != "Released":
|
||||
@@ -210,26 +219,33 @@ class Alibaba:
|
||||
return False
|
||||
|
||||
logging.info("ECS %s is released" % instance_id)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return True
|
||||
|
||||
|
||||
# krkn_lib
|
||||
class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.alibaba = Alibaba()
|
||||
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, vm_id)
|
||||
)
|
||||
self.alibaba.start_instances(vm_id)
|
||||
self.alibaba.wait_until_running(vm_id, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.alibaba.wait_until_running(vm_id, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info("Node with instance ID: %s is in running state" % node)
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
@@ -239,20 +255,23 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, vm_id)
|
||||
)
|
||||
self.alibaba.stop_instances(vm_id)
|
||||
self.alibaba.wait_until_stopped(vm_id, timeout)
|
||||
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
|
||||
logging.info("Node with instance ID: %s is in stopped state" % vm_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -260,23 +279,26 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Might need to stop and then release the instance
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping instance"
|
||||
)
|
||||
vm_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = vm_id
|
||||
self.alibaba.stop_instances(vm_id)
|
||||
self.alibaba.wait_until_stopped(vm_id, timeout)
|
||||
self.alibaba.wait_until_stopped(vm_id, timeout, affected_node)
|
||||
logging.info(
|
||||
"Releasing the node %s with instance ID: %s " % (node, vm_id)
|
||||
)
|
||||
self.alibaba.release_instance(vm_id)
|
||||
self.alibaba.wait_until_released(vm_id, timeout)
|
||||
self.alibaba.wait_until_released(vm_id, timeout, affected_node)
|
||||
logging.info("Node with instance ID: %s has been released" % node)
|
||||
logging.info(
|
||||
"node_termination_scenario has been successfully injected!"
|
||||
@@ -288,17 +310,20 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
instance_id = self.alibaba.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info("Rebooting the node with instance ID: %s " % (instance_id))
|
||||
self.alibaba.reboot_instances(instance_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been rebooted" % (instance_id)
|
||||
)
|
||||
@@ -310,3 +335,4 @@ class alibaba_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
@@ -7,12 +7,13 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class AWS:
|
||||
def __init__(self):
|
||||
self.boto_client = boto3.client("ec2")
|
||||
self.boto_instance = boto3.resource("ec2").Instance("id")
|
||||
self.boto_resource = boto3.resource("ec2")
|
||||
self.boto_instance = self.boto_resource.Instance("id")
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
@@ -76,9 +77,13 @@ class AWS:
|
||||
# until a successful state is reached. An error is returned after 40 failed checks
|
||||
# Setting timeout for consistency with other cloud functions
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, instance_id, timeout=600):
|
||||
def wait_until_running(self, instance_id, timeout=600, affected_node=None):
|
||||
try:
|
||||
start_time = time.time()
|
||||
self.boto_instance.wait_until_running(InstanceIds=[instance_id])
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
@@ -88,9 +93,13 @@ class AWS:
|
||||
return False
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, instance_id, timeout=600):
|
||||
def wait_until_stopped(self, instance_id, timeout=600, affected_node= None):
|
||||
try:
|
||||
start_time = time.time()
|
||||
self.boto_instance.wait_until_stopped(InstanceIds=[instance_id])
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
@@ -100,9 +109,13 @@ class AWS:
|
||||
return False
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, instance_id, timeout=600):
|
||||
def wait_until_terminated(self, instance_id, timeout=600, affected_node= None):
|
||||
try:
|
||||
start_time = time.time()
|
||||
self.boto_instance.wait_until_terminated(InstanceIds=[instance_id])
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
@@ -179,25 +192,93 @@ class AWS:
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Detach volume
|
||||
def detach_volumes(self, volumes_ids: list):
|
||||
for volume in volumes_ids:
|
||||
try:
|
||||
self.boto_client.detach_volume(VolumeId=volume, Force=True)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Detaching volume %s failed with exception: %s"
|
||||
% (volume, e)
|
||||
)
|
||||
|
||||
# Attach volume
|
||||
def attach_volume(self, attachment: dict):
|
||||
try:
|
||||
if self.get_volume_state(attachment["VolumeId"]) == "in-use":
|
||||
logging.info(
|
||||
"Volume %s is already in use." % attachment["VolumeId"]
|
||||
)
|
||||
return
|
||||
logging.info(
|
||||
"Attaching the %s volumes to instance %s."
|
||||
% (attachment["VolumeId"], attachment["InstanceId"])
|
||||
)
|
||||
self.boto_client.attach_volume(
|
||||
InstanceId=attachment["InstanceId"],
|
||||
Device=attachment["Device"],
|
||||
VolumeId=attachment["VolumeId"]
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed attaching disk %s to the %s instance. "
|
||||
"Encountered following exception: %s"
|
||||
% (attachment['VolumeId'], attachment['InstanceId'], e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Get IDs of node volumes
|
||||
def get_volumes_ids(self, instance_id: list):
|
||||
response = self.boto_client.describe_instances(InstanceIds=instance_id)
|
||||
instance_attachment_details = response["Reservations"][0]["Instances"][0]["BlockDeviceMappings"]
|
||||
root_volume_device_name = self.get_root_volume_id(instance_id)
|
||||
volume_ids = []
|
||||
for device in instance_attachment_details:
|
||||
if device["DeviceName"] != root_volume_device_name:
|
||||
volume_id = device["Ebs"]["VolumeId"]
|
||||
volume_ids.append(volume_id)
|
||||
return volume_ids
|
||||
|
||||
# Get volumes attachment details
|
||||
def get_volume_attachment_details(self, volume_ids: list):
|
||||
response = self.boto_client.describe_volumes(VolumeIds=volume_ids)
|
||||
volumes_details = response["Volumes"]
|
||||
return volumes_details
|
||||
|
||||
# Get root volume
|
||||
def get_root_volume_id(self, instance_id):
|
||||
instance_id = instance_id[0]
|
||||
instance = self.boto_resource.Instance(instance_id)
|
||||
root_volume_id = instance.root_device_name
|
||||
return root_volume_id
|
||||
|
||||
# Get volume state
|
||||
def get_volume_state(self, volume_id: str):
|
||||
volume = self.boto_resource.Volume(volume_id)
|
||||
state = volume.state
|
||||
return state
|
||||
|
||||
# krkn_lib
|
||||
class aws_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.aws = AWS()
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.aws.start_instances(instance_id)
|
||||
self.aws.wait_until_running(instance_id)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.aws.wait_until_running(instance_id, affected_node=affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % (instance_id)
|
||||
)
|
||||
@@ -210,22 +291,25 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.aws.stop_instances(instance_id)
|
||||
self.aws.wait_until_stopped(instance_id)
|
||||
self.aws.wait_until_stopped(instance_id, affected_node=affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % (instance_id)
|
||||
)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node=affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -234,19 +318,22 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (node, instance_id)
|
||||
)
|
||||
self.aws.terminate_instances(instance_id)
|
||||
self.aws.wait_until_terminated(instance_id)
|
||||
self.aws.wait_until_terminated(instance_id, affected_node=affected_node)
|
||||
for _ in range(timeout):
|
||||
if node not in self.kubecli.list_nodes():
|
||||
break
|
||||
@@ -265,19 +352,22 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection" + str(node))
|
||||
instance_id = self.aws.get_instance_id(node)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.aws.reboot_instances(instance_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been rebooted" % (instance_id)
|
||||
)
|
||||
@@ -290,3 +380,50 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Get volume attachment info
|
||||
def get_disk_attachment_info(self, instance_kill_count, node):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Obtaining disk attachment information")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
if volumes_ids:
|
||||
vol_attachment_details = self.aws.get_volume_attachment_details(
|
||||
volumes_ids
|
||||
)
|
||||
return vol_attachment_details
|
||||
return
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to obtain disk attachment information of %s node. "
|
||||
"Encounteres following exception: %s." % (node, e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to detach the volume
|
||||
def disk_detach_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting disk_detach_scenario injection")
|
||||
instance_id = (self.aws.get_instance_id(node)).split()
|
||||
volumes_ids = self.aws.get_volumes_ids(instance_id)
|
||||
logging.info(
|
||||
"Detaching the %s volumes from instance %s "
|
||||
% (volumes_ids, node)
|
||||
)
|
||||
self.aws.detach_volumes(volumes_ids)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to detach disk from %s node. Encountered following"
|
||||
"exception: %s." % (node, e)
|
||||
)
|
||||
logging.debug("")
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to attach the volume
|
||||
def disk_attach_scenario(self, instance_kill_count, attachment_details, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
for attachment in attachment_details:
|
||||
self.aws.attach_volume(attachment["Attachments"][0])
|
||||
|
||||
@@ -8,7 +8,7 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
from azure.mgmt.compute import ComputeManagementClient
|
||||
from azure.identity import DefaultAzureCredential
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class Azure:
|
||||
def __init__(self):
|
||||
@@ -18,8 +18,11 @@ class Azure:
|
||||
logging.info("credential " + str(credentials))
|
||||
# az_account = runcommand.invoke("az account list -o yaml")
|
||||
# az_account_yaml = yaml.safe_load(az_account, Loader=yaml.FullLoader)
|
||||
logger = logging.getLogger("azure")
|
||||
logger.setLevel(logging.WARNING)
|
||||
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
|
||||
self.compute_client = ComputeManagementClient(credentials, subscription_id)
|
||||
self.compute_client = ComputeManagementClient(credentials, subscription_id,logging=logger)
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
@@ -90,8 +93,9 @@ class Azure:
|
||||
return status
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, resource_group, vm_name, timeout):
|
||||
def wait_until_running(self, resource_group, vm_name, timeout, affected_node):
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
while status and status.code != "PowerState/running":
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
@@ -101,11 +105,15 @@ class Azure:
|
||||
if time_counter >= timeout:
|
||||
logging.info("Vm %s is still not ready in allotted time" % vm_name)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return True
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, resource_group, vm_name, timeout):
|
||||
def wait_until_stopped(self, resource_group, vm_name, timeout, affected_node):
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
while status and status.code != "PowerState/stopped":
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
@@ -115,10 +123,14 @@ class Azure:
|
||||
if time_counter >= timeout:
|
||||
logging.info("Vm %s is still not stopped in allotted time" % vm_name)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return True
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, resource_group, vm_name, timeout):
|
||||
def wait_until_terminated(self, resource_group, vm_name, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
statuses = self.compute_client.virtual_machines.instance_view(
|
||||
resource_group, vm_name
|
||||
).statuses[0]
|
||||
@@ -137,29 +149,35 @@ class Azure:
|
||||
return False
|
||||
except Exception:
|
||||
logging.info("Vm %s is terminated" % vm_name)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return True
|
||||
|
||||
|
||||
# krkn_lib
|
||||
class azure_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
logging.info("init in azure")
|
||||
self.azure = Azure()
|
||||
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
)
|
||||
self.azure.start_instances(resource_group, vm_name)
|
||||
self.azure.wait_until_running(resource_group, vm_name, timeout)
|
||||
nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli)
|
||||
self.azure.wait_until_running(resource_group, vm_name, timeout, affected_node=affected_node)
|
||||
nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli, affected_node)
|
||||
logging.info("Node with instance ID: %s is in running state" % node)
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
@@ -170,21 +188,24 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
)
|
||||
self.azure.stop_instances(resource_group, vm_name)
|
||||
self.azure.wait_until_stopped(resource_group, vm_name, timeout)
|
||||
self.azure.wait_until_stopped(resource_group, vm_name, timeout, affected_node=affected_node)
|
||||
logging.info("Node with instance ID: %s is in stopped state" % vm_name)
|
||||
nodeaction.wait_for_unknown_status(vm_name, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(vm_name, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -193,19 +214,22 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
)
|
||||
self.azure.terminate_instances(resource_group, vm_name)
|
||||
self.azure.wait_until_terminated(resource_group, vm_name, timeout)
|
||||
self.azure.wait_until_terminated(resource_group, vm_name, timeout, affected_node)
|
||||
for _ in range(timeout):
|
||||
if vm_name not in self.kubecli.list_nodes():
|
||||
break
|
||||
@@ -224,20 +248,26 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
vm_name, resource_group = self.azure.get_instance_id(node)
|
||||
affected_node.node_id = vm_name
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s "
|
||||
% (vm_name, resource_group)
|
||||
)
|
||||
|
||||
self.azure.reboot_instances(resource_group, vm_name)
|
||||
nodeaction.wait_for_unknown_status(vm_name, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli)
|
||||
|
||||
nodeaction.wait_for_ready_status(vm_name, timeout, self.kubecli, affected_node)
|
||||
|
||||
logging.info("Node with instance ID: %s has been rebooted" % (vm_name))
|
||||
logging.info("node_reboot_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
@@ -248,3 +278,4 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
@@ -9,7 +9,7 @@ import pyipmi.interfaces
|
||||
import time
|
||||
import traceback
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class BM:
|
||||
def __init__(self, bm_info, user, passwd):
|
||||
@@ -109,40 +109,50 @@ class BM:
|
||||
self.get_ipmi_connection(bmc_addr, node_name).chassis_control_power_cycle()
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, bmc_addr, node_name):
|
||||
def wait_until_running(self, bmc_addr, node_name, affected_node):
|
||||
start_time = time.time()
|
||||
while (
|
||||
not self.get_ipmi_connection(bmc_addr, node_name)
|
||||
.get_chassis_status()
|
||||
.power_on
|
||||
):
|
||||
time.sleep(1)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, bmc_addr, node_name):
|
||||
def wait_until_stopped(self, bmc_addr, node_name, affected_node):
|
||||
start_time = time.time()
|
||||
while (
|
||||
self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on
|
||||
):
|
||||
time.sleep(1)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
|
||||
|
||||
# krkn_lib
|
||||
class bm_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes,affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.bm = BM(bm_info, user, passwd)
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
affected_node.node_id = bmc_addr
|
||||
logging.info(
|
||||
"Starting the node %s with bmc address: %s " % (node, bmc_addr)
|
||||
)
|
||||
self.bm.start_instances(bmc_addr, node)
|
||||
self.bm.wait_until_running(bmc_addr, node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.bm.wait_until_running(bmc_addr, node, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with bmc address: %s is in running state" % (bmc_addr)
|
||||
)
|
||||
@@ -155,22 +165,25 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
affected_node.node_id = bmc_addr
|
||||
logging.info(
|
||||
"Stopping the node %s with bmc address: %s " % (node, bmc_addr)
|
||||
)
|
||||
self.bm.stop_instances(bmc_addr, node)
|
||||
self.bm.wait_until_stopped(bmc_addr, node)
|
||||
self.bm.wait_until_stopped(bmc_addr, node, affected_node)
|
||||
logging.info(
|
||||
"Node with bmc address: %s is in stopped state" % (bmc_addr)
|
||||
)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -179,6 +192,7 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -187,6 +201,7 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
bmc_addr = self.bm.get_bmc_addr(node)
|
||||
@@ -195,8 +210,8 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
"Rebooting the node %s with bmc address: %s " % (node, bmc_addr)
|
||||
)
|
||||
self.bm.reboot_instances(bmc_addr, node)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info("Node with bmc address: %s has been rebooted" % (bmc_addr))
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
@@ -208,3 +223,4 @@ class bm_node_scenarios(abstract_node_scenarios):
|
||||
traceback.print_exc()
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import datetime
|
||||
import time
|
||||
import random
|
||||
import logging
|
||||
import paramiko
|
||||
from krkn_lib.models.k8s import AffectedNode
|
||||
import krkn.invoke.command as runcommand
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
from krkn_lib.models.k8s import AffectedNode
|
||||
|
||||
node_general = False
|
||||
|
||||
@@ -40,23 +44,25 @@ def get_node(label_selector, instance_kill_count, kubecli: KrknKubernetes):
|
||||
nodes.remove(node_to_add)
|
||||
return nodes_to_return
|
||||
|
||||
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Ready
|
||||
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes):
|
||||
kubecli.watch_node_status(node, "True", timeout)
|
||||
|
||||
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
|
||||
affected_node = kubecli.watch_node_status(node, "True", timeout, affected_node)
|
||||
return affected_node
|
||||
|
||||
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Not Ready
|
||||
def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes):
|
||||
kubecli.watch_node_status(node, "False", timeout)
|
||||
|
||||
def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
|
||||
affected_node = kubecli.watch_node_status(node, "False", timeout, affected_node)
|
||||
return affected_node
|
||||
|
||||
|
||||
# krkn_lib
|
||||
# Wait until the node status becomes Unknown
|
||||
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes):
|
||||
kubecli.watch_node_status(node, "Unknown", timeout)
|
||||
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes, affected_node: AffectedNode = None):
|
||||
affected_node = kubecli.watch_node_status(node, "Unknown", timeout, affected_node)
|
||||
return affected_node
|
||||
|
||||
|
||||
# Get the ip of the cluster node
|
||||
|
||||
@@ -5,7 +5,7 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
import logging
|
||||
import docker
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class Docker:
|
||||
def __init__(self):
|
||||
@@ -38,21 +38,23 @@ class Docker:
|
||||
|
||||
|
||||
class docker_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.docker = Docker()
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
container_id = self.docker.get_container_id(node)
|
||||
affected_node.node_id = container_id
|
||||
logging.info(
|
||||
"Starting the node %s with container ID: %s " % (node, container_id)
|
||||
)
|
||||
self.docker.start_instances(node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with container ID: %s is in running state" % (container_id)
|
||||
)
|
||||
@@ -64,13 +66,16 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
container_id = self.docker.get_container_id(node)
|
||||
affected_node.node_id = container_id
|
||||
logging.info(
|
||||
"Stopping the node %s with container ID: %s " % (node, container_id)
|
||||
)
|
||||
@@ -78,7 +83,7 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
logging.info(
|
||||
"Node with container ID: %s is in stopped state" % (container_id)
|
||||
)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -86,6 +91,7 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -113,6 +119,7 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
container_id = self.docker.get_container_id(node)
|
||||
@@ -121,8 +128,8 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
% (node, container_id)
|
||||
)
|
||||
self.docker.reboot_instances(node)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with container ID: %s has been rebooted" % (container_id)
|
||||
)
|
||||
@@ -134,3 +141,4 @@ class docker_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
raise e
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
@@ -1,66 +1,78 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import google.auth
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from googleapiclient import discovery
|
||||
from oauth2client.client import GoogleCredentials
|
||||
from google.cloud import compute_v1
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class GCP:
|
||||
def __init__(self):
|
||||
try:
|
||||
gapp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
with open(gapp_creds, "r") as f:
|
||||
f_str = f.read()
|
||||
self.project = json.loads(f_str)["project_id"]
|
||||
# self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip()
|
||||
logging.info("project " + str(self.project) + "!")
|
||||
credentials = GoogleCredentials.get_application_default()
|
||||
self.client = discovery.build(
|
||||
"compute", "v1", credentials=credentials, cache_discovery=False
|
||||
)
|
||||
|
||||
_, self.project_id = google.auth.default()
|
||||
self.instance_client = compute_v1.InstancesClient()
|
||||
except Exception as e:
|
||||
logging.error("Error on setting up GCP connection: " + str(e))
|
||||
|
||||
raise e
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
zone_request = self.client.zones().list(project=self.project)
|
||||
while zone_request is not None:
|
||||
zone_response = zone_request.execute()
|
||||
for zone in zone_response["items"]:
|
||||
instances_request = self.client.instances().list(
|
||||
project=self.project, zone=zone["name"]
|
||||
)
|
||||
while instances_request is not None:
|
||||
instance_response = instances_request.execute()
|
||||
if "items" in instance_response.keys():
|
||||
for instance in instance_response["items"]:
|
||||
if instance["name"] in node:
|
||||
return instance["name"], zone["name"]
|
||||
instances_request = self.client.zones().list_next(
|
||||
previous_request=instances_request,
|
||||
previous_response=instance_response,
|
||||
)
|
||||
zone_request = self.client.zones().list_next(
|
||||
previous_request=zone_request, previous_response=zone_response
|
||||
# Get the instance of the node
|
||||
def get_node_instance(self, node):
|
||||
try:
|
||||
request = compute_v1.AggregatedListInstancesRequest(
|
||||
project = self.project_id
|
||||
)
|
||||
logging.info("no instances ")
|
||||
agg_list = self.instance_client.aggregated_list(request=request)
|
||||
for _, response in agg_list:
|
||||
if response.instances:
|
||||
for instance in response.instances:
|
||||
if instance.name in node:
|
||||
return instance
|
||||
logging.info("no instances ")
|
||||
except Exception as e:
|
||||
logging.error("Error getting the instance of the node: " + str(e))
|
||||
|
||||
raise e
|
||||
|
||||
# Get the instance name
|
||||
def get_instance_name(self, instance):
|
||||
if instance.name:
|
||||
return instance.name
|
||||
|
||||
# Get the instance zone
|
||||
def get_instance_zone(self, instance):
|
||||
if instance.zone:
|
||||
return instance.zone.split("/")[-1]
|
||||
|
||||
# Get the instance zone of the node
|
||||
def get_node_instance_zone(self, node):
|
||||
instance = self.get_node_instance(node)
|
||||
if instance:
|
||||
return self.get_instance_zone(instance)
|
||||
|
||||
# Get the instance name of the node
|
||||
def get_node_instance_name(self, node):
|
||||
instance = self.get_node_instance(node)
|
||||
if instance:
|
||||
return self.get_instance_name(instance)
|
||||
|
||||
# Get the instance name of the node
|
||||
def get_instance_id(self, node):
|
||||
return self.get_node_instance_name(node)
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self, zone, instance_id):
|
||||
def start_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().start(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " started")
|
||||
request = compute_v1.StartInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.start(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " started")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
@@ -70,12 +82,15 @@ class GCP:
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, zone, instance_id):
|
||||
def stop_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().stop(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " stopped")
|
||||
request = compute_v1.StopInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.stop(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance %s. Encountered following "
|
||||
@@ -84,13 +99,16 @@ class GCP:
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Start the node instance
|
||||
def suspend_instances(self, zone, instance_id):
|
||||
# Suspend the node instance
|
||||
def suspend_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().suspend(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " suspended")
|
||||
request = compute_v1.SuspendInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.suspend(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " suspended")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to suspend node instance %s. Encountered following "
|
||||
@@ -100,49 +118,65 @@ class GCP:
|
||||
raise RuntimeError()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, zone, instance_id):
|
||||
def terminate_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().delete(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " terminated")
|
||||
request = compute_v1.DeleteInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.delete(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " terminated")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
"Failed to terminate node instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, zone, instance_id):
|
||||
def reboot_instances(self, instance_id):
|
||||
try:
|
||||
self.client.instances().reset(
|
||||
project=self.project, zone=zone, instance=instance_id
|
||||
).execute()
|
||||
logging.info("vm name " + str(instance_id) + " rebooted")
|
||||
request = compute_v1.ResetInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
self.instance_client.reset(request=request)
|
||||
logging.info("Instance: " + str(instance_id) + " rebooted")
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following "
|
||||
"Failed to reboot node instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
|
||||
raise RuntimeError()
|
||||
|
||||
# Get instance status
|
||||
def get_instance_status(self, zone, instance_id, expected_status, timeout):
|
||||
# statuses: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
|
||||
def get_instance_status(self, instance_id, expected_status, timeout):
|
||||
# states: PROVISIONING, STAGING, RUNNING, STOPPING, SUSPENDING, SUSPENDED, REPAIRING,
|
||||
# and TERMINATED.
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = (
|
||||
self.client.instances()
|
||||
.get(project=self.project, zone=zone, instance=instance_id)
|
||||
.execute()
|
||||
)
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
if instStatus["status"] == expected_status:
|
||||
try:
|
||||
request = compute_v1.GetInstanceRequest(
|
||||
instance=instance_id,
|
||||
project=self.project_id,
|
||||
zone=self.get_node_instance_zone(instance_id),
|
||||
)
|
||||
instance_status = self.instance_client.get(request=request).status
|
||||
logging.info("Status of instance " + str(instance_id) + ": " + instance_status)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get status of instance %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
raise RuntimeError()
|
||||
|
||||
if instance_status == expected_status:
|
||||
logging.info('status matches, end' + str(expected_status) + str(instance_status))
|
||||
return True
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
@@ -153,53 +187,59 @@ class GCP:
|
||||
return False
|
||||
|
||||
# Wait until the node instance is suspended
|
||||
def wait_until_suspended(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "SUSPENDED", timeout)
|
||||
def wait_until_suspended(self, instance_id, timeout):
|
||||
return self.get_instance_status(instance_id, "SUSPENDED", timeout)
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "RUNNING", timeout)
|
||||
def wait_until_running(self, instance_id, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
instance_status = self.get_instance_status(instance_id, "RUNNING", timeout)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return instance_status
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, zone, instance_id, timeout):
|
||||
return self.get_instance_status(zone, instance_id, "TERMINATED", timeout)
|
||||
def wait_until_stopped(self, instance_id, timeout, affected_node):
|
||||
# In GCP, the next state after STOPPING is TERMINATED
|
||||
start_time = time.time()
|
||||
instance_status = self.get_instance_status(instance_id, "TERMINATED", timeout)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return instance_status
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, zone, instance_id, timeout):
|
||||
try:
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = (
|
||||
self.client.instances()
|
||||
.get(project=self.project, zone=zone, instance=instance_id)
|
||||
.execute()
|
||||
)
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
time.sleep(sleeper)
|
||||
except Exception as e:
|
||||
logging.info("here " + str(e))
|
||||
return True
|
||||
def wait_until_terminated(self, instance_id, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
instance_status = self.get_instance_status(instance_id, "TERMINATED", timeout)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return instance_status
|
||||
|
||||
|
||||
# krkn_lib
|
||||
class gcp_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.gcp = GCP()
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Starting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.start_instances(zone, instance_id)
|
||||
self.gcp.wait_until_running(zone, instance_id, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.gcp.start_instances(instance_id)
|
||||
self.gcp.wait_until_running(instance_id, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % instance_id
|
||||
)
|
||||
@@ -212,23 +252,26 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("stop scenario")
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Stopping the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.stop_instances(zone, instance_id)
|
||||
self.gcp.wait_until_stopped(zone, instance_id, timeout)
|
||||
self.gcp.stop_instances(instance_id)
|
||||
self.gcp.wait_until_stopped(instance_id, timeout, affected_node=affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % instance_id
|
||||
)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -237,19 +280,23 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Terminating the node %s with instance ID: %s "
|
||||
% (node, instance_id)
|
||||
)
|
||||
self.gcp.terminate_instances(zone, instance_id)
|
||||
self.gcp.wait_until_terminated(zone, instance_id, timeout)
|
||||
self.gcp.terminate_instances(instance_id)
|
||||
self.gcp.wait_until_terminated(instance_id, timeout, affected_node=affected_node)
|
||||
for _ in range(timeout):
|
||||
if node not in self.kubecli.list_nodes():
|
||||
break
|
||||
@@ -267,20 +314,25 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
|
||||
|
||||
raise e
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
instance_id, zone = self.gcp.get_instance_id(node)
|
||||
instance = self.gcp.get_node_instance(node)
|
||||
instance_id = self.gcp.get_instance_name(instance)
|
||||
affected_node.node_id = instance_id
|
||||
logging.info(
|
||||
"Rebooting the node %s with instance ID: %s " % (node, instance_id)
|
||||
)
|
||||
self.gcp.reboot_instances(zone, instance_id)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.gcp.reboot_instances(instance_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
self.gcp.wait_until_running(instance_id, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been rebooted" % instance_id
|
||||
)
|
||||
@@ -293,3 +345,4 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
@@ -3,7 +3,7 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
|
||||
class GENERAL:
|
||||
def __init__(self):
|
||||
@@ -12,8 +12,8 @@ class GENERAL:
|
||||
|
||||
# krkn_lib
|
||||
class general_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
super().__init__(kubecli)
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.general = GENERAL()
|
||||
|
||||
# Node scenario to start the node
|
||||
|
||||
367
krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py
Normal file
367
krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import typing
|
||||
from os import environ
|
||||
from dataclasses import dataclass, field
|
||||
from traceback import format_exc
|
||||
import logging
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from kubernetes import client, watch
|
||||
from ibm_vpc import VpcV1
|
||||
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
||||
import sys
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode
|
||||
|
||||
|
||||
class IbmCloud:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the ibm cloud client by using the the env variables:
|
||||
'IBMC_APIKEY' 'IBMC_URL'
|
||||
"""
|
||||
apiKey = environ.get("IBMC_APIKEY")
|
||||
service_url = environ.get("IBMC_URL")
|
||||
if not apiKey:
|
||||
raise Exception("Environmental variable 'IBMC_APIKEY' is not set")
|
||||
if not service_url:
|
||||
raise Exception("Environmental variable 'IBMC_URL' is not set")
|
||||
try:
|
||||
authenticator = IAMAuthenticator(apiKey)
|
||||
self.service = VpcV1(authenticator=authenticator)
|
||||
|
||||
self.service.set_service_url(service_url)
|
||||
except Exception as e:
|
||||
logging.error("error authenticating" + str(e))
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
node_list = self.list_instances()
|
||||
for node in node_list:
|
||||
if node_name == node["vpc_name"]:
|
||||
return node["vpc_id"]
|
||||
logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_instance(self, instance_id):
|
||||
"""
|
||||
Deletes the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
try:
|
||||
self.service.delete_instance(instance_id)
|
||||
logging.info("Deleted Instance -- '{}'".format(instance_id))
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be deleted. ".format(instance_id))
|
||||
return False
|
||||
|
||||
def reboot_instances(self, instance_id):
|
||||
"""
|
||||
Reboots the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is not powered on
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="reboot",
|
||||
)
|
||||
logging.info("Reset Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be rebooted".format(instance_id))
|
||||
return False
|
||||
|
||||
def stop_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already stopped
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="stop",
|
||||
)
|
||||
logging.info("Stopped Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be stopped".format(instance_id))
|
||||
logging.info("error" + str(e))
|
||||
return False
|
||||
|
||||
def start_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already running
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="start",
|
||||
)
|
||||
logging.info("Started Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not start running".format(instance_id))
|
||||
return False
|
||||
|
||||
def list_instances(self):
|
||||
"""
|
||||
Returns a list of Instances present in the datacenter
|
||||
"""
|
||||
instance_names = []
|
||||
try:
|
||||
instances_result = self.service.list_instances().get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc["name"], "vpc_id": vpc["id"]})
|
||||
starting_count = instances_result["total_count"]
|
||||
while instances_result["total_count"] == instances_result["limit"]:
|
||||
instances_result = self.service.list_instances(
|
||||
start=starting_count
|
||||
).get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
starting_count += instances_result["total_count"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc.name, "vpc_id": vpc.id})
|
||||
except Exception as e:
|
||||
logging.error("Error listing out instances: " + str(e))
|
||||
sys.exit(1)
|
||||
return instance_names
|
||||
|
||||
def find_id_in_list(self, name, vpc_list):
|
||||
for vpc in vpc_list:
|
||||
if vpc["vpc_name"] == name:
|
||||
return vpc["vpc_id"]
|
||||
|
||||
def get_instance_status(self, instance_id):
|
||||
"""
|
||||
Returns the status of the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
try:
|
||||
instance = self.service.get_instance(instance_id).get_result()
|
||||
state = instance["status"]
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_deleted(self, instance_id, timeout, affected_node=None):
|
||||
"""
|
||||
Waits until the instance is deleted or until the timeout. Returns True if
|
||||
the instance is successfully deleted, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
while vpc is not None:
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still being deleted, sleeping for 5 seconds"
|
||||
% instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not deleted in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return True
|
||||
|
||||
def wait_until_running(self, instance_id, timeout, affected_node=None):
|
||||
"""
|
||||
Waits until the Instance switches to running state or until the timeout.
|
||||
Returns True if the Instance switches to running, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "running":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not ready in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return True
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the Instance switches to stopped state or until the timeout.
|
||||
Returns True if the Instance switches to stopped, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "stopped":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not stopped, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not stopped in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return True
|
||||
|
||||
|
||||
def wait_until_rebooted(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the Instance switches to restarting state and then running state or until the timeout.
|
||||
Returns True if the Instance switches back to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status == "starting":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still restarting, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still restarting after allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
self.wait_until_running(instance_id, timeout, affected_node)
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ibm_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.ibmcloud = IbmCloud()
|
||||
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id( node)
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
for _ in range(instance_kill_count):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s " % (node))
|
||||
|
||||
if instance_id:
|
||||
vm_started = self.ibmcloud.start_instances(instance_id)
|
||||
if vm_started:
|
||||
self.ibmcloud.wait_until_running(instance_id, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(
|
||||
node, timeout, self.kubecli, affected_node
|
||||
)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, instance_id)
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (node))
|
||||
vm_stopped = self.ibmcloud.stop_instances(instance_id)
|
||||
if vm_stopped:
|
||||
self.ibmcloud.wait_until_stopped(instance_id, timeout, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s " % (node))
|
||||
self.ibmcloud.reboot_instances(instance_id)
|
||||
self.ibmcloud.wait_until_rebooted(instance_id, timeout)
|
||||
nodeaction.wait_for_unknown_status(
|
||||
node, timeout, affected_node
|
||||
)
|
||||
nodeaction.wait_for_ready_status(
|
||||
node, timeout, affected_node
|
||||
)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted successfully" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
|
||||
def node_terminate_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping the node"
|
||||
)
|
||||
logging.info("Deleting the node with instance ID: %s " % (node))
|
||||
self.ibmcloud.delete_instance(instance_id)
|
||||
self.ibmcloud.wait_until_deleted(node, timeout, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error("node_terminate_scenario injection failed!")
|
||||
|
||||
@@ -6,6 +6,7 @@ from itertools import repeat
|
||||
import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value, log_exception
|
||||
|
||||
@@ -21,7 +22,8 @@ from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scena
|
||||
from krkn.scenario_plugins.node_actions.general_cloud_node_scenarios import (
|
||||
general_node_scenarios,
|
||||
)
|
||||
|
||||
from krkn.scenario_plugins.node_actions.vmware_node_scenarios import vmware_node_scenarios
|
||||
from krkn.scenario_plugins.node_actions.ibmcloud_node_scenarios import ibm_node_scenarios
|
||||
node_general = False
|
||||
|
||||
|
||||
@@ -49,6 +51,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_scenario,
|
||||
node_scenario_object,
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
scenario_telemetry,
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(krkn_config, start_time, end_time)
|
||||
@@ -59,37 +62,38 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
return 0
|
||||
|
||||
def get_node_scenario_object(self, node_scenario, kubecli: KrknKubernetes):
|
||||
affected_nodes_status = AffectedNodeStatus()
|
||||
if (
|
||||
"cloud_type" not in node_scenario.keys()
|
||||
or node_scenario["cloud_type"] == "generic"
|
||||
):
|
||||
global node_general
|
||||
node_general = True
|
||||
return general_node_scenarios(kubecli)
|
||||
return general_node_scenarios(kubecli, affected_nodes_status)
|
||||
if node_scenario["cloud_type"].lower() == "aws":
|
||||
return aws_node_scenarios(kubecli)
|
||||
return aws_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif node_scenario["cloud_type"].lower() == "gcp":
|
||||
return gcp_node_scenarios(kubecli)
|
||||
return gcp_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif node_scenario["cloud_type"].lower() == "openstack":
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import (
|
||||
openstack_node_scenarios,
|
||||
)
|
||||
|
||||
return openstack_node_scenarios(kubecli)
|
||||
return openstack_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif (
|
||||
node_scenario["cloud_type"].lower() == "azure"
|
||||
or node_scenario["cloud_type"] == "az"
|
||||
or node_scenario["cloud_type"].lower() == "az"
|
||||
):
|
||||
return azure_node_scenarios(kubecli)
|
||||
return azure_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif (
|
||||
node_scenario["cloud_type"].lower() == "alibaba"
|
||||
or node_scenario["cloud_type"] == "alicloud"
|
||||
or node_scenario["cloud_type"].lower() == "alicloud"
|
||||
):
|
||||
from krkn.scenario_plugins.node_actions.alibaba_node_scenarios import (
|
||||
alibaba_node_scenarios,
|
||||
)
|
||||
|
||||
return alibaba_node_scenarios(kubecli)
|
||||
return alibaba_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif node_scenario["cloud_type"].lower() == "bm":
|
||||
from krkn.scenario_plugins.node_actions.bm_node_scenarios import (
|
||||
bm_node_scenarios,
|
||||
@@ -100,9 +104,20 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_scenario.get("bmc_user", None),
|
||||
node_scenario.get("bmc_password", None),
|
||||
kubecli,
|
||||
affected_nodes_status
|
||||
)
|
||||
elif node_scenario["cloud_type"].lower() == "docker":
|
||||
return docker_node_scenarios(kubecli)
|
||||
elif (
|
||||
node_scenario["cloud_type"].lower() == "vsphere"
|
||||
or node_scenario["cloud_type"].lower() == "vmware"
|
||||
):
|
||||
return vmware_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif (
|
||||
node_scenario["cloud_type"].lower() == "ibm"
|
||||
or node_scenario["cloud_type"].lower() == "ibmcloud"
|
||||
):
|
||||
return ibm_node_scenarios(kubecli, affected_nodes_status)
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type "
|
||||
@@ -120,7 +135,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
|
||||
def inject_node_scenario(
|
||||
self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes
|
||||
self, action, node_scenario, node_scenario_object, kubecli: KrknKubernetes, scenario_telemetry: ScenarioTelemetry
|
||||
):
|
||||
|
||||
# Get the node scenario configurations for setting nodes
|
||||
@@ -138,17 +153,18 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
nodes = common_node_functions.get_node(
|
||||
label_selector, instance_kill_count, kubecli
|
||||
)
|
||||
|
||||
|
||||
# GCP api doesn't support multiprocessing calls, will only actually run 1
|
||||
if parallel_nodes and node_scenario['cloud_type'].lower() != "gcp":
|
||||
if parallel_nodes:
|
||||
self.multiprocess_nodes(nodes, node_scenario_object, action, node_scenario)
|
||||
else:
|
||||
for single_node in nodes:
|
||||
self.run_node(single_node, node_scenario_object, action, node_scenario)
|
||||
affected_nodes_status = node_scenario_object.affected_nodes_status
|
||||
scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes)
|
||||
|
||||
def multiprocess_nodes(self, nodes, node_scenario_object, action, node_scenario):
|
||||
try:
|
||||
logging.info("parallely call to nodes")
|
||||
# pool object with number of element
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
|
||||
@@ -160,10 +176,9 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
|
||||
|
||||
def run_node(self, single_node, node_scenario_object, action, node_scenario):
|
||||
logging.info("action" + str(action))
|
||||
# Get the scenario specifics for running action nodes
|
||||
run_kill_count = get_yaml_item_value(node_scenario, "runs", 1)
|
||||
if action == "node_stop_start_scenario":
|
||||
if action in ("node_stop_start_scenario", "node_disk_detach_attach_scenario"):
|
||||
duration = get_yaml_item_value(node_scenario, "duration", 120)
|
||||
|
||||
timeout = get_yaml_item_value(node_scenario, "timeout", 120)
|
||||
@@ -200,6 +215,9 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_scenario_object.node_reboot_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
)
|
||||
elif action == "node_disk_detach_attach_scenario":
|
||||
node_scenario_object.node_disk_detach_attach_scenario(
|
||||
run_kill_count, single_node, timeout, duration)
|
||||
elif action == "stop_start_kubelet_scenario":
|
||||
node_scenario_object.stop_start_kubelet_scenario(
|
||||
run_kill_count, single_node, timeout
|
||||
@@ -245,5 +263,6 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
% action
|
||||
)
|
||||
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["node_scenarios"]
|
||||
|
||||
@@ -7,7 +7,7 @@ from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class OPENSTACKCLOUD:
|
||||
def __init__(self):
|
||||
@@ -56,12 +56,22 @@ class OPENSTACKCLOUD:
|
||||
raise RuntimeError()
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, node, timeout):
|
||||
return self.get_instance_status(node, "ACTIVE", timeout)
|
||||
def wait_until_running(self, node, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
instance_status= self.get_instance_status(node, "ACTIVE", timeout)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return instance_status
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, node, timeout):
|
||||
return self.get_instance_status(node, "SHUTOFF", timeout)
|
||||
def wait_until_stopped(self, node, timeout, affected_node):
|
||||
start_time = time.time()
|
||||
instance_status = self.get_instance_status(node, "SHUTOFF", timeout)
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return instance_status
|
||||
|
||||
# Get instance status
|
||||
def get_instance_status(self, node, expected_status, timeout):
|
||||
@@ -107,19 +117,21 @@ class OPENSTACKCLOUD:
|
||||
|
||||
# krkn_lib
|
||||
class openstack_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus ):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.openstackcloud = OPENSTACKCLOUD()
|
||||
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s" % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
self.openstackcloud.start_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name, timeout)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info("Node with instance ID: %s is in running state" % (node))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
@@ -130,18 +142,20 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
self.openstackcloud.stop_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name, timeout)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name, timeout, affected_node)
|
||||
logging.info("Node with instance name: %s is in stopped state" % (node))
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_not_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to stop node instance. Encountered following exception: %s. "
|
||||
@@ -150,17 +164,19 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s" % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
self.openstackcloud.reboot_instances(openstack_node_name)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info("Node with instance name: %s has been rebooted" % (node))
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
@@ -171,10 +187,12 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to start the node
|
||||
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node_ip)
|
||||
try:
|
||||
logging.info("Starting helper_node_start_scenario injection")
|
||||
openstack_node_name = self.openstackcloud.get_openstack_nodename(
|
||||
@@ -182,7 +200,7 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.info("Starting the helper node %s" % (openstack_node_name))
|
||||
self.openstackcloud.start_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name, timeout)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name, timeout, affected_node)
|
||||
logging.info("Helper node with IP: %s is in running state" % (node_ip))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
@@ -193,10 +211,12 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("helper_node_start_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node_ip)
|
||||
try:
|
||||
logging.info("Starting helper_node_stop_scenario injection")
|
||||
openstack_node_name = self.openstackcloud.get_openstack_nodename(
|
||||
@@ -204,7 +224,7 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
)
|
||||
logging.info("Stopping the helper node %s " % (openstack_node_name))
|
||||
self.openstackcloud.stop_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name, timeout)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name, timeout, affected_node)
|
||||
logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
@@ -214,6 +234,7 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.error("helper_node_stop_scenario injection failed!")
|
||||
|
||||
raise RuntimeError()
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
|
||||
try:
|
||||
|
||||
@@ -3,25 +3,25 @@ import logging
|
||||
import random
|
||||
import sys
|
||||
import time
|
||||
import typing
|
||||
from dataclasses import dataclass, field
|
||||
import urllib3
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from dataclasses import dataclass
|
||||
from os import environ
|
||||
from traceback import format_exc
|
||||
import requests
|
||||
from arcaflow_plugin_sdk import plugin, validation
|
||||
from com.vmware.vapi.std.errors_client import (
|
||||
AlreadyInDesiredState,
|
||||
NotAllowedInCurrentState,
|
||||
)
|
||||
from com.vmware.vcenter.vm_client import Power
|
||||
from com.vmware.vcenter_client import VM, ResourcePool
|
||||
from kubernetes import client, watch
|
||||
from vmware.vapi.vsphere.client import create_vsphere_client
|
||||
|
||||
from krkn.scenario_plugins.native.node_scenarios import (
|
||||
kubernetes_functions as kube_helper,
|
||||
)
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus
|
||||
|
||||
class vSphere:
|
||||
def __init__(self, verify=True):
|
||||
@@ -32,7 +32,7 @@ class vSphere:
|
||||
self.server = environ.get("VSPHERE_IP")
|
||||
self.username = environ.get("VSPHERE_USERNAME")
|
||||
self.password = environ.get("VSPHERE_PASSWORD")
|
||||
session = self.get_unverified_session() if not verify else None
|
||||
session = self.get_unverified_session()
|
||||
self.credentials_present = (
|
||||
True if self.server and self.username and self.password else False
|
||||
)
|
||||
@@ -42,6 +42,7 @@ class vSphere:
|
||||
"'VSPHERE_IP', 'VSPHERE_USERNAME', "
|
||||
"'VSPHERE_PASSWORD' are not set"
|
||||
)
|
||||
|
||||
self.client = create_vsphere_client(
|
||||
server=self.server,
|
||||
username=self.username,
|
||||
@@ -53,10 +54,13 @@ class vSphere:
|
||||
"""
|
||||
Returns an unverified session object
|
||||
"""
|
||||
|
||||
|
||||
session = requests.session()
|
||||
# Set the proxy settings for the session
|
||||
session.verify = False
|
||||
requests.packages.urllib3.disable_warnings()
|
||||
|
||||
urllib3.disable_warnings()
|
||||
|
||||
return session
|
||||
|
||||
def get_vm(self, instance_id):
|
||||
@@ -297,14 +301,16 @@ class vSphere:
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_released(self, instance_id, timeout):
|
||||
def wait_until_released(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the VM is deleted or until the timeout. Returns True if
|
||||
the VM is successfully deleted, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
vm = self.get_vm(instance_id)
|
||||
exit_status = True
|
||||
while vm is not None:
|
||||
vm = self.get_vm(instance_id)
|
||||
logging.info(
|
||||
@@ -314,16 +320,22 @@ class vSphere:
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(f"VM {instance_id} is still not deleted in allotted time")
|
||||
return False
|
||||
return True
|
||||
exit_status = False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
|
||||
return exit_status
|
||||
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
def wait_until_running(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the VM switches to POWERED_ON state or until the timeout.
|
||||
Returns True if the VM switches to POWERED_ON, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
exit_status = True
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != Power.State.POWERED_ON:
|
||||
status = self.get_vm_status(instance_id)
|
||||
@@ -334,16 +346,23 @@ class vSphere:
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(f"VM {instance_id} is still not ready in allotted time")
|
||||
return False
|
||||
return True
|
||||
exit_status = False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
return exit_status
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the VM switches to POWERED_OFF state or until the timeout.
|
||||
Returns True if the VM switches to POWERED_OFF, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
start_time = time.time()
|
||||
exit_status = True
|
||||
status = self.get_vm_status(instance_id)
|
||||
while status != Power.State.POWERED_OFF:
|
||||
status = self.get_vm_status(instance_id)
|
||||
@@ -354,322 +373,106 @@ class vSphere:
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(f"VM {instance_id} is still not ready in allotted time")
|
||||
return False
|
||||
return True
|
||||
exit_status = False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
|
||||
|
||||
return exit_status
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node:
|
||||
name: str
|
||||
class vmware_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.vsphere = vSphere()
|
||||
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info(f"Starting the node {node} ")
|
||||
vm_started = self.vsphere.start_instances(node)
|
||||
if vm_started:
|
||||
self.vsphere.wait_until_running(node, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
logging.info(f"Node with instance ID: {node} is in running state")
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_start_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioSuccessOutput:
|
||||
|
||||
nodes: typing.Dict[int, Node] = field(
|
||||
metadata={
|
||||
"name": "Nodes started/stopped/terminated/rebooted",
|
||||
"description": "Map between timestamps and the pods "
|
||||
"started/stopped/terminated/rebooted. "
|
||||
"The timestamp is provided in nanoseconds",
|
||||
}
|
||||
)
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": "The action performed or attempted to be "
|
||||
"performed on the node. Possible values"
|
||||
"are : Start, Stop, Terminate, Reboot",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioErrorOutput:
|
||||
|
||||
error: str
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": "The action attempted to be performed on the node. "
|
||||
"Possible values are : Start Stop, Terminate, Reboot",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioConfig:
|
||||
|
||||
name: typing.Annotated[
|
||||
typing.Optional[str],
|
||||
validation.required_if_not("label_selector"),
|
||||
validation.required_if("skip_openshift_checks"),
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Name",
|
||||
"description": "Name(s) for target nodes. "
|
||||
"Required if label_selector is not set.",
|
||||
},
|
||||
)
|
||||
|
||||
runs: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Number of runs per node",
|
||||
"description": "Number of times to inject each scenario under "
|
||||
"actions (will perform on same node each time)",
|
||||
},
|
||||
)
|
||||
|
||||
label_selector: typing.Annotated[
|
||||
typing.Optional[str], validation.min(1), validation.required_if_not("name")
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Label selector",
|
||||
"description": "Kubernetes label selector for the target nodes. "
|
||||
"Required if name is not set.\n"
|
||||
"See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ " # noqa
|
||||
"for details.",
|
||||
},
|
||||
)
|
||||
|
||||
timeout: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=180,
|
||||
metadata={
|
||||
"name": "Timeout",
|
||||
"description": "Timeout to wait for the target pod(s) "
|
||||
"to be removed in seconds.",
|
||||
},
|
||||
)
|
||||
|
||||
instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Instance Count",
|
||||
"description": "Number of nodes to perform action/select "
|
||||
"that match the label selector.",
|
||||
},
|
||||
)
|
||||
|
||||
skip_openshift_checks: typing.Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"name": "Skip Openshift Checks",
|
||||
"description": "Skip checking the status of the openshift nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
verify_session: bool = field(
|
||||
default=True,
|
||||
metadata={
|
||||
"name": "Verify API Session",
|
||||
"description": "Verifies the vSphere client session. "
|
||||
"It is enabled by default",
|
||||
},
|
||||
)
|
||||
|
||||
kubeconfig_path: typing.Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Kubeconfig path",
|
||||
"description": "Path to your Kubeconfig file. "
|
||||
"Defaults to ~/.kube/config.\n"
|
||||
"See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ " # noqa
|
||||
"for details.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="vmware-node-start",
|
||||
name="Start the node",
|
||||
description="Start the node(s) by starting the VMware VM "
|
||||
"on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_start(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.START, core_v1)
|
||||
nodes_started = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info(f"Starting the node {name} ")
|
||||
vm_started = vsphere.start_instances(name)
|
||||
if vm_started:
|
||||
vsphere.wait_until_running(name, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(f"Node with instance ID: {name} is in running state")
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_start_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_started, kube_helper.Actions.START
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="vmware-node-stop",
|
||||
name="Stop the node",
|
||||
description="Stop the node(s) by starting the VMware VM "
|
||||
"on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_stop(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1)
|
||||
nodes_stopped = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info(f"Stopping the node {name} ")
|
||||
vm_stopped = vsphere.stop_instances(name)
|
||||
if vm_stopped:
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(f"Node with instance ID: {name} is in stopped state")
|
||||
logging.info("node_stop_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_stop_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_stopped, kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="vmware-node-reboot",
|
||||
name="Reboot VMware VM",
|
||||
description="Reboot the node(s) by starting the VMware VM "
|
||||
"on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_reboot(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.REBOOT, core_v1)
|
||||
nodes_rebooted = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info(f"Rebooting the node {name} ")
|
||||
vsphere.reboot_instances(name)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
f"Node with instance ID: {name} has rebooted " "successfully"
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info(f"Stopping the node {node} ")
|
||||
vm_stopped = self.vsphere.stop_instances(node)
|
||||
if vm_stopped:
|
||||
self.vsphere.wait_until_stopped(node, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(
|
||||
node, timeout, self.kubecli, affected_node
|
||||
)
|
||||
logging.info("node_reboot_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_reboot_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
)
|
||||
logging.info(f"Node with instance ID: {node} is in stopped state")
|
||||
logging.info("node_stop_scenario has been successfully injected!")
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_stop_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_rebooted, kube_helper.Actions.REBOOT
|
||||
)
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info(f"Rebooting the node {node} ")
|
||||
self.vsphere.reboot_instances(node)
|
||||
|
||||
nodeaction.wait_for_unknown_status(
|
||||
node, timeout, self.kubecli, affected_node
|
||||
)
|
||||
|
||||
logging.info(
|
||||
f"Node with instance ID: {node} has rebooted " "successfully"
|
||||
)
|
||||
logging.info("node_reboot_scenario has been successfully injected!")
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_reboot_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="vmware-node-terminate",
|
||||
name="Reboot VMware VM",
|
||||
description="Wait for the node to be terminated",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_terminate(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
vsphere = vSphere(verify=cfg.verify_session)
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
node_list = kube_helper.get_node_list(
|
||||
cfg, kube_helper.Actions.TERMINATE, core_v1
|
||||
)
|
||||
nodes_terminated = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection "
|
||||
"by first stopping the node"
|
||||
)
|
||||
vsphere.stop_instances(name)
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
logging.info(f"Releasing the node with instance ID: {name} ")
|
||||
vsphere.release_instances(name)
|
||||
vsphere.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(f"Node with instance ID: {name} has been released")
|
||||
logging.info(
|
||||
"node_terminate_scenario has been " "successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_terminate_scenario injection failed! " f"Error was: {str(e)}"
|
||||
def node_terminate_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node)
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection "
|
||||
"by first stopping the node"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
self.vsphere.stop_instances(node)
|
||||
self.vsphere.wait_until_stopped(node, timeout, affected_node)
|
||||
logging.info(f"Releasing the node with instance ID: {node} ")
|
||||
self.vsphere.release_instances(node)
|
||||
self.vsphere.wait_until_released(node, timeout, affected_node)
|
||||
logging.info(f"Node with instance ID: {node} has been released")
|
||||
logging.info(
|
||||
"node_terminate_scenario has been " "successfully injected!"
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_terminated, kube_helper.Actions.TERMINATE
|
||||
)
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error(
|
||||
f"node_terminate_scenario injection failed! " f"Error was: {str(e)}"
|
||||
)
|
||||
@@ -13,8 +13,11 @@ from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
|
||||
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud
|
||||
from krkn.scenario_plugins.node_actions.ibmcloud_node_scenarios import IbmCloud
|
||||
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode
|
||||
|
||||
class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(
|
||||
@@ -32,9 +35,12 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
"cluster_shut_down_scenario"
|
||||
]
|
||||
start_time = int(time.time())
|
||||
affected_nodes_status = AffectedNodeStatus()
|
||||
self.cluster_shut_down(
|
||||
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes()
|
||||
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
|
||||
)
|
||||
|
||||
scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
return 0
|
||||
@@ -52,7 +58,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
else:
|
||||
pool = ThreadPool(processes=processes)
|
||||
logging.info("nodes type " + str(type(nodes[0])))
|
||||
if type(nodes[0]) is tuple:
|
||||
node_id = []
|
||||
node_info = []
|
||||
@@ -72,7 +77,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
|
||||
# Inject the cluster shut down scenario
|
||||
# krkn_lib
|
||||
def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes):
|
||||
def cluster_shut_down(self, shut_down_config, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
runs = shut_down_config["runs"]
|
||||
shut_down_duration = shut_down_config["shut_down_duration"]
|
||||
cloud_type = shut_down_config["cloud_type"]
|
||||
@@ -101,25 +106,30 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
node_id = []
|
||||
for node in nodes:
|
||||
instance_id = cloud_object.get_instance_id(node)
|
||||
affected_nodes_status.affected_nodes.append(AffectedNode(node, node_id=instance_id))
|
||||
node_id.append(instance_id)
|
||||
logging.info("node id list " + str(node_id))
|
||||
for _ in range(runs):
|
||||
logging.info("Starting cluster_shut_down scenario injection")
|
||||
stopping_nodes = set(node_id)
|
||||
self.multiprocess_nodes(cloud_object.stop_instances, node_id, processes)
|
||||
stopped_nodes = stopping_nodes.copy()
|
||||
start_time = time.time()
|
||||
while len(stopping_nodes) > 0:
|
||||
for node in stopping_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_stopped(
|
||||
node[1], node[0], timeout
|
||||
node[1], node[0], timeout, affected_node
|
||||
)
|
||||
else:
|
||||
node_status = cloud_object.wait_until_stopped(node, timeout)
|
||||
node_status = cloud_object.wait_until_stopped(node, timeout, affected_node)
|
||||
|
||||
# Only want to remove node from stopping list
|
||||
# when fully stopped/no error
|
||||
if node_status:
|
||||
# need to add in time that is passing while waiting for other nodes to be stopped
|
||||
affected_node.set_cloud_stopping_time(time.time() - start_time)
|
||||
stopped_nodes.remove(node)
|
||||
|
||||
stopping_nodes = stopped_nodes.copy()
|
||||
@@ -132,19 +142,25 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
logging.info("Restarting the nodes")
|
||||
restarted_nodes = set(node_id)
|
||||
self.multiprocess_nodes(cloud_object.start_instances, node_id, processes)
|
||||
start_time = time.time()
|
||||
logging.info("Wait for each node to be running again")
|
||||
not_running_nodes = restarted_nodes.copy()
|
||||
while len(not_running_nodes) > 0:
|
||||
for node in not_running_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
# need to add in time that is passing while waiting for other nodes to be running
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_running(
|
||||
node[1], node[0], timeout
|
||||
node[1], node[0], timeout, affected_node
|
||||
)
|
||||
else:
|
||||
node_status = cloud_object.wait_until_running(node, timeout)
|
||||
node_status = cloud_object.wait_until_running(node, timeout, affected_node)
|
||||
if node_status:
|
||||
affected_node.set_cloud_running_time(time.time() - start_time)
|
||||
restarted_nodes.remove(node)
|
||||
not_running_nodes = restarted_nodes.copy()
|
||||
|
||||
logging.info("Waiting for 150s to allow cluster component initialization")
|
||||
time.sleep(150)
|
||||
|
||||
|
||||
@@ -2,15 +2,21 @@ import logging
|
||||
import time
|
||||
|
||||
import yaml
|
||||
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from itertools import repeat
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import log_exception
|
||||
|
||||
from krkn import utils
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.native.network import cerberus
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
||||
|
||||
class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(
|
||||
@@ -25,92 +31,138 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
with open(scenario, "r") as f:
|
||||
zone_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = zone_outage_config_yaml["zone_outage"]
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
cloud_type = scenario_config["cloud_type"]
|
||||
# Add support for user-provided default network ACL
|
||||
default_acl_id = scenario_config.get("default_acl_id")
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
|
||||
if cloud_type.lower() == "aws":
|
||||
cloud_object = AWS()
|
||||
else:
|
||||
logging.error(
|
||||
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios" % cloud_type
|
||||
)
|
||||
return 1
|
||||
|
||||
start_time = int(time.time())
|
||||
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = cloud_object.describe_network_acls(
|
||||
vpc_id, subnet_id
|
||||
)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s" % (subnet_id, network_association_ids)
|
||||
)
|
||||
|
||||
# Use provided default ACL if available, otherwise create a new one
|
||||
if default_acl_id:
|
||||
acl_id = default_acl_id
|
||||
logging.info(
|
||||
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
|
||||
default_acl_id
|
||||
)
|
||||
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
|
||||
if cloud_type.lower() == "aws":
|
||||
self.cloud_object = AWS()
|
||||
self.network_based_zone(scenario_config)
|
||||
else:
|
||||
kubecli = lib_telemetry.get_lib_kubernetes()
|
||||
if cloud_type.lower() == "gcp":
|
||||
affected_nodes_status = AffectedNodeStatus()
|
||||
self.cloud_object = gcp_node_scenarios(kubecli, affected_nodes_status)
|
||||
self.node_based_zone(scenario_config, kubecli)
|
||||
affected_nodes_status = self.cloud_object.affected_nodes_status
|
||||
scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes)
|
||||
else:
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
logging.info("Created new default ACL %s", acl_id)
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
new_association_id = cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
cloud_object.replace_network_acl_association(
|
||||
new_association_id, original_acl_id
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure " "the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
cloud_object.delete_network_acl(acl_id)
|
||||
logging.error(
|
||||
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios" % cloud_type
|
||||
)
|
||||
return 1
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error(
|
||||
f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
|
||||
)
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def node_based_zone(self, scenario_config: dict[str, any], kubecli: KrknKubernetes ):
|
||||
zone = scenario_config["zone"]
|
||||
duration = get_yaml_item_value(scenario_config, "duration", 60)
|
||||
timeout = get_yaml_item_value(scenario_config, "timeout", 180)
|
||||
label_selector = f"topology.kubernetes.io/zone={zone}"
|
||||
try:
|
||||
# get list of nodes in zone/region
|
||||
nodes = kubecli.list_killable_nodes(label_selector)
|
||||
# stop nodes in parallel
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
|
||||
pool.starmap(
|
||||
self.cloud_object.node_stop_scenario,zip(repeat(1), nodes, repeat(timeout))
|
||||
)
|
||||
|
||||
pool.close()
|
||||
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# start nodes in parallel
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
pool.starmap(
|
||||
self.cloud_object.node_start_scenario,zip(repeat(1), nodes, repeat(timeout))
|
||||
)
|
||||
pool.close()
|
||||
except Exception as e:
|
||||
logging.info(
|
||||
f"Node based zone outage scenario failed with exception: {e}"
|
||||
)
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def network_based_zone(self, scenario_config: dict[str, any]):
|
||||
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
# Add support for user-provided default network ACL
|
||||
default_acl_id = scenario_config.get("default_acl_id")
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = self.cloud_object.describe_network_acls(
|
||||
vpc_id, subnet_id
|
||||
)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s" % (subnet_id, network_association_ids)
|
||||
)
|
||||
|
||||
# Use provided default ACL if available, otherwise create a new one
|
||||
if default_acl_id:
|
||||
acl_id = default_acl_id
|
||||
logging.info(
|
||||
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
|
||||
default_acl_id
|
||||
)
|
||||
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
|
||||
else:
|
||||
acl_id = self.cloud_object.create_default_network_acl(vpc_id)
|
||||
logging.info("Created new default ACL %s", acl_id)
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
new_association_id = self.cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
self.cloud_object.replace_network_acl_association(
|
||||
new_association_id, original_acl_id
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure " "the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
self.cloud_object.delete_network_acl(acl_id)
|
||||
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["zone_outages_scenarios"]
|
||||
|
||||
83
krkn/utils/HealthChecker.py
Normal file
83
krkn/utils/HealthChecker.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import requests
|
||||
import time
|
||||
import logging
|
||||
import queue
|
||||
from datetime import datetime
|
||||
from krkn_lib.models.telemetry.models import HealthCheck
|
||||
|
||||
class HealthChecker:
|
||||
current_iterations: int = 0
|
||||
ret_value = 0
|
||||
def __init__(self, iterations):
|
||||
self.iterations = iterations
|
||||
|
||||
def make_request(self, url, auth=None, headers=None):
|
||||
response_data = {}
|
||||
response = requests.get(url, auth=auth, headers=headers)
|
||||
response_data["url"] = url
|
||||
response_data["status"] = response.status_code == 200
|
||||
response_data["status_code"] = response.status_code
|
||||
return response_data
|
||||
|
||||
|
||||
def run_health_check(self, health_check_config, health_check_telemetry_queue: queue.Queue):
|
||||
if health_check_config and health_check_config["config"] and any(config.get("url") for config in health_check_config["config"]):
|
||||
health_check_start_time_stamp = datetime.now()
|
||||
health_check_telemetry = []
|
||||
health_check_tracker = {}
|
||||
interval = health_check_config["interval"] if health_check_config["interval"] else 2
|
||||
response_tracker = {config["url"]:True for config in health_check_config["config"]}
|
||||
while self.current_iterations < self.iterations:
|
||||
for config in health_check_config.get("config"):
|
||||
auth, headers = None, None
|
||||
if config["url"]: url = config["url"]
|
||||
|
||||
if config["bearer_token"]:
|
||||
bearer_token = "Bearer " + config["bearer_token"]
|
||||
headers = {"Authorization": bearer_token}
|
||||
|
||||
if config["auth"]: auth = config["auth"]
|
||||
response = self.make_request(url, auth, headers)
|
||||
|
||||
if response["status_code"] != 200:
|
||||
if config["url"] not in health_check_tracker:
|
||||
start_timestamp = datetime.now()
|
||||
health_check_tracker[config["url"]] = {
|
||||
"status_code": response["status_code"],
|
||||
"start_timestamp": start_timestamp
|
||||
}
|
||||
if response_tracker[config["url"]] != False: response_tracker[config["url"]] = False
|
||||
if config["exit_on_failure"] and config["exit_on_failure"] == True and self.ret_value==0: self.ret_value = 2
|
||||
else:
|
||||
if config["url"] in health_check_tracker:
|
||||
end_timestamp = datetime.now()
|
||||
start_timestamp = health_check_tracker[config["url"]]["start_timestamp"]
|
||||
previous_status_code = str(health_check_tracker[config["url"]]["status_code"])
|
||||
duration = (end_timestamp - start_timestamp).total_seconds()
|
||||
downtime_record = {
|
||||
"url": config["url"],
|
||||
"status": False,
|
||||
"status_code": previous_status_code,
|
||||
"start_timestamp": start_timestamp.isoformat(),
|
||||
"end_timestamp": end_timestamp.isoformat(),
|
||||
"duration": duration
|
||||
}
|
||||
health_check_telemetry.append(HealthCheck(downtime_record))
|
||||
del health_check_tracker[config["url"]]
|
||||
time.sleep(interval)
|
||||
health_check_end_time_stamp = datetime.now()
|
||||
for url, status in response_tracker.items():
|
||||
if status == True:
|
||||
duration = (health_check_end_time_stamp - health_check_start_time_stamp).total_seconds()
|
||||
success_response = {
|
||||
"url": url,
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": health_check_start_time_stamp.isoformat(),
|
||||
"end_timestamp": health_check_end_time_stamp.isoformat(),
|
||||
"duration": duration
|
||||
}
|
||||
health_check_telemetry.append(HealthCheck(success_response))
|
||||
health_check_telemetry_queue.put(health_check_telemetry)
|
||||
else:
|
||||
logging.info("health checks config is not defined, skipping them")
|
||||
@@ -3,10 +3,10 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from tzlocal.unix import get_localzone
|
||||
|
||||
import logging
|
||||
|
||||
def populate_cluster_events(
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
krkn_config: dict,
|
||||
scenario_config: dict,
|
||||
kubecli: KrknKubernetes,
|
||||
start_timestamp: int,
|
||||
@@ -31,8 +31,12 @@ def populate_cluster_events(
|
||||
namespace=namespace,
|
||||
)
|
||||
)
|
||||
|
||||
scenario_telemetry.set_cluster_events(events)
|
||||
archive_path = krkn_config["telemetry"]["archive_path"]
|
||||
file_path = archive_path + "/events.json"
|
||||
with open(file_path, "w+") as f:
|
||||
f.write("\n".join(str(item) for item in events))
|
||||
logging.info(f'Find cluster events in file {file_path}' )
|
||||
|
||||
|
||||
|
||||
def collect_and_put_ocp_logs(
|
||||
|
||||
@@ -1,25 +1,24 @@
|
||||
aliyun-python-sdk-core==2.13.36
|
||||
aliyun-python-sdk-ecs==4.24.25
|
||||
arcaflow-plugin-sdk==0.14.0
|
||||
arcaflow==0.19.1
|
||||
boto3==1.28.61
|
||||
azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
itsdangerous==2.0.1
|
||||
coverage==7.4.1
|
||||
coverage==7.6.12
|
||||
datetime==5.4
|
||||
docker==7.0.0
|
||||
gitpython==3.1.41
|
||||
google-api-python-client==2.116.0
|
||||
google-auth==2.37.0
|
||||
google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.4
|
||||
krkn-lib==4.0.4
|
||||
jinja2==3.1.6
|
||||
krkn-lib==5.0.0
|
||||
lxml==5.1.0
|
||||
kubernetes==28.1.0
|
||||
numpy==1.26.4
|
||||
oauth2client==4.1.3
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
paramiko==3.4.0
|
||||
@@ -36,6 +35,7 @@ werkzeug==3.0.6
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
|
||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@v0.1.0
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
@@ -9,6 +9,8 @@ import optparse
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
import queue
|
||||
import threading
|
||||
|
||||
from krkn_lib.elastic.krkn_elastic import KrknElastic
|
||||
from krkn_lib.models.elastic import ElasticChaosRunTelemetry
|
||||
@@ -26,11 +28,16 @@ from krkn_lib.utils import SafeLogger
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case
|
||||
|
||||
from krkn.utils import TeeLogHandler
|
||||
from krkn.utils.HealthChecker import HealthChecker
|
||||
from krkn.scenario_plugins.scenario_plugin_factory import (
|
||||
ScenarioPluginFactory,
|
||||
ScenarioPluginNotFound,
|
||||
)
|
||||
|
||||
# removes TripleDES warning
|
||||
import warnings
|
||||
warnings.filterwarnings(action='ignore', module='.*paramiko.*')
|
||||
|
||||
report_file = ""
|
||||
|
||||
|
||||
@@ -45,9 +52,7 @@ def main(cfg) -> int:
|
||||
with open(cfg, "r") as f:
|
||||
config = yaml.full_load(f)
|
||||
global kubeconfig_path, wait_duration, kraken_config
|
||||
distribution = get_yaml_item_value(
|
||||
config["kraken"], "distribution", "openshift"
|
||||
)
|
||||
|
||||
kubeconfig_path = os.path.expanduser(
|
||||
get_yaml_item_value(config["kraken"], "kubeconfig_path", "")
|
||||
)
|
||||
@@ -86,13 +91,6 @@ def main(cfg) -> int:
|
||||
)
|
||||
# elastic search
|
||||
enable_elastic = get_yaml_item_value(config["elastic"], "enable_elastic", False)
|
||||
elastic_collect_metrics = get_yaml_item_value(
|
||||
config["elastic"], "collect_metrics", False
|
||||
)
|
||||
|
||||
elastic_colllect_alerts = get_yaml_item_value(
|
||||
config["elastic"], "collect_alerts", False
|
||||
)
|
||||
|
||||
elastic_url = get_yaml_item_value(config["elastic"], "elastic_url", "")
|
||||
|
||||
@@ -123,10 +121,11 @@ def main(cfg) -> int:
|
||||
config["performance_monitoring"], "check_critical_alerts", False
|
||||
)
|
||||
telemetry_api_url = config["telemetry"].get("api_url")
|
||||
health_check_config = config["health_checks"]
|
||||
|
||||
# Initialize clients
|
||||
if not os.path.isfile(kubeconfig_path) and not os.path.isfile(
|
||||
"/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
"/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
):
|
||||
logging.error(
|
||||
"Cannot read the kubeconfig file at %s, please check" % kubeconfig_path
|
||||
@@ -163,6 +162,11 @@ def main(cfg) -> int:
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
distribution = "kubernetes"
|
||||
if ocpcli.is_openshift():
|
||||
distribution = "openshift"
|
||||
logging.info("Detected distribution %s" % (distribution))
|
||||
|
||||
# find node kraken might be running on
|
||||
kubecli.find_kraken_node()
|
||||
|
||||
@@ -199,7 +203,7 @@ def main(cfg) -> int:
|
||||
else:
|
||||
# If can't make a connection, set alerts to false
|
||||
enable_alerts = False
|
||||
critical_alerts = False
|
||||
check_critical_alerts = False
|
||||
except Exception:
|
||||
logging.error(
|
||||
"invalid distribution selected, running openshift scenarios against kubernetes cluster."
|
||||
@@ -219,6 +223,7 @@ def main(cfg) -> int:
|
||||
safe_logger, ocpcli, telemetry_request_id, config["telemetry"]
|
||||
)
|
||||
if enable_elastic:
|
||||
logging.info(f"Elastic collection enabled at: {elastic_url}:{elastic_port}")
|
||||
elastic_search = KrknElastic(
|
||||
safe_logger,
|
||||
elastic_url,
|
||||
@@ -267,8 +272,8 @@ def main(cfg) -> int:
|
||||
classes_and_types: dict[str, list[str]] = {}
|
||||
for loaded in scenario_plugin_factory.loaded_plugins.keys():
|
||||
if (
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
not in classes_and_types.keys()
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
not in classes_and_types.keys()
|
||||
):
|
||||
classes_and_types[
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
@@ -295,6 +300,12 @@ def main(cfg) -> int:
|
||||
module_name, class_name, error = failed
|
||||
logging.error(f"⛔ Class: {class_name} Module: {module_name}")
|
||||
logging.error(f"⚠️ {error}\n")
|
||||
health_check_telemetry_queue = queue.Queue()
|
||||
health_checker = HealthChecker(iterations)
|
||||
health_check_worker = threading.Thread(target=health_checker.run_health_check,
|
||||
args=(health_check_config, health_check_telemetry_queue))
|
||||
health_check_worker.start()
|
||||
|
||||
# Loop to run the chaos starts here
|
||||
while int(iteration) < iterations and run_signal != "STOP":
|
||||
# Inject chaos scenarios specified in the config
|
||||
@@ -355,12 +366,18 @@ def main(cfg) -> int:
|
||||
break
|
||||
|
||||
iteration += 1
|
||||
health_checker.current_iterations += 1
|
||||
|
||||
# telemetry
|
||||
# in order to print decoded telemetry data even if telemetry collection
|
||||
# is disabled, it's necessary to serialize the ChaosRunTelemetry object
|
||||
# to json, and recreate a new object from it.
|
||||
end_time = int(time.time())
|
||||
health_check_worker.join()
|
||||
try:
|
||||
chaos_telemetry.health_checks = health_check_telemetry_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
chaos_telemetry.health_checks = None
|
||||
|
||||
# if platform is openshift will be collected
|
||||
# Cloud platform and network plugins metadata
|
||||
@@ -415,9 +432,9 @@ def main(cfg) -> int:
|
||||
)
|
||||
else:
|
||||
if (
|
||||
config["telemetry"]["prometheus_namespace"]
|
||||
and config["telemetry"]["prometheus_pod_name"]
|
||||
and config["telemetry"]["prometheus_container_name"]
|
||||
config["telemetry"]["prometheus_namespace"]
|
||||
and config["telemetry"]["prometheus_pod_name"]
|
||||
and config["telemetry"]["prometheus_container_name"]
|
||||
):
|
||||
try:
|
||||
prometheus_archive_files = (
|
||||
@@ -466,8 +483,7 @@ def main(cfg) -> int:
|
||||
start_time,
|
||||
end_time,
|
||||
alert_profile,
|
||||
elastic_colllect_alerts,
|
||||
elastic_alerts_index,
|
||||
elastic_alerts_index
|
||||
)
|
||||
|
||||
else:
|
||||
@@ -475,15 +491,15 @@ def main(cfg) -> int:
|
||||
return 1
|
||||
# sys.exit(1)
|
||||
if enable_metrics:
|
||||
logging.info(f'Capturing metrics using file {metrics_profile}')
|
||||
prometheus_plugin.metrics(
|
||||
prometheus,
|
||||
elastic_search,
|
||||
start_time,
|
||||
run_uuid,
|
||||
start_time,
|
||||
end_time,
|
||||
metrics_profile,
|
||||
elastic_collect_metrics,
|
||||
elastic_metrics_index,
|
||||
elastic_metrics_index
|
||||
)
|
||||
|
||||
if post_critical_alerts > 0:
|
||||
@@ -497,6 +513,9 @@ def main(cfg) -> int:
|
||||
)
|
||||
# sys.exit(2)
|
||||
return 2
|
||||
if health_checker.ret_value != 0:
|
||||
logging.error("Health check failed for the applications, Please check; exiting")
|
||||
return health_checker.ret_value
|
||||
|
||||
logging.info(
|
||||
"Successfully finished running Kraken. UUID for the run: "
|
||||
@@ -639,4 +658,4 @@ if __name__ == "__main__":
|
||||
with open(junit_testcase_file_path, "w") as stream:
|
||||
stream.write(junit_testcase_xml)
|
||||
|
||||
sys.exit(retval)
|
||||
sys.exit(retval)
|
||||
9
scenarios/kube/cpu-hog.yml
Normal file
9
scenarios/kube/cpu-hog.yml
Normal file
@@ -0,0 +1,9 @@
|
||||
duration: 60
|
||||
workers: '' # leave it empty '' node cpu auto-detection
|
||||
hog-type: cpu
|
||||
image: quay.io/krkn-chaos/krkn-hog
|
||||
namespace: default
|
||||
cpu-load-percentage: 90
|
||||
cpu-method: all
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: 2
|
||||
@@ -1,12 +0,0 @@
|
||||
---
|
||||
deployers:
|
||||
image:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
success:
|
||||
level: debug
|
||||
@@ -1,13 +0,0 @@
|
||||
input_list:
|
||||
- cpu_count: 1
|
||||
cpu_load_percentage: 80
|
||||
cpu_method: all
|
||||
duration: 30
|
||||
kubeconfig: ''
|
||||
namespace: default
|
||||
# set the node selector as a key-value pair eg.
|
||||
# node_selector:
|
||||
# kubernetes.io/hostname: kind-worker2
|
||||
node_selector: {}
|
||||
|
||||
|
||||
@@ -1,98 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: SubRootObject
|
||||
objects:
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_count:
|
||||
display:
|
||||
description: Number of CPU cores to be used (0 means all)
|
||||
name: number of CPUs
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_method:
|
||||
display:
|
||||
description: CPU stress method
|
||||
name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
cpu_load_percentage:
|
||||
display:
|
||||
description: load CPU by percentage
|
||||
name: CPU load
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
|
||||
steps:
|
||||
kubeconfig:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
|
||||
deployment_type: image
|
||||
input:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: cpu
|
||||
workers: !expr $.input.cpu_count
|
||||
cpu-method: "all"
|
||||
cpu-load: !expr $.input.cpu_load_percentage
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
pod:
|
||||
metadata:
|
||||
namespace: !expr $.input.namespace
|
||||
labels:
|
||||
arcaflow: stressng
|
||||
spec:
|
||||
nodeSelector: !expr $.input.node_selector
|
||||
pluginContainer:
|
||||
imagePullPolicy: Always
|
||||
outputs:
|
||||
success:
|
||||
stressng: !expr $.steps.stressng.outputs.success
|
||||
|
||||
@@ -1,25 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: RootObject
|
||||
properties:
|
||||
input_list:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
items: !expr $.input.input_list
|
||||
workflow: sub-workflow.yaml
|
||||
parallelism: 1000
|
||||
outputs:
|
||||
success:
|
||||
workloads: !expr $.steps.workload_loop.outputs.success.data
|
||||
|
||||
14
scenarios/kube/io-hog.yml
Normal file
14
scenarios/kube/io-hog.yml
Normal file
@@ -0,0 +1,14 @@
|
||||
duration: 30
|
||||
workers: '' # leave it empty '' node cpu auto-detection
|
||||
hog-type: io
|
||||
image: quay.io/krkn-chaos/krkn-hog
|
||||
namespace: default
|
||||
io-block-size: 1m
|
||||
io-write-bytes: 1g
|
||||
io-target-pod-folder: /hog-data
|
||||
io-target-pod-volume:
|
||||
name: node-volume
|
||||
hostPath:
|
||||
path: /root # a path writable by kubelet in the root filesystem of the node
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: ''
|
||||
@@ -1,11 +0,0 @@
|
||||
deployers:
|
||||
image:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
success:
|
||||
level: debug
|
||||
@@ -1,16 +0,0 @@
|
||||
input_list:
|
||||
- duration: 30
|
||||
io_block_size: 1m
|
||||
io_workers: 1
|
||||
io_write_bytes: 10m
|
||||
kubeconfig: ''
|
||||
namespace: default
|
||||
# set the node selector as a key-value pair eg.
|
||||
# node_selector:
|
||||
# kubernetes.io/hostname: kind-worker2
|
||||
node_selector: {}
|
||||
target_pod_folder: /hog-data
|
||||
target_pod_volume:
|
||||
hostPath:
|
||||
path: /tmp
|
||||
name: node-volume
|
||||
@@ -1,141 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: SubRootObject
|
||||
objects:
|
||||
hostPath:
|
||||
id: HostPathVolumeSource
|
||||
properties:
|
||||
path:
|
||||
type:
|
||||
type_id: string
|
||||
Volume:
|
||||
id: Volume
|
||||
properties:
|
||||
name:
|
||||
type:
|
||||
type_id: string
|
||||
hostPath:
|
||||
type:
|
||||
id: hostPath
|
||||
type_id: ref
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
io_workers:
|
||||
display:
|
||||
description: number of workers
|
||||
name: start N workers continually writing, reading and removing temporary files
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
io_block_size:
|
||||
display:
|
||||
description: single write size
|
||||
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
io_write_bytes:
|
||||
display:
|
||||
description: Total number of bytes written
|
||||
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
|
||||
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
|
||||
GBytes using the suffix b, k, m or g
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_folder:
|
||||
display:
|
||||
description: Target Folder
|
||||
name: Folder in the pod where the test will be executed and the test files will be written
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_volume:
|
||||
display:
|
||||
name: kubernetes volume definition
|
||||
description: the volume that will be attached to the pod. In order to stress
|
||||
the node storage only hosPath mode is currently supported
|
||||
type:
|
||||
type_id: ref
|
||||
id: Volume
|
||||
required: true
|
||||
|
||||
steps:
|
||||
kubeconfig:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
|
||||
deployment_type: image
|
||||
input:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
timeout: !expr $.input.duration
|
||||
workdir: !expr $.input.target_pod_folder
|
||||
stressors:
|
||||
- stressor: hdd
|
||||
workers: !expr $.input.io_workers
|
||||
hdd-bytes: !expr $.input.io_write_bytes
|
||||
hdd-write-size: !expr $.input.io_block_size
|
||||
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
pod:
|
||||
metadata:
|
||||
namespace: !expr $.input.namespace
|
||||
labels:
|
||||
arcaflow: stressng
|
||||
spec:
|
||||
nodeSelector: !expr $.input.node_selector
|
||||
pluginContainer:
|
||||
imagePullPolicy: Always
|
||||
securityContext:
|
||||
privileged: true
|
||||
volumeMounts:
|
||||
- mountPath: /hog-data
|
||||
name: node-volume
|
||||
volumes:
|
||||
- !expr $.input.target_pod_volume
|
||||
|
||||
outputs:
|
||||
success:
|
||||
stressng: !expr $.steps.stressng.outputs.success
|
||||
|
||||
@@ -1,26 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: RootObject
|
||||
properties:
|
||||
input_list:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
items: !expr $.input.input_list
|
||||
workflow: sub-workflow.yaml
|
||||
parallelism: 1000
|
||||
outputs:
|
||||
success:
|
||||
workloads: !expr $.steps.workload_loop.outputs.success.data
|
||||
|
||||
|
||||
|
||||
8
scenarios/kube/memory-hog.yml
Normal file
8
scenarios/kube/memory-hog.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
duration: 60
|
||||
workers: '' # leave it empty '' node cpu auto-detection
|
||||
hog-type: memory
|
||||
image: quay.io/krkn-chaos/krkn-hog
|
||||
namespace: default
|
||||
memory-vm-bytes: 90%
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: ''
|
||||
@@ -1,12 +0,0 @@
|
||||
---
|
||||
deployers:
|
||||
image:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
success:
|
||||
level: debug
|
||||
@@ -1,13 +0,0 @@
|
||||
input_list:
|
||||
- duration: 30
|
||||
vm_bytes: 10%
|
||||
vm_workers: 2
|
||||
# set the node selector as a key-value pair eg.
|
||||
# node_selector:
|
||||
# kubernetes.io/hostname: kind-worker2
|
||||
node_selector: { }
|
||||
kubeconfig: ""
|
||||
namespace: default
|
||||
|
||||
# duplicate this section to run simultaneous stressors in the same run
|
||||
|
||||
@@ -1,89 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: SubRootObject
|
||||
objects:
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_workers:
|
||||
display:
|
||||
description: Number of VM stressors to be run (0 means 1 stressor per CPU)
|
||||
name: Number of VM stressors
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_bytes:
|
||||
display:
|
||||
description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
|
||||
steps:
|
||||
kubeconfig:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
|
||||
deployment_type: image
|
||||
input:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: vm
|
||||
workers: !expr $.input.vm_workers
|
||||
vm-bytes: !expr $.input.vm_bytes
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
pod:
|
||||
metadata:
|
||||
namespace: !expr $.input.namespace
|
||||
labels:
|
||||
arcaflow: stressng
|
||||
spec:
|
||||
nodeSelector: !expr $.input.node_selector
|
||||
pluginContainer:
|
||||
imagePullPolicy: Always
|
||||
|
||||
outputs:
|
||||
success:
|
||||
stressng: !expr $.steps.stressng.outputs.success
|
||||
|
||||
@@ -1,29 +0,0 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: RootObject
|
||||
properties:
|
||||
input_list:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
items: !expr $.input.input_list
|
||||
workflow: sub-workflow.yaml
|
||||
parallelism: 1000
|
||||
outputs:
|
||||
success:
|
||||
workloads: !expr $.steps.workload_loop.outputs.success.data
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
13
scenarios/kube/network_filter.yml
Normal file
13
scenarios/kube/network_filter.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
- id: node_network_filter
|
||||
wait_duration: 300
|
||||
test_duration: 100
|
||||
label_selector: "kubernetes.io/hostname=ip-10-0-39-182.us-east-2.compute.internal"
|
||||
namespace: 'default'
|
||||
instance_count: 1
|
||||
execution: parallel
|
||||
ingress: false
|
||||
egress: true
|
||||
target: node
|
||||
interfaces: []
|
||||
ports:
|
||||
- 2049
|
||||
@@ -16,3 +16,10 @@ node_scenarios:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
- actions:
|
||||
- node_disk_detach_attach_scenario
|
||||
node_name:
|
||||
label_selector:
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: aws
|
||||
@@ -1,10 +1,16 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: <ibmcloud-node-terminate/ibmcloud-node-reboot/ibmcloud-node-stop/ibmcloud-node-start>
|
||||
config:
|
||||
name: ""
|
||||
label_selector: "node-role.kubernetes.io/worker" # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
|
||||
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time)
|
||||
instance_count: 1 # Number of nodes to perform action/select that match the label selector
|
||||
timeout: 360 # Duration to wait for completion of node scenario injection
|
||||
duration: 120 # Duration to stop the node before running the start action
|
||||
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario
|
||||
node_scenarios:
|
||||
- actions:
|
||||
- node_stop_start_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 360
|
||||
duration: 120
|
||||
cloud_type: ibm
|
||||
- actions:
|
||||
- node_reboot_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: ibm
|
||||
@@ -1,10 +1,17 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: <vmware-node-stop/vmware-node-start/vmware-node-reboot/vmware-node-terminate>
|
||||
config:
|
||||
name: <node_name> # Node on which scenario has to be injected; can set multiple names separated by comma
|
||||
label_selector: <label_selector> # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
|
||||
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time)
|
||||
instance_count: 1 # Number of nodes to perform action/select that match the label selector
|
||||
timeout: 300 # Duration to wait for completion of node scenario injection
|
||||
verify_session: True # Set to True if you want to verify the vSphere client session using certificates; else False
|
||||
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario
|
||||
node_scenarios:
|
||||
- actions:
|
||||
- node_reboot_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: vmware
|
||||
- actions:
|
||||
- node_stop_start_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 360
|
||||
duration: 10
|
||||
cloud_type: vmware
|
||||
parallel: false
|
||||
|
||||
4
scenarios/openshift/zone_outage_gcp.yaml
Normal file
4
scenarios/openshift/zone_outage_gcp.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
zone_outage: # Scenario to create an outage of a zone by tweaking network ACL
|
||||
cloud_type: gcp # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
|
||||
duration: 600 # duration in seconds after which the zone will be back online
|
||||
zone: <zone> # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user