mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-19 20:40:33 +00:00
Compare commits
32 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
edd0159251 | ||
|
|
cf9f7702ed | ||
|
|
cfe624f153 | ||
|
|
62f50db195 | ||
|
|
aee838d3ac | ||
|
|
3b4d8a13f9 | ||
|
|
a86bb6ab95 | ||
|
|
7f0110972b | ||
|
|
126f4ebb35 | ||
|
|
83d99bbb02 | ||
|
|
2624102d65 | ||
|
|
02587bcbe6 | ||
|
|
c51bf04f9e | ||
|
|
41195b1a60 | ||
|
|
ab80acbee7 | ||
|
|
3573d13ea9 | ||
|
|
9c5251d52f | ||
|
|
a0bba27edc | ||
|
|
0d0143d1e0 | ||
|
|
0004c05f81 | ||
|
|
57a747a34a | ||
|
|
22108ae4e7 | ||
|
|
cecaa1eda3 | ||
|
|
5450ecb914 | ||
|
|
cad6b68f43 | ||
|
|
0eba329305 | ||
|
|
ce8593f2f0 | ||
|
|
9061ddbb5b | ||
|
|
dd4d0d0389 | ||
|
|
0cabe5e91d | ||
|
|
32fe0223ff | ||
|
|
a25736ad08 |
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
10
.github/PULL_REQUEST_TEMPLATE.md
vendored
Normal file
@@ -0,0 +1,10 @@
|
||||
## Description
|
||||
<!-- Provide a brief description of the changes made in this PR. -->
|
||||
|
||||
## Documentation
|
||||
- [ ] **Is documentation needed for this update?**
|
||||
|
||||
If checked, a documentation PR must be created and merged in the [website repository](https://github.com/krkn-chaos/website/).
|
||||
|
||||
## Related Documentation PR (if applicable)
|
||||
<!-- Add the link to the corresponding documentation PR in the website repository -->
|
||||
45
.github/workflows/require-docs.yml
vendored
Normal file
45
.github/workflows/require-docs.yml
vendored
Normal file
@@ -0,0 +1,45 @@
|
||||
name: Require Documentation Update
|
||||
on:
|
||||
pull_request:
|
||||
types: [opened, edited, synchronize]
|
||||
branches:
|
||||
- main
|
||||
jobs:
|
||||
check-docs:
|
||||
name: Check Documentation Update
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Checkout repository
|
||||
uses: actions/checkout@v4
|
||||
|
||||
- name: Check if Documentation is Required
|
||||
id: check_docs
|
||||
run: |
|
||||
echo "Checking PR body for documentation checkbox..."
|
||||
# Read the PR body from the GitHub event payload
|
||||
if echo "${{ github.event.pull_request.body }}" | grep -qi '\[x\].*documentation needed'; then
|
||||
echo "Documentation required detected."
|
||||
echo "docs_required=true" >> $GITHUB_OUTPUT
|
||||
else
|
||||
echo "Documentation not required."
|
||||
echo "docs_required=false" >> $GITHUB_OUTPUT
|
||||
fi
|
||||
|
||||
- name: Enforce Documentation Update (if required)
|
||||
if: steps.check_docs.outputs.docs_required == 'true'
|
||||
env:
|
||||
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
|
||||
run: |
|
||||
# Retrieve feature branch and repository owner from the GitHub context
|
||||
FEATURE_BRANCH="${{ github.head_ref }}"
|
||||
REPO_OWNER="${{ github.repository_owner }}"
|
||||
WEBSITE_REPO="website"
|
||||
echo "Searching for a merged documentation PR for feature branch: $FEATURE_BRANCH in $REPO_OWNER/$WEBSITE_REPO..."
|
||||
MERGED_PR=$(gh pr list --repo "$REPO_OWNER/$WEBSITE_REPO" --state merged --json headRefName,title,url | jq -r \
|
||||
--arg FEATURE_BRANCH "$FEATURE_BRANCH" '.[] | select(.title | contains($FEATURE_BRANCH)) | .url')
|
||||
if [[ -z "$MERGED_PR" ]]; then
|
||||
echo ":x: Documentation PR for branch '$FEATURE_BRANCH' is required and has not been merged."
|
||||
exit 1
|
||||
else
|
||||
echo ":white_check_mark: Found merged documentation PR: $MERGED_PR"
|
||||
fi
|
||||
7
ADOPTERS.md
Normal file
7
ADOPTERS.md
Normal file
@@ -0,0 +1,7 @@
|
||||
# Krkn Adopters
|
||||
|
||||
This is a list of organizations that have publicly acknowledged usage of Krkn and shared details of how they are leveraging it in their environment for chaos engineering use cases. Do you want to add yourself to this list? Please fork the repository and open a PR with the required change.
|
||||
|
||||
| Organization | Since | Website | Use-Case |
|
||||
|:-|:-|:-|:-|
|
||||
| MarketAxess | 2024 | https://www.marketaxess.com/ | Kraken enables us to achieve our goal of increasing the reliability of our cloud products on Kubernetes. The tool allows us to automatically run various chaos scenarios, identify resilience and performance bottlenecks, and seamlessly restore the system to its original state once scenarios finish. These chaos scenarios include pod disruptions, node (EC2) outages, simulating availability zone (AZ) outages, and filling up storage spaces like EBS and EFS. The community is highly responsive to requests and works on expanding the tool's capabilities. MarketAxess actively contributes to the project, adding features such as the ability to leverage existing network ACLs and proposing several feature improvements to enhance test coverage. |
|
||||
@@ -62,3 +62,11 @@ elastic:
|
||||
metrics_index: "krkn-metrics"
|
||||
alerts_index: "krkn-alerts"
|
||||
telemetry_index: "krkn-telemetry"
|
||||
|
||||
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
||||
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
||||
config: # Provide list of health check configurations for applications
|
||||
- url: # Provide application endpoint
|
||||
bearer_token: # Bearer token for authentication if any
|
||||
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
||||
|
||||
@@ -19,7 +19,7 @@ function functional_test_telemetry {
|
||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||
|
||||
export scenario_type="hog_scenarios"
|
||||
export scenario_file="scenarios/kube/cpu-hog/input.yaml"
|
||||
export scenario_file="scenarios/kube/cpu–hog.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||
|
||||
96
README.md
96
README.md
@@ -10,91 +10,15 @@ Kraken injects deliberate failures into Kubernetes clusters to check if it is re
|
||||
|
||||
|
||||
### Workflow
|
||||

|
||||
|
||||
### Demo
|
||||
[](https://youtu.be/LN-fZywp_mo "Kraken Demo - Click to Watch!")
|
||||

|
||||
|
||||
|
||||
### Chaos Testing Guide
|
||||
[Guide](docs/index.md) encapsulates:
|
||||
- Test methodology that needs to be embraced.
|
||||
- Best practices that an Kubernetes cluster, platform and applications running on top of it should take into account for best user experience, performance, resilience and reliability.
|
||||
- Tooling.
|
||||
- Scenarios supported.
|
||||
- Test environment recommendations as to how and where to run chaos tests.
|
||||
- Chaos testing in practice.
|
||||
|
||||
The guide is hosted at https://krkn-chaos.github.io/krkn.
|
||||
<!-- ### Demo
|
||||
[](https://youtu.be/LN-fZywp_mo "Kraken Demo - Click to Watch!") -->
|
||||
|
||||
|
||||
### How to Get Started
|
||||
Instructions on how to setup, configure and run Kraken can be found at [Installation](docs/installation.md).
|
||||
|
||||
You may consider utilizing the chaos recommendation tool prior to initiating the chaos runs to profile the application service(s) under test. This tool discovers a list of Krkn scenarios with a high probability of causing failures or disruptions to your application service(s). The tool can be accessed at [Chaos-Recommender](utils/chaos_recommender/README.md).
|
||||
|
||||
See the [getting started doc](docs/getting_started.md) on support on how to get started with your own custom scenario or editing current scenarios for your specific usage.
|
||||
|
||||
After installation, refer back to the below sections for supported scenarios and how to tweak the kraken config to load them on your cluster.
|
||||
|
||||
|
||||
#### Running Kraken with minimal configuration tweaks
|
||||
For cases where you want to run Kraken with minimal configuration changes, refer to [krkn-hub](https://github.com/krkn-chaos/krkn-hub). One use case is CI integration where you do not want to carry around different configuration files for the scenarios.
|
||||
|
||||
|
||||
### Config
|
||||
Instructions on how to setup the config and the options supported can be found at [Config](docs/config.md).
|
||||
|
||||
|
||||
### Kubernetes chaos scenarios supported
|
||||
|
||||
Scenario type | Kubernetes
|
||||
--------------------------- | ------------- |
|
||||
[Pod Scenarios](docs/pod_scenarios.md) | :heavy_check_mark: |
|
||||
[Pod Network Scenarios](docs/pod_network_scenarios.md) | :x: |
|
||||
[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: |
|
||||
[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: |
|
||||
[Time Scenarios](docs/time_scenarios.md) | :heavy_check_mark: |
|
||||
[Hog Scenarios: CPU, Memory](docs/hog_scenarios.md) | :heavy_check_mark: |
|
||||
[Cluster Shut Down Scenarios](docs/cluster_shut_down_scenarios.md) | :heavy_check_mark: |
|
||||
[Service Disruption Scenarios](docs/service_disruption_scenarios.md.md) | :heavy_check_mark: |
|
||||
[Zone Outage Scenarios](docs/zone_outage.md) | :heavy_check_mark: |
|
||||
[Application_outages](docs/application_outages.md) | :heavy_check_mark: |
|
||||
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: |
|
||||
[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: |
|
||||
[ManagedCluster Scenarios](docs/managedcluster_scenarios.md) | :heavy_check_mark: |
|
||||
[Service Hijacking Scenarios](docs/service_hijacking_scenarios.md) | :heavy_check_mark: |
|
||||
[SYN Flood Scenarios](docs/syn_flood_scenarios.md) | :heavy_check_mark: |
|
||||
|
||||
|
||||
### Kraken scenario pass/fail criteria and report
|
||||
It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
|
||||
- Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
|
||||
- Leveraging [Cerberus](https://github.com/krkn-chaos/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/krkn-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
|
||||
- Leveraging built-in alert collection feature to fail the runs in case of critical alerts.
|
||||
|
||||
### Signaling
|
||||
In CI runs or any external job it is useful to stop Kraken once a certain test or state gets reached. We created a way to signal to kraken to pause the chaos or stop it completely using a signal posted to a port of your choice.
|
||||
|
||||
For example if we have a test run loading the cluster running and kraken separately running; we want to be able to know when to start/stop the kraken run based on when the test run completes or gets to a certain loaded state.
|
||||
|
||||
More detailed information on enabling and leveraging this feature can be found [here](docs/signal.md).
|
||||
|
||||
|
||||
### Performance monitoring
|
||||
Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chaos scenarios on various components is key to find out the bottlenecks as it is important to make sure the cluster is healthy in terms if both recovery as well as performance during/after the failure has been injected. Instructions on enabling it can be found [here](docs/performance_dashboards.md).
|
||||
|
||||
|
||||
### SLOs validation during and post chaos
|
||||
- In addition to checking the recovery and health of the cluster and components under test, Kraken takes in a profile with the Prometheus expressions to validate and alerts, exits with a non-zero return code depending on the severity set. This feature can be used to determine pass/fail or alert on abnormalities observed in the cluster based on the metrics.
|
||||
- Kraken also provides ability to check if any critical alerts are firing in the cluster post chaos and pass/fail's.
|
||||
|
||||
Information on enabling and leveraging this feature can be found [here](docs/SLOs_validation.md)
|
||||
|
||||
|
||||
### OCM / ACM integration
|
||||
|
||||
Kraken supports injecting faults into [Open Cluster Management (OCM)](https://open-cluster-management.io/) and [Red Hat Advanced Cluster Management for Kubernetes (ACM)](https://www.krkn.com/en/technologies/management/advanced-cluster-management) managed clusters through [ManagedCluster Scenarios](docs/managedcluster_scenarios.md).
|
||||
Instructions on how to setup, configure and run Kraken can be found in the [documentation](https://krkn-chaos.dev/docs/).
|
||||
|
||||
|
||||
### Blogs and other useful resources
|
||||
@@ -106,6 +30,7 @@ Kraken supports injecting faults into [Open Cluster Management (OCM)](https://op
|
||||
- Blog post on supercharging chaos testing using AI integration in Krkn: https://www.redhat.com/en/blog/supercharging-chaos-testing-using-ai
|
||||
- Blog post announcing Krkn joining CNCF Sandbox: https://www.redhat.com/en/blog/krknchaos-joining-cncf-sandbox
|
||||
|
||||
|
||||
### Roadmap
|
||||
Enhancements being planned can be found in the [roadmap](ROADMAP.md).
|
||||
|
||||
@@ -113,17 +38,8 @@ Enhancements being planned can be found in the [roadmap](ROADMAP.md).
|
||||
### Contributions
|
||||
We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
|
||||
|
||||
[More information on how to Contribute](docs/contribute.md)
|
||||
[More information on how to Contribute](https://krkn-chaos.dev/docs/contribution-guidelines/contribute/)
|
||||
|
||||
If adding a new scenario or tweaking the main config, be sure to add in updates into the CI to be sure the CI is up to date.
|
||||
Please read [this file]((CI/README.md#adding-a-test-case)) for more information on updates.
|
||||
|
||||
|
||||
### Scenario Plugin Development
|
||||
|
||||
If you're gearing up to develop new scenarios, take a moment to review our
|
||||
[Scenario Plugin API Documentation](docs/scenario_plugin_api.md).
|
||||
It’s the perfect starting point to tap into your chaotic creativity!
|
||||
|
||||
### Community
|
||||
Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, tsebasti/Tullio Sebastiani, yogi/Yogananth Subramanian, sahil/Sahil Shah, pradeep/Pradeep Surisetty and ravielluri/Naga Ravi Chaitanya Elluri.
|
||||
|
||||
43
SECURITY.md
Normal file
43
SECURITY.md
Normal file
@@ -0,0 +1,43 @@
|
||||
# Security Policy
|
||||
|
||||
We attach great importance to code security. We are very grateful to the users, security vulnerability researchers, etc. for reporting security vulnerabilities to the Krkn community. All reported security vulnerabilities will be carefully assessed and addressed in a timely manner.
|
||||
|
||||
|
||||
## Security Checks
|
||||
|
||||
Krkn leverages [Snyk](https://snyk.io/) to ensure that any security vulnerabilities found
|
||||
in the code base and dependencies are fixed and published in the latest release. Security
|
||||
vulnerability checks are enabled for each pull request to enable developers to get insights
|
||||
and proactively fix them.
|
||||
|
||||
|
||||
## Reporting a Vulnerability
|
||||
|
||||
The Krkn project treats security vulnerabilities seriously, so we
|
||||
strive to take action quickly when required.
|
||||
|
||||
The project requests that security issues be disclosed in a responsible
|
||||
manner to allow adequate time to respond. If a security issue or
|
||||
vulnerability has been found, please disclose the details to our
|
||||
dedicated email address:
|
||||
|
||||
cncf-krkn-maintainers@lists.cncf.io
|
||||
|
||||
You can also use the [GitHub vulnerability report mechanism](https://docs.github.com/en/code-security/security-advisories/guidance-on-reporting-and-writing-information-about-vulnerabilities/privately-reporting-a-security-vulnerability#privately-reporting-a-security-vulnerability) to report the security vulnerability.
|
||||
|
||||
Please include as much information as possible with the report. The
|
||||
following details assist with analysis efforts:
|
||||
- Description of the vulnerability
|
||||
- Affected component (version, commit, branch etc)
|
||||
- Affected code (file path, line numbers)
|
||||
- Exploit code
|
||||
|
||||
|
||||
## Security Team
|
||||
|
||||
The security team currently consists of the [Maintainers of Krkn](https://github.com/krkn-chaos/krkn/blob/main/MAINTAINERS.md)
|
||||
|
||||
|
||||
## Process and Supported Releases
|
||||
|
||||
The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
@@ -1,5 +1,4 @@
|
||||
kraken:
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
@@ -25,12 +24,10 @@ kraken:
|
||||
- scenarios/openshift/prom_kill.yml
|
||||
- scenarios/openshift/openshift-apiserver.yml
|
||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||
- vmware_node_scenarios:
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- ibmcloud_node_scenarios:
|
||||
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/aws_node_scenarios.yml
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/openshift/time_scenarios_example.yml
|
||||
- cluster_shut_down_scenarios:
|
||||
@@ -48,6 +45,8 @@ kraken:
|
||||
- scenarios/kube/service_hijacking.yaml
|
||||
- syn_flood_scenarios:
|
||||
- scenarios/kube/syn_flood.yaml
|
||||
- network_chaos_ng_scenarios:
|
||||
- scenarios/kube/network-filter.yml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
@@ -63,12 +62,10 @@ performance_monitoring:
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
enable_metrics: False
|
||||
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
|
||||
metrics_profile: config/metrics.yaml
|
||||
metrics_profile: config/metrics-report.yaml
|
||||
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
|
||||
elastic:
|
||||
enable_elastic: False
|
||||
collect_metrics: False
|
||||
collect_alerts: False
|
||||
verify_certs: False
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_port: 32766
|
||||
@@ -112,7 +109,10 @@ telemetry:
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
|
||||
|
||||
|
||||
|
||||
|
||||
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
||||
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
||||
config: # Provide list of health check configurations for applications
|
||||
- url: # Provide application endpoint
|
||||
bearer_token: # Bearer token for authentication if any
|
||||
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
||||
|
||||
@@ -1,133 +1,126 @@
|
||||
metrics:
|
||||
# API server
|
||||
- query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
|
||||
metricName: API99thLatency
|
||||
|
||||
- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
|
||||
metricName: APIRequestRate
|
||||
instant: True
|
||||
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(apiserver_current_inflight_requests[5m]))
|
||||
metricName: APIInflightRequests
|
||||
instant: True
|
||||
|
||||
# Container & pod metrics
|
||||
- query: (sum(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler)"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: (sum(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(etcd|oauth-apiserver|sdn|ovn-kubernetes|.*apiserver|authentication|.*controller-manager|.*scheduler)"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerCPU-Masters
|
||||
instant: true
|
||||
|
||||
- query: (sum(irate(container_cpu_usage_seconds_total{pod!="",container="prometheus",namespace="openshift-monitoring"}[2m]) * 100) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
|
||||
metricName: containerCPU-Prometheus
|
||||
instant: true
|
||||
|
||||
- query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"}[2m]) * 100 and on (node) kube_node_role{role="worker"}) by (namespace, container)) > 0
|
||||
metricName: containerCPU-AggregatedWorkers
|
||||
instant: true
|
||||
|
||||
- query: (avg(irate(container_cpu_usage_seconds_total{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"}[2m]) * 100 and on (node) kube_node_role{role="infra"}) by (namespace, container)) > 0
|
||||
metricName: containerCPU-AggregatedInfra
|
||||
|
||||
- query: (sum(container_memory_rss{pod!="",namespace="openshift-monitoring",name!="",container="prometheus"}) by (container, pod, namespace, node) and on (node) kube_node_role{role="infra"}) > 0
|
||||
metricName: containerMemory-Prometheus
|
||||
instant: True
|
||||
|
||||
- query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress)"} and on (node) kube_node_role{role="worker"}) by (container, namespace)
|
||||
metricName: containerMemory-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: avg(container_memory_rss{name!="",container!="POD",namespace=~"openshift-(sdn|ovn-kubernetes|ingress|monitoring|image-registry|logging)"} and on (node) kube_node_role{role="infra"}) by (container, namespace)
|
||||
metricName: containerMemory-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
# Node metrics
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Masters
|
||||
instant: True
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: maxCPU-Masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
|
||||
metricName: nodeCPU-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: (avg((sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))) by (mode)) > 0
|
||||
metricName: nodeCPU-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryAvailable-Masters
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Masters
|
||||
instant: true
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryAvailable-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: maxCPU-Workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Workers
|
||||
instant: true
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryAvailable-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryActive-Masters
|
||||
instant: True
|
||||
|
||||
- query: avg(node_memory_Active_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryActive-AggregatedWorkers
|
||||
instant: True
|
||||
|
||||
- query: avg(avg(node_memory_Active_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryActive-AggregatedInfra
|
||||
|
||||
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-Masters
|
||||
|
||||
- query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedWorkers
|
||||
|
||||
- query: avg(node_memory_Cached_bytes + node_memory_Buffers_bytes and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers-AggregatedInfra
|
||||
|
||||
- query: irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: rxNetworkBytes-Masters
|
||||
|
||||
- query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: rxNetworkBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(irate(node_network_receive_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: rxNetworkBytes-AggregatedInfra
|
||||
|
||||
- query: irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: txNetworkBytes-Masters
|
||||
|
||||
- query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: txNetworkBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(irate(node_network_transmit_bytes_total{device=~"^(ens|eth|bond|team).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: txNetworkBytes-AggregatedInfra
|
||||
|
||||
- query: rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeDiskWrittenBytes-Masters
|
||||
|
||||
- query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskWrittenBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(rate(node_disk_written_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskWrittenBytes-AggregatedInfra
|
||||
|
||||
- query: rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")
|
||||
metricName: nodeDiskReadBytes-Masters
|
||||
|
||||
- query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskReadBytes-AggregatedWorkers
|
||||
|
||||
- query: avg(rate(node_disk_read_bytes_total{device!~"^(dm|rb).*"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (device)
|
||||
metricName: nodeDiskReadBytes-AggregatedInfra
|
||||
instant: True
|
||||
|
||||
# Etcd metrics
|
||||
- query: sum(rate(etcd_server_leader_changes_seen_total[2m]))
|
||||
metricName: etcdLeaderChangesRate
|
||||
instant: True
|
||||
|
||||
- query: etcd_server_is_leader > 0
|
||||
metricName: etcdServerIsLeader
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))
|
||||
metricName: 99thEtcdDiskBackendCommitDurationSeconds
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))
|
||||
metricName: 99thEtcdDiskWalFsyncDurationSeconds
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket[5m]))
|
||||
metricName: 99thEtcdRoundTripTimeSeconds
|
||||
|
||||
- query: etcd_mvcc_db_total_size_in_bytes
|
||||
metricName: etcdDBPhysicalSizeBytes
|
||||
|
||||
- query: etcd_mvcc_db_total_size_in_use_in_bytes
|
||||
metricName: etcdDBLogicalSizeBytes
|
||||
instant: True
|
||||
|
||||
- query: sum by (cluster_version)(etcd_cluster_version)
|
||||
metricName: etcdVersion
|
||||
@@ -135,83 +128,16 @@ metrics:
|
||||
|
||||
- query: sum(rate(etcd_object_counts{}[5m])) by (resource) > 0
|
||||
metricName: etcdObjectCount
|
||||
instant: True
|
||||
|
||||
- query: histogram_quantile(0.99,sum(rate(etcd_request_duration_seconds_bucket[2m])) by (le,operation,apiserver)) > 0
|
||||
metricName: P99APIEtcdRequestLatency
|
||||
|
||||
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})
|
||||
metricName: ActiveWatchStreams
|
||||
|
||||
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})
|
||||
metricName: ActiveLeaseStreams
|
||||
|
||||
- query: sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))
|
||||
metricName: snapshotSaveLatency
|
||||
|
||||
- query: sum(rate(etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: HeartBeatFailures
|
||||
|
||||
- query: sum(rate(etcd_server_health_failures{namespace="openshift-etcd"}[2m]))
|
||||
metricName: HealthFailures
|
||||
|
||||
- query: sum(rate(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: SlowApplies
|
||||
|
||||
- query: sum(rate(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m]))
|
||||
metricName: SlowIndexRead
|
||||
|
||||
- query: sum(etcd_server_proposals_pending)
|
||||
metricName: PendingProposals
|
||||
|
||||
- query: histogram_quantile(1.0, sum(rate(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket[1m])) by (le, instance))
|
||||
metricName: CompactionMaxPause
|
||||
instant: True
|
||||
|
||||
- query: sum by (instance) (apiserver_storage_objects)
|
||||
metricName: etcdTotalObjectCount
|
||||
instant: True
|
||||
|
||||
- query: topk(500, max by(resource) (apiserver_storage_objects))
|
||||
metricName: etcdTopObectCount
|
||||
|
||||
# Cluster metrics
|
||||
- query: count(kube_namespace_created)
|
||||
metricName: namespaceCount
|
||||
|
||||
- query: sum(kube_pod_status_phase{}) by (phase)
|
||||
metricName: podStatusCount
|
||||
|
||||
- query: count(kube_secret_info{})
|
||||
metricName: secretCount
|
||||
|
||||
- query: count(kube_deployment_labels{})
|
||||
metricName: deploymentCount
|
||||
|
||||
- query: count(kube_configmap_info{})
|
||||
metricName: configmapCount
|
||||
|
||||
- query: count(kube_service_info{})
|
||||
metricName: serviceCount
|
||||
|
||||
- query: kube_node_role
|
||||
metricName: nodeRoles
|
||||
instant: true
|
||||
|
||||
- query: sum(kube_node_status_condition{status="true"}) by (condition)
|
||||
metricName: nodeStatus
|
||||
|
||||
- query: (sum(rate(container_fs_writes_bytes_total{container!="",device!~".+dm.+"}[5m])) by (device, container, node) and on (node) kube_node_role{role="master"}) > 0
|
||||
metricName: containerDiskUsage
|
||||
|
||||
- query: cluster_version{type="completed"}
|
||||
metricName: clusterVersion
|
||||
instant: true
|
||||
|
||||
# Golang metrics
|
||||
|
||||
- query: go_memstats_heap_alloc_bytes{job=~"apiserver|api|etcd"}
|
||||
metricName: goHeapAllocBytes
|
||||
|
||||
- query: go_memstats_heap_inuse_bytes{job=~"apiserver|api|etcd"}
|
||||
metricName: goHeapInuseBytes
|
||||
|
||||
- query: go_gc_duration_seconds{job=~"apiserver|api|etcd",quantile="1"}
|
||||
metricName: goGCDurationSeconds
|
||||
instant: True
|
||||
|
||||
248
config/metrics-report.yaml
Normal file
248
config/metrics-report.yaml
Normal file
@@ -0,0 +1,248 @@
|
||||
metrics:
|
||||
|
||||
# API server
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
instant: true
|
||||
|
||||
# Kubelet & CRI-O
|
||||
|
||||
# Average and max of the CPU usage from all worker's kubelet
|
||||
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: cpu-kubelet
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="kubelet"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-cpu-kubelet
|
||||
instant: true
|
||||
|
||||
# Average of the memory usage from all worker's kubelet
|
||||
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: memory-kubelet
|
||||
instant: true
|
||||
|
||||
# Max of the memory usage from all worker's kubelet
|
||||
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="kubelet"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-memory-kubelet
|
||||
instant: true
|
||||
|
||||
- query: max_over_time(sum(process_resident_memory_bytes{service="kubelet",job="kubelet"} and on (node) kube_node_role{role="worker"})[.elapsed:])
|
||||
metricName: max-memory-sum-kubelet
|
||||
instant: true
|
||||
|
||||
# Average and max of the CPU usage from all worker's CRI-O
|
||||
- query: avg(avg_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: cpu-crio
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(irate(process_cpu_seconds_total{service="kubelet",job="crio"}[2m])[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-cpu-crio
|
||||
instant: true
|
||||
|
||||
# Average of the memory usage from all worker's CRI-O
|
||||
- query: avg(avg_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: memory-crio
|
||||
instant: true
|
||||
|
||||
# Max of the memory usage from all worker's CRI-O
|
||||
- query: max(max_over_time(process_resident_memory_bytes{service="kubelet",job="crio"}[.elapsed:]) and on (node) kube_node_role{role="worker"})
|
||||
metricName: max-memory-crio
|
||||
instant: true
|
||||
|
||||
# Etcd
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdDiskBackendCommit
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdDiskWalFsync
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(histogram_quantile(0.99, irate(etcd_network_peer_round_trip_time_seconds_bucket[2m]))[.elapsed:]))
|
||||
metricName: 99thEtcdRoundTripTime
|
||||
instant: true
|
||||
|
||||
# Control-plane
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: max-cpu-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-kube-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: max-memory-kube-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-kube-apiserver"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-kube-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-kube-apiserver"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-kube-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-apiserver"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-openshift-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(container_memory_rss{name!="", namespace="openshift-apiserver"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-openshift-apiserver
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-etcd"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-etcd
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(3,sum(container_memory_rss{name!="", namespace="openshift-etcd"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-etcd
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-controller-manager"}[2m])) by (pod))[.elapsed:]))
|
||||
metricName: cpu-openshift-controller-manager
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(topk(1, sum(container_memory_rss{name!="", namespace="openshift-controller-manager"}) by (pod))[.elapsed:]))
|
||||
metricName: memory-openshift-controller-manager
|
||||
instant: true
|
||||
|
||||
# multus
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-multus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-multus", pod=~"(multus).+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-multus
|
||||
instant: true
|
||||
|
||||
# OVNKubernetes - standard & IC
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-ovn-control-plane
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"(ovnkube-master|ovnkube-control-plane).+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-ovn-control-plane
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[2m])[.elapsed:])) by (container)
|
||||
metricName: cpu-ovnkube-node
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(container_memory_rss{name!="", namespace="openshift-ovn-kubernetes", pod=~"ovnkube-node.+", container!="POD"}[.elapsed:])) by (container)
|
||||
metricName: memory-ovnkube-node
|
||||
instant: true
|
||||
|
||||
# Nodes
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-masters
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-masters
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: max-cpu-workers
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-workers
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-workers
|
||||
instant: true
|
||||
|
||||
- query: sum( (node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)") )
|
||||
metricName: memory-sum-workers
|
||||
instant: true
|
||||
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: cpu-infra
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(node_cpu_seconds_total{mode!="idle", mode!="steal"}[2m]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (instance)[.elapsed:]))
|
||||
metricName: max-cpu-infra
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: memory-infra
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: max-memory-infra
|
||||
instant: true
|
||||
|
||||
- query: max_over_time(sum((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes) and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)"))[.elapsed:])
|
||||
metricName: max-memory-sum-infra
|
||||
instant: true
|
||||
|
||||
# Monitoring and ingress
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: cpu-prometheus
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: max-cpu-prometheus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[.elapsed:]))
|
||||
metricName: memory-prometheus
|
||||
instant: true
|
||||
|
||||
- query: max(max_over_time(sum(container_memory_rss{name!="", namespace="openshift-monitoring", pod=~"prometheus-k8s.+"}) by (pod)[.elapsed:]))
|
||||
metricName: max-memory-prometheus
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(irate(container_cpu_usage_seconds_total{name!="", namespace="openshift-ingress", pod=~"router-default.+"}[2m])) by (pod)[.elapsed:]))
|
||||
metricName: cpu-router
|
||||
instant: true
|
||||
|
||||
- query: avg(avg_over_time(sum(container_memory_rss{name!="", namespace="openshift-ingress", pod=~"router-default.+"}) by (pod)[.elapsed:]))
|
||||
metricName: memory-router
|
||||
instant: true
|
||||
|
||||
# Cluster
|
||||
|
||||
- query: avg_over_time(cluster:memory_usage:ratio[.elapsed:])
|
||||
metricName: memory-cluster-usage-ratio
|
||||
instant: true
|
||||
|
||||
- query: avg_over_time(cluster:node_cpu:ratio[.elapsed:])
|
||||
metricName: cpu-cluster-usage-ratio
|
||||
instant: true
|
||||
|
||||
# Retain the raw CPU seconds totals for comparison
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="worker",role!="infra"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Workers
|
||||
instant: true
|
||||
|
||||
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Masters
|
||||
instant: true
|
||||
|
||||
|
||||
- query: sum(node_cpu_seconds_total and on (instance) label_replace(kube_node_role{role="infra"}, "instance", "$1", "node", "(.+)")) by (mode)
|
||||
metricName: nodeCPUSeconds-Infra
|
||||
instant: true
|
||||
@@ -1,13 +1,7 @@
|
||||
metrics:
|
||||
# API server
|
||||
- query: histogram_quantile(0.99, sum(rate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb!~"WATCH", subresource!="log"}[2m])) by (verb,resource,subresource,instance,le)) > 0
|
||||
metricName: API99thLatency
|
||||
|
||||
- query: sum(irate(apiserver_request_total{apiserver="kube-apiserver",verb!="WATCH",subresource!="log"}[2m])) by (verb,instance,resource,code) > 0
|
||||
metricName: APIRequestRate
|
||||
|
||||
- query: sum(apiserver_current_inflight_requests{}) by (request_kind) > 0
|
||||
metricName: APIInflightRequests
|
||||
- query: irate(apiserver_request_total{verb="POST", resource="pods", subresource="binding",code="201"}[2m]) > 0
|
||||
metricName: schedulingThroughput
|
||||
|
||||
# Containers & pod metrics
|
||||
- query: sum(irate(container_cpu_usage_seconds_total{name!="",namespace=~"openshift-(etcd|oauth-apiserver|.*apiserver|ovn-kubernetes|sdn|ingress|authentication|.*controller-manager|.*scheduler|monitoring|logging|image-registry)"}[2m]) * 100) by (pod, namespace, node)
|
||||
@@ -33,8 +27,17 @@ metrics:
|
||||
metricName: crioMemory
|
||||
|
||||
# Node metrics
|
||||
- query: sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) > 0
|
||||
metricName: nodeCPU
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Masters
|
||||
|
||||
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Masters
|
||||
|
||||
- query: (sum(irate(node_cpu_seconds_total[2m])) by (mode,instance) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)")) > 0
|
||||
metricName: nodeCPU-Workers
|
||||
|
||||
- query: (avg_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[2m:]) and on (instance) label_replace(kube_node_role{role="worker"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: nodeMemory-Workers
|
||||
|
||||
- query: avg(node_memory_MemAvailable_bytes) by (instance)
|
||||
metricName: nodeMemoryAvailable
|
||||
@@ -42,6 +45,9 @@ metrics:
|
||||
- query: avg(node_memory_Active_bytes) by (instance)
|
||||
metricName: nodeMemoryActive
|
||||
|
||||
- query: max(max_over_time((node_memory_MemTotal_bytes - node_memory_MemAvailable_bytes)[.elapsed:]) and on (instance) label_replace(kube_node_role{role="master"}, "instance", "$1", "node", "(.+)"))
|
||||
metricName: maxMemory-Masters
|
||||
|
||||
- query: avg(node_memory_Cached_bytes) by (instance) + avg(node_memory_Buffers_bytes) by (instance)
|
||||
metricName: nodeMemoryCached+nodeMemoryBuffers
|
||||
|
||||
@@ -84,34 +90,4 @@ metrics:
|
||||
|
||||
- query: sum by (cluster_version)(etcd_cluster_version)
|
||||
metricName: etcdVersion
|
||||
instant: true
|
||||
|
||||
# Cluster metrics
|
||||
- query: count(kube_namespace_created)
|
||||
metricName: namespaceCount
|
||||
|
||||
- query: sum(kube_pod_status_phase{}) by (phase)
|
||||
metricName: podStatusCount
|
||||
|
||||
- query: count(kube_secret_info{})
|
||||
metricName: secretCount
|
||||
|
||||
- query: count(kube_deployment_labels{})
|
||||
metricName: deploymentCount
|
||||
|
||||
- query: count(kube_configmap_info{})
|
||||
metricName: configmapCount
|
||||
|
||||
- query: count(kube_service_info{})
|
||||
metricName: serviceCount
|
||||
|
||||
- query: kube_node_role
|
||||
metricName: nodeRoles
|
||||
instant: true
|
||||
|
||||
- query: sum(kube_node_status_condition{status="true"}) by (condition)
|
||||
metricName: nodeStatus
|
||||
|
||||
- query: cluster_version{type="completed"}
|
||||
metricName: clusterVersion
|
||||
instant: true
|
||||
instant: true
|
||||
@@ -1,14 +1,19 @@
|
||||
# oc build
|
||||
FROM golang:1.22.5 AS oc-build
|
||||
FROM golang:1.23.1 AS oc-build
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libkrb5-dev
|
||||
WORKDIR /tmp
|
||||
RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
|
||||
WORKDIR /tmp/oc
|
||||
RUN go mod edit -go 1.22.5 &&\
|
||||
RUN go mod edit -go 1.23.1 &&\
|
||||
go get github.com/moby/buildkit@v0.12.5 &&\
|
||||
go get github.com/containerd/containerd@v1.7.11&&\
|
||||
go get github.com/docker/docker@v25.0.6&&\
|
||||
go get github.com/opencontainers/runc@v1.1.14&&\
|
||||
go get github.com/go-git/go-git/v5@v5.13.0&&\
|
||||
go get golang.org/x/net@v0.36.0&&\
|
||||
go get github.com/containerd/containerd@v1.7.27&&\
|
||||
go get golang.org/x/oauth2@v0.27.0&&\
|
||||
go get golang.org/x/crypto@v0.35.0&&\
|
||||
go mod tidy && go mod vendor
|
||||
RUN make GO_REQUIRED_MIN_VERSION:= oc
|
||||
|
||||
@@ -20,10 +25,6 @@ RUN dnf update -y
|
||||
|
||||
ENV KUBECONFIG /home/krkn/.kube/config
|
||||
|
||||
# install kubectl
|
||||
RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" &&\
|
||||
cp kubectl /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl &&\
|
||||
cp kubectl /usr/bin/kubectl && chmod +x /usr/bin/kubectl
|
||||
|
||||
# This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo
|
||||
RUN dnf update && dnf install -y --setopt=install_weak_deps=False \
|
||||
@@ -45,7 +46,8 @@ RUN if [ -n "$PR_NUMBER" ]; then git fetch origin pull/${PR_NUMBER}/head:pr-${PR
|
||||
# if it is a TAG trigger checkout the tag
|
||||
RUN if [ -n "$TAG" ]; then git checkout "$TAG";fi
|
||||
|
||||
RUN python3.9 -m ensurepip
|
||||
RUN python3.9 -m ensurepip --upgrade --default-pip
|
||||
RUN python3.9 -m pip install --upgrade pip setuptools==70.0.0
|
||||
RUN pip3.9 install -r requirements.txt
|
||||
RUN pip3.9 install jsonschema
|
||||
|
||||
|
||||
@@ -101,12 +101,21 @@
|
||||
{
|
||||
"name": "alerts-path",
|
||||
"short_description": "Cluster alerts path file (in container)",
|
||||
"description": "Enables cluster alerts check",
|
||||
"description": "Allows to specify a different alert file path",
|
||||
"variable": "ALERTS_PATH",
|
||||
"type": "string",
|
||||
"default": "config/alerts.yaml",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "metrics-path",
|
||||
"short_description": "Cluster metrics path file (in container)",
|
||||
"description": "Allows to specify a different metrics file path",
|
||||
"variable": "METRICS_PATH",
|
||||
"type": "string",
|
||||
"default": "config/metrics-aggregated.yaml",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "enable-es",
|
||||
"short_description": "Enables elastic search data collection",
|
||||
@@ -125,7 +134,6 @@
|
||||
"variable": "ES_SERVER",
|
||||
"type": "string",
|
||||
"default": "http://0.0.0.0",
|
||||
"validator": "^(http|https):\/\/.*",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
@@ -166,28 +174,6 @@
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-collect-metrics",
|
||||
"short_description": "Enables metrics collection on elastic search",
|
||||
"description": "Enables metrics collection on elastic search",
|
||||
"variable": "ES_COLLECT_METRICS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-collect-alerts",
|
||||
"short_description": "Enables alerts collection on elastic search",
|
||||
"description": "Enables alerts collection on elastic search",
|
||||
"variable": "ES_COLLECT_ALERTS",
|
||||
"type": "enum",
|
||||
"allowed_values": "True,False",
|
||||
"separator": ",",
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "es-metrics-index",
|
||||
"short_description": "Elasticsearch metrics index",
|
||||
@@ -381,6 +367,60 @@
|
||||
"default": "True",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-interval",
|
||||
"short_description": "Heath check interval",
|
||||
"description": "How often to check the health check urls",
|
||||
"variable": "HEALTH_CHECK_INTERVAL",
|
||||
"type": "number",
|
||||
"default": "2",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-url",
|
||||
"short_description": "Health check url",
|
||||
"description": "Url to check the health of",
|
||||
"variable": "HEALTH_CHECK_URL",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-auth",
|
||||
"short_description": "Health check authentication tuple",
|
||||
"description": "Authentication tuple to authenticate into health check URL",
|
||||
"variable": "HEALTH_CHECK_AUTH",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-bearer-token",
|
||||
"short_description": "Health check bearer token",
|
||||
"description": "Bearer token to authenticate into health check URL",
|
||||
"variable": "HEALTH_CHECK_BEARER_TOKEN",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-exit",
|
||||
"short_description": "Health check exit on failure",
|
||||
"description": "Exit on failure when health check URL is not able to connect",
|
||||
"variable": "HEALTH_CHECK_EXIT_ON_FAILURE",
|
||||
"type": "string",
|
||||
"default": "",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "health-check-verify",
|
||||
"short_description": "SSL Verification of health check url",
|
||||
"description": "SSL Verification to authenticate into health check URL",
|
||||
"variable": "HEALTH_CHECK_VERIFY",
|
||||
"type": "string",
|
||||
"default": "false",
|
||||
"required": "false"
|
||||
},
|
||||
{
|
||||
"name": "krkn-debug",
|
||||
"short_description": "Krkn debug mode",
|
||||
@@ -392,5 +432,4 @@
|
||||
"default": "False",
|
||||
"required": "false"
|
||||
}
|
||||
]
|
||||
|
||||
]
|
||||
@@ -12,10 +12,6 @@ Config components:
|
||||
# Kraken
|
||||
This section defines scenarios and specific data to the chaos run
|
||||
|
||||
## Distribution
|
||||
Either **openshift** or **kubernetes** depending on the type of cluster you want to run chaos on.
|
||||
The prometheus url/route and bearer token are automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
|
||||
## Exit on failure
|
||||
**exit_on_failure**: Exit when a post action check or cerberus run fails
|
||||
|
||||
|
||||
59
docs/health_checks.md
Normal file
59
docs/health_checks.md
Normal file
@@ -0,0 +1,59 @@
|
||||
### Health Checks
|
||||
|
||||
Health checks provide real-time visibility into the impact of chaos scenarios on application availability and performance. Health check configuration supports application endpoints accessible via http / https along with authentication mechanism such as bearer token and authentication credentials.
|
||||
Health checks are configured in the ```config.yaml```
|
||||
|
||||
The system periodically checks the provided URLs based on the defined interval and records the results in Telemetry. The telemetry data includes:
|
||||
|
||||
- Success response ```200``` when the application is running normally.
|
||||
- Failure response other than 200 if the application experiences downtime or errors.
|
||||
|
||||
This helps users quickly identify application health issues and take necessary actions.
|
||||
|
||||
#### Sample health check config
|
||||
```
|
||||
health_checks:
|
||||
interval: <time_in_seconds> # Defines the frequency of health checks, default value is 2 seconds
|
||||
config: # List of application endpoints to check
|
||||
- url: "https://example.com/health"
|
||||
bearer_token: "hfjauljl..." # Bearer token for authentication if any
|
||||
auth:
|
||||
exit_on_failure: True # If value is True exits when health check failed for application, values can be True/False
|
||||
- url: "https://another-service.com/status"
|
||||
bearer_token:
|
||||
auth: ("admin","secretpassword") # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: False
|
||||
- url: http://general-service.com
|
||||
bearer_token:
|
||||
auth:
|
||||
exit_on_failure:
|
||||
```
|
||||
#### Sample health check telemetry
|
||||
```
|
||||
"health_checks": [
|
||||
{
|
||||
"url": "https://example.com/health",
|
||||
"status": False,
|
||||
"status_code": "503",
|
||||
"start_timestamp": "2025-02-25 11:51:33",
|
||||
"end_timestamp": "2025-02-25 11:51:40",
|
||||
"duration": "0:00:07"
|
||||
},
|
||||
{
|
||||
"url": "https://another-service.com/status",
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": "2025-02-25 22:18:19",
|
||||
"end_timestamp": "22025-02-25 22:22:46",
|
||||
"duration": "0:04:27"
|
||||
},
|
||||
{
|
||||
"url": "http://general-service.com",
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": "2025-02-25 22:18:19",
|
||||
"end_timestamp": "22025-02-25 22:22:46",
|
||||
"duration": "0:04:27"
|
||||
}
|
||||
],
|
||||
```
|
||||
@@ -18,6 +18,8 @@ these workloads will use a predetermined amount of resources for a specified dur
|
||||
|`namespace`| string | the namespace where the stress workload will be deployed |
|
||||
|`node-selector`| string (Optional) | defines the node selector for choosing target nodes. If not specified, one schedulable node in the cluster will be chosen at random. If multiple nodes match the selector, all of them will be subjected to stress. If number-of-nodes is specified, that many nodes will be randomly selected from those identified by the selector. |
|
||||
|`number-of-nodes`| number (Optional) | restricts the number of selected nodes by the selector|
|
||||
|`taints`| list (Optional) default [] | list of taints for which tolerations need to created. Example: ["node-role.kubernetes.io/master:NoSchedule"]|
|
||||
|
||||
|
||||
|
||||
#### `cpu-hog` options
|
||||
|
||||
@@ -88,7 +88,8 @@ We want to look at this in terms of CPU, Memory, Disk, Throughput, Network etc.
|
||||
- Appropriate caching and Content Delivery Network should be enabled to be performant and usable when there is a latency on the client side.
|
||||
- Not every user or machine has access to unlimited bandwidth, there might be a delay on the user side ( client ) to access the API’s due to limited bandwidth, throttling or latency depending on the geographic location. It is important to inject latency between the client and API calls to understand the behavior and optimize things including caching wherever possible, using CDN’s or opting for different protocols like HTTP/2 or HTTP/3 vs HTTP.
|
||||
|
||||
|
||||
- Ensure Disruption Budgets are enabled for your critical applications
|
||||
- Protect your application during disruptions by setting a [pod disruption budget](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) to avoid downtime. For instance, etcd, zookeeper or similar applications need at least 2 replicas to maintain quorum. This can be ensured by setting PDB maxUnavailable to 1.
|
||||
|
||||
|
||||
### Tooling
|
||||
|
||||
@@ -1,6 +1,9 @@
|
||||
### Service Disruption Scenarios (Previously Delete Namespace Scenario)
|
||||
|
||||
Using this type of scenario configuration one is able to delete crucial objects in a specific namespace, or a namespace matching a certain regex string.
|
||||
Using this type of scenario configuration one is able to delete crucial objects in a specific namespace, or a namespace matching a certain regex string. The goal of this scenario is to ensure Pod Disruption Budgets with appropriate configurations are set to have minimum number of replicas are running at a given time and avoid downtime.
|
||||
|
||||
|
||||
**NOTE**: Protect your application during disruptions by setting a [pod disruption budget](https://kubernetes.io/docs/tasks/run-application/configure-pdb/) to avoid downtime. For instance, etcd, zookeeper or similar applications need at least 2 replicas to maintain quorum. This can be ensured by setting PDB maxUnavailable to 1.
|
||||
|
||||
Configuration Options:
|
||||
|
||||
|
||||
@@ -2,10 +2,11 @@ from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import os.path
|
||||
import math
|
||||
from typing import Optional, List, Dict, Any
|
||||
|
||||
import urllib3
|
||||
import logging
|
||||
import urllib3
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
@@ -25,8 +26,7 @@ def alerts(
|
||||
start_time,
|
||||
end_time,
|
||||
alert_profile,
|
||||
elastic_collect_alerts,
|
||||
elastic_alerts_index,
|
||||
elastic_alerts_index
|
||||
):
|
||||
|
||||
if alert_profile is None or os.path.exists(alert_profile) is False:
|
||||
@@ -46,6 +46,7 @@ def alerts(
|
||||
for alert in profile_yaml:
|
||||
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
|
||||
logging.error(f"wrong alert {alert}, skipping")
|
||||
continue
|
||||
|
||||
processed_alert = prom_cli.process_alert(
|
||||
alert,
|
||||
@@ -56,7 +57,6 @@ def alerts(
|
||||
processed_alert[0]
|
||||
and processed_alert[1]
|
||||
and elastic
|
||||
and elastic_collect_alerts
|
||||
):
|
||||
elastic_alert = ElasticAlert(
|
||||
run_uuid=run_uuid,
|
||||
@@ -156,15 +156,15 @@ def metrics(
|
||||
start_time,
|
||||
end_time,
|
||||
metrics_profile,
|
||||
elastic_collect_metrics,
|
||||
elastic_metrics_index,
|
||||
elastic_metrics_index
|
||||
) -> list[dict[str, list[(int, float)] | str]]:
|
||||
metrics_list: list[dict[str, list[(int, float)] | str]] = []
|
||||
|
||||
if metrics_profile is None or os.path.exists(metrics_profile) is False:
|
||||
logging.error(f"{metrics_profile} alert profile does not exist")
|
||||
sys.exit(1)
|
||||
with open(metrics_profile) as profile:
|
||||
profile_yaml = yaml.safe_load(profile)
|
||||
|
||||
if not profile_yaml["metrics"] or not isinstance(profile_yaml["metrics"], list):
|
||||
logging.error(
|
||||
f"{metrics_profile} wrong file format, alert profile must be "
|
||||
@@ -172,30 +172,58 @@ def metrics(
|
||||
f"expr, description, severity"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
elapsed_ceil = math.ceil((end_time - start_time)/ 60 )
|
||||
elapsed_time = str(elapsed_ceil) + "m"
|
||||
metrics_list: list[dict[str, int | float | str]] = []
|
||||
for metric_query in profile_yaml["metrics"]:
|
||||
if (
|
||||
query = metric_query['query']
|
||||
|
||||
# calculate elapsed time
|
||||
if ".elapsed" in metric_query["query"]:
|
||||
query = metric_query['query'].replace(".elapsed", elapsed_time)
|
||||
if "instant" in list(metric_query.keys()) and metric_query['instant']:
|
||||
metrics_result = prom_cli.process_query(
|
||||
query
|
||||
)
|
||||
elif (
|
||||
list(metric_query.keys()).sort()
|
||||
!= ["query", "metricName", "instant"].sort()
|
||||
== ["query", "metricName"].sort()
|
||||
):
|
||||
logging.error(f"wrong alert {metric_query}, skipping")
|
||||
metrics_result = prom_cli.process_prom_query_in_range(
|
||||
metric_query["query"],
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=datetime.datetime.fromtimestamp(end_time),
|
||||
)
|
||||
|
||||
metric = {"name": metric_query["metricName"], "values": []}
|
||||
metrics_result = prom_cli.process_prom_query_in_range(
|
||||
query,
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=datetime.datetime.fromtimestamp(end_time), granularity=30
|
||||
)
|
||||
else:
|
||||
logging.info('didnt match keys')
|
||||
continue
|
||||
|
||||
for returned_metric in metrics_result:
|
||||
if "values" in returned_metric:
|
||||
metric = {"query": query, "metricName": metric_query['metricName']}
|
||||
for k,v in returned_metric['metric'].items():
|
||||
metric[k] = v
|
||||
|
||||
if "values" in returned_metric:
|
||||
for value in returned_metric["values"]:
|
||||
try:
|
||||
metric["values"].append((value[0], float(value[1])))
|
||||
metric['timestamp'] = str(datetime.datetime.fromtimestamp(value[0]))
|
||||
metric["value"] = float(value[1])
|
||||
# want double array of the known details and the metrics specific to each call
|
||||
metrics_list.append(metric.copy())
|
||||
except ValueError:
|
||||
pass
|
||||
metrics_list.append(metric)
|
||||
elif "value" in returned_metric:
|
||||
try:
|
||||
value = returned_metric["value"]
|
||||
metric['timestamp'] = str(datetime.datetime.fromtimestamp(value[0]))
|
||||
metric["value"] = float(value[1])
|
||||
|
||||
if elastic_collect_metrics and elastic:
|
||||
# want double array of the known details and the metrics specific to each call
|
||||
metrics_list.append(metric.copy())
|
||||
except ValueError:
|
||||
pass
|
||||
|
||||
if elastic:
|
||||
result = elastic.upload_metrics_to_elasticsearch(
|
||||
run_uuid=run_uuid, index=elastic_metrics_index, raw_data=metrics_list
|
||||
)
|
||||
|
||||
@@ -68,6 +68,7 @@ class AbstractScenarioPlugin(ABC):
|
||||
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario_config
|
||||
scenario_telemetry.scenario_type = self.get_scenario_types()[0]
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
parsed_scenario_config = telemetry.set_parameters_base64(
|
||||
scenario_telemetry, scenario_config
|
||||
@@ -103,7 +104,7 @@ class AbstractScenarioPlugin(ABC):
|
||||
|
||||
if events_backup:
|
||||
utils.populate_cluster_events(
|
||||
scenario_telemetry,
|
||||
krkn_config,
|
||||
parsed_scenario_config,
|
||||
telemetry.get_lib_kubernetes(),
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
|
||||
@@ -3,7 +3,7 @@ import time
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
from krkn_lib.utils import get_yaml_item_value, get_random_string
|
||||
from jinja2 import Template
|
||||
from krkn import cerberus
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
@@ -33,17 +33,22 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
duration = get_yaml_item_value(scenario_config, "duration", 60)
|
||||
|
||||
start_time = int(time.time())
|
||||
policy_name = f"krkn-deny-{get_random_string(5)}"
|
||||
|
||||
network_policy_template = """---
|
||||
network_policy_template = (
|
||||
"""---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
name: kraken-deny
|
||||
name: """
|
||||
+ policy_name
|
||||
+ """
|
||||
spec:
|
||||
podSelector:
|
||||
matchLabels: {{ pod_selector }}
|
||||
policyTypes: {{ traffic_type }}
|
||||
"""
|
||||
)
|
||||
t = Template(network_policy_template)
|
||||
rendered_spec = t.render(
|
||||
pod_selector=pod_selector, traffic_type=traffic_type
|
||||
@@ -65,7 +70,7 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
# unblock the traffic by deleting the network policy
|
||||
logging.info("Deleting the network policy")
|
||||
lib_telemetry.get_lib_kubernetes().delete_net_policy(
|
||||
"kraken-deny", namespace
|
||||
policy_name, namespace
|
||||
)
|
||||
|
||||
logging.info(
|
||||
|
||||
@@ -22,9 +22,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
start_time = int(time.time())
|
||||
pool = PodsMonitorPool(lib_telemetry.get_lib_kubernetes())
|
||||
wait_duration = krkn_config["tunings"]["wait_duration"]
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
cont_scenario_config = yaml.full_load(f)
|
||||
@@ -45,16 +43,10 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
return 1
|
||||
scenario_telemetry.affected_pods = result
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
# capture end time
|
||||
end_time = int(time.time())
|
||||
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e)
|
||||
logging.error("ContainerScenarioPlugin exiting due to Exception %s")
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
@@ -33,7 +33,7 @@ class HogsScenarioPlugin(AbstractScenarioPlugin):
|
||||
else:
|
||||
node_selector = scenario_config.node_selector
|
||||
|
||||
available_nodes = lib_telemetry.get_lib_kubernetes().list_schedulable_nodes(node_selector)
|
||||
available_nodes = lib_telemetry.get_lib_kubernetes().list_nodes(node_selector)
|
||||
if len(available_nodes) == 0:
|
||||
raise Exception("no available nodes to schedule workload")
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
|
||||
return [
|
||||
"pod_disruption_scenarios",
|
||||
"pod_network_scenarios",
|
||||
"ibmcloud_node_scenarios",
|
||||
"ingress_node_scenarios"
|
||||
]
|
||||
|
||||
def start_monitoring(self, pool: PodsMonitorPool, scenarios: list[Any]):
|
||||
|
||||
@@ -97,15 +97,6 @@ class NetworkScenarioConfig:
|
||||
},
|
||||
)
|
||||
|
||||
kraken_config: typing.Optional[str] = field(
|
||||
default="",
|
||||
metadata={
|
||||
"name": "Kraken Config",
|
||||
"description": "Path to the config file of Kraken. "
|
||||
"Set this field if you wish to publish status onto Cerberus",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetworkScenarioSuccessOutput:
|
||||
@@ -710,6 +701,7 @@ def network_chaos(
|
||||
pod_module_template = env.get_template("pod_module.j2")
|
||||
cli, batch_cli = kube_helper.setup_kubernetes(cfg.kubeconfig_path)
|
||||
|
||||
logging.info("Starting Ingress Network Chaos")
|
||||
try:
|
||||
node_interface_dict = get_node_interfaces(
|
||||
cfg.node_interface_name,
|
||||
@@ -721,16 +713,6 @@ def network_chaos(
|
||||
except Exception:
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
job_list = []
|
||||
publish = False
|
||||
if cfg.kraken_config:
|
||||
failed_post_scenarios = ""
|
||||
try:
|
||||
with open(cfg.kraken_config, "r") as f:
|
||||
config = yaml.full_load(f)
|
||||
except Exception:
|
||||
logging.error("Error reading Kraken config from %s" % cfg.kraken_config)
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
publish = True
|
||||
|
||||
try:
|
||||
if cfg.execution_type == "parallel":
|
||||
@@ -747,13 +729,7 @@ def network_chaos(
|
||||
)
|
||||
)
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
config, failed_post_scenarios, start_time, end_time
|
||||
)
|
||||
|
||||
elif cfg.execution_type == "serial":
|
||||
create_interfaces = True
|
||||
@@ -773,18 +749,12 @@ def network_chaos(
|
||||
)
|
||||
)
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration + 100)
|
||||
logging.info("Deleting jobs")
|
||||
delete_jobs(cli, batch_cli, job_list[:])
|
||||
job_list = []
|
||||
logging.info("Waiting for wait_duration : %ss" % cfg.wait_duration)
|
||||
time.sleep(cfg.wait_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
config, failed_post_scenarios, start_time, end_time
|
||||
)
|
||||
create_interfaces = False
|
||||
else:
|
||||
|
||||
@@ -799,7 +769,7 @@ def network_chaos(
|
||||
execution_type=cfg.execution_type,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Network Chaos exiting due to Exception - %s" % e)
|
||||
logging.error("Ingress Network Chaos exiting due to Exception - %s" % e)
|
||||
return "error", NetworkScenarioErrorOutput(format_exc())
|
||||
finally:
|
||||
delete_virtual_interfaces(cli, node_interface_dict.keys(), pod_module_template)
|
||||
|
||||
@@ -1,589 +0,0 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import typing
|
||||
from os import environ
|
||||
from dataclasses import dataclass, field
|
||||
from traceback import format_exc
|
||||
import logging
|
||||
from krkn.scenario_plugins.native.node_scenarios import (
|
||||
kubernetes_functions as kube_helper,
|
||||
)
|
||||
from arcaflow_plugin_sdk import validation, plugin
|
||||
from kubernetes import client, watch
|
||||
from ibm_vpc import VpcV1
|
||||
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
||||
import sys
|
||||
|
||||
|
||||
class IbmCloud:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the ibm cloud client by using the the env variables:
|
||||
'IBMC_APIKEY' 'IBMC_URL'
|
||||
"""
|
||||
apiKey = environ.get("IBMC_APIKEY")
|
||||
service_url = environ.get("IBMC_URL")
|
||||
if not apiKey:
|
||||
raise Exception("Environmental variable 'IBMC_APIKEY' is not set")
|
||||
if not service_url:
|
||||
raise Exception("Environmental variable 'IBMC_URL' is not set")
|
||||
try:
|
||||
authenticator = IAMAuthenticator(apiKey)
|
||||
self.service = VpcV1(authenticator=authenticator)
|
||||
|
||||
self.service.set_service_url(service_url)
|
||||
except Exception as e:
|
||||
logging.error("error authenticating" + str(e))
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
node_list = self.list_instances()
|
||||
for node in node_list:
|
||||
if node_name == node["vpc_name"]:
|
||||
return node["vpc_id"]
|
||||
logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_instance(self, instance_id):
|
||||
"""
|
||||
Deletes the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
try:
|
||||
self.service.delete_instance(instance_id)
|
||||
logging.info("Deleted Instance -- '{}'".format(instance_id))
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be deleted. ".format(instance_id))
|
||||
return False
|
||||
|
||||
def reboot_instances(self, instance_id):
|
||||
"""
|
||||
Reboots the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is not powered on
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="reboot",
|
||||
)
|
||||
logging.info("Reset Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be rebooted".format(instance_id))
|
||||
return False
|
||||
|
||||
def stop_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already stopped
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="stop",
|
||||
)
|
||||
logging.info("Stopped Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be stopped".format(instance_id))
|
||||
logging.info("error" + str(e))
|
||||
return False
|
||||
|
||||
def start_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already running
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="start",
|
||||
)
|
||||
logging.info("Started Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not start running".format(instance_id))
|
||||
return False
|
||||
|
||||
def list_instances(self):
|
||||
"""
|
||||
Returns a list of Instances present in the datacenter
|
||||
"""
|
||||
instance_names = []
|
||||
try:
|
||||
instances_result = self.service.list_instances().get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc["name"], "vpc_id": vpc["id"]})
|
||||
starting_count = instances_result["total_count"]
|
||||
while instances_result["total_count"] == instances_result["limit"]:
|
||||
instances_result = self.service.list_instances(
|
||||
start=starting_count
|
||||
).get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
starting_count += instances_result["total_count"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc.name, "vpc_id": vpc.id})
|
||||
except Exception as e:
|
||||
logging.error("Error listing out instances: " + str(e))
|
||||
sys.exit(1)
|
||||
return instance_names
|
||||
|
||||
def find_id_in_list(self, name, vpc_list):
|
||||
for vpc in vpc_list:
|
||||
if vpc["vpc_name"] == name:
|
||||
return vpc["vpc_id"]
|
||||
|
||||
def get_instance_status(self, instance_id):
|
||||
"""
|
||||
Returns the status of the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
try:
|
||||
instance = self.service.get_instance(instance_id).get_result()
|
||||
state = instance["status"]
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_deleted(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the instance is deleted or until the timeout. Returns True if
|
||||
the instance is successfully deleted, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
while vpc is not None:
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still being deleted, sleeping for 5 seconds"
|
||||
% instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not deleted in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_running(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to running state or until the timeout.
|
||||
Returns True if the Instance switches to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "running":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not ready in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to stopped state or until the timeout.
|
||||
Returns True if the Instance switches to stopped, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "stopped":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not stopped, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not stopped in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
return True
|
||||
|
||||
def wait_until_rebooted(self, instance_id, timeout):
|
||||
"""
|
||||
Waits until the Instance switches to restarting state and then running state or until the timeout.
|
||||
Returns True if the Instance switches back to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status == "starting":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still restarting, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still restarting after allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
self.wait_until_running(instance_id, timeout)
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class Node:
|
||||
name: str
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioSuccessOutput:
|
||||
|
||||
nodes: typing.Dict[int, Node] = field(
|
||||
metadata={
|
||||
"name": "Nodes started/stopped/terminated/rebooted",
|
||||
"description": """Map between timestamps and the pods started/stopped/terminated/rebooted.
|
||||
The timestamp is provided in nanoseconds""",
|
||||
}
|
||||
)
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action performed or attempted to be performed on the node. Possible values
|
||||
are : Start, Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioErrorOutput:
|
||||
|
||||
error: str
|
||||
action: kube_helper.Actions = field(
|
||||
metadata={
|
||||
"name": "The action performed on the node",
|
||||
"description": """The action attempted to be performed on the node. Possible values are : Start
|
||||
Stop, Terminate, Reboot""",
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NodeScenarioConfig:
|
||||
|
||||
name: typing.Annotated[
|
||||
typing.Optional[str],
|
||||
validation.required_if_not("label_selector"),
|
||||
validation.required_if("skip_openshift_checks"),
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Name",
|
||||
"description": "Name(s) for target nodes. Required if label_selector is not set.",
|
||||
},
|
||||
)
|
||||
|
||||
runs: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Number of runs per node",
|
||||
"description": "Number of times to inject each scenario under actions (will perform on same node each time)",
|
||||
},
|
||||
)
|
||||
|
||||
label_selector: typing.Annotated[
|
||||
typing.Optional[str], validation.min(1), validation.required_if_not("name")
|
||||
] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Label selector",
|
||||
"description": "Kubernetes label selector for the target nodes. Required if name is not set.\n"
|
||||
"See https://kubernetes.io/docs/concepts/overview/working-with-objects/labels/ for details.",
|
||||
},
|
||||
)
|
||||
|
||||
timeout: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=180,
|
||||
metadata={
|
||||
"name": "Timeout",
|
||||
"description": "Timeout to wait for the target pod(s) to be removed in seconds.",
|
||||
},
|
||||
)
|
||||
|
||||
instance_count: typing.Annotated[typing.Optional[int], validation.min(1)] = field(
|
||||
default=1,
|
||||
metadata={
|
||||
"name": "Instance Count",
|
||||
"description": "Number of nodes to perform action/select that match the label selector.",
|
||||
},
|
||||
)
|
||||
|
||||
skip_openshift_checks: typing.Optional[bool] = field(
|
||||
default=False,
|
||||
metadata={
|
||||
"name": "Skip Openshift Checks",
|
||||
"description": "Skip checking the status of the openshift nodes.",
|
||||
},
|
||||
)
|
||||
|
||||
kubeconfig_path: typing.Optional[str] = field(
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Kubeconfig path",
|
||||
"description": "Path to your Kubeconfig file. Defaults to ~/.kube/config.\n"
|
||||
"See https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ for "
|
||||
"details.",
|
||||
},
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-start",
|
||||
name="Start the node",
|
||||
description="Start the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_start(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.START, core_v1)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_started = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
vm_started = ibmcloud.start_instances(instance_id)
|
||||
if vm_started:
|
||||
ibmcloud.wait_until_running(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.START,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_started, kube_helper.Actions.START
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-stop",
|
||||
name="Stop the node",
|
||||
description="Stop the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_stop(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
logging.info("set up done")
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.STOP, core_v1)
|
||||
logging.info("set node list" + str(node_list))
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
logging.info("node names" + str(node_name_id_list))
|
||||
nodes_stopped = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
vm_stopped = ibmcloud.stop_instances(instance_id)
|
||||
if vm_stopped:
|
||||
ibmcloud.wait_until_stopped(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.STOP,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_stopped, kube_helper.Actions.STOP
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-reboot",
|
||||
name="Reboot Ibmcloud Instance",
|
||||
description="Reboot the node(s) by starting the Ibmcloud Instance on which the node is configured",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_reboot(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
watch_resource = watch.Watch()
|
||||
node_list = kube_helper.get_node_list(cfg, kube_helper.Actions.REBOOT, core_v1)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_rebooted = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s " % (name))
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
if instance_id:
|
||||
ibmcloud.reboot_instances(instance_id)
|
||||
ibmcloud.wait_until_rebooted(instance_id, cfg.timeout)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
kube_helper.wait_for_ready_status(
|
||||
name, cfg.timeout, watch_resource, core_v1
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted successfully" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.REBOOT,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_rebooted, kube_helper.Actions.REBOOT
|
||||
)
|
||||
|
||||
|
||||
@plugin.step(
|
||||
id="ibmcloud-node-terminate",
|
||||
name="Reboot Ibmcloud Instance",
|
||||
description="Wait for node to be deleted",
|
||||
outputs={"success": NodeScenarioSuccessOutput, "error": NodeScenarioErrorOutput},
|
||||
)
|
||||
def node_terminate(
|
||||
cfg: NodeScenarioConfig,
|
||||
) -> typing.Tuple[
|
||||
str, typing.Union[NodeScenarioSuccessOutput, NodeScenarioErrorOutput]
|
||||
]:
|
||||
with kube_helper.setup_kubernetes(None) as cli:
|
||||
ibmcloud = IbmCloud()
|
||||
core_v1 = client.CoreV1Api(cli)
|
||||
node_list = kube_helper.get_node_list(
|
||||
cfg, kube_helper.Actions.TERMINATE, core_v1
|
||||
)
|
||||
node_name_id_list = ibmcloud.list_instances()
|
||||
nodes_terminated = {}
|
||||
for name in node_list:
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping the node"
|
||||
)
|
||||
instance_id = ibmcloud.find_id_in_list(name, node_name_id_list)
|
||||
logging.info("Deleting the node with instance ID: %s " % (name))
|
||||
if instance_id:
|
||||
ibmcloud.delete_instance(instance_id)
|
||||
ibmcloud.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released" % name
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find instances that matched the node specifications on ibm cloud in the set region"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
"No matching vpc with node name " + name,
|
||||
kube_helper.Actions.TERMINATE,
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error("node_terminate_scenario injection failed!")
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
)
|
||||
|
||||
return "success", NodeScenarioSuccessOutput(
|
||||
nodes_terminated, kube_helper.Actions.TERMINATE
|
||||
)
|
||||
@@ -1,179 +0,0 @@
|
||||
from kubernetes import config, client
|
||||
from kubernetes.client.rest import ApiException
|
||||
import logging
|
||||
import random
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class Actions(Enum):
|
||||
"""
|
||||
This enumeration indicates different kinds of node operations
|
||||
"""
|
||||
|
||||
START = "Start"
|
||||
STOP = "Stop"
|
||||
TERMINATE = "Terminate"
|
||||
REBOOT = "Reboot"
|
||||
|
||||
|
||||
def setup_kubernetes(kubeconfig_path):
|
||||
"""
|
||||
Sets up the Kubernetes client
|
||||
"""
|
||||
|
||||
if kubeconfig_path is None:
|
||||
kubeconfig_path = config.KUBE_CONFIG_DEFAULT_LOCATION
|
||||
kubeconfig = config.kube_config.KubeConfigMerger(kubeconfig_path)
|
||||
|
||||
if kubeconfig.config is None:
|
||||
raise Exception(
|
||||
"Invalid kube-config file: %s. " "No configuration found." % kubeconfig_path
|
||||
)
|
||||
loader = config.kube_config.KubeConfigLoader(
|
||||
config_dict=kubeconfig.config,
|
||||
)
|
||||
client_config = client.Configuration()
|
||||
loader.load_and_set(client_config)
|
||||
return client.ApiClient(configuration=client_config)
|
||||
|
||||
|
||||
def list_killable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be stopped/reset/released
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) == "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def list_startable_nodes(core_v1, label_selector=None):
|
||||
"""
|
||||
Returns a list of nodes that can be started
|
||||
"""
|
||||
|
||||
nodes = []
|
||||
try:
|
||||
if label_selector:
|
||||
ret = core_v1.list_node(pretty=True, label_selector=label_selector)
|
||||
else:
|
||||
ret = core_v1.list_node(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling CoreV1Api->list_node: %s\n" % e)
|
||||
raise e
|
||||
for node in ret.items:
|
||||
for cond in node.status.conditions:
|
||||
if str(cond.type) == "Ready" and str(cond.status) != "True":
|
||||
nodes.append(node.metadata.name)
|
||||
return nodes
|
||||
|
||||
|
||||
def get_node_list(cfg, action, core_v1):
|
||||
"""
|
||||
Returns a list of nodes to be used in the node scenarios. The list returned is constructed as follows:
|
||||
- If the key 'name' is present in the node scenario config, the value is extracted and split into
|
||||
a list
|
||||
- Each node in the list is fed to the get_node function which checks if the node is killable or
|
||||
fetches the node using the label selector
|
||||
"""
|
||||
|
||||
def get_node(node_name, label_selector, instance_kill_count, action, core_v1):
|
||||
list_nodes_func = (
|
||||
list_startable_nodes if action == Actions.START else list_killable_nodes
|
||||
)
|
||||
if node_name in list_nodes_func(core_v1):
|
||||
return [node_name]
|
||||
elif node_name:
|
||||
logging.info(
|
||||
"Node with provided node_name does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
nodes = list_nodes_func(core_v1, label_selector)
|
||||
if not nodes:
|
||||
raise Exception("Ready nodes with the provided label selector do not exist")
|
||||
logging.info(
|
||||
"Ready nodes with the label selector %s: %s" % (label_selector, nodes)
|
||||
)
|
||||
number_of_nodes = len(nodes)
|
||||
if instance_kill_count == number_of_nodes:
|
||||
return nodes
|
||||
nodes_to_return = []
|
||||
for i in range(instance_kill_count):
|
||||
node_to_add = nodes[random.randint(0, len(nodes) - 1)]
|
||||
nodes_to_return.append(node_to_add)
|
||||
nodes.remove(node_to_add)
|
||||
return nodes_to_return
|
||||
|
||||
if cfg.name:
|
||||
input_nodes = cfg.name.split(",")
|
||||
else:
|
||||
input_nodes = [""]
|
||||
scenario_nodes = set()
|
||||
|
||||
if cfg.skip_openshift_checks:
|
||||
scenario_nodes = input_nodes
|
||||
else:
|
||||
for node in input_nodes:
|
||||
nodes = get_node(
|
||||
node, cfg.label_selector, cfg.instance_count, action, core_v1
|
||||
)
|
||||
scenario_nodes.update(nodes)
|
||||
|
||||
return list(scenario_nodes)
|
||||
|
||||
|
||||
def watch_node_status(node, status, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Monitor the status of a node for change
|
||||
"""
|
||||
count = timeout
|
||||
for event in watch_resource.stream(
|
||||
core_v1.list_node,
|
||||
field_selector=f"metadata.name={node}",
|
||||
timeout_seconds=timeout,
|
||||
):
|
||||
conditions = [
|
||||
status
|
||||
for status in event["object"].status.conditions
|
||||
if status.type == "Ready"
|
||||
]
|
||||
if conditions[0].status == status:
|
||||
watch_resource.stop()
|
||||
break
|
||||
else:
|
||||
count -= 1
|
||||
logging.info("Status of node " + node + ": " + str(conditions[0].status))
|
||||
if not count:
|
||||
watch_resource.stop()
|
||||
|
||||
|
||||
def wait_for_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Ready
|
||||
"""
|
||||
watch_node_status(node, "True", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_not_ready_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Not Ready
|
||||
"""
|
||||
watch_node_status(node, "False", timeout, watch_resource, core_v1)
|
||||
|
||||
|
||||
def wait_for_unknown_status(node, timeout, watch_resource, core_v1):
|
||||
"""
|
||||
Wait until the node status becomes Unknown
|
||||
"""
|
||||
watch_node_status(node, "Unknown", timeout, watch_resource, core_v1)
|
||||
@@ -12,7 +12,6 @@ from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin i
|
||||
from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import (
|
||||
pod_egress_shaping,
|
||||
)
|
||||
import krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||
from krkn.scenario_plugins.native.pod_network_outage.pod_network_outage_plugin import (
|
||||
pod_ingress_shaping,
|
||||
)
|
||||
@@ -157,10 +156,6 @@ PLUGINS = Plugins(
|
||||
),
|
||||
PluginStep(wait_for_pods, ["error"]),
|
||||
PluginStep(run_python_file, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_start, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_stop, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_reboot, ["error"]),
|
||||
PluginStep(ibmcloud_plugin.node_terminate, ["error"]),
|
||||
PluginStep(network_chaos, ["error"]),
|
||||
PluginStep(pod_outage, ["error"]),
|
||||
PluginStep(pod_egress_shaping, ["error"]),
|
||||
|
||||
0
krkn/scenario_plugins/network_chaos_ng/__init__.py
Normal file
0
krkn/scenario_plugins/network_chaos_ng/__init__.py
Normal file
41
krkn/scenario_plugins/network_chaos_ng/models.py
Normal file
41
krkn/scenario_plugins/network_chaos_ng/models.py
Normal file
@@ -0,0 +1,41 @@
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
|
||||
|
||||
class NetworkChaosScenarioType(Enum):
|
||||
Node = 1
|
||||
Pod = 2
|
||||
|
||||
@dataclass
|
||||
class BaseNetworkChaosConfig:
|
||||
supported_execution = ["serial", "parallel"]
|
||||
id: str
|
||||
wait_duration: int
|
||||
test_duration: int
|
||||
label_selector: str
|
||||
instance_count: int
|
||||
execution: str
|
||||
namespace: str
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = []
|
||||
if self.execution is None:
|
||||
errors.append(f"execution cannot be None, supported values are: {','.join(self.supported_execution)}")
|
||||
if self.execution not in self.supported_execution:
|
||||
errors.append(f"{self.execution} is not in supported execution mod: {','.join(self.supported_execution)}")
|
||||
if self.label_selector is None:
|
||||
errors.append("label_selector cannot be None")
|
||||
return errors
|
||||
|
||||
@dataclass
|
||||
class NetworkFilterConfig(BaseNetworkChaosConfig):
|
||||
ingress: bool
|
||||
egress: bool
|
||||
interfaces: list[str]
|
||||
target: str
|
||||
ports: list[int]
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = super().validate()
|
||||
# here further validations
|
||||
return errors
|
||||
@@ -0,0 +1,58 @@
|
||||
import abc
|
||||
import logging
|
||||
import queue
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import BaseNetworkChaosConfig, NetworkChaosScenarioType
|
||||
|
||||
|
||||
class AbstractNetworkChaosModule(abc.ABC):
|
||||
"""
|
||||
The abstract class that needs to be implemented by each Network Chaos Scenario
|
||||
"""
|
||||
@abc.abstractmethod
|
||||
def run(self, target: str, kubecli: KrknTelemetryOpenshift, error_queue: queue.Queue = None):
|
||||
"""
|
||||
the entrypoint method for the Network Chaos Scenario
|
||||
:param target: The resource name that will be targeted by the scenario (Node Name, Pod Name etc.)
|
||||
:param kubecli: The `KrknTelemetryOpenshift` needed by the scenario to access to the krkn-lib methods
|
||||
:param error_queue: A queue that will be used by the plugin to push the errors raised during the execution of parallel modules
|
||||
"""
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
"""
|
||||
returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network
|
||||
Chaos Scenario that is running (Pod Scenario or Node Scenario)
|
||||
"""
|
||||
pass
|
||||
|
||||
|
||||
def log_info(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for INFO severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.info(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.info(message)
|
||||
|
||||
def log_warning(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for WARNING severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.warning(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.warning(message)
|
||||
|
||||
|
||||
def log_error(self, message: str, parallel: bool = False, node_name: str = ""):
|
||||
"""
|
||||
log helper method for ERROR severity to be used in the scenarios
|
||||
"""
|
||||
if parallel:
|
||||
logging.error(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.error(message)
|
||||
@@ -0,0 +1,136 @@
|
||||
import os
|
||||
import queue
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
BaseNetworkChaosConfig,
|
||||
NetworkFilterConfig,
|
||||
NetworkChaosScenarioType,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
|
||||
|
||||
class NodeNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
config: NetworkFilterConfig
|
||||
|
||||
def run(
|
||||
self,
|
||||
target: str,
|
||||
kubecli: KrknTelemetryOpenshift,
|
||||
error_queue: queue.Queue = None,
|
||||
):
|
||||
parallel = False
|
||||
if error_queue:
|
||||
parallel = True
|
||||
try:
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_name = f"node-filter-{get_random_string(5)}"
|
||||
pod_template = env.get_template("templates/network-chaos.j2")
|
||||
pod_body = yaml.safe_load(
|
||||
pod_template.render(
|
||||
pod_name=pod_name,
|
||||
namespace=self.config.namespace,
|
||||
host_network=True,
|
||||
target=target,
|
||||
)
|
||||
)
|
||||
self.log_info(
|
||||
f"creating pod to filter "
|
||||
f"ports {','.join([str(port) for port in self.config.ports])}, "
|
||||
f"ingress:{str(self.config.ingress)}, "
|
||||
f"egress:{str(self.config.egress)}",
|
||||
parallel,
|
||||
target,
|
||||
)
|
||||
kubecli.get_lib_kubernetes().create_pod(
|
||||
pod_body, self.config.namespace, 300
|
||||
)
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
interfaces = [
|
||||
self.get_default_interface(pod_name, self.config.namespace, kubecli)
|
||||
]
|
||||
self.log_info(f"detected default interface {interfaces[0]}")
|
||||
else:
|
||||
interfaces = self.config.interfaces
|
||||
|
||||
input_rules, output_rules = self.generate_rules(interfaces)
|
||||
|
||||
for rule in input_rules:
|
||||
self.log_info(f"applying iptables INPUT rule: {rule}", parallel, target)
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[rule], pod_name, self.config.namespace
|
||||
)
|
||||
for rule in output_rules:
|
||||
self.log_info(
|
||||
f"applying iptables OUTPUT rule: {rule}", parallel, target
|
||||
)
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[rule], pod_name, self.config.namespace
|
||||
)
|
||||
self.log_info(
|
||||
f"waiting {self.config.test_duration} seconds before removing the iptables rules"
|
||||
)
|
||||
time.sleep(self.config.test_duration)
|
||||
self.log_info("removing iptables rules")
|
||||
for _ in input_rules:
|
||||
# always deleting the first rule since has been inserted from the top
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[f"iptables -D INPUT 1"], pod_name, self.config.namespace
|
||||
)
|
||||
for _ in output_rules:
|
||||
# always deleting the first rule since has been inserted from the top
|
||||
kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[f"iptables -D OUTPUT 1"], pod_name, self.config.namespace
|
||||
)
|
||||
self.log_info(
|
||||
f"deleting network chaos pod {pod_name} from {self.config.namespace}"
|
||||
)
|
||||
|
||||
kubecli.get_lib_kubernetes().delete_pod(pod_name, self.config.namespace)
|
||||
|
||||
except Exception as e:
|
||||
if error_queue is None:
|
||||
raise e
|
||||
else:
|
||||
error_queue.put(str(e))
|
||||
|
||||
def __init__(self, config: NetworkFilterConfig):
|
||||
self.config = config
|
||||
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
return NetworkChaosScenarioType.Node, self.config
|
||||
|
||||
def get_default_interface(
|
||||
self, pod_name: str, namespace: str, kubecli: KrknTelemetryOpenshift
|
||||
) -> str:
|
||||
cmd = "ip r | grep default | awk '/default/ {print $5}'"
|
||||
output = kubecli.get_lib_kubernetes().exec_cmd_in_pod(
|
||||
[cmd], pod_name, namespace
|
||||
)
|
||||
return output.replace("\n", "")
|
||||
|
||||
def generate_rules(self, interfaces: list[str]) -> (list[str], list[str]):
|
||||
input_rules = []
|
||||
output_rules = []
|
||||
for interface in interfaces:
|
||||
for port in self.config.ports:
|
||||
if self.config.egress:
|
||||
output_rules.append(
|
||||
f"iptables -I OUTPUT 1 -p tcp --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
|
||||
)
|
||||
|
||||
if self.config.ingress:
|
||||
input_rules.append(
|
||||
f"iptables -I INPUT 1 -i {interface} -p tcp --dport {port} -m state --state NEW,RELATED,ESTABLISHED -j DROP"
|
||||
)
|
||||
return input_rules, output_rules
|
||||
@@ -0,0 +1,17 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: {{pod_name}}
|
||||
namespace: {{namespace}}
|
||||
spec:
|
||||
{% if host_network %}
|
||||
hostNetwork: true
|
||||
{%endif%}
|
||||
nodeSelector:
|
||||
kubernetes.io/hostname: {{target}}
|
||||
containers:
|
||||
- name: fedora
|
||||
imagePullPolicy: Always
|
||||
image: quay.io/krkn-chaos/krkn-network-chaos:latest
|
||||
securityContext:
|
||||
privileged: true
|
||||
@@ -0,0 +1,24 @@
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import AbstractNetworkChaosModule
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import NodeNetworkFilterModule
|
||||
|
||||
|
||||
supported_modules = ["node_network_filter"]
|
||||
|
||||
class NetworkChaosFactory:
|
||||
|
||||
@staticmethod
|
||||
def get_instance(config: dict[str, str]) -> AbstractNetworkChaosModule:
|
||||
if config["id"] is None:
|
||||
raise Exception("network chaos id cannot be None")
|
||||
if config["id"] not in supported_modules:
|
||||
raise Exception(f"{config['id']} is not a supported network chaos module")
|
||||
|
||||
if config["id"] == "node_network_filter":
|
||||
config = NetworkFilterConfig(**config)
|
||||
errors = config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return NodeNetworkFilterModule(config)
|
||||
|
||||
|
||||
@@ -0,0 +1,116 @@
|
||||
import logging
|
||||
import queue
|
||||
import random
|
||||
import threading
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
NetworkChaosScenarioType,
|
||||
BaseNetworkChaosConfig,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.network_chaos_factory import (
|
||||
NetworkChaosFactory,
|
||||
)
|
||||
|
||||
|
||||
class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
try:
|
||||
with open(scenario, "r") as file:
|
||||
scenario_config = yaml.safe_load(file)
|
||||
if not isinstance(scenario_config, list):
|
||||
logging.error(
|
||||
"network chaos scenario config must be a list of objects"
|
||||
)
|
||||
return 1
|
||||
for config in scenario_config:
|
||||
network_chaos = NetworkChaosFactory.get_instance(config)
|
||||
network_chaos_config = network_chaos.get_config()
|
||||
logging.info(
|
||||
f"running network_chaos scenario: {network_chaos_config[1].id}"
|
||||
)
|
||||
if network_chaos_config[0] == NetworkChaosScenarioType.Node:
|
||||
targets = lib_telemetry.get_lib_kubernetes().list_nodes(
|
||||
network_chaos_config[1].label_selector
|
||||
)
|
||||
else:
|
||||
targets = lib_telemetry.get_lib_kubernetes().list_pods(
|
||||
network_chaos_config[1].namespace,
|
||||
network_chaos_config[1].label_selector,
|
||||
)
|
||||
if len(targets) == 0:
|
||||
logging.warning(
|
||||
f"no targets found for {network_chaos_config[1].id} "
|
||||
f"network chaos scenario with selector {network_chaos_config[1].label_selector} "
|
||||
f"with target type {network_chaos_config[0]}"
|
||||
)
|
||||
|
||||
if network_chaos_config[1].instance_count != 0 and network_chaos_config[1].instance_count > len(targets):
|
||||
targets = random.sample(targets, network_chaos_config[1].instance_count)
|
||||
|
||||
if network_chaos_config[1].execution == "parallel":
|
||||
self.run_parallel(targets, network_chaos, lib_telemetry)
|
||||
else:
|
||||
self.run_serial(targets, network_chaos, lib_telemetry)
|
||||
if len(config) > 1:
|
||||
logging.info(f"waiting {network_chaos_config[1].wait_duration} seconds before running the next "
|
||||
f"Network Chaos NG Module")
|
||||
time.sleep(network_chaos_config[1].wait_duration)
|
||||
except Exception as e:
|
||||
logging.error(str(e))
|
||||
return 1
|
||||
return 0
|
||||
|
||||
def run_parallel(
|
||||
self,
|
||||
targets: list[str],
|
||||
module: AbstractNetworkChaosModule,
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
):
|
||||
error_queue = queue.Queue()
|
||||
threads = []
|
||||
errors = []
|
||||
for target in targets:
|
||||
thread = threading.Thread(
|
||||
target=module.run, args=[target, lib_telemetry, error_queue]
|
||||
)
|
||||
thread.start()
|
||||
threads.append(thread)
|
||||
for thread in threads:
|
||||
thread.join()
|
||||
while True:
|
||||
try:
|
||||
errors.append(error_queue.get_nowait())
|
||||
except queue.Empty:
|
||||
break
|
||||
if len(errors) > 0:
|
||||
raise Exception(
|
||||
f"module {module.get_config()[1].id} execution failed: [{';'.join(errors)}]"
|
||||
)
|
||||
|
||||
def run_serial(
|
||||
self,
|
||||
targets: list[str],
|
||||
module: AbstractNetworkChaosModule,
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
):
|
||||
for target in targets:
|
||||
module.run(target, lib_telemetry)
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["network_chaos_ng_scenarios"]
|
||||
@@ -224,7 +224,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.gcp = GCP()
|
||||
print("selfkeys" + str(vars(self)))
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
|
||||
367
krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py
Normal file
367
krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py
Normal file
@@ -0,0 +1,367 @@
|
||||
#!/usr/bin/env python
|
||||
import time
|
||||
import typing
|
||||
from os import environ
|
||||
from dataclasses import dataclass, field
|
||||
from traceback import format_exc
|
||||
import logging
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
from krkn.scenario_plugins.node_actions.abstract_node_scenarios import (
|
||||
abstract_node_scenarios,
|
||||
)
|
||||
from kubernetes import client, watch
|
||||
from ibm_vpc import VpcV1
|
||||
from ibm_cloud_sdk_core.authenticators import IAMAuthenticator
|
||||
import sys
|
||||
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus, AffectedNode
|
||||
|
||||
|
||||
class IbmCloud:
|
||||
def __init__(self):
|
||||
"""
|
||||
Initialize the ibm cloud client by using the the env variables:
|
||||
'IBMC_APIKEY' 'IBMC_URL'
|
||||
"""
|
||||
apiKey = environ.get("IBMC_APIKEY")
|
||||
service_url = environ.get("IBMC_URL")
|
||||
if not apiKey:
|
||||
raise Exception("Environmental variable 'IBMC_APIKEY' is not set")
|
||||
if not service_url:
|
||||
raise Exception("Environmental variable 'IBMC_URL' is not set")
|
||||
try:
|
||||
authenticator = IAMAuthenticator(apiKey)
|
||||
self.service = VpcV1(authenticator=authenticator)
|
||||
|
||||
self.service.set_service_url(service_url)
|
||||
except Exception as e:
|
||||
logging.error("error authenticating" + str(e))
|
||||
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node_name):
|
||||
node_list = self.list_instances()
|
||||
for node in node_list:
|
||||
if node_name == node["vpc_name"]:
|
||||
return node["vpc_id"]
|
||||
logging.error("Couldn't find node with name " + str(node_name) + ", you could try another region")
|
||||
sys.exit(1)
|
||||
|
||||
def delete_instance(self, instance_id):
|
||||
"""
|
||||
Deletes the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
try:
|
||||
self.service.delete_instance(instance_id)
|
||||
logging.info("Deleted Instance -- '{}'".format(instance_id))
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be deleted. ".format(instance_id))
|
||||
return False
|
||||
|
||||
def reboot_instances(self, instance_id):
|
||||
"""
|
||||
Reboots the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is not powered on
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="reboot",
|
||||
)
|
||||
logging.info("Reset Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be rebooted".format(instance_id))
|
||||
return False
|
||||
|
||||
def stop_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already stopped
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="stop",
|
||||
)
|
||||
logging.info("Stopped Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not be stopped".format(instance_id))
|
||||
logging.info("error" + str(e))
|
||||
return False
|
||||
|
||||
def start_instances(self, instance_id):
|
||||
"""
|
||||
Stops the Instance whose name is given by 'instance_id'. Returns True if successful, or
|
||||
returns False if the Instance is already running
|
||||
"""
|
||||
|
||||
try:
|
||||
self.service.create_instance_action(
|
||||
instance_id,
|
||||
type="start",
|
||||
)
|
||||
logging.info("Started Instance -- '{}'".format(instance_id))
|
||||
return True
|
||||
except Exception as e:
|
||||
logging.info("Instance '{}' could not start running".format(instance_id))
|
||||
return False
|
||||
|
||||
def list_instances(self):
|
||||
"""
|
||||
Returns a list of Instances present in the datacenter
|
||||
"""
|
||||
instance_names = []
|
||||
try:
|
||||
instances_result = self.service.list_instances().get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc["name"], "vpc_id": vpc["id"]})
|
||||
starting_count = instances_result["total_count"]
|
||||
while instances_result["total_count"] == instances_result["limit"]:
|
||||
instances_result = self.service.list_instances(
|
||||
start=starting_count
|
||||
).get_result()
|
||||
instances_list = instances_result["instances"]
|
||||
starting_count += instances_result["total_count"]
|
||||
for vpc in instances_list:
|
||||
instance_names.append({"vpc_name": vpc.name, "vpc_id": vpc.id})
|
||||
except Exception as e:
|
||||
logging.error("Error listing out instances: " + str(e))
|
||||
sys.exit(1)
|
||||
return instance_names
|
||||
|
||||
def find_id_in_list(self, name, vpc_list):
|
||||
for vpc in vpc_list:
|
||||
if vpc["vpc_name"] == name:
|
||||
return vpc["vpc_id"]
|
||||
|
||||
def get_instance_status(self, instance_id):
|
||||
"""
|
||||
Returns the status of the Instance whose name is given by 'instance_id'
|
||||
"""
|
||||
|
||||
try:
|
||||
instance = self.service.get_instance(instance_id).get_result()
|
||||
state = instance["status"]
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s." % (instance_id, e)
|
||||
)
|
||||
return None
|
||||
|
||||
def wait_until_deleted(self, instance_id, timeout, affected_node=None):
|
||||
"""
|
||||
Waits until the instance is deleted or until the timeout. Returns True if
|
||||
the instance is successfully deleted, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
while vpc is not None:
|
||||
vpc = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still being deleted, sleeping for 5 seconds"
|
||||
% instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not deleted in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("terminated", end_time - start_time)
|
||||
return True
|
||||
|
||||
def wait_until_running(self, instance_id, timeout, affected_node=None):
|
||||
"""
|
||||
Waits until the Instance switches to running state or until the timeout.
|
||||
Returns True if the Instance switches to running, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "running":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not running, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not ready in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("running", end_time - start_time)
|
||||
return True
|
||||
|
||||
def wait_until_stopped(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the Instance switches to stopped state or until the timeout.
|
||||
Returns True if the Instance switches to stopped, else returns False
|
||||
"""
|
||||
start_time = time.time()
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status != "stopped":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still not stopped, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still not stopped in allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
end_time = time.time()
|
||||
if affected_node:
|
||||
affected_node.set_affected_node_status("stopped", end_time - start_time)
|
||||
return True
|
||||
|
||||
|
||||
def wait_until_rebooted(self, instance_id, timeout, affected_node):
|
||||
"""
|
||||
Waits until the Instance switches to restarting state and then running state or until the timeout.
|
||||
Returns True if the Instance switches back to running, else returns False
|
||||
"""
|
||||
|
||||
time_counter = 0
|
||||
status = self.get_instance_status(instance_id)
|
||||
while status == "starting":
|
||||
status = self.get_instance_status(instance_id)
|
||||
logging.info(
|
||||
"Instance %s is still restarting, sleeping for 5 seconds" % instance_id
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"Instance %s is still restarting after allotted time" % instance_id
|
||||
)
|
||||
return False
|
||||
self.wait_until_running(instance_id, timeout, affected_node)
|
||||
return True
|
||||
|
||||
|
||||
@dataclass
|
||||
class ibm_node_scenarios(abstract_node_scenarios):
|
||||
def __init__(self, kubecli: KrknKubernetes, affected_nodes_status: AffectedNodeStatus):
|
||||
super().__init__(kubecli, affected_nodes_status)
|
||||
self.ibmcloud = IbmCloud()
|
||||
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id( node)
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
for _ in range(instance_kill_count):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s " % (node))
|
||||
|
||||
if instance_id:
|
||||
vm_started = self.ibmcloud.start_instances(instance_id)
|
||||
if vm_started:
|
||||
self.ibmcloud.wait_until_running(instance_id, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(
|
||||
node, timeout, self.kubecli, affected_node
|
||||
)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to find node that matched instances on ibm cloud in region"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
self.affected_nodes_status.affected_nodes.append(affected_node)
|
||||
|
||||
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, instance_id)
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (node))
|
||||
vm_stopped = self.ibmcloud.stop_instances(instance_id)
|
||||
if vm_stopped:
|
||||
self.ibmcloud.wait_until_stopped(instance_id, timeout, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
|
||||
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s " % (node))
|
||||
self.ibmcloud.reboot_instances(instance_id)
|
||||
self.ibmcloud.wait_until_rebooted(instance_id, timeout)
|
||||
nodeaction.wait_for_unknown_status(
|
||||
node, timeout, affected_node
|
||||
)
|
||||
nodeaction.wait_for_ready_status(
|
||||
node, timeout, affected_node
|
||||
)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted successfully" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
|
||||
|
||||
def node_terminate_scenario(self, instance_kill_count, node, timeout):
|
||||
try:
|
||||
instance_id = self.ibmcloud.get_instance_id(node)
|
||||
for _ in range(instance_kill_count):
|
||||
affected_node = AffectedNode(node, node_id=instance_id)
|
||||
logging.info(
|
||||
"Starting node_termination_scenario injection by first stopping the node"
|
||||
)
|
||||
logging.info("Deleting the node with instance ID: %s " % (node))
|
||||
self.ibmcloud.delete_instance(instance_id)
|
||||
self.ibmcloud.wait_until_deleted(node, timeout, affected_node)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released" % node
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been successfully injected!"
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error("node_terminate_scenario injection failed!")
|
||||
|
||||
@@ -23,7 +23,7 @@ from krkn.scenario_plugins.node_actions.general_cloud_node_scenarios import (
|
||||
general_node_scenarios,
|
||||
)
|
||||
from krkn.scenario_plugins.node_actions.vmware_node_scenarios import vmware_node_scenarios
|
||||
|
||||
from krkn.scenario_plugins.node_actions.ibmcloud_node_scenarios import ibm_node_scenarios
|
||||
node_general = False
|
||||
|
||||
|
||||
@@ -113,6 +113,11 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
or node_scenario["cloud_type"].lower() == "vmware"
|
||||
):
|
||||
return vmware_node_scenarios(kubecli, affected_nodes_status)
|
||||
elif (
|
||||
node_scenario["cloud_type"].lower() == "ibm"
|
||||
or node_scenario["cloud_type"].lower() == "ibmcloud"
|
||||
):
|
||||
return ibm_node_scenarios(kubecli, affected_nodes_status)
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type "
|
||||
|
||||
@@ -14,8 +14,7 @@ class OPENSTACKCLOUD:
|
||||
self.Wait = 30
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
openstack_node_ip = nodeaction.get_node_ip(node)
|
||||
def get_instance_id(self, openstack_node_ip):
|
||||
openstack_node_name = self.get_openstack_nodename(openstack_node_ip)
|
||||
return openstack_node_name
|
||||
|
||||
@@ -128,7 +127,8 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s" % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
openstack_node_ip = self.kubecli.get_node_ip(node)
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(openstack_node_ip)
|
||||
self.openstackcloud.start_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_running(openstack_node_name, timeout, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
@@ -151,7 +151,8 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s " % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
openstack_node_ip = self.kubecli.get_node_ip(node)
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(openstack_node_ip)
|
||||
self.openstackcloud.stop_instances(openstack_node_name)
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name, timeout, affected_node)
|
||||
logging.info("Node with instance name: %s is in stopped state" % (node))
|
||||
@@ -173,7 +174,8 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s" % (node))
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(node)
|
||||
openstack_node_ip = self.kubecli.get_node_ip(node)
|
||||
openstack_node_name = self.openstackcloud.get_instance_id(openstack_node_ip)
|
||||
self.openstackcloud.reboot_instances(openstack_node_name)
|
||||
nodeaction.wait_for_unknown_status(node, timeout, self.kubecli, affected_node)
|
||||
nodeaction.wait_for_ready_status(node, timeout, self.kubecli, affected_node)
|
||||
|
||||
@@ -13,7 +13,7 @@ from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP
|
||||
from krkn.scenario_plugins.node_actions.openstack_node_scenarios import OPENSTACKCLOUD
|
||||
from krkn.scenario_plugins.native.node_scenarios.ibmcloud_plugin import IbmCloud
|
||||
from krkn.scenario_plugins.node_actions.ibmcloud_node_scenarios import IbmCloud
|
||||
|
||||
import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction
|
||||
|
||||
@@ -117,7 +117,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
while len(stopping_nodes) > 0:
|
||||
for node in stopping_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_stopped(
|
||||
node[1], node[0], timeout, affected_node
|
||||
@@ -149,7 +149,7 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
for node in not_running_nodes:
|
||||
affected_node = affected_nodes_status.get_affected_node_index(node)
|
||||
# need to add in time that is passing while waiting for other nodes to be running
|
||||
|
||||
|
||||
if type(node) is tuple:
|
||||
node_status = cloud_object.wait_until_running(
|
||||
node[1], node[0], timeout, affected_node
|
||||
|
||||
@@ -2,15 +2,21 @@ import logging
|
||||
import time
|
||||
|
||||
import yaml
|
||||
|
||||
from multiprocessing.pool import ThreadPool
|
||||
from itertools import repeat
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import log_exception
|
||||
|
||||
from krkn import utils
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.native.network import cerberus
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
||||
|
||||
class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
def run(
|
||||
@@ -25,92 +31,138 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
with open(scenario, "r") as f:
|
||||
zone_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = zone_outage_config_yaml["zone_outage"]
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
cloud_type = scenario_config["cloud_type"]
|
||||
# Add support for user-provided default network ACL
|
||||
default_acl_id = scenario_config.get("default_acl_id")
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
|
||||
if cloud_type.lower() == "aws":
|
||||
cloud_object = AWS()
|
||||
else:
|
||||
logging.error(
|
||||
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios" % cloud_type
|
||||
)
|
||||
return 1
|
||||
|
||||
start_time = int(time.time())
|
||||
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = cloud_object.describe_network_acls(
|
||||
vpc_id, subnet_id
|
||||
)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s" % (subnet_id, network_association_ids)
|
||||
)
|
||||
|
||||
# Use provided default ACL if available, otherwise create a new one
|
||||
if default_acl_id:
|
||||
acl_id = default_acl_id
|
||||
logging.info(
|
||||
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
|
||||
default_acl_id
|
||||
)
|
||||
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
|
||||
if cloud_type.lower() == "aws":
|
||||
self.cloud_object = AWS()
|
||||
self.network_based_zone(scenario_config)
|
||||
else:
|
||||
kubecli = lib_telemetry.get_lib_kubernetes()
|
||||
if cloud_type.lower() == "gcp":
|
||||
affected_nodes_status = AffectedNodeStatus()
|
||||
self.cloud_object = gcp_node_scenarios(kubecli, affected_nodes_status)
|
||||
self.node_based_zone(scenario_config, kubecli)
|
||||
affected_nodes_status = self.cloud_object.affected_nodes_status
|
||||
scenario_telemetry.affected_nodes.extend(affected_nodes_status.affected_nodes)
|
||||
else:
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
logging.info("Created new default ACL %s", acl_id)
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
new_association_id = cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
cloud_object.replace_network_acl_association(
|
||||
new_association_id, original_acl_id
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure " "the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
cloud_object.delete_network_acl(acl_id)
|
||||
logging.error(
|
||||
"ZoneOutageScenarioPlugin Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios" % cloud_type
|
||||
)
|
||||
return 1
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error(
|
||||
f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
|
||||
)
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def node_based_zone(self, scenario_config: dict[str, any], kubecli: KrknKubernetes ):
|
||||
zone = scenario_config["zone"]
|
||||
duration = get_yaml_item_value(scenario_config, "duration", 60)
|
||||
timeout = get_yaml_item_value(scenario_config, "timeout", 180)
|
||||
label_selector = f"topology.kubernetes.io/zone={zone}"
|
||||
try:
|
||||
# get list of nodes in zone/region
|
||||
nodes = kubecli.list_killable_nodes(label_selector)
|
||||
# stop nodes in parallel
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
|
||||
pool.starmap(
|
||||
self.cloud_object.node_stop_scenario,zip(repeat(1), nodes, repeat(timeout))
|
||||
)
|
||||
|
||||
pool.close()
|
||||
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# start nodes in parallel
|
||||
pool = ThreadPool(processes=len(nodes))
|
||||
pool.starmap(
|
||||
self.cloud_object.node_start_scenario,zip(repeat(1), nodes, repeat(timeout))
|
||||
)
|
||||
pool.close()
|
||||
except Exception as e:
|
||||
logging.info(
|
||||
f"Node based zone outage scenario failed with exception: {e}"
|
||||
)
|
||||
return 1
|
||||
else:
|
||||
return 0
|
||||
|
||||
def network_based_zone(self, scenario_config: dict[str, any]):
|
||||
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
# Add support for user-provided default network ACL
|
||||
default_acl_id = scenario_config.get("default_acl_id")
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = self.cloud_object.describe_network_acls(
|
||||
vpc_id, subnet_id
|
||||
)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s" % (subnet_id, network_association_ids)
|
||||
)
|
||||
|
||||
# Use provided default ACL if available, otherwise create a new one
|
||||
if default_acl_id:
|
||||
acl_id = default_acl_id
|
||||
logging.info(
|
||||
"Using provided default ACL ID %s - this ACL will not be deleted after the scenario",
|
||||
default_acl_id
|
||||
)
|
||||
# Don't add to acl_ids_created since we don't want to delete user-provided ACLs at cleanup
|
||||
else:
|
||||
acl_id = self.cloud_object.create_default_network_acl(vpc_id)
|
||||
logging.info("Created new default ACL %s", acl_id)
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
new_association_id = self.cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration " "in the config: %s" % duration
|
||||
)
|
||||
time.sleep(duration)
|
||||
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
self.cloud_object.replace_network_acl_association(
|
||||
new_association_id, original_acl_id
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure " "the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
self.cloud_object.delete_network_acl(acl_id)
|
||||
|
||||
|
||||
def get_scenario_types(self) -> list[str]:
|
||||
return ["zone_outages_scenarios"]
|
||||
|
||||
85
krkn/utils/HealthChecker.py
Normal file
85
krkn/utils/HealthChecker.py
Normal file
@@ -0,0 +1,85 @@
|
||||
import requests
|
||||
import time
|
||||
import logging
|
||||
import queue
|
||||
from datetime import datetime
|
||||
from krkn_lib.models.telemetry.models import HealthCheck
|
||||
|
||||
class HealthChecker:
|
||||
current_iterations: int = 0
|
||||
ret_value = 0
|
||||
def __init__(self, iterations):
|
||||
self.iterations = iterations
|
||||
|
||||
def make_request(self, url, auth=None, headers=None, verify=True):
|
||||
response_data = {}
|
||||
response = requests.get(url, auth=auth, headers=headers, verify=verify)
|
||||
response_data["url"] = url
|
||||
response_data["status"] = response.status_code == 200
|
||||
response_data["status_code"] = response.status_code
|
||||
return response_data
|
||||
|
||||
|
||||
def run_health_check(self, health_check_config, health_check_telemetry_queue: queue.Queue):
|
||||
if health_check_config and health_check_config["config"] and any(config.get("url") for config in health_check_config["config"]):
|
||||
health_check_start_time_stamp = datetime.now()
|
||||
health_check_telemetry = []
|
||||
health_check_tracker = {}
|
||||
interval = health_check_config["interval"] if health_check_config["interval"] else 2
|
||||
|
||||
response_tracker = {config["url"]:True for config in health_check_config["config"]}
|
||||
while self.current_iterations < self.iterations:
|
||||
for config in health_check_config.get("config"):
|
||||
auth, headers = None, None
|
||||
verify_url = config["verify_url"] if "verify_url" in config else True
|
||||
if config["url"]: url = config["url"]
|
||||
|
||||
if config["bearer_token"]:
|
||||
bearer_token = "Bearer " + config["bearer_token"]
|
||||
headers = {"Authorization": bearer_token}
|
||||
|
||||
if config["auth"]: auth = tuple(config["auth"].split(','))
|
||||
response = self.make_request(url, auth, headers, verify_url)
|
||||
|
||||
if response["status_code"] != 200:
|
||||
if config["url"] not in health_check_tracker:
|
||||
start_timestamp = datetime.now()
|
||||
health_check_tracker[config["url"]] = {
|
||||
"status_code": response["status_code"],
|
||||
"start_timestamp": start_timestamp
|
||||
}
|
||||
if response_tracker[config["url"]] != False: response_tracker[config["url"]] = False
|
||||
if config["exit_on_failure"] and config["exit_on_failure"] == True and self.ret_value==0: self.ret_value = 2
|
||||
else:
|
||||
if config["url"] in health_check_tracker:
|
||||
end_timestamp = datetime.now()
|
||||
start_timestamp = health_check_tracker[config["url"]]["start_timestamp"]
|
||||
previous_status_code = str(health_check_tracker[config["url"]]["status_code"])
|
||||
duration = (end_timestamp - start_timestamp).total_seconds()
|
||||
downtime_record = {
|
||||
"url": config["url"],
|
||||
"status": False,
|
||||
"status_code": previous_status_code,
|
||||
"start_timestamp": start_timestamp.isoformat(),
|
||||
"end_timestamp": end_timestamp.isoformat(),
|
||||
"duration": duration
|
||||
}
|
||||
health_check_telemetry.append(HealthCheck(downtime_record))
|
||||
del health_check_tracker[config["url"]]
|
||||
time.sleep(interval)
|
||||
health_check_end_time_stamp = datetime.now()
|
||||
for url, status in response_tracker.items():
|
||||
if status == True:
|
||||
duration = (health_check_end_time_stamp - health_check_start_time_stamp).total_seconds()
|
||||
success_response = {
|
||||
"url": url,
|
||||
"status": True,
|
||||
"status_code": 200,
|
||||
"start_timestamp": health_check_start_time_stamp.isoformat(),
|
||||
"end_timestamp": health_check_end_time_stamp.isoformat(),
|
||||
"duration": duration
|
||||
}
|
||||
health_check_telemetry.append(HealthCheck(success_response))
|
||||
health_check_telemetry_queue.put(health_check_telemetry)
|
||||
else:
|
||||
logging.info("health checks config is not defined, skipping them")
|
||||
@@ -3,10 +3,10 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from tzlocal.unix import get_localzone
|
||||
|
||||
import logging
|
||||
|
||||
def populate_cluster_events(
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
krkn_config: dict,
|
||||
scenario_config: dict,
|
||||
kubecli: KrknKubernetes,
|
||||
start_timestamp: int,
|
||||
@@ -31,8 +31,12 @@ def populate_cluster_events(
|
||||
namespace=namespace,
|
||||
)
|
||||
)
|
||||
|
||||
scenario_telemetry.set_cluster_events(events)
|
||||
archive_path = krkn_config["telemetry"]["archive_path"]
|
||||
file_path = archive_path + "/events.json"
|
||||
with open(file_path, "w+") as f:
|
||||
f.write("\n".join(str(item) for item in events))
|
||||
logging.info(f'Find cluster events in file {file_path}' )
|
||||
|
||||
|
||||
|
||||
def collect_and_put_ocp_logs(
|
||||
|
||||
@@ -6,7 +6,7 @@ azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
itsdangerous==2.0.1
|
||||
coverage==7.4.1
|
||||
coverage==7.6.12
|
||||
datetime==5.4
|
||||
docker==7.0.0
|
||||
gitpython==3.1.41
|
||||
@@ -14,8 +14,8 @@ google-auth==2.37.0
|
||||
google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.5
|
||||
krkn-lib==4.0.8
|
||||
jinja2==3.1.6
|
||||
krkn-lib==5.0.1
|
||||
lxml==5.1.0
|
||||
kubernetes==28.1.0
|
||||
numpy==1.26.4
|
||||
@@ -30,11 +30,12 @@ python-openstackclient==6.5.0
|
||||
requests==2.32.2
|
||||
service_identity==24.1.0
|
||||
PyYAML==6.0.1
|
||||
setuptools==70.0.0
|
||||
setuptools==78.1.1
|
||||
werkzeug==3.0.6
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
|
||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@v0.1.0
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
@@ -9,6 +9,8 @@ import optparse
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
import queue
|
||||
import threading
|
||||
|
||||
from krkn_lib.elastic.krkn_elastic import KrknElastic
|
||||
from krkn_lib.models.elastic import ElasticChaosRunTelemetry
|
||||
@@ -26,6 +28,7 @@ from krkn_lib.utils import SafeLogger
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case
|
||||
|
||||
from krkn.utils import TeeLogHandler
|
||||
from krkn.utils.HealthChecker import HealthChecker
|
||||
from krkn.scenario_plugins.scenario_plugin_factory import (
|
||||
ScenarioPluginFactory,
|
||||
ScenarioPluginNotFound,
|
||||
@@ -49,9 +52,7 @@ def main(cfg) -> int:
|
||||
with open(cfg, "r") as f:
|
||||
config = yaml.full_load(f)
|
||||
global kubeconfig_path, wait_duration, kraken_config
|
||||
distribution = get_yaml_item_value(
|
||||
config["kraken"], "distribution", "openshift"
|
||||
)
|
||||
|
||||
kubeconfig_path = os.path.expanduser(
|
||||
get_yaml_item_value(config["kraken"], "kubeconfig_path", "")
|
||||
)
|
||||
@@ -90,13 +91,6 @@ def main(cfg) -> int:
|
||||
)
|
||||
# elastic search
|
||||
enable_elastic = get_yaml_item_value(config["elastic"], "enable_elastic", False)
|
||||
elastic_collect_metrics = get_yaml_item_value(
|
||||
config["elastic"], "collect_metrics", False
|
||||
)
|
||||
|
||||
elastic_colllect_alerts = get_yaml_item_value(
|
||||
config["elastic"], "collect_alerts", False
|
||||
)
|
||||
|
||||
elastic_url = get_yaml_item_value(config["elastic"], "elastic_url", "")
|
||||
|
||||
@@ -127,10 +121,11 @@ def main(cfg) -> int:
|
||||
config["performance_monitoring"], "check_critical_alerts", False
|
||||
)
|
||||
telemetry_api_url = config["telemetry"].get("api_url")
|
||||
health_check_config = config["health_checks"]
|
||||
|
||||
# Initialize clients
|
||||
if not os.path.isfile(kubeconfig_path) and not os.path.isfile(
|
||||
"/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
"/var/run/secrets/kubernetes.io/serviceaccount/token"
|
||||
):
|
||||
logging.error(
|
||||
"Cannot read the kubeconfig file at %s, please check" % kubeconfig_path
|
||||
@@ -167,6 +162,11 @@ def main(cfg) -> int:
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
distribution = "kubernetes"
|
||||
if ocpcli.is_openshift():
|
||||
distribution = "openshift"
|
||||
logging.info("Detected distribution %s" % (distribution))
|
||||
|
||||
# find node kraken might be running on
|
||||
kubecli.find_kraken_node()
|
||||
|
||||
@@ -203,7 +203,7 @@ def main(cfg) -> int:
|
||||
else:
|
||||
# If can't make a connection, set alerts to false
|
||||
enable_alerts = False
|
||||
critical_alerts = False
|
||||
check_critical_alerts = False
|
||||
except Exception:
|
||||
logging.error(
|
||||
"invalid distribution selected, running openshift scenarios against kubernetes cluster."
|
||||
@@ -223,6 +223,7 @@ def main(cfg) -> int:
|
||||
safe_logger, ocpcli, telemetry_request_id, config["telemetry"]
|
||||
)
|
||||
if enable_elastic:
|
||||
logging.info(f"Elastic collection enabled at: {elastic_url}:{elastic_port}")
|
||||
elastic_search = KrknElastic(
|
||||
safe_logger,
|
||||
elastic_url,
|
||||
@@ -271,8 +272,8 @@ def main(cfg) -> int:
|
||||
classes_and_types: dict[str, list[str]] = {}
|
||||
for loaded in scenario_plugin_factory.loaded_plugins.keys():
|
||||
if (
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
not in classes_and_types.keys()
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
not in classes_and_types.keys()
|
||||
):
|
||||
classes_and_types[
|
||||
scenario_plugin_factory.loaded_plugins[loaded].__name__
|
||||
@@ -299,6 +300,12 @@ def main(cfg) -> int:
|
||||
module_name, class_name, error = failed
|
||||
logging.error(f"⛔ Class: {class_name} Module: {module_name}")
|
||||
logging.error(f"⚠️ {error}\n")
|
||||
health_check_telemetry_queue = queue.Queue()
|
||||
health_checker = HealthChecker(iterations)
|
||||
health_check_worker = threading.Thread(target=health_checker.run_health_check,
|
||||
args=(health_check_config, health_check_telemetry_queue))
|
||||
health_check_worker.start()
|
||||
|
||||
# Loop to run the chaos starts here
|
||||
while int(iteration) < iterations and run_signal != "STOP":
|
||||
# Inject chaos scenarios specified in the config
|
||||
@@ -359,12 +366,18 @@ def main(cfg) -> int:
|
||||
break
|
||||
|
||||
iteration += 1
|
||||
health_checker.current_iterations += 1
|
||||
|
||||
# telemetry
|
||||
# in order to print decoded telemetry data even if telemetry collection
|
||||
# is disabled, it's necessary to serialize the ChaosRunTelemetry object
|
||||
# to json, and recreate a new object from it.
|
||||
end_time = int(time.time())
|
||||
health_check_worker.join()
|
||||
try:
|
||||
chaos_telemetry.health_checks = health_check_telemetry_queue.get_nowait()
|
||||
except queue.Empty:
|
||||
chaos_telemetry.health_checks = None
|
||||
|
||||
# if platform is openshift will be collected
|
||||
# Cloud platform and network plugins metadata
|
||||
@@ -419,9 +432,9 @@ def main(cfg) -> int:
|
||||
)
|
||||
else:
|
||||
if (
|
||||
config["telemetry"]["prometheus_namespace"]
|
||||
and config["telemetry"]["prometheus_pod_name"]
|
||||
and config["telemetry"]["prometheus_container_name"]
|
||||
config["telemetry"]["prometheus_namespace"]
|
||||
and config["telemetry"]["prometheus_pod_name"]
|
||||
and config["telemetry"]["prometheus_container_name"]
|
||||
):
|
||||
try:
|
||||
prometheus_archive_files = (
|
||||
@@ -470,8 +483,7 @@ def main(cfg) -> int:
|
||||
start_time,
|
||||
end_time,
|
||||
alert_profile,
|
||||
elastic_colllect_alerts,
|
||||
elastic_alerts_index,
|
||||
elastic_alerts_index
|
||||
)
|
||||
|
||||
else:
|
||||
@@ -479,15 +491,15 @@ def main(cfg) -> int:
|
||||
return 1
|
||||
# sys.exit(1)
|
||||
if enable_metrics:
|
||||
logging.info(f'Capturing metrics using file {metrics_profile}')
|
||||
prometheus_plugin.metrics(
|
||||
prometheus,
|
||||
elastic_search,
|
||||
start_time,
|
||||
run_uuid,
|
||||
start_time,
|
||||
end_time,
|
||||
metrics_profile,
|
||||
elastic_collect_metrics,
|
||||
elastic_metrics_index,
|
||||
elastic_metrics_index
|
||||
)
|
||||
|
||||
if post_critical_alerts > 0:
|
||||
@@ -501,6 +513,9 @@ def main(cfg) -> int:
|
||||
)
|
||||
# sys.exit(2)
|
||||
return 2
|
||||
if health_checker.ret_value != 0:
|
||||
logging.error("Health check failed for the applications, Please check; exiting")
|
||||
return health_checker.ret_value
|
||||
|
||||
logging.info(
|
||||
"Successfully finished running Kraken. UUID for the run: "
|
||||
@@ -643,4 +658,4 @@ if __name__ == "__main__":
|
||||
with open(junit_testcase_file_path, "w") as stream:
|
||||
stream.write(junit_testcase_xml)
|
||||
|
||||
sys.exit(retval)
|
||||
sys.exit(retval)
|
||||
@@ -7,3 +7,4 @@ cpu-load-percentage: 90
|
||||
cpu-method: all
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: 2
|
||||
taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]
|
||||
|
||||
@@ -11,4 +11,5 @@ io-target-pod-volume:
|
||||
hostPath:
|
||||
path: /root # a path writable by kubelet in the root filesystem of the node
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: ''
|
||||
number-of-nodes: ''
|
||||
taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]
|
||||
@@ -6,3 +6,4 @@ namespace: default
|
||||
memory-vm-bytes: 90%
|
||||
node-selector: "node-role.kubernetes.io/worker="
|
||||
number-of-nodes: ''
|
||||
taints: [] #example ["node-role.kubernetes.io/master:NoSchedule"]
|
||||
|
||||
13
scenarios/kube/network-filter.yml
Normal file
13
scenarios/kube/network-filter.yml
Normal file
@@ -0,0 +1,13 @@
|
||||
- id: node_network_filter
|
||||
wait_duration: 300
|
||||
test_duration: 100
|
||||
label_selector: "kubernetes.io/hostname=ip-10-0-39-182.us-east-2.compute.internal"
|
||||
namespace: 'default'
|
||||
instance_count: 1
|
||||
execution: parallel
|
||||
ingress: false
|
||||
egress: true
|
||||
target: node
|
||||
interfaces: []
|
||||
ports:
|
||||
- 2049
|
||||
@@ -1,10 +1,16 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: <ibmcloud-node-terminate/ibmcloud-node-reboot/ibmcloud-node-stop/ibmcloud-node-start>
|
||||
config:
|
||||
name: ""
|
||||
label_selector: "node-role.kubernetes.io/worker" # When node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection
|
||||
runs: 1 # Number of times to inject each scenario under actions (will perform on same node each time)
|
||||
instance_count: 1 # Number of nodes to perform action/select that match the label selector
|
||||
timeout: 360 # Duration to wait for completion of node scenario injection
|
||||
duration: 120 # Duration to stop the node before running the start action
|
||||
skip_openshift_checks: False # Set to True if you don't want to wait for the status of the nodes to change on OpenShift before passing the scenario
|
||||
node_scenarios:
|
||||
- actions:
|
||||
- node_stop_start_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 360
|
||||
duration: 120
|
||||
cloud_type: ibm
|
||||
- actions:
|
||||
- node_reboot_scenario
|
||||
node_name:
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
instance_count: 1
|
||||
timeout: 120
|
||||
cloud_type: ibm
|
||||
@@ -1,49 +0,0 @@
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: litmus-sa
|
||||
namespace: litmus
|
||||
labels:
|
||||
name: litmus-sa
|
||||
app.kubernetes.io/part-of: litmus
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRole
|
||||
metadata:
|
||||
name: litmus-sa
|
||||
labels:
|
||||
name: litmus-sa
|
||||
app.kubernetes.io/part-of: litmus
|
||||
rules:
|
||||
- apiGroups: [""]
|
||||
resources: ["pods","events"]
|
||||
verbs: ["create","list","get","patch","update","delete","deletecollection"]
|
||||
- apiGroups: [""]
|
||||
resources: ["pods/exec","pods/log"]
|
||||
verbs: ["list","get","create"]
|
||||
- apiGroups: ["batch"]
|
||||
resources: ["jobs"]
|
||||
verbs: ["create","list","get","delete","deletecollection"]
|
||||
- apiGroups: ["litmuschaos.io"]
|
||||
resources: ["chaosengines","chaosexperiments","chaosresults"]
|
||||
verbs: ["create","list","get","patch","update"]
|
||||
- apiGroups: [""]
|
||||
resources: ["nodes"]
|
||||
verbs: ["get","list"]
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: litmus-sa
|
||||
labels:
|
||||
name: litmus-sa
|
||||
app.kubernetes.io/part-of: litmus
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: litmus-sa
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: litmus-sa
|
||||
namespace: litmus
|
||||
4
scenarios/openshift/zone_outage_gcp.yaml
Normal file
4
scenarios/openshift/zone_outage_gcp.yaml
Normal file
@@ -0,0 +1,4 @@
|
||||
zone_outage: # Scenario to create an outage of a zone by tweaking network ACL
|
||||
cloud_type: gcp # cloud type on which Kubernetes/OpenShift runs. aws is only platform supported currently for this scenario.
|
||||
duration: 600 # duration in seconds after which the zone will be back online
|
||||
zone: <zone> # (Optional) ID of an existing network ACL to use instead of creating a new one. If provided, this ACL will not be deleted after the scenario.
|
||||
Reference in New Issue
Block a user