mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-18 20:09:55 +00:00
Compare commits
10 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
39c0152b7b | ||
|
|
491dc17267 | ||
|
|
b2b5002f45 | ||
|
|
fccd701dee | ||
|
|
570631ebfc | ||
|
|
3ab9ca4319 | ||
|
|
4084ffd9c6 | ||
|
|
19cc2c047f | ||
|
|
6197fc6722 | ||
|
|
2a8ac41ebf |
21
.github/workflows/build.yml
vendored
21
.github/workflows/build.yml
vendored
@@ -1,8 +1,5 @@
|
||||
name: Build Krkn
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
@@ -51,20 +48,4 @@ jobs:
|
||||
if-no-files-found: error
|
||||
- name: Check CI results
|
||||
run: grep Fail CI/results.markdown && false || true
|
||||
- name: Build the Docker images
|
||||
run: docker build --no-cache -t quay.io/redhat-chaos/krkn containers/
|
||||
- name: Login in quay
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
|
||||
env:
|
||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
||||
- name: Push the Docker images
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker push quay.io/redhat-chaos/krkn
|
||||
- name: Rebuild krkn-hub
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
uses: redhat-chaos/actions/krkn-hub@main
|
||||
with:
|
||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
||||
|
||||
|
||||
30
.github/workflows/docker-image.yml
vendored
Normal file
30
.github/workflows/docker-image.yml
vendored
Normal file
@@ -0,0 +1,30 @@
|
||||
name: Docker Image CI
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
build:
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build the Docker images
|
||||
run: docker build --no-cache -t quay.io/redhat-chaos/krkn containers/
|
||||
- name: Login in quay
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
|
||||
env:
|
||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
||||
- name: Push the Docker images
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker push quay.io/redhat-chaos/krkn
|
||||
- name: Rebuild krkn-hub
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
uses: redhat-chaos/actions/krkn-hub@main
|
||||
with:
|
||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
||||
@@ -29,3 +29,15 @@ tunings:
|
||||
wait_duration: 6 # Duration to wait between each chaos scenario.
|
||||
iterations: 1 # Number of times to execute the scenarios.
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
username: username # telemetry service username
|
||||
password: password # telemetry service password
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
|
||||
@@ -51,7 +51,7 @@ spec:
|
||||
claimName: kraken-test-pvc
|
||||
containers:
|
||||
- name: kraken-test-container
|
||||
image: 'image-registry.openshift-image-registry.svc:5000/openshift/httpd:latest'
|
||||
image: 'quay.io/centos7/httpd-24-centos7:latest'
|
||||
volumeMounts:
|
||||
- mountPath: "/home/krake-dir/"
|
||||
name: kraken-test-pv
|
||||
|
||||
@@ -59,6 +59,7 @@ Instructions on how to setup the config and the options supported can be found a
|
||||
Scenario type | Kubernetes | OpenShift
|
||||
--------------------------- | ------------- |--------------------|
|
||||
[Pod Scenarios](docs/pod_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Pod Network Scenarios](docs/pod_network_scenarios.md) | :x: | :heavy_check_mark: |
|
||||
[Container Scenarios](docs/container_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Node Scenarios](docs/node_scenarios.md) | :heavy_check_mark: | :heavy_check_mark: |
|
||||
[Time Scenarios](docs/time_scenarios.md) | :x: | :heavy_check_mark: |
|
||||
|
||||
@@ -7,40 +7,8 @@ kraken:
|
||||
signal_address: 0.0.0.0 # Signal listening address
|
||||
port: 8081 # Signal port
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load
|
||||
- arcaflow_scenarios:
|
||||
- scenarios/arcaflow/cpu-hog/input.yaml
|
||||
- scenarios/arcaflow/memory-hog/input.yaml
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/etcd.yml
|
||||
- scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
- scenarios/openshift/vmware_node_scenarios.yml
|
||||
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
||||
- scenarios/openshift/network_chaos_ingress.yml
|
||||
- scenarios/openshift/pod_network_outage.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/openshift-apiserver.yml
|
||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||
- time_scenarios: # List of chaos time scenarios to load
|
||||
- scenarios/openshift/time_scenarios_example.yml
|
||||
- cluster_shut_down_scenarios:
|
||||
- - scenarios/openshift/cluster_shut_down_scenario.yml
|
||||
- scenarios/openshift/post_action_shut_down.py
|
||||
- namespace_scenarios:
|
||||
- - scenarios/openshift/regex_namespace.yaml
|
||||
- - scenarios/openshift/ingress_namespace.yaml
|
||||
- scenarios/openshift/post_action_namespace.py
|
||||
- zone_outages:
|
||||
- scenarios/openshift/zone_outage.yaml
|
||||
- application_outages:
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- pvc_scenarios:
|
||||
- scenarios/openshift/pvc_scenario.yaml
|
||||
- network_chaos:
|
||||
- scenarios/openshift/network_chaos.yaml
|
||||
- application_outages:
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
@@ -58,9 +26,26 @@ performance_monitoring:
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
alert_profile: config/alerts # Path to alert profile with the prometheus queries
|
||||
alert_profile: config/alerts # Path or URL to alert profile with the prometheus queries
|
||||
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
|
||||
tunings:
|
||||
wait_duration: 60 # Duration to wait between each chaos scenario
|
||||
iterations: 1 # Number of times to execute the scenarios
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
username: username # telemetry service username
|
||||
password: password # telemetry service password
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
||||
# simultaneously).
|
||||
# For unstable/slow connection is better to keep this value low
|
||||
# increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
|
||||
# failed chunk without affecting the whole upload.
|
||||
@@ -14,9 +14,10 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.3.4 /root/kraken && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.3.6 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
|
||||
@@ -14,9 +14,10 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.3.4 /root/kraken && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.3.6 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
|
||||
@@ -23,7 +23,7 @@ performance_monitoring:
|
||||
```
|
||||
|
||||
#### Alert profile
|
||||
A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. The following are a few alerts examples:
|
||||
A couple of [alert profiles](https://github.com/redhat-chaos/krkn/tree/main/config) [alerts](https://github.com/redhat-chaos/krkn/blob/main/config/alerts) are shipped by default and can be tweaked to add more queries to alert on. User can provide a URL or path to the file in the [config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml). The following are a few alerts examples:
|
||||
|
||||
```
|
||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_wal_fsync_duration_seconds_bucket[2m]))[5m:]) > 0.01
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
## Pod network Scenarios
|
||||
|
||||
### Pod outage
|
||||
Scenario to block the traffic ( Ingress/Egress ) of a pod matching the labels for the specified duration of time to understand the behavior of the service/other services which depend on it during downtime. This helps with planning the requirements accordingly, be it improving the timeouts or tweaking the alerts etc.
|
||||
With the current network policies, it is not possible to explicitly block ports which are enabled by allowed network policy rule. This chaos scenario addresses this issue by using OVS flow rules to block ports related to the pod. It supports OpenShiftSDN and OVNKubernetes based networks.
|
||||
@@ -13,3 +15,23 @@ With the current network policies, it is not possible to explicitly block ports
|
||||
- 8443 # Blocks 8443, Default [], i.e. all ports.
|
||||
label_selector: 'component=ui' # Blocks access to openshift console
|
||||
```
|
||||
### Pod Network shaping
|
||||
Scenario to introduce network latency, packet loss, and bandwidth restriction in the Pod's network interface. The purpose of this scenario is to observe faults caused by random variations in the network.
|
||||
|
||||
##### Sample scenario config for egress traffic shaping (using plugin)
|
||||
```
|
||||
- id: pod_egress_shaping
|
||||
config:
|
||||
namespace: openshift-console # Required - Namespace of the pod to which filter need to be applied.
|
||||
label_selector: 'component=ui' # Applies traffic shaping to access openshift console.
|
||||
network_params:
|
||||
latency: 500ms # Add 500ms latency to egress traffic from the pod.
|
||||
```
|
||||
|
||||
##### Steps
|
||||
- Pick the pods to introduce the network anomaly either from label_selector or pod_name.
|
||||
- Identify the pod interface name on the node.
|
||||
- Set traffic shaping config on pod's interface using tc and netem.
|
||||
- Wait for the duration time.
|
||||
- Remove traffic shaping config on pod's interface.
|
||||
- Remove the job that spawned the pod.
|
||||
@@ -4,25 +4,32 @@ import time
|
||||
import kraken.cerberus.setup as cerberus
|
||||
from jinja2 import Template
|
||||
import kraken.invoke.command as runcommand
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# Reads the scenario config, applies and deletes a network policy to
|
||||
# block the traffic for the specified duration
|
||||
def run(scenarios_list, config, wait_duration):
|
||||
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
failed_post_scenarios = ""
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for app_outage_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
|
||||
if len(app_outage_config) > 1:
|
||||
with open(app_outage_config, "r") as f:
|
||||
app_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = app_outage_config_yaml["application_outage"]
|
||||
pod_selector = scenario_config.get("pod_selector", "{}")
|
||||
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
|
||||
namespace = scenario_config.get("namespace", "")
|
||||
duration = scenario_config.get("duration", 60)
|
||||
try:
|
||||
with open(app_outage_config, "r") as f:
|
||||
app_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = app_outage_config_yaml["application_outage"]
|
||||
pod_selector = scenario_config.get("pod_selector", "{}")
|
||||
traffic_type = scenario_config.get("block", "[Ingress, Egress]")
|
||||
namespace = scenario_config.get("namespace", "")
|
||||
duration = scenario_config.get("duration", 60)
|
||||
|
||||
start_time = int(time.time())
|
||||
start_time = int(time.time())
|
||||
|
||||
network_policy_template = """---
|
||||
network_policy_template = """---
|
||||
apiVersion: networking.k8s.io/v1
|
||||
kind: NetworkPolicy
|
||||
metadata:
|
||||
@@ -31,28 +38,38 @@ spec:
|
||||
podSelector:
|
||||
matchLabels: {{ pod_selector }}
|
||||
policyTypes: {{ traffic_type }}
|
||||
"""
|
||||
t = Template(network_policy_template)
|
||||
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
|
||||
# Write the rendered template to a file
|
||||
with open("kraken_network_policy.yaml", "w") as f:
|
||||
f.write(rendered_spec)
|
||||
# Block the traffic by creating network policy
|
||||
logging.info("Creating the network policy")
|
||||
runcommand.invoke(
|
||||
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
|
||||
)
|
||||
"""
|
||||
t = Template(network_policy_template)
|
||||
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
|
||||
# Write the rendered template to a file
|
||||
with open("kraken_network_policy.yaml", "w") as f:
|
||||
f.write(rendered_spec)
|
||||
# Block the traffic by creating network policy
|
||||
logging.info("Creating the network policy")
|
||||
runcommand.invoke(
|
||||
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
|
||||
)
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info("Waiting for the specified duration in the config: %s" % (duration))
|
||||
time.sleep(duration)
|
||||
# wait for the specified duration
|
||||
logging.info("Waiting for the specified duration in the config: %s" % (duration))
|
||||
time.sleep(duration)
|
||||
|
||||
# unblock the traffic by deleting the network policy
|
||||
logging.info("Deleting the network policy")
|
||||
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
|
||||
# unblock the traffic by deleting the network policy
|
||||
logging.info("Deleting the network policy")
|
||||
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except Exception as e :
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(app_outage_config)
|
||||
telemetry.log_exception(app_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
|
||||
@@ -1,3 +1,5 @@
|
||||
import time
|
||||
|
||||
import arcaflow
|
||||
import os
|
||||
import yaml
|
||||
@@ -6,22 +8,31 @@ import sys
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from .context_auth import ContextAuth
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
|
||||
def run(scenarios_list: List[str], kubeconfig_path: str):
|
||||
def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_post_scenarios = []
|
||||
for scenario in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry,scenario)
|
||||
engine_args = build_args(scenario)
|
||||
run_workflow(engine_args, kubeconfig_path)
|
||||
status_code = run_workflow(engine_args, kubeconfig_path)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exitStatus = status_code
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
if status_code != 0:
|
||||
failed_post_scenarios.append(scenario)
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str):
|
||||
def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
|
||||
set_arca_kubeconfig(engine_args, kubeconfig_path)
|
||||
exit_status = arcaflow.run(engine_args)
|
||||
if exit_status != 0:
|
||||
logging.error(
|
||||
f"failed to run arcaflow scenario {engine_args.input}"
|
||||
)
|
||||
sys.exit(exit_status)
|
||||
return exit_status
|
||||
|
||||
|
||||
def build_args(input_file: str) -> arcaflow.EngineArgs:
|
||||
|
||||
@@ -3,7 +3,10 @@ import logging
|
||||
import urllib.request
|
||||
import shutil
|
||||
import sys
|
||||
import requests
|
||||
import tempfile
|
||||
import kraken.prometheus.client as prometheus
|
||||
from urllib.parse import urlparse
|
||||
|
||||
|
||||
def setup(url):
|
||||
@@ -72,6 +75,14 @@ def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, en
|
||||
Scrapes metrics defined in the profile from Prometheus and alerts based on the severity defined
|
||||
"""
|
||||
|
||||
is_url = urlparse(alert_profile)
|
||||
if is_url.scheme and is_url.netloc:
|
||||
response = requests.get(alert_profile)
|
||||
temp_alerts = tempfile.NamedTemporaryFile()
|
||||
temp_alerts.write(response.content)
|
||||
temp_alerts.flush()
|
||||
alert_profile = temp_alerts.name
|
||||
|
||||
if not prometheus_url:
|
||||
if distribution == "openshift":
|
||||
logging.info("Looks like prometheus_url is not defined, trying to use the default instance on the cluster")
|
||||
@@ -79,7 +90,7 @@ def alerts(distribution, prometheus_url, prometheus_bearer_token, start_time, en
|
||||
distribution, prometheus_url, prometheus_bearer_token
|
||||
)
|
||||
else:
|
||||
logging.error("Looks like proemtheus url is not defined, exiting")
|
||||
logging.error("Looks like prometheus url is not defined, exiting")
|
||||
sys.exit(1)
|
||||
command = (
|
||||
"./kube-burner check-alerts "
|
||||
|
||||
@@ -6,7 +6,7 @@ import kraken.cerberus.setup as cerberus
|
||||
import kraken.post_actions.actions as post_actions
|
||||
import yaml
|
||||
import sys
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def run(
|
||||
@@ -15,72 +15,96 @@ def run(
|
||||
wait_duration,
|
||||
failed_post_scenarios,
|
||||
kubeconfig_path,
|
||||
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
|
||||
):
|
||||
|
||||
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
|
||||
telemetry: KrknTelemetry
|
||||
) -> (list[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for scenario_config in scenarios_list:
|
||||
if len(scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
with open(scenario_config[0], "r") as f:
|
||||
scenario_config_yaml = yaml.full_load(f)
|
||||
for scenario in scenario_config_yaml["scenarios"]:
|
||||
scenario_namespace = scenario.get("namespace", "")
|
||||
scenario_label = scenario.get("label_selector", "")
|
||||
if scenario_namespace is not None and scenario_namespace.strip() != "":
|
||||
if scenario_label is not None and scenario_label.strip() != "":
|
||||
logging.error("You can only have namespace or label set in your namespace scenario")
|
||||
logging.error(
|
||||
"Current scenario config has namespace '%s' and label selector '%s'"
|
||||
% (scenario_namespace, scenario_label)
|
||||
)
|
||||
logging.error(
|
||||
"Please set either namespace to blank ('') or label_selector to blank ('') to continue"
|
||||
)
|
||||
sys.exit(1)
|
||||
delete_count = scenario.get("delete_count", 1)
|
||||
run_count = scenario.get("runs", 1)
|
||||
run_sleep = scenario.get("sleep", 10)
|
||||
wait_time = scenario.get("wait_time", 30)
|
||||
killed_namespaces = []
|
||||
start_time = int(time.time())
|
||||
for i in range(run_count):
|
||||
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
|
||||
for j in range(delete_count):
|
||||
if len(namespaces) == 0:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
|
||||
try:
|
||||
if len(scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, scenario_config[1])
|
||||
else:
|
||||
pre_action_output = ""
|
||||
with open(scenario_config[0], "r") as f:
|
||||
scenario_config_yaml = yaml.full_load(f)
|
||||
for scenario in scenario_config_yaml["scenarios"]:
|
||||
scenario_namespace = scenario.get("namespace", "")
|
||||
scenario_label = scenario.get("label_selector", "")
|
||||
if scenario_namespace is not None and scenario_namespace.strip() != "":
|
||||
if scenario_label is not None and scenario_label.strip() != "":
|
||||
logging.error("You can only have namespace or label set in your namespace scenario")
|
||||
logging.error(
|
||||
"Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
|
||||
% (str(run_count), scenario_namespace, str(scenario_label))
|
||||
"Current scenario config has namespace '%s' and label selector '%s'"
|
||||
% (scenario_namespace, scenario_label)
|
||||
)
|
||||
sys.exit(1)
|
||||
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
|
||||
killed_namespaces.append(selected_namespace)
|
||||
try:
|
||||
kubecli.delete_namespace(selected_namespace)
|
||||
logging.info("Delete on namespace %s was successful" % str(selected_namespace))
|
||||
except Exception as e:
|
||||
logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
|
||||
logging.info("Namespace action error: " + str(e))
|
||||
sys.exit(1)
|
||||
namespaces.remove(selected_namespace)
|
||||
logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
|
||||
time.sleep(run_sleep)
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
if len(scenario_config) > 1:
|
||||
try:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
|
||||
logging.error(
|
||||
"Please set either namespace to blank ('') or label_selector to blank ('') to continue"
|
||||
)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
delete_count = scenario.get("delete_count", 1)
|
||||
run_count = scenario.get("runs", 1)
|
||||
run_sleep = scenario.get("sleep", 10)
|
||||
wait_time = scenario.get("wait_time", 30)
|
||||
killed_namespaces = []
|
||||
start_time = int(time.time())
|
||||
for i in range(run_count):
|
||||
namespaces = kubecli.check_namespaces([scenario_namespace], scenario_label)
|
||||
for j in range(delete_count):
|
||||
if len(namespaces) == 0:
|
||||
logging.error(
|
||||
"Couldn't delete %s namespaces, not enough namespaces matching %s with label %s"
|
||||
% (str(run_count), scenario_namespace, str(scenario_label))
|
||||
)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
selected_namespace = namespaces[random.randint(0, len(namespaces) - 1)]
|
||||
killed_namespaces.append(selected_namespace)
|
||||
try:
|
||||
kubecli.delete_namespace(selected_namespace)
|
||||
logging.info("Delete on namespace %s was successful" % str(selected_namespace))
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
logging.info("Delete on namespace %s was unsuccessful" % str(selected_namespace))
|
||||
logging.info("Namespace action error: " + str(e))
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
namespaces.remove(selected_namespace)
|
||||
logging.info("Waiting %s seconds between namespace deletions" % str(run_sleep))
|
||||
time.sleep(run_sleep)
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
if len(scenario_config) > 1:
|
||||
try:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, scenario_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
else:
|
||||
failed_post_scenarios = check_active_namespace(killed_namespaces, wait_time, kubecli)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (Exception, RuntimeError):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(scenario_config[0])
|
||||
telemetry.log_exception(scenario_config[0])
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
|
||||
@@ -8,88 +8,104 @@ import krkn_lib_kubernetes
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
import kraken.cerberus.setup as cerberus
|
||||
import kraken.node_actions.common_node_functions as common_node_functions
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
# Reads the scenario config and introduces traffic variations in Node's host network interface.
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
failed_post_scenarios = ""
|
||||
logging.info("Runing the Network Chaos tests")
|
||||
failed_post_scenarios = ""
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for net_config in scenarios_list:
|
||||
with open(net_config, "r") as file:
|
||||
param_lst = ["latency", "loss", "bandwidth"]
|
||||
test_config = yaml.safe_load(file)
|
||||
test_dict = test_config["network_chaos"]
|
||||
test_duration = int(test_dict.get("duration", 300))
|
||||
test_interface = test_dict.get("interfaces", [])
|
||||
test_node = test_dict.get("node_name", "")
|
||||
test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
|
||||
test_execution = test_dict.get("execution", "serial")
|
||||
test_instance_count = test_dict.get("instance_count", 1)
|
||||
test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
|
||||
if test_node:
|
||||
node_name_list = test_node.split(",")
|
||||
else:
|
||||
node_name_list = [test_node]
|
||||
nodelst = []
|
||||
for single_node_name in node_name_list:
|
||||
nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_template = env.get_template("pod.j2")
|
||||
test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
|
||||
joblst = []
|
||||
egress_lst = [i for i in param_lst if i in test_egress]
|
||||
chaos_config = {
|
||||
"network_chaos": {
|
||||
"duration": test_duration,
|
||||
"interfaces": test_interface,
|
||||
"node_name": ",".join(nodelst),
|
||||
"execution": test_execution,
|
||||
"instance_count": test_instance_count,
|
||||
"egress": test_egress,
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = net_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, net_config)
|
||||
try:
|
||||
with open(net_config, "r") as file:
|
||||
param_lst = ["latency", "loss", "bandwidth"]
|
||||
test_config = yaml.safe_load(file)
|
||||
test_dict = test_config["network_chaos"]
|
||||
test_duration = int(test_dict.get("duration", 300))
|
||||
test_interface = test_dict.get("interfaces", [])
|
||||
test_node = test_dict.get("node_name", "")
|
||||
test_node_label = test_dict.get("label_selector", "node-role.kubernetes.io/master")
|
||||
test_execution = test_dict.get("execution", "serial")
|
||||
test_instance_count = test_dict.get("instance_count", 1)
|
||||
test_egress = test_dict.get("egress", {"bandwidth": "100mbit"})
|
||||
if test_node:
|
||||
node_name_list = test_node.split(",")
|
||||
else:
|
||||
node_name_list = [test_node]
|
||||
nodelst = []
|
||||
for single_node_name in node_name_list:
|
||||
nodelst.extend(common_node_functions.get_node(single_node_name, test_node_label, test_instance_count, kubecli))
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_template = env.get_template("pod.j2")
|
||||
test_interface = verify_interface(test_interface, nodelst, pod_template, kubecli)
|
||||
joblst = []
|
||||
egress_lst = [i for i in param_lst if i in test_egress]
|
||||
chaos_config = {
|
||||
"network_chaos": {
|
||||
"duration": test_duration,
|
||||
"interfaces": test_interface,
|
||||
"node_name": ",".join(nodelst),
|
||||
"execution": test_execution,
|
||||
"instance_count": test_instance_count,
|
||||
"egress": test_egress,
|
||||
}
|
||||
}
|
||||
}
|
||||
logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
|
||||
job_template = env.get_template("job.j2")
|
||||
try:
|
||||
for i in egress_lst:
|
||||
for node in nodelst:
|
||||
exec_cmd = get_egress_cmd(
|
||||
test_execution, test_interface, i, test_dict["egress"], duration=test_duration
|
||||
)
|
||||
logging.info("Executing %s on node %s" % (exec_cmd, node))
|
||||
job_body = yaml.safe_load(
|
||||
job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
|
||||
)
|
||||
joblst.append(job_body["metadata"]["name"])
|
||||
api_response = kubecli.create_job(job_body)
|
||||
if api_response is None:
|
||||
raise Exception("Error creating job")
|
||||
if test_execution == "serial":
|
||||
logging.info("Waiting for serial job to finish")
|
||||
logging.info("Executing network chaos with config \n %s" % yaml.dump(chaos_config))
|
||||
job_template = env.get_template("job.j2")
|
||||
try:
|
||||
for i in egress_lst:
|
||||
for node in nodelst:
|
||||
exec_cmd = get_egress_cmd(
|
||||
test_execution, test_interface, i, test_dict["egress"], duration=test_duration
|
||||
)
|
||||
logging.info("Executing %s on node %s" % (exec_cmd, node))
|
||||
job_body = yaml.safe_load(
|
||||
job_template.render(jobname=i + str(hash(node))[:5], nodename=node, cmd=exec_cmd)
|
||||
)
|
||||
joblst.append(job_body["metadata"]["name"])
|
||||
api_response = kubecli.create_job(job_body)
|
||||
if api_response is None:
|
||||
raise Exception("Error creating job")
|
||||
if test_execution == "serial":
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(joblst[:], kubecli, test_duration + 300)
|
||||
logging.info("Waiting for wait_duration %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
if test_execution == "parallel":
|
||||
break
|
||||
if test_execution == "parallel":
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(joblst[:], kubecli, test_duration + 300)
|
||||
logging.info("Waiting for wait_duration %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
if test_execution == "parallel":
|
||||
break
|
||||
if test_execution == "parallel":
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(joblst[:], kubecli, test_duration + 300)
|
||||
logging.info("Waiting for wait_duration %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except Exception as e:
|
||||
logging.error("Network Chaos exiting due to Exception %s" % e)
|
||||
sys.exit(1)
|
||||
finally:
|
||||
logging.info("Deleting jobs")
|
||||
delete_job(joblst[:], kubecli)
|
||||
except Exception as e:
|
||||
logging.error("Network Chaos exiting due to Exception %s" % e)
|
||||
raise RuntimeError()
|
||||
finally:
|
||||
logging.info("Deleting jobs")
|
||||
delete_job(joblst[:], kubecli)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(net_config)
|
||||
telemetry.log_exception(net_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
@@ -110,7 +126,8 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern
|
||||
for interface in test_interface:
|
||||
if interface not in interface_lst:
|
||||
logging.error("Interface %s not found in node %s interface list %s" % (interface, nodelst[pod_index], interface_lst))
|
||||
sys.exit(1)
|
||||
#sys.exit(1)
|
||||
raise RuntimeError()
|
||||
return test_interface
|
||||
finally:
|
||||
logging.info("Deleteing pod to query interface on node")
|
||||
@@ -158,7 +175,7 @@ def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
logging.error(pod_log)
|
||||
except Exception:
|
||||
logging.warning("Exception in getting job status")
|
||||
api_response = kubecli.delete_job(name=jobname, namespace="default")
|
||||
kubecli.delete_job(name=jobname, namespace="default")
|
||||
|
||||
|
||||
def get_egress_cmd(execution, test_interface, mod, vallst, duration=30):
|
||||
|
||||
@@ -27,7 +27,9 @@ class AWS:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, instance_id):
|
||||
@@ -36,7 +38,9 @@ class AWS:
|
||||
logging.info("EC2 instance: " + str(instance_id) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, instance_id):
|
||||
@@ -47,7 +51,9 @@ class AWS:
|
||||
logging.error(
|
||||
"Failed to terminate node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, instance_id):
|
||||
@@ -58,7 +64,9 @@ class AWS:
|
||||
logging.error(
|
||||
"Failed to reboot node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Below functions poll EC2.Client.describe_instances() every 15 seconds
|
||||
# until a successful state is reached. An error is returned after 40 failed checks
|
||||
@@ -102,7 +110,9 @@ class AWS:
|
||||
"Failed to create the default network_acl: %s"
|
||||
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
return acl_id
|
||||
|
||||
# Replace network acl association
|
||||
@@ -114,7 +124,9 @@ class AWS:
|
||||
new_association_id = status["NewAssociationId"]
|
||||
except Exception as e:
|
||||
logging.error("Failed to replace network acl association: %s" % (e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
return new_association_id
|
||||
|
||||
# Describe network acl
|
||||
@@ -131,7 +143,9 @@ class AWS:
|
||||
"Failed to describe network acl: %s."
|
||||
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet" % (e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
associations = response["NetworkAcls"][0]["Associations"]
|
||||
# grab the current network_acl in use
|
||||
original_acl_id = response["NetworkAcls"][0]["Associations"][0]["NetworkAclId"]
|
||||
@@ -148,7 +162,9 @@ class AWS:
|
||||
"Make sure you have aws cli configured on the host and set for the region of your vpc/subnet"
|
||||
% (acl_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
class aws_node_scenarios(abstract_node_scenarios):
|
||||
@@ -173,7 +189,9 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -189,7 +207,9 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -213,7 +233,9 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -232,4 +254,6 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
@@ -39,7 +39,9 @@ class Azure:
|
||||
logging.info("vm name " + str(vm_name) + " started")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (vm_name, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, group_name, vm_name):
|
||||
@@ -48,7 +50,9 @@ class Azure:
|
||||
logging.info("vm name " + str(vm_name) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (vm_name, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, group_name, vm_name):
|
||||
@@ -59,7 +63,9 @@ class Azure:
|
||||
logging.error(
|
||||
"Failed to terminate node instance %s. Encountered following " "exception: %s." % (vm_name, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, group_name, vm_name):
|
||||
@@ -68,7 +74,9 @@ class Azure:
|
||||
logging.info("vm name " + str(vm_name) + " rebooted")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (vm_name, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
def get_vm_status(self, resource_group, vm_name):
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses
|
||||
@@ -145,7 +153,9 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -161,7 +171,9 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -185,7 +197,9 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -204,4 +218,6 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
@@ -45,7 +45,9 @@ class GCP:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, zone, instance_id):
|
||||
@@ -54,7 +56,9 @@ class GCP:
|
||||
logging.info("vm name " + str(instance_id) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (instance_id, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Start the node instance
|
||||
def suspend_instances(self, zone, instance_id):
|
||||
@@ -65,7 +69,9 @@ class GCP:
|
||||
logging.error(
|
||||
"Failed to suspend node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, zone, instance_id):
|
||||
@@ -76,7 +82,9 @@ class GCP:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, zone, instance_id):
|
||||
@@ -87,7 +95,9 @@ class GCP:
|
||||
logging.error(
|
||||
"Failed to start node instance %s. Encountered following " "exception: %s." % (instance_id, e)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Get instance status
|
||||
def get_instance_status(self, zone, instance_id, expected_status, timeout):
|
||||
@@ -156,7 +166,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -173,7 +185,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -197,7 +211,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -215,4 +231,6 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
@@ -24,7 +24,9 @@ class OPENSTACKCLOUD:
|
||||
logging.info("Instance: " + str(node) + " started")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance %s. Encountered following " "exception: %s." % (node, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, node):
|
||||
@@ -33,7 +35,9 @@ class OPENSTACKCLOUD:
|
||||
logging.info("Instance: " + str(node) + " stopped")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance %s. Encountered following " "exception: %s." % (node, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, node):
|
||||
@@ -42,7 +46,9 @@ class OPENSTACKCLOUD:
|
||||
logging.info("Instance: " + str(node) + " rebooted")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance %s. Encountered following " "exception: %s." % (node, e))
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, node, timeout):
|
||||
@@ -109,7 +115,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -125,7 +133,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
@@ -144,7 +154,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to start the node
|
||||
def helper_node_start_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
@@ -162,7 +174,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("helper_node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Node scenario to stop the node
|
||||
def helper_node_stop_scenario(self, instance_kill_count, node_ip, timeout):
|
||||
@@ -177,7 +191,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("helper_node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
def helper_node_service_status(self, node_ip, service, ssh_private_key, timeout):
|
||||
try:
|
||||
@@ -188,4 +204,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
except Exception as e:
|
||||
logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e))
|
||||
logging.error("helper_node_service_status injection failed!")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
@@ -13,7 +13,7 @@ from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
|
||||
from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
|
||||
import kraken.node_actions.common_node_functions as common_node_functions
|
||||
import kraken.cerberus.setup as cerberus
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
node_general = False
|
||||
|
||||
@@ -53,8 +53,14 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib
|
||||
|
||||
# Run defined scenarios
|
||||
# krkn_lib_kubernetes
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for node_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = node_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
|
||||
with open(node_scenario_config, "r") as f:
|
||||
node_scenario_config = yaml.full_load(f)
|
||||
for node_scenario in node_scenario_config["node_scenarios"]:
|
||||
@@ -62,12 +68,24 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
|
||||
if node_scenario["actions"]:
|
||||
for action in node_scenario["actions"]:
|
||||
start_time = int(time.time())
|
||||
inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(config, start_time, end_time)
|
||||
logging.info("")
|
||||
try:
|
||||
inject_node_scenario(action, node_scenario, node_scenario_object, kubecli)
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(config, start_time, end_time)
|
||||
logging.info("")
|
||||
except (RuntimeError, Exception) as e:
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(node_scenario_config)
|
||||
telemetry.log_exception(node_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
# Inject the specified node scenario
|
||||
|
||||
@@ -12,6 +12,8 @@ import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||
from kraken.plugins.run_python_plugin import run_python_file
|
||||
from kraken.plugins.network.ingress_shaping import network_chaos
|
||||
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage
|
||||
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
|
||||
@dataclasses.dataclass
|
||||
@@ -213,21 +215,36 @@ PLUGINS = Plugins(
|
||||
[
|
||||
"error"
|
||||
]
|
||||
)
|
||||
),
|
||||
PluginStep(
|
||||
pod_egress_shaping,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
)
|
||||
]
|
||||
)
|
||||
|
||||
|
||||
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int) -> List[str]:
|
||||
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetry) -> (List[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
for scenario in scenarios:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
logging.info('scenario '+ str(scenario))
|
||||
try:
|
||||
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
|
||||
except Exception as e:
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_post_scenarios.append(scenario)
|
||||
logging.error("Error while running {}: {}".format(scenario, e))
|
||||
return failed_post_scenarios
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
telemetry.log_exception(scenario)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
|
||||
return failed_post_scenarios
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
File diff suppressed because it is too large
Load Diff
@@ -10,7 +10,7 @@ import time
|
||||
import yaml
|
||||
import sys
|
||||
import random
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# Run pod based scenarios
|
||||
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
|
||||
@@ -67,8 +67,22 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
|
||||
return failed_post_scenarios
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def container_run(kubeconfig_path,
|
||||
scenarios_list,
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
wait_duration,
|
||||
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
|
||||
telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
|
||||
failed_scenarios = []
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
|
||||
for container_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = container_scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
|
||||
if len(container_scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
|
||||
else:
|
||||
@@ -78,30 +92,41 @@ def container_run(kubeconfig_path, scenarios_list, config, failed_post_scenarios
|
||||
for cont_scenario in cont_scenario_config["scenarios"]:
|
||||
# capture start time
|
||||
start_time = int(time.time())
|
||||
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
||||
|
||||
if len(container_scenario_config) > 1:
|
||||
try:
|
||||
try:
|
||||
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
||||
if len(container_scenario_config) > 1:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path, container_scenario_config, failed_post_scenarios, pre_action_output
|
||||
kubeconfig_path,
|
||||
container_scenario_config,
|
||||
failed_post_scenarios,
|
||||
pre_action_output
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run post action checks: %s" % e)
|
||||
sys.exit(1)
|
||||
else:
|
||||
failed_post_scenarios = check_failed_containers(
|
||||
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
|
||||
)
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
# capture end time
|
||||
end_time = int(time.time())
|
||||
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
failed_scenarios.append(container_scenario_config[0])
|
||||
telemetry.log_exception(container_scenario_config[0])
|
||||
scenario_telemetry.exitStatus = 1
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
else:
|
||||
failed_post_scenarios = check_failed_containers(
|
||||
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
|
||||
)
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
# capture end time
|
||||
end_time = int(time.time())
|
||||
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
logging.info("")
|
||||
|
||||
|
||||
def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
@@ -114,7 +139,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
|
||||
kill_count = cont_scenario.get("count", 1)
|
||||
if type(pod_names) != list:
|
||||
logging.error("Please make sure your pod_names are in a list format")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
if len(pod_names) == 0:
|
||||
if namespace == "*":
|
||||
# returns double array of pod name and namespace
|
||||
@@ -126,7 +153,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
|
||||
if namespace == "*":
|
||||
logging.error("You must specify the namespace to kill a container in a specific pod")
|
||||
logging.error("Scenario " + scenario_name + " failed")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
pods = pod_names
|
||||
# get container and pod name
|
||||
container_pod_list = []
|
||||
@@ -147,7 +176,9 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
|
||||
if len(container_pod_list) == 0:
|
||||
logging.error("Trying to kill more containers than were found, try lowering kill count")
|
||||
logging.error("Scenario " + scenario_name + " failed")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
selected_container_pod = container_pod_list[random.randint(0, len(container_pod_list) - 1)]
|
||||
for c_name in selected_container_pod[2]:
|
||||
if container_name != "":
|
||||
@@ -178,6 +209,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub
|
||||
time.sleep(2)
|
||||
continue
|
||||
else:
|
||||
logging.warning(response)
|
||||
continue
|
||||
|
||||
|
||||
|
||||
@@ -7,151 +7,233 @@ import krkn_lib_kubernetes
|
||||
import yaml
|
||||
|
||||
from ..cerberus import setup as cerberus
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
"""
|
||||
Reads the scenario config and creates a temp file to fill up the PVC
|
||||
"""
|
||||
failed_post_scenarios = ""
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for app_config in scenarios_list:
|
||||
if len(app_config) > 1:
|
||||
with open(app_config, "r") as f:
|
||||
config_yaml = yaml.full_load(f)
|
||||
scenario_config = config_yaml["pvc_scenario"]
|
||||
pvc_name = scenario_config.get("pvc_name", "")
|
||||
pod_name = scenario_config.get("pod_name", "")
|
||||
namespace = scenario_config.get("namespace", "")
|
||||
target_fill_percentage = scenario_config.get(
|
||||
"fill_percentage", "50"
|
||||
)
|
||||
duration = scenario_config.get("duration", 60)
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_config)
|
||||
try:
|
||||
if len(app_config) > 1:
|
||||
with open(app_config, "r") as f:
|
||||
config_yaml = yaml.full_load(f)
|
||||
scenario_config = config_yaml["pvc_scenario"]
|
||||
pvc_name = scenario_config.get("pvc_name", "")
|
||||
pod_name = scenario_config.get("pod_name", "")
|
||||
namespace = scenario_config.get("namespace", "")
|
||||
target_fill_percentage = scenario_config.get(
|
||||
"fill_percentage", "50"
|
||||
)
|
||||
duration = scenario_config.get("duration", 60)
|
||||
|
||||
logging.info(
|
||||
"Input params:\n"
|
||||
"pvc_name: '%s'\n"
|
||||
"pod_name: '%s'\n"
|
||||
"namespace: '%s'\n"
|
||||
"target_fill_percentage: '%s%%'\nduration: '%ss'"
|
||||
% (
|
||||
str(pvc_name),
|
||||
str(pod_name),
|
||||
str(namespace),
|
||||
str(target_fill_percentage),
|
||||
str(duration)
|
||||
)
|
||||
)
|
||||
|
||||
# Check input params
|
||||
if namespace is None:
|
||||
logging.error(
|
||||
"You must specify the namespace where the PVC is"
|
||||
)
|
||||
sys.exit(1)
|
||||
if pvc_name is None and pod_name is None:
|
||||
logging.error(
|
||||
"You must specify the pvc_name or the pod_name"
|
||||
)
|
||||
sys.exit(1)
|
||||
if pvc_name and pod_name:
|
||||
logging.info(
|
||||
"pod_name will be ignored, pod_name used will be "
|
||||
"a retrieved from the pod used in the pvc_name"
|
||||
"Input params:\n"
|
||||
"pvc_name: '%s'\n"
|
||||
"pod_name: '%s'\n"
|
||||
"namespace: '%s'\n"
|
||||
"target_fill_percentage: '%s%%'\nduration: '%ss'"
|
||||
% (
|
||||
str(pvc_name),
|
||||
str(pod_name),
|
||||
str(namespace),
|
||||
str(target_fill_percentage),
|
||||
str(duration)
|
||||
)
|
||||
)
|
||||
|
||||
# Get pod name
|
||||
if pvc_name:
|
||||
if pod_name:
|
||||
logging.info(
|
||||
"pod_name '%s' will be overridden with one of "
|
||||
"the pods mounted in the PVC" % (str(pod_name))
|
||||
)
|
||||
pvc = kubecli.get_pvc_info(pvc_name, namespace)
|
||||
try:
|
||||
# random generator not used for
|
||||
# security/cryptographic purposes.
|
||||
pod_name = random.choice(pvc.podNames) # nosec
|
||||
logging.info("Pod name: %s" % pod_name)
|
||||
except Exception:
|
||||
# Check input params
|
||||
if namespace is None:
|
||||
logging.error(
|
||||
"Pod associated with %s PVC, on namespace %s, "
|
||||
"not found" % (str(pvc_name), str(namespace))
|
||||
"You must specify the namespace where the PVC is"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Get volume name
|
||||
pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
|
||||
|
||||
if pod is None:
|
||||
logging.error(
|
||||
"Exiting as pod '%s' doesn't exist "
|
||||
"in namespace '%s'" % (
|
||||
str(pod_name),
|
||||
str(namespace)
|
||||
#sys.exit(1)
|
||||
raise RuntimeError()
|
||||
if pvc_name is None and pod_name is None:
|
||||
logging.error(
|
||||
"You must specify the pvc_name or the pod_name"
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
if pvc_name and pod_name:
|
||||
logging.info(
|
||||
"pod_name will be ignored, pod_name used will be "
|
||||
"a retrieved from the pod used in the pvc_name"
|
||||
)
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
for volume in pod.volumes:
|
||||
if volume.pvcName is not None:
|
||||
volume_name = volume.name
|
||||
pvc_name = volume.pvcName
|
||||
# Get pod name
|
||||
if pvc_name:
|
||||
if pod_name:
|
||||
logging.info(
|
||||
"pod_name '%s' will be overridden with one of "
|
||||
"the pods mounted in the PVC" % (str(pod_name))
|
||||
)
|
||||
pvc = kubecli.get_pvc_info(pvc_name, namespace)
|
||||
break
|
||||
if 'pvc' not in locals():
|
||||
logging.error(
|
||||
"Pod '%s' in namespace '%s' does not use a pvc" % (
|
||||
try:
|
||||
# random generator not used for
|
||||
# security/cryptographic purposes.
|
||||
pod_name = random.choice(pvc.podNames) # nosec
|
||||
logging.info("Pod name: %s" % pod_name)
|
||||
except Exception:
|
||||
logging.error(
|
||||
"Pod associated with %s PVC, on namespace %s, "
|
||||
"not found" % (str(pvc_name), str(namespace))
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Get volume name
|
||||
pod = kubecli.get_pod_info(name=pod_name, namespace=namespace)
|
||||
|
||||
if pod is None:
|
||||
logging.error(
|
||||
"Exiting as pod '%s' doesn't exist "
|
||||
"in namespace '%s'" % (
|
||||
str(pod_name),
|
||||
str(namespace)
|
||||
)
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
for volume in pod.volumes:
|
||||
if volume.pvcName is not None:
|
||||
volume_name = volume.name
|
||||
pvc_name = volume.pvcName
|
||||
pvc = kubecli.get_pvc_info(pvc_name, namespace)
|
||||
break
|
||||
if 'pvc' not in locals():
|
||||
logging.error(
|
||||
"Pod '%s' in namespace '%s' does not use a pvc" % (
|
||||
str(pod_name),
|
||||
str(namespace)
|
||||
)
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
logging.info("Volume name: %s" % volume_name)
|
||||
logging.info("PVC name: %s" % pvc_name)
|
||||
|
||||
# Get container name and mount path
|
||||
for container in pod.containers:
|
||||
for vol in container.volumeMounts:
|
||||
if vol.name == volume_name:
|
||||
mount_path = vol.mountPath
|
||||
container_name = container.name
|
||||
break
|
||||
logging.info("Container path: %s" % container_name)
|
||||
logging.info("Mount path: %s" % mount_path)
|
||||
|
||||
# Get PVC capacity and used bytes
|
||||
command = "df %s -B 1024 | sed 1d" % (str(mount_path))
|
||||
command_output = (
|
||||
kubecli.exec_cmd_in_pod(
|
||||
command,
|
||||
pod_name,
|
||||
namespace,
|
||||
container_name,
|
||||
"sh"
|
||||
)
|
||||
).split()
|
||||
pvc_used_kb = int(command_output[2])
|
||||
pvc_capacity_kb = pvc_used_kb + int(command_output[3])
|
||||
logging.info("PVC used: %s KB" % pvc_used_kb)
|
||||
logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
|
||||
|
||||
# Check valid fill percentage
|
||||
current_fill_percentage = pvc_used_kb / pvc_capacity_kb
|
||||
if not (
|
||||
current_fill_percentage * 100
|
||||
< float(target_fill_percentage)
|
||||
<= 99
|
||||
):
|
||||
logging.error(
|
||||
"Target fill percentage (%.2f%%) is lower than "
|
||||
"current fill percentage (%.2f%%) "
|
||||
"or higher than 99%%" % (
|
||||
target_fill_percentage,
|
||||
current_fill_percentage * 100
|
||||
)
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Calculate file size
|
||||
file_size_kb = int(
|
||||
(
|
||||
float(
|
||||
target_fill_percentage / 100
|
||||
) * float(pvc_capacity_kb)
|
||||
) - float(pvc_used_kb)
|
||||
)
|
||||
logging.debug("File size: %s KB" % file_size_kb)
|
||||
|
||||
file_name = "kraken.tmp"
|
||||
logging.info(
|
||||
"Creating %s file, %s KB size, in pod %s at %s (ns %s)"
|
||||
% (
|
||||
str(file_name),
|
||||
str(file_size_kb),
|
||||
str(pod_name),
|
||||
str(mount_path),
|
||||
str(namespace)
|
||||
)
|
||||
)
|
||||
sys.exit(1)
|
||||
logging.info("Volume name: %s" % volume_name)
|
||||
logging.info("PVC name: %s" % pvc_name)
|
||||
|
||||
# Get container name and mount path
|
||||
for container in pod.containers:
|
||||
for vol in container.volumeMounts:
|
||||
if vol.name == volume_name:
|
||||
mount_path = vol.mountPath
|
||||
container_name = container.name
|
||||
break
|
||||
logging.info("Container path: %s" % container_name)
|
||||
logging.info("Mount path: %s" % mount_path)
|
||||
|
||||
# Get PVC capacity and used bytes
|
||||
command = "df %s -B 1024 | sed 1d" % (str(mount_path))
|
||||
command_output = (
|
||||
start_time = int(time.time())
|
||||
# Create temp file in the PVC
|
||||
full_path = "%s/%s" % (str(mount_path), str(file_name))
|
||||
command = "fallocate -l $((%s*1024)) %s" % (
|
||||
str(file_size_kb),
|
||||
str(full_path)
|
||||
)
|
||||
logging.debug(
|
||||
"Create temp file in the PVC command:\n %s" % command
|
||||
)
|
||||
kubecli.exec_cmd_in_pod(
|
||||
command,
|
||||
pod_name,
|
||||
namespace,
|
||||
container_name,
|
||||
"sh"
|
||||
)
|
||||
).split()
|
||||
pvc_used_kb = int(command_output[2])
|
||||
pvc_capacity_kb = pvc_used_kb + int(command_output[3])
|
||||
logging.info("PVC used: %s KB" % pvc_used_kb)
|
||||
logging.info("PVC capacity: %s KB" % pvc_capacity_kb)
|
||||
|
||||
# Check valid fill percentage
|
||||
current_fill_percentage = pvc_used_kb / pvc_capacity_kb
|
||||
if not (
|
||||
current_fill_percentage * 100
|
||||
< float(target_fill_percentage)
|
||||
<= 99
|
||||
):
|
||||
logging.error(
|
||||
"Target fill percentage (%.2f%%) is lower than "
|
||||
"current fill percentage (%.2f%%) "
|
||||
"or higher than 99%%" % (
|
||||
target_fill_percentage,
|
||||
current_fill_percentage * 100
|
||||
)
|
||||
# Check if file is created
|
||||
command = "ls -lh %s" % (str(mount_path))
|
||||
logging.debug("Check file is created command:\n %s" % command)
|
||||
response = kubecli.exec_cmd_in_pod(
|
||||
command, pod_name, namespace, container_name, "sh"
|
||||
)
|
||||
sys.exit(1)
|
||||
logging.info("\n" + str(response))
|
||||
if str(file_name).lower() in str(response).lower():
|
||||
logging.info(
|
||||
"%s file successfully created" % (str(full_path))
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to create tmp file with %s size" % (
|
||||
str(file_size_kb)
|
||||
)
|
||||
)
|
||||
remove_temp_file(
|
||||
file_name,
|
||||
full_path,
|
||||
pod_name,
|
||||
namespace,
|
||||
container_name,
|
||||
mount_path,
|
||||
file_size_kb,
|
||||
kubecli
|
||||
)
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# Calculate file size
|
||||
file_size_kb = int(
|
||||
@@ -186,26 +268,25 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
"Create temp file in the PVC command:\n %s" % command
|
||||
)
|
||||
kubecli.exec_cmd_in_pod(
|
||||
command, pod_name, namespace, container_name, "sh"
|
||||
command, pod_name, namespace, container_name
|
||||
)
|
||||
|
||||
# Check if file is created
|
||||
command = "ls -lh %s" % (str(mount_path))
|
||||
logging.debug("Check file is created command:\n %s" % command)
|
||||
response = kubecli.exec_cmd_in_pod(
|
||||
command, pod_name, namespace, container_name, "sh"
|
||||
command, pod_name, namespace, container_name
|
||||
)
|
||||
logging.info("\n" + str(response))
|
||||
if str(file_name).lower() in str(response).lower():
|
||||
logging.info(
|
||||
"%s file successfully created" % (str(full_path))
|
||||
)
|
||||
else:
|
||||
logging.error(
|
||||
"Failed to create tmp file with %s size" % (
|
||||
str(file_size_kb)
|
||||
"Waiting for the specified duration in the config: %ss" % (
|
||||
duration
|
||||
)
|
||||
)
|
||||
time.sleep(duration)
|
||||
logging.info("Finish waiting")
|
||||
|
||||
remove_temp_file(
|
||||
file_name,
|
||||
full_path,
|
||||
@@ -216,35 +297,25 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
file_size_kb,
|
||||
kubecli
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
# Wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration in the config: %ss" % (
|
||||
duration
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
)
|
||||
time.sleep(duration)
|
||||
logging.info("Finish waiting")
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(app_config)
|
||||
telemetry.log_exception(app_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
remove_temp_file(
|
||||
file_name,
|
||||
full_path,
|
||||
pod_name,
|
||||
namespace,
|
||||
container_name,
|
||||
mount_path,
|
||||
file_size_kb,
|
||||
kubecli
|
||||
)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
@@ -260,15 +331,14 @@ def remove_temp_file(
|
||||
):
|
||||
command = "rm -f %s" % (str(full_path))
|
||||
logging.debug("Remove temp file from the PVC command:\n %s" % command)
|
||||
kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name, "sh")
|
||||
kubecli.exec_cmd_in_pod(command, pod_name, namespace, container_name)
|
||||
command = "ls -lh %s" % (str(mount_path))
|
||||
logging.debug("Check temp file is removed command:\n %s" % command)
|
||||
response = kubecli.exec_cmd_in_pod(
|
||||
command,
|
||||
pod_name,
|
||||
namespace,
|
||||
container_name,
|
||||
"sh"
|
||||
container_name
|
||||
)
|
||||
logging.info("\n" + str(response))
|
||||
if not (str(file_name).lower() in str(response).lower()):
|
||||
@@ -277,7 +347,7 @@ def remove_temp_file(
|
||||
logging.error(
|
||||
"Failed to delete tmp file with %s size" % (str(file_size_kb))
|
||||
)
|
||||
sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
|
||||
def toKbytes(value):
|
||||
@@ -286,7 +356,7 @@ def toKbytes(value):
|
||||
"PVC capacity %s does not match expression "
|
||||
"regexp '^[0-9]+[K|M|G|T]i$'"
|
||||
)
|
||||
sys.exit(1)
|
||||
raise RuntimeError()
|
||||
unit = {"K": 0, "M": 1, "G": 2, "T": 3}
|
||||
base = 1024 if ("i" in value) else 1000
|
||||
exp = unit[value[-2:-1]]
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
#!/usr/bin/env python
|
||||
|
||||
import os
|
||||
import sys
|
||||
import yaml
|
||||
import logging
|
||||
@@ -13,7 +13,7 @@ from ..node_actions.aws_node_scenarios import AWS
|
||||
from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD
|
||||
from ..node_actions.az_node_scenarios import Azure
|
||||
from ..node_actions.gcp_node_scenarios import GCP
|
||||
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
def multiprocess_nodes(cloud_object_function, nodes):
|
||||
try:
|
||||
@@ -59,7 +59,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
|
||||
"Cloud type %s is not currently supported for cluster shut down" %
|
||||
cloud_type
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
nodes = kubecli.list_nodes()
|
||||
node_id = []
|
||||
@@ -128,9 +130,16 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
failed_post_scenarios = []
|
||||
failed_scenarios = []
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
|
||||
for shut_down_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = shut_down_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, shut_down_config[0])
|
||||
if len(shut_down_config) > 1:
|
||||
pre_action_output = post_actions.run("", shut_down_config[1])
|
||||
else:
|
||||
@@ -140,18 +149,32 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
|
||||
shut_down_config_scenario = \
|
||||
shut_down_config_yaml["cluster_shut_down_scenario"]
|
||||
start_time = int(time.time())
|
||||
cluster_shut_down(shut_down_config_scenario, kubecli)
|
||||
logging.info(
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
"", shut_down_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
try:
|
||||
cluster_shut_down(shut_down_config_scenario, kubecli)
|
||||
logging.info(
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
"", shut_down_config, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
|
||||
except (RuntimeError, Exception):
|
||||
telemetry.log_exception(shut_down_config[0])
|
||||
failed_scenarios.append(shut_down_config[0])
|
||||
scenario_telemetry.exitStatus = 1
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -8,6 +8,7 @@ import random
|
||||
import krkn_lib_kubernetes
|
||||
from ..cerberus import setup as cerberus
|
||||
from ..invoke import command as runcommand
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
@@ -93,7 +94,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
for name in scenario["object_name"]:
|
||||
if "namespace" not in scenario.keys():
|
||||
logging.error("Need to set namespace when using pod name")
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
pod_names.append([name, scenario["namespace"]])
|
||||
elif "namespace" in scenario.keys() and scenario["namespace"]:
|
||||
if "label_selector" not in scenario.keys():
|
||||
@@ -127,7 +130,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
"Cannot find pods matching the namespace/label_selector, "
|
||||
"please check"
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
pod_counter = 0
|
||||
for pod in pod_names:
|
||||
if len(pod) > 1:
|
||||
@@ -152,7 +157,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
"in pod %s in namespace %s"
|
||||
% (selected_container_name, pod[0], pod[1])
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
pod_names[pod_counter].append(selected_container_name)
|
||||
else:
|
||||
selected_container_name = get_container_name(
|
||||
@@ -178,7 +185,9 @@ def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
scenario["namespace"]
|
||||
)
|
||||
)
|
||||
sys.exit(1)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
pod_names[pod_counter].append(selected_container_name)
|
||||
logging.info("Reset date/time on pod " + str(pod[0]))
|
||||
pod_counter += 1
|
||||
@@ -299,24 +308,41 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube
|
||||
|
||||
|
||||
# krkn_lib_kubernetes
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
|
||||
failed_scenarios = []
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
for time_scenario_config in scenarios_list:
|
||||
with open(time_scenario_config, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
for time_scenario in scenario_config["time_scenarios"]:
|
||||
start_time = int(time.time())
|
||||
object_type, object_names = skew_time(time_scenario, kubecli)
|
||||
not_reset = check_date_time(object_type, object_names, kubecli)
|
||||
if len(not_reset) > 0:
|
||||
logging.info("Object times were not reset")
|
||||
logging.info(
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
not_reset,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = time_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
|
||||
try:
|
||||
with open(time_scenario_config, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
for time_scenario in scenario_config["time_scenarios"]:
|
||||
start_time = int(time.time())
|
||||
object_type, object_names = skew_time(time_scenario, kubecli)
|
||||
not_reset = check_date_time(object_type, object_names, kubecli)
|
||||
if len(not_reset) > 0:
|
||||
logging.info("Object times were not reset")
|
||||
logging.info(
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
not_reset,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
telemetry.log_exception(time_scenario_config)
|
||||
failed_scenarios.append(time_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -1,100 +1,119 @@
|
||||
import yaml
|
||||
import sys
|
||||
import logging
|
||||
import time
|
||||
from ..node_actions.aws_node_scenarios import AWS
|
||||
from ..cerberus import setup as cerberus
|
||||
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
|
||||
|
||||
|
||||
def run(scenarios_list, config, wait_duration):
|
||||
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) :
|
||||
"""
|
||||
filters the subnet of interest and applies the network acl
|
||||
to create zone outage
|
||||
"""
|
||||
failed_post_scenarios = ""
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
|
||||
for zone_outage_config in scenarios_list:
|
||||
if len(zone_outage_config) > 1:
|
||||
with open(zone_outage_config, "r") as f:
|
||||
zone_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = zone_outage_config_yaml["zone_outage"]
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
cloud_type = scenario_config["cloud_type"]
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = zone_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
|
||||
try:
|
||||
if len(zone_outage_config) > 1:
|
||||
with open(zone_outage_config, "r") as f:
|
||||
zone_outage_config_yaml = yaml.full_load(f)
|
||||
scenario_config = zone_outage_config_yaml["zone_outage"]
|
||||
vpc_id = scenario_config["vpc_id"]
|
||||
subnet_ids = scenario_config["subnet_id"]
|
||||
duration = scenario_config["duration"]
|
||||
cloud_type = scenario_config["cloud_type"]
|
||||
ids = {}
|
||||
acl_ids_created = []
|
||||
|
||||
if cloud_type.lower() == "aws":
|
||||
cloud_object = AWS()
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios"
|
||||
% cloud_type
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
start_time = int(time.time())
|
||||
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = \
|
||||
cloud_object.describe_network_acls(vpc_id, subnet_id)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s"
|
||||
% (subnet_id, network_association_ids)
|
||||
)
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
new_association_id = \
|
||||
cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
if cloud_type.lower() == "aws":
|
||||
cloud_object = AWS()
|
||||
else:
|
||||
logging.error(
|
||||
"Cloud type %s is not currently supported for "
|
||||
"zone outage scenarios"
|
||||
% cloud_type
|
||||
)
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
raise RuntimeError()
|
||||
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
acl_ids_created.append(acl_id)
|
||||
start_time = int(time.time())
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration "
|
||||
"in the config: %s" % (duration)
|
||||
)
|
||||
time.sleep(duration)
|
||||
for subnet_id in subnet_ids:
|
||||
logging.info("Targeting subnet_id")
|
||||
network_association_ids = []
|
||||
associations, original_acl_id = \
|
||||
cloud_object.describe_network_acls(vpc_id, subnet_id)
|
||||
for entry in associations:
|
||||
if entry["SubnetId"] == subnet_id:
|
||||
network_association_ids.append(
|
||||
entry["NetworkAclAssociationId"]
|
||||
)
|
||||
logging.info(
|
||||
"Network association ids associated with "
|
||||
"the subnet %s: %s"
|
||||
% (subnet_id, network_association_ids)
|
||||
)
|
||||
acl_id = cloud_object.create_default_network_acl(vpc_id)
|
||||
new_association_id = \
|
||||
cloud_object.replace_network_acl_association(
|
||||
network_association_ids[0], acl_id
|
||||
)
|
||||
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
cloud_object.replace_network_acl_association(
|
||||
new_association_id,
|
||||
original_acl_id
|
||||
# capture the orginal_acl_id, created_acl_id and
|
||||
# new association_id to use during the recovery
|
||||
ids[new_association_id] = original_acl_id
|
||||
acl_ids_created.append(acl_id)
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info(
|
||||
"Waiting for the specified duration "
|
||||
"in the config: %s" % (duration)
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure "
|
||||
"the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
time.sleep(duration)
|
||||
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
cloud_object.delete_network_acl(acl_id)
|
||||
# replace the applied acl with the previous acl in use
|
||||
for new_association_id, original_acl_id in ids.items():
|
||||
cloud_object.replace_network_acl_association(
|
||||
new_association_id,
|
||||
original_acl_id
|
||||
)
|
||||
logging.info(
|
||||
"Wating for 60 seconds to make sure "
|
||||
"the changes are in place"
|
||||
)
|
||||
time.sleep(60)
|
||||
|
||||
logging.info(
|
||||
"End of scenario. "
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
# delete the network acl created for the run
|
||||
for acl_id in acl_ids_created:
|
||||
cloud_object.delete_network_acl(acl_id)
|
||||
|
||||
logging.info(
|
||||
"End of scenario. "
|
||||
"Waiting for the specified duration: %s" % (wait_duration)
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
failed_scenarios.append(zone_outage_config)
|
||||
telemetry.log_exception(zone_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
|
||||
@@ -37,4 +37,4 @@ prometheus_api_client
|
||||
ibm_cloud_sdk_core
|
||||
ibm_vpc
|
||||
pytest
|
||||
krkn-lib-kubernetes > 0.1.1
|
||||
krkn-lib-kubernetes >= 0.1.3
|
||||
@@ -25,7 +25,7 @@ import kraken.arcaflow_plugin as arcaflow_plugin
|
||||
import server as server
|
||||
import kraken.prometheus.client as promcli
|
||||
from kraken import plugins
|
||||
from krkn_lib_kubernetes import KrknLibKubernetes
|
||||
from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger
|
||||
|
||||
KUBE_BURNER_URL = (
|
||||
"https://github.com/cloud-bulldozer/kube-burner/"
|
||||
@@ -98,13 +98,34 @@ def main(cfg):
|
||||
)
|
||||
sys.exit(1)
|
||||
logging.info("Initializing client to talk to the Kubernetes cluster")
|
||||
|
||||
# Generate uuid for the run
|
||||
if run_uuid:
|
||||
logging.info(
|
||||
"Using the uuid defined by the user for the run: %s" % run_uuid
|
||||
)
|
||||
else:
|
||||
run_uuid = str(uuid.uuid4())
|
||||
logging.info("Generated a uuid for the run: %s" % run_uuid)
|
||||
|
||||
# request_id for telemetry is generated once here and used everywhere
|
||||
telemetry_request_id = f"{int(time.time())}-{run_uuid}"
|
||||
if config["telemetry"].get("run_tag"):
|
||||
telemetry_request_id = f"{telemetry_request_id}-{config['telemetry']['run_tag']}"
|
||||
telemetry_log_file = f'{config["telemetry"]["archive_path"]}/{telemetry_request_id}.log'
|
||||
safe_logger = SafeLogger(filename=telemetry_log_file)
|
||||
|
||||
try:
|
||||
kubeconfig_path
|
||||
os.environ["KUBECONFIG"] = str(kubeconfig_path)
|
||||
# krkn-lib-kubernetes init
|
||||
kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path)
|
||||
except NameError:
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
# KrknTelemetry init
|
||||
telemetry = KrknTelemetry(safe_logger, kubecli)
|
||||
|
||||
# find node kraken might be running on
|
||||
kubecli.find_kraken_node()
|
||||
|
||||
@@ -141,14 +162,7 @@ def main(cfg):
|
||||
if deploy_performance_dashboards:
|
||||
performance_dashboards.setup(dashboard_repo, distribution)
|
||||
|
||||
# Generate uuid for the run
|
||||
if run_uuid:
|
||||
logging.info(
|
||||
"Using the uuid defined by the user for the run: %s" % run_uuid
|
||||
)
|
||||
else:
|
||||
run_uuid = str(uuid.uuid4())
|
||||
logging.info("Generated a uuid for the run: %s" % run_uuid)
|
||||
|
||||
|
||||
# Initialize the start iteration to 0
|
||||
iteration = 0
|
||||
@@ -171,7 +185,8 @@ def main(cfg):
|
||||
# Capture the start time
|
||||
start_time = int(time.time())
|
||||
litmus_installed = False
|
||||
|
||||
chaos_telemetry = ChaosRunTelemetry()
|
||||
chaos_telemetry.run_uuid = run_uuid
|
||||
# Loop to run the chaos starts here
|
||||
while int(iteration) < iterations and run_signal != "STOP":
|
||||
# Inject chaos scenarios specified in the config
|
||||
@@ -203,36 +218,41 @@ def main(cfg):
|
||||
)
|
||||
sys.exit(1)
|
||||
elif scenario_type == "arcaflow_scenarios":
|
||||
failed_post_scenarios = arcaflow_plugin.run(
|
||||
scenarios_list, kubeconfig_path
|
||||
failed_post_scenarios, scenario_telemetries = arcaflow_plugin.run(
|
||||
scenarios_list, kubeconfig_path, telemetry
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
elif scenario_type == "plugin_scenarios":
|
||||
failed_post_scenarios = plugins.run(
|
||||
failed_post_scenarios, scenario_telemetries = plugins.run(
|
||||
scenarios_list,
|
||||
kubeconfig_path,
|
||||
kraken_config,
|
||||
failed_post_scenarios,
|
||||
wait_duration,
|
||||
telemetry
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "container_scenarios":
|
||||
logging.info("Running container scenarios")
|
||||
failed_post_scenarios = pod_scenarios.container_run(
|
||||
failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run(
|
||||
kubeconfig_path,
|
||||
scenarios_list,
|
||||
config,
|
||||
failed_post_scenarios,
|
||||
wait_duration,
|
||||
kubecli
|
||||
kubecli,
|
||||
telemetry
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Inject node chaos scenarios specified in the config
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "node_scenarios":
|
||||
logging.info("Running node scenarios")
|
||||
nodeaction.run(scenarios_list, config, wait_duration, kubecli)
|
||||
|
||||
failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
# Inject managedcluster chaos scenarios specified in the config
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "managedcluster_scenarios":
|
||||
@@ -247,7 +267,8 @@ def main(cfg):
|
||||
elif scenario_type == "time_scenarios":
|
||||
if distribution == "openshift":
|
||||
logging.info("Running time skew scenarios")
|
||||
time_actions.run(scenarios_list, config, wait_duration, kubecli)
|
||||
failed_post_scenarios, scenario_telemetries = time_actions.run(scenarios_list, config, wait_duration, kubecli, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
else:
|
||||
logging.error(
|
||||
"Litmus scenarios are currently "
|
||||
@@ -297,44 +318,48 @@ def main(cfg):
|
||||
# Inject cluster shutdown scenarios
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "cluster_shut_down_scenarios":
|
||||
shut_down.run(scenarios_list, config, wait_duration, kubecli)
|
||||
failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Inject namespace chaos scenarios
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "namespace_scenarios":
|
||||
logging.info("Running namespace scenarios")
|
||||
namespace_actions.run(
|
||||
failed_post_scenarios, scenario_telemetries = namespace_actions.run(
|
||||
scenarios_list,
|
||||
config,
|
||||
wait_duration,
|
||||
failed_post_scenarios,
|
||||
kubeconfig_path,
|
||||
kubecli
|
||||
kubecli,
|
||||
telemetry
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Inject zone failures
|
||||
elif scenario_type == "zone_outages":
|
||||
logging.info("Inject zone outages")
|
||||
zone_outages.run(scenarios_list, config, wait_duration)
|
||||
|
||||
failed_post_scenarios, scenario_telemetries = zone_outages.run(scenarios_list, config, wait_duration, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
# Application outages
|
||||
elif scenario_type == "application_outages":
|
||||
logging.info("Injecting application outage")
|
||||
application_outage.run(
|
||||
scenarios_list, config, wait_duration
|
||||
)
|
||||
failed_post_scenarios, scenario_telemetries = application_outage.run(
|
||||
scenarios_list, config, wait_duration, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# PVC scenarios
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "pvc_scenarios":
|
||||
logging.info("Running PVC scenario")
|
||||
pvc_scenario.run(scenarios_list, config, kubecli)
|
||||
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Network scenarios
|
||||
# krkn_lib_kubernetes
|
||||
elif scenario_type == "network_chaos":
|
||||
logging.info("Running Network Chaos")
|
||||
network_chaos.run(scenarios_list, config, wait_duration, kubecli)
|
||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry)
|
||||
|
||||
# Check for critical alerts when enabled
|
||||
if check_critical_alerts:
|
||||
@@ -353,6 +378,21 @@ def main(cfg):
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
|
||||
# telemetry
|
||||
if config["telemetry"]["enabled"]:
|
||||
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_request_id}")
|
||||
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
||||
try:
|
||||
telemetry.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
||||
safe_logger.info("archives download started:")
|
||||
prometheus_archive_files = telemetry.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
|
||||
safe_logger.info("archives upload started:")
|
||||
telemetry.put_ocp_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
|
||||
except Exception as e:
|
||||
logging.error(f"failed to send telemetry data: {str(e)}")
|
||||
else:
|
||||
logging.info("telemetry collection disabled, skipping.")
|
||||
|
||||
# Capture the end time
|
||||
end_time = int(time.time())
|
||||
|
||||
|
||||
14
scenarios/openshift/pod_network_shaping.yml
Normal file
14
scenarios/openshift/pod_network_shaping.yml
Normal file
@@ -0,0 +1,14 @@
|
||||
# yaml-language-server: $schema=../plugin.schema.json
|
||||
- id: pod_egress_shaping
|
||||
config:
|
||||
namespace: <namespace> # Required - Namespace of the pod to which traffic shaping need to be applied
|
||||
label_selector: <label_selector> # When pod_name is not specified, pod with matching label_selector is selected for chaos scenario
|
||||
pod_name: <pod name> # When label_selector is not specified, pod matching the name will be selected for the chaos scenario
|
||||
network_params: # latency, loss and bandwidth are the three supported network parameters to alter for the chaos test
|
||||
latency: <time> # Value is a string. For example : 50ms
|
||||
loss: <fraction> # Loss is a fraction between 0 and 1. It has to be enclosed in quotes to treat it as a string. For example, '0.02%' (not 0.02%)
|
||||
bandwidth: <rate> # Value is a string. For example: 100mbit
|
||||
execution_type: <serial/parallel> # Used to specify whether you want to apply filters on interfaces one at a time or all at once. Default is 'parallel'
|
||||
instance_count: <number> # Number of pods to perform action/select that match the label selector
|
||||
wait_duration: <time_duration> # Default is 300. Ensure that it is at least about twice of test_duration
|
||||
test_duration: <time_duration> # Default is 120
|
||||
@@ -2253,6 +2253,166 @@
|
||||
"id",
|
||||
"config"
|
||||
]
|
||||
},
|
||||
{
|
||||
"type": "object",
|
||||
"title": "pod_egress_shaping Arcaflow scenarios",
|
||||
"properties": {
|
||||
"id": {
|
||||
"type": "string",
|
||||
"const": "pod_egress_shaping"
|
||||
},
|
||||
"config": {
|
||||
"$defs": {
|
||||
"EgressParams": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"namespace": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"title": "Namespace",
|
||||
"description": "Namespace of the pod to which filter need to be appliedfor details."
|
||||
},
|
||||
"kubeconfig_path": {
|
||||
"type": "string",
|
||||
"title": "Kubeconfig path",
|
||||
"description": "Kubeconfig file as string\nSee https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ for details."
|
||||
},
|
||||
"pod_name": {
|
||||
"type": "string",
|
||||
"title": "Pod name",
|
||||
"description": "When label_selector is not specified, pod matching the name will beselected for the chaos scenario"
|
||||
},
|
||||
"label_selector": {
|
||||
"type": "string",
|
||||
"title": "Label selector",
|
||||
"description": "Kubernetes label selector for the target pod. When pod_name is not specified, pod with matching label_selector is selected for chaos scenario"
|
||||
},
|
||||
"kraken_config": {
|
||||
"type": "string",
|
||||
"title": "Kraken Config",
|
||||
"description": "Path to the config file of Kraken. Set this field if you wish to publish status onto Cerberus"
|
||||
},
|
||||
"test_duration": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 90,
|
||||
"title": "Test duration",
|
||||
"description": "Duration for which each step of the ingress chaos testing is to be performed."
|
||||
},
|
||||
"wait_duration": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 300,
|
||||
"title": "Wait Duration",
|
||||
"description": "Wait duration for finishing a test and its cleanup.Ensure that it is significantly greater than wait_duration"
|
||||
},
|
||||
"instance_count": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 1,
|
||||
"title": "Instance Count",
|
||||
"description": "Number of pods to perform action/select that match the label selector."
|
||||
},
|
||||
"execution_type": {
|
||||
"type": "string",
|
||||
"default": "parallel",
|
||||
"title": "Execution Type",
|
||||
"description": "The order in which the ingress filters are applied. Execution type can be 'serial' or 'parallel'"
|
||||
},
|
||||
"network_params": {
|
||||
"type": "object",
|
||||
"propertyNames": {},
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
},
|
||||
"title": "Network Parameters",
|
||||
"description": "The network filters that are applied on the interface. The currently supported filters are latency, loss and bandwidth"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"namespace"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"dependentRequired": {}
|
||||
}
|
||||
},
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"namespace": {
|
||||
"type": "string",
|
||||
"minLength": 1,
|
||||
"title": "Namespace",
|
||||
"description": "Namespace of the pod to which filter need to be appliedfor details."
|
||||
},
|
||||
"kubeconfig_path": {
|
||||
"type": "string",
|
||||
"title": "Kubeconfig path",
|
||||
"description": "Kubeconfig file as string\nSee https://kubernetes.io/docs/concepts/configuration/organize-cluster-access-kubeconfig/ for details."
|
||||
},
|
||||
"pod_name": {
|
||||
"type": "string",
|
||||
"title": "Pod name",
|
||||
"description": "When label_selector is not specified, pod matching the name will beselected for the chaos scenario"
|
||||
},
|
||||
"label_selector": {
|
||||
"type": "string",
|
||||
"title": "Label selector",
|
||||
"description": "Kubernetes label selector for the target pod. When pod_name is not specified, pod with matching label_selector is selected for chaos scenario"
|
||||
},
|
||||
"kraken_config": {
|
||||
"type": "string",
|
||||
"title": "Kraken Config",
|
||||
"description": "Path to the config file of Kraken. Set this field if you wish to publish status onto Cerberus"
|
||||
},
|
||||
"test_duration": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 90,
|
||||
"title": "Test duration",
|
||||
"description": "Duration for which each step of the ingress chaos testing is to be performed."
|
||||
},
|
||||
"wait_duration": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 300,
|
||||
"title": "Wait Duration",
|
||||
"description": "Wait duration for finishing a test and its cleanup.Ensure that it is significantly greater than wait_duration"
|
||||
},
|
||||
"instance_count": {
|
||||
"type": "integer",
|
||||
"minimum": 1,
|
||||
"default": 1,
|
||||
"title": "Instance Count",
|
||||
"description": "Number of pods to perform action/select that match the label selector."
|
||||
},
|
||||
"execution_type": {
|
||||
"type": "string",
|
||||
"default": "parallel",
|
||||
"title": "Execution Type",
|
||||
"description": "The order in which the ingress filters are applied. Execution type can be 'serial' or 'parallel'"
|
||||
},
|
||||
"network_params": {
|
||||
"type": "object",
|
||||
"propertyNames": {},
|
||||
"additionalProperties": {
|
||||
"type": "string"
|
||||
},
|
||||
"title": "Network Parameters",
|
||||
"description": "The network filters that are applied on the interface. The currently supported filters are latency, loss and bandwidth"
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"namespace"
|
||||
],
|
||||
"additionalProperties": false,
|
||||
"dependentRequired": {}
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
"id",
|
||||
"config"
|
||||
]
|
||||
}
|
||||
]
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user