Compare commits

...

22 Commits

Author SHA1 Message Date
Tullio Sebastiani
5712721410 bumped docker version (#493)
Co-authored-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2023-09-19 17:38:44 +02:00
Tullio Sebastiani
5567c06cd0 reinstated io-hog documentation (#492) 2023-09-19 17:27:59 +02:00
Sahil Shah
0ad4c11356 Fix for time scenario (#490) 2023-09-14 12:36:08 -04:00
Tullio Sebastiani
f6f686e8fe fixed io-hog scenario 2023-09-13 09:57:00 -04:00
Pratyusha Thammineni
3a66f8a5a3 Added Docker image build workflow status badge
This Allows the users to track the docker-build action in README.md
without navigationg to Actions tab on Github
2023-09-11 15:16:28 -04:00
Sahil Shah
585d519687 Adding Prometheus Disruption Scenario (#484) 2023-09-11 11:18:29 -04:00
yogananth-subramanian
e40fedcd44 Update etcd metrics 2023-09-08 11:11:42 -04:00
Paige Rubendall
1bb5b8ad04 adding comment 2023-08-29 21:54:17 -04:00
Paige Rubendall
725d58c8ce adding docs update again 2023-08-25 14:37:07 -04:00
Paige Rubendall
c6058da7a7 adding comment 2023-08-25 12:19:03 -04:00
Naga Ravi Chaitanya Elluri
06a8ed220c Bump release version to v1.4.4 2023-08-24 13:28:39 -04:00
Dustin Black
2c6b50bcdc bump arcaflow stressng plugin to 0.3.1 for bug fix 2023-08-24 12:50:28 -04:00
Naga Ravi Chaitanya Elluri
ed97c8df2b Bump release version to v1.4.3 2023-08-23 11:56:39 -04:00
Tullio Sebastiani
1baa68bcee engine bump to v0.6.1 2023-08-23 11:38:23 -04:00
Naga Ravi Chaitanya Elluri
ab84f09448 Use release tags vs latest for kubeconfig arca plugins (#473) 2023-08-23 09:59:33 -04:00
Dustin Black
6ace3c952b update to plugin release stressng:0.3.0 (#472) 2023-08-23 09:15:30 -04:00
Tullio Sebastiani
cee5259fd3 arcaflow scenarios removed from config.yaml 2023-08-23 08:50:19 -04:00
Tullio Sebastiani
f868000ebd Switched from krkn_lib_kubernetes to krkn_lib v1.0.0 (#469)
* changed all the references to krkn_lib_kubernetes to the new krkn_lib


changed all the references

* added krkn-lib pointer in documentation
2023-08-22 12:41:40 -04:00
pratyusha
d2d80be241 Updated config.yaml file with more scenarios (#468) 2023-08-21 11:26:33 -04:00
Naga Ravi Chaitanya Elluri
da464859c4 Bump release version to v1.4.2 2023-08-21 09:06:28 -04:00
Naga Ravi Chaitanya Elluri
ef88005985 Use images tagged with a release for hog scenarios
This commit switches from using latest images to a specific release
to review changes and update configs before using the latest bits.
2023-08-18 01:47:17 -04:00
Sahil Shah
102bdfdc96 Bump the release version to v1.4.1 (#465) 2023-08-17 10:18:11 -04:00
45 changed files with 633 additions and 213 deletions

View File

@@ -1,5 +1,6 @@
# Krkn aka Kraken
[![Docker Repository on Quay](https://quay.io/repository/redhat-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/redhat-chaos/krkn?tab=tags&tag=latest)
![Workflow-Status](https://github.com/redhat-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)
![Krkn logo](media/logo.png)

View File

@@ -16,6 +16,42 @@
description: etcd leader changes observed
severity: warning
- expr: (last_over_time(etcd_mvcc_db_total_size_in_bytes[5m]) / last_over_time(etcd_server_quota_backend_bytes[5m]))*100 > 95
description: etcd cluster database is running full.
severity: critical
- expr: (last_over_time(etcd_mvcc_db_total_size_in_use_in_bytes[5m]) / last_over_time(etcd_mvcc_db_total_size_in_bytes[5m])) < 0.5
description: etcd database size in use is less than 50% of the actual allocated storage.
severity: warning
- expr: rate(etcd_server_proposals_failed_total{job=~".*etcd.*"}[15m]) > 5
description: etcd cluster has high number of proposal failures.
severity: warning
- expr: histogram_quantile(0.99, rate(etcd_network_peer_round_trip_time_seconds_bucket{job=~".*etcd.*"}[5m])) > 0.15
description: etcd cluster member communication is slow.
severity: warning
- expr: histogram_quantile(0.99, sum(rate(grpc_server_handling_seconds_bucket{job=~".*etcd.*", grpc_method!="Defragment", grpc_type="unary"}[5m])) without(grpc_type)) > 0.15
description: etcd grpc requests are slow.
severity: critical
- expr: 100 * sum(rate(grpc_server_handled_total{job=~".*etcd.*", grpc_code=~"Unknown|FailedPrecondition|ResourceExhausted|Internal|Unavailable|DataLoss|DeadlineExceeded"}[5m])) without (grpc_type, grpc_code) / sum(rate(grpc_server_handled_total{job=~".*etcd.*"}[5m])) without (grpc_type, grpc_code) > 5
description: etcd cluster has high number of failed grpc requests.
severity: critical
- expr: etcd_server_has_leader{job=~".*etcd.*"} == 0
description: etcd cluster has no leader.
severity: warning
- expr: sum(up{job=~".*etcd.*"} == bool 1) without (instance) < ((count(up{job=~".*etcd.*"}) without (instance) + 1) / 2)
description: etcd cluster has insufficient number of members.
severity: warning
- expr: max without (endpoint) ( sum without (instance) (up{job=~".*etcd.*"} == bool 0) or count without (To) ( sum without (instance) (rate(etcd_network_peer_sent_failures_total{job=~".*etcd.*"}[120s])) > 0.01 )) > 0
description: etcd cluster members are down.
severity: warning
# API server
- expr: avg_over_time(histogram_quantile(0.99, sum(irate(apiserver_request_duration_seconds_bucket{apiserver="kube-apiserver", verb=~"POST|PUT|DELETE|PATCH", subresource!~"log|exec|portforward|attach|proxy"}[2m])) by (le, resource, verb))[10m:]) > 1
description: 10 minutes avg. 99th mutating API call latency for {{$labels.verb}}/{{$labels.resource}} higher than 1 second. {{$value}}s

View File

@@ -6,9 +6,49 @@ kraken:
signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
signal_address: 0.0.0.0 # Signal listening address
port: 8081 # Signal port
chaos_scenarios: # List of policies/chaos scenarios to load
chaos_scenarios:
# List of policies/chaos scenarios to load
- arcaflow_scenarios:
- scenarios/arcaflow/cpu-hog/input.yaml
- scenarios/arcaflow/memory-hog/input.yaml
- scenarios/arcaflow/io-hog/input.yaml
- application_outages:
- scenarios/openshift/app_outage.yaml
- scenarios/openshift/app_outage.yaml
- container_scenarios: # List of chaos pod scenarios to load
- - scenarios/openshift/container_etcd.yml
- plugin_scenarios:
- scenarios/openshift/etcd.yml
- scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/vmware_node_scenarios.yml
- scenarios/openshift/network_chaos_ingress.yml
- scenarios/openshift/prom_kill.yml
- node_scenarios: # List of chaos node scenarios to load
- scenarios/openshift/node_scenarios_example.yml
- plugin_scenarios:
- scenarios/openshift/openshift-apiserver.yml
- scenarios/openshift/openshift-kube-apiserver.yml
- time_scenarios: # List of chaos time scenarios to load
- scenarios/openshift/time_scenarios_example.yml
- litmus_scenarios: # List of litmus scenarios to load
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_cpu_hog_engine.yaml
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_mem_engine.yaml
- - scenarios/openshift/templates/litmus-rbac.yaml
- scenarios/openshift/node_io_engine.yaml
- cluster_shut_down_scenarios:
- - scenarios/openshift/cluster_shut_down_scenario.yml
- scenarios/openshift/post_action_shut_down.py
- namespace_scenarios:
- - scenarios/openshift/regex_namespace.yaml
- - scenarios/openshift/ingress_namespace.yaml
- scenarios/openshift/post_action_namespace.py
- zone_outages:
- scenarios/openshift/zone_outage.yaml
- pvc_scenarios:
- scenarios/openshift/pvc_scenario.yaml
- network_chaos:
- scenarios/openshift/network_chaos.yaml
cerberus:
cerberus_enabled: False # Enable it when cerberus is previously installed
@@ -48,4 +88,4 @@ telemetry:
# simultaneously).
# For unstable/slow connection is better to keep this value low
# increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
# failed chunk without affecting the whole upload.
# failed chunk without affecting the whole upload.

View File

@@ -13,6 +13,7 @@ kraken:
- plugin_scenarios: # List of chaos pod scenarios to load
- scenarios/openshift/etcd.yml
- scenarios/openshift/regex_openshift_pod_kill.yml
- scenarios/openshift/prom_kill.yml
- node_scenarios: # List of chaos node scenarios to load
- scenarios/openshift/node_scenarios_example.yml
- plugin_scenarios:

View File

@@ -139,6 +139,39 @@ metrics:
- query: histogram_quantile(0.99,sum(rate(etcd_request_duration_seconds_bucket[2m])) by (le,operation,apiserver)) > 0
metricName: P99APIEtcdRequestLatency
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Watch",grpc_type="bidi_stream"})
metricName: ActiveWatchStreams
- query: sum(grpc_server_started_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"}) - sum(grpc_server_handled_total{namespace="openshift-etcd",grpc_service="etcdserverpb.Lease",grpc_type="bidi_stream"})
metricName: ActiveLeaseStreams
- query: sum(rate(etcd_debugging_snap_save_total_duration_seconds_sum{namespace="openshift-etcd"}[2m]))
metricName: snapshotSaveLatency
- query: sum(rate(etcd_server_heartbeat_send_failures_total{namespace="openshift-etcd"}[2m]))
metricName: HeartBeatFailures
- query: sum(rate(etcd_server_health_failures{namespace="openshift-etcd"}[2m]))
metricName: HealthFailures
- query: sum(rate(etcd_server_slow_apply_total{namespace="openshift-etcd"}[2m]))
metricName: SlowApplies
- query: sum(rate(etcd_server_slow_read_indexes_total{namespace="openshift-etcd"}[2m]))
metricName: SlowIndexRead
- query: sum(etcd_server_proposals_pending)
metricName: PendingProposals
- query: histogram_quantile(1.0, sum(rate(etcd_debugging_mvcc_db_compaction_pause_duration_milliseconds_bucket[1m])) by (le, instance))
metricName: CompactionMaxPause
- query: sum by (instance) (apiserver_storage_objects)
metricName: etcdTotalObjectCount
- query: topk(500, max by(resource) (apiserver_storage_objects))
metricName: etcdTopObectCount
# Cluster metrics
- query: count(kube_namespace_created)
metricName: namespaceCount

View File

@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
# Install dependencies
RUN yum install -y git python39 python3-pip jq gettext wget && \
python3.9 -m pip install -U pip && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.0 /root/kraken && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.5 /root/kraken && \
mkdir -p /root/.kube && cd /root/kraken && \
pip3.9 install -r requirements.txt && \
pip3.9 install virtualenv && \

View File

@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
# Install dependencies
RUN yum install -y git python39 python3-pip jq gettext wget && \
python3.9 -m pip install -U pip && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.0 /root/kraken && \
git clone https://github.com/redhat-chaos/krkn.git --branch v1.4.5 /root/kraken && \
mkdir -p /root/.kube && cd /root/kraken && \
pip3.9 install -r requirements.txt && \
pip3.9 install virtualenv && \

View File

@@ -7,6 +7,7 @@ The engine uses containers to execute plugins and runs them either locally in Do
#### Hog scenarios:
- [CPU Hog](arcaflow_scenarios/cpu_hog.md)
- [Memory Hog](arcaflow_scenarios/memory_hog.md)
- [I/O Hog](arcaflow_scenarios/io_hog.md)
### Prequisites
@@ -64,4 +65,6 @@ Each step is represented by a container that will be executed from the deployer
Note that we provide the scenarios as a template, but they can be manipulated to define more complex workflows.
To have more details regarding the arcaflow workflows architecture and syntax it is suggested to refer to the [Arcaflow Documentation](https://arcalot.io/arcaflow/).
This edit is no longer in quay image
Working on fix in ticket: https://issues.redhat.com/browse/CHAOS-494
This will effect all versions 4.12 and higher of OpenShift

View File

@@ -0,0 +1,21 @@
# I/O Hog
This scenario is based on the arcaflow [arcaflow-plugin-stressng](https://github.com/arcalot/arcaflow-plugin-stressng) plugin.
The purpose of this scenario is to create disk pressure on a particular node of the Kubernetes/OpenShift cluster for a time span.
The scenario allows to attach a node path to the pod as a `hostPath` volume.
To enable this plugin add the pointer to the scenario input file `scenarios/arcaflow/io-hog/input.yaml` as described in the
Usage section.
This scenario takes a list of objects named `input_list` with the following properties:
- **kubeconfig :** *string* the kubeconfig needed by the deployer to deploy the sysbench plugin in the target cluster
- **namespace :** *string* the namespace where the scenario container will be deployed
**Note:** this parameter will be automatically filled by kraken if the `kubeconfig_path` property is correctly set
- **node_selector :** *key-value map* the node label that will be used as `nodeSelector` by the pod to target a specific cluster node
- **duration :** *string* stop stress test after N seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y.
- **target_pod_folder :** *string* the path in the pod where the volume is mounted
- **target_pod_volume :** *object* the `hostPath` volume definition in the [Kubernetes/OpenShift](https://docs.openshift.com/container-platform/3.11/install_config/persistent_storage/using_hostpath.html) format, that will be attached to the pod as a volume
- **io_write_bytes :** *string* writes N bytes for each hdd process. The size can be expressed as % of free space on the file system or in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g
- **io_block_size :** *string* size of each write in bytes. Size can be from 1 byte to 4m.
To perform several load tests in the same run simultaneously (eg. stress two or more nodes in the same run) add another item
to the `input_list` with the same properties (and eventually different values eg. different node_selectors
to schedule the pod on different nodes). To reduce (or increase) the parallelism change the value `parallelism` in `workload.yaml` file

View File

@@ -11,6 +11,11 @@ The following ways are supported to run Kraken:
**NOTE**: To run Kraken on Power (ppc64le) architecture, build and run a containerized version by following the
instructions given [here](https://github.com/redhat-chaos/krkn/blob/main/containers/build_own_image-README.md).
**NOTE**: Helper functions for interactions in Krkn are part of [krkn-lib](https://github.com/redhat-chaos/krkn-lib).
Please feel free to reuse and expand them as you see fit when adding a new scenario or expanding
the capabilities of the current supported scenarios.
### Git
#### Clone the repository

View File

@@ -4,7 +4,8 @@ import time
import kraken.cerberus.setup as cerberus
from jinja2 import Template
import kraken.invoke.command as runcommand
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
# Reads the scenario config, applies and deletes a network policy to
# block the traffic for the specified duration

View File

@@ -1,14 +1,13 @@
import time
import arcaflow
import os
import yaml
import logging
import sys
from pathlib import Path
from typing import List
from .context_auth import ContextAuth
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):

View File

@@ -1,14 +1,13 @@
import kraken.invoke.command as runcommand
import krkn_lib_kubernetes
import logging
import time
import sys
import requests
import yaml
import kraken.cerberus.setup as cerberus
from krkn_lib.k8s import KrknKubernetes
# krkn_lib_kubernetes
# krkn_lib
# Inject litmus scenarios defined in the config
def run(
scenarios_list,
@@ -16,7 +15,7 @@ def run(
litmus_uninstall,
wait_duration,
litmus_namespace,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
kubecli: KrknKubernetes
):
# Loop to run the scenarios starts here
for l_scenario in scenarios_list:
@@ -94,8 +93,8 @@ def deploy_all_experiments(version_string, namespace):
)
# krkn_lib_kubernetes
def wait_for_initialized(engine_name, experiment_name, namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def wait_for_initialized(engine_name, experiment_name, namespace, kubecli: KrknKubernetes):
chaos_engine = kubecli.get_litmus_chaos_object(kind='chaosengine', name=engine_name,
namespace=namespace).engineStatus
@@ -119,13 +118,13 @@ def wait_for_initialized(engine_name, experiment_name, namespace, kubecli: krkn_
return True
# krkn_lib_kubernetes
# krkn_lib
def wait_for_status(
engine_name,
expected_status,
experiment_name,
namespace,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
kubecli: KrknKubernetes
):
if expected_status == "running":
@@ -156,8 +155,8 @@ def wait_for_status(
# Check status of experiment
# krkn_lib_kubernetes
def check_experiment(engine_name, experiment_name, namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def check_experiment(engine_name, experiment_name, namespace, kubecli: KrknKubernetes):
wait_response = wait_for_status(engine_name, "running", experiment_name, namespace, kubecli)
@@ -183,8 +182,8 @@ def check_experiment(engine_name, experiment_name, namespace, kubecli: krkn_lib_
# Delete all chaos engines in a given namespace
# krkn_lib_kubernetes
def delete_chaos_experiments(namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def delete_chaos_experiments(namespace, kubecli: KrknKubernetes):
if kubecli.check_if_namespace_exists(namespace):
chaos_exp_exists = runcommand.invoke_no_exit("kubectl get chaosexperiment")
@@ -194,8 +193,8 @@ def delete_chaos_experiments(namespace, kubecli: krkn_lib_kubernetes.KrknLibKube
# Delete all chaos engines in a given namespace
# krkn_lib_kubernetes
def delete_chaos(namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def delete_chaos(namespace, kubecli:KrknKubernetes):
if kubecli.check_if_namespace_exists(namespace):
logging.info("Deleting all litmus run objects")
@@ -209,8 +208,8 @@ def delete_chaos(namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
logging.info(namespace + " namespace doesn't exist")
# krkn_lib_kubernetes
def uninstall_litmus(version, litmus_namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def uninstall_litmus(version, litmus_namespace, kubecli: KrknKubernetes):
if kubecli.check_if_namespace_exists(litmus_namespace):
logging.info("Uninstalling Litmus operator")

View File

@@ -1,14 +1,14 @@
import random
import logging
import krkn_lib_kubernetes
from krkn_lib.k8s import KrknKubernetes
# krkn_lib_kubernetes
# krkn_lib
# Pick a random managedcluster with specified label selector
def get_managedcluster(
managedcluster_name,
label_selector,
instance_kill_count,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
kubecli: KrknKubernetes):
if managedcluster_name in kubecli.list_killable_managedclusters():
return [managedcluster_name]
@@ -30,12 +30,12 @@ def get_managedcluster(
# Wait until the managedcluster status becomes Available
# krkn_lib_kubernetes
def wait_for_available_status(managedcluster, timeout, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def wait_for_available_status(managedcluster, timeout, kubecli: KrknKubernetes):
kubecli.watch_managedcluster_status(managedcluster, "True", timeout)
# Wait until the managedcluster status becomes Not Available
# krkn_lib_kubernetes
def wait_for_unavailable_status(managedcluster, timeout, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def wait_for_unavailable_status(managedcluster, timeout, kubecli: KrknKubernetes):
kubecli.watch_managedcluster_status(managedcluster, "Unknown", timeout)

View File

@@ -4,19 +4,17 @@ import time
import logging
import sys
import yaml
import html
import krkn_lib_kubernetes
import kraken.managedcluster_scenarios.common_managedcluster_functions as common_managedcluster_functions
from krkn_lib.k8s import KrknKubernetes
class GENERAL:
def __init__(self):
pass
# krkn_lib_kubernetes
# krkn_lib
class managedcluster_scenarios():
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
kubecli: KrknKubernetes
def __init__(self, kubecli: KrknKubernetes):
self.kubecli = kubecli
self.general = GENERAL()

View File

@@ -1,20 +1,19 @@
import yaml
import logging
import time
import krkn_lib_kubernetes
from kraken.managedcluster_scenarios.managedcluster_scenarios import managedcluster_scenarios
import kraken.managedcluster_scenarios.common_managedcluster_functions as common_managedcluster_functions
import kraken.cerberus.setup as cerberus
from krkn_lib.k8s import KrknKubernetes
# Get the managedcluster scenarios object of specfied cloud type
# krkn_lib_kubernetes
def get_managedcluster_scenario_object(managedcluster_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def get_managedcluster_scenario_object(managedcluster_scenario, kubecli: KrknKubernetes):
return managedcluster_scenarios(kubecli)
# Run defined scenarios
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes):
for managedcluster_scenario_config in scenarios_list:
with open(managedcluster_scenario_config, "r") as f:
managedcluster_scenario_config = yaml.full_load(f)
@@ -32,8 +31,8 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
# Inject the specified managedcluster scenario
# krkn_lib_kubernetes
def inject_managedcluster_scenario(action, managedcluster_scenario, managedcluster_scenario_object, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def inject_managedcluster_scenario(action, managedcluster_scenario, managedcluster_scenario_object, kubecli: KrknKubernetes):
# Get the managedcluster scenario configurations
run_kill_count = managedcluster_scenario.get("runs", 1)
instance_kill_count = managedcluster_scenario.get("instance_count", 1)

View File

@@ -1,21 +1,22 @@
import time
import random
import logging
import krkn_lib_kubernetes
import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
import yaml
import sys
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
# krkn_lib_kubernetes
# krkn_lib
def run(
scenarios_list,
config,
wait_duration,
failed_post_scenarios,
kubeconfig_path,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
kubecli: KrknKubernetes,
telemetry: KrknTelemetry
) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
@@ -106,8 +107,8 @@ def run(
scenario_telemetries.append(scenario_telemetry)
return failed_scenarios, scenario_telemetries
# krkn_lib_kubernetes
def check_active_namespace(killed_namespaces, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def check_active_namespace(killed_namespaces, wait_time, kubecli: KrknKubernetes):
active_namespace = []
timer = 0
while timer < wait_time and killed_namespaces:

View File

@@ -1,18 +1,19 @@
import yaml
import logging
import time
import sys
import os
import random
import krkn_lib_kubernetes
from jinja2 import Environment, FileSystemLoader
import kraken.cerberus.setup as cerberus
import kraken.node_actions.common_node_functions as common_node_functions
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from jinja2 import Environment, FileSystemLoader
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
# krkn_lib_kubernetes
# krkn_lib
# Reads the scenario config and introduces traffic variations in Node's host network interface.
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = ""
logging.info("Runing the Network Chaos tests")
failed_post_scenarios = ""
@@ -108,8 +109,8 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
return failed_scenarios, scenario_telemetries
# krkn_lib_kubernetes
def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def verify_interface(test_interface, nodelst, template, kubecli: KrknKubernetes):
pod_index = random.randint(0, len(nodelst) - 1)
pod_body = yaml.safe_load(template.render(nodename=nodelst[pod_index]))
logging.info("Creating pod to query interface on node %s" % nodelst[pod_index])
@@ -134,16 +135,16 @@ def verify_interface(test_interface, nodelst, template, kubecli: krkn_lib_kubern
kubecli.delete_pod("fedtools", "default")
# krkn_lib_kubernetes
def get_job_pods(api_response, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def get_job_pods(api_response, kubecli: KrknKubernetes):
controllerUid = api_response.metadata.labels["controller-uid"]
pod_label_selector = "controller-uid=" + controllerUid
pods_list = kubecli.list_pods(label_selector=pod_label_selector, namespace="default")
return pods_list[0]
# krkn_lib_kubernetes
def wait_for_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, timeout=300):
# krkn_lib
def wait_for_job(joblst, kubecli: KrknKubernetes, timeout=300):
waittime = time.time() + timeout
count = 0
joblen = len(joblst)
@@ -161,8 +162,8 @@ def wait_for_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, timeout
time.sleep(5)
# krkn_lib_kubernetes
def delete_job(joblst, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def delete_job(joblst, kubecli: KrknKubernetes):
for jobname in joblst:
try:
api_response = kubecli.get_job_status(jobname, namespace="default")

View File

@@ -2,12 +2,12 @@ import sys
import logging
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
import krkn_lib_kubernetes
from krkn_lib.k8s import KrknKubernetes
# krkn_lib_kubernetes
# krkn_lib
class abstract_node_scenarios:
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
kubecli: KrknKubernetes
def __init__(self, kubecli: KrknKubernetes):
self.kubecli = kubecli
# Node scenario to start the node
def node_start_scenario(self, instance_kill_count, node, timeout):

View File

@@ -1,14 +1,14 @@
import sys
import time
import krkn_lib_kubernetes
import logging
import kraken.node_actions.common_node_functions as nodeaction
import os
import json
from aliyunsdkcore.client import AcsClient
from aliyunsdkecs.request.v20140526 import DescribeInstancesRequest, DeleteInstanceRequest
from aliyunsdkecs.request.v20140526 import StopInstanceRequest, StartInstanceRequest, RebootInstanceRequest
import logging
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
import os
import json
from krkn_lib.k8s import KrknKubernetes
class Alibaba:
@@ -180,9 +180,9 @@ class Alibaba:
logging.info("ECS %s is released" % instance_id)
return True
# krkn_lib_kubernetes
# krkn_lib
class alibaba_node_scenarios(abstract_node_scenarios):
def __init__(self,kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self,kubecli: KrknKubernetes):
self.alibaba = Alibaba()
# Node scenario to start the node

View File

@@ -2,10 +2,9 @@ import sys
import time
import boto3
import logging
import krkn_lib_kubernetes
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
from krkn_lib.k8s import KrknKubernetes
class AWS:
def __init__(self):
@@ -166,9 +165,9 @@ class AWS:
# sys.exit(1)
raise RuntimeError()
# krkn_lib_kubernetes
# krkn_lib
class aws_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, kubecli: KrknKubernetes):
super().__init__(kubecli)
self.aws = AWS()

View File

@@ -1,13 +1,14 @@
import sys
import time
from azure.mgmt.compute import ComputeManagementClient
from azure.identity import DefaultAzureCredential
import yaml
import kraken.invoke.command as runcommand
import logging
import krkn_lib_kubernetes
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
import kraken.invoke.command as runcommand
import yaml
from azure.mgmt.compute import ComputeManagementClient
from azure.identity import DefaultAzureCredential
from krkn_lib.k8s import KrknKubernetes
class Azure:
@@ -129,9 +130,9 @@ class Azure:
logging.info("Vm %s is terminated" % vm_name)
return True
# krkn_lib_kubernetes
# krkn_lib
class azure_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, kubecli: KrknKubernetes):
super().__init__(kubecli)
logging.info("init in azure")
self.azure = Azure()

View File

@@ -1,6 +1,5 @@
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
import krkn_lib_kubernetes
import logging
import openshift as oc
import pyipmi
@@ -8,7 +7,7 @@ import pyipmi.interfaces
import sys
import time
import traceback
from krkn_lib.k8s import KrknKubernetes
class BM:
def __init__(self, bm_info, user, passwd):
@@ -105,9 +104,9 @@ class BM:
while self.get_ipmi_connection(bmc_addr, node_name).get_chassis_status().power_on:
time.sleep(1)
# krkn_lib_kubernetes
# krkn_lib
class bm_node_scenarios(abstract_node_scenarios):
def __init__(self, bm_info, user, passwd, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, bm_info, user, passwd, kubecli: KrknKubernetes):
super().__init__(kubecli)
self.bm = BM(bm_info, user, passwd)

View File

@@ -2,14 +2,13 @@ import time
import random
import logging
import paramiko
import krkn_lib_kubernetes
import kraken.invoke.command as runcommand
from krkn_lib.k8s import KrknKubernetes
node_general = False
# Pick a random node with specified label selector
def get_node(node_name, label_selector, instance_kill_count, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def get_node(node_name, label_selector, instance_kill_count, kubecli: KrknKubernetes):
if node_name in kubecli.list_killable_nodes():
return [node_name]
elif node_name:
@@ -29,21 +28,21 @@ def get_node(node_name, label_selector, instance_kill_count, kubecli: krkn_lib_k
return nodes_to_return
# krkn_lib_kubernetes
# krkn_lib
# Wait until the node status becomes Ready
def wait_for_ready_status(node, timeout, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def wait_for_ready_status(node, timeout, kubecli: KrknKubernetes):
resource_version = kubecli.get_node_resource_version(node)
kubecli.watch_node_status(node, "True", timeout, resource_version)
# krkn_lib_kubernetes
# krkn_lib
# Wait until the node status becomes Not Ready
def wait_for_not_ready_status(node, timeout, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def wait_for_not_ready_status(node, timeout, kubecli: KrknKubernetes):
resource_version = kubecli.get_node_resource_version(node)
kubecli.watch_node_status(node, "False", timeout, resource_version)
# krkn_lib_kubernetes
# krkn_lib
# Wait until the node status becomes Unknown
def wait_for_unknown_status(node, timeout, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def wait_for_unknown_status(node, timeout, kubecli: KrknKubernetes):
resource_version = kubecli.get_node_resource_version(node)
kubecli.watch_node_status(node, "Unknown", timeout, resource_version)

View File

@@ -1,10 +1,9 @@
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
import krkn_lib_kubernetes
import logging
import sys
import docker
from krkn_lib.k8s import KrknKubernetes
class Docker:
def __init__(self):
@@ -37,7 +36,7 @@ class Docker:
class docker_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, kubecli: KrknKubernetes):
super().__init__(kubecli)
self.docker = Docker()

View File

@@ -1,13 +1,12 @@
import sys
import time
import logging
import krkn_lib_kubernetes
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
from googleapiclient import discovery
from oauth2client.client import GoogleCredentials
import kraken.invoke.command as runcommand
from krkn_lib.k8s import KrknKubernetes
class GCP:
def __init__(self):
@@ -143,9 +142,9 @@ class GCP:
return True
# krkn_lib_kubernetes
# krkn_lib
class gcp_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, kubecli: KrknKubernetes):
super().__init__(kubecli)
self.gcp = GCP()

View File

@@ -1,15 +1,14 @@
import logging
import krkn_lib_kubernetes
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
from krkn_lib.k8s import KrknKubernetes
class GENERAL:
def __init__(self):
pass
# krkn_lib_kubernetes
# krkn_lib
class general_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes ):
def __init__(self, kubecli: KrknKubernetes):
super().__init__(kubecli)
self.general = GENERAL()

View File

@@ -1,11 +1,10 @@
import sys
import time
import logging
import krkn_lib_kubernetes
import kraken.invoke.command as runcommand
import kraken.node_actions.common_node_functions as nodeaction
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
from krkn_lib.k8s import KrknKubernetes
class OPENSTACKCLOUD:
def __init__(self):
@@ -93,9 +92,9 @@ class OPENSTACKCLOUD:
return node_name
counter += 1
# krkn_lib_kubernetes
# krkn_lib
class openstack_node_scenarios(abstract_node_scenarios):
def __init__(self, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def __init__(self, kubecli: KrknKubernetes):
self.openstackcloud = OPENSTACKCLOUD()
# Node scenario to start the node

View File

@@ -2,7 +2,6 @@ import yaml
import logging
import sys
import time
import krkn_lib_kubernetes
from kraken.node_actions.aws_node_scenarios import aws_node_scenarios
from kraken.node_actions.general_cloud_node_scenarios import general_node_scenarios
from kraken.node_actions.az_node_scenarios import azure_node_scenarios
@@ -13,14 +12,15 @@ from kraken.node_actions.bm_node_scenarios import bm_node_scenarios
from kraken.node_actions.docker_node_scenarios import docker_node_scenarios
import kraken.node_actions.common_node_functions as common_node_functions
import kraken.cerberus.setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry, ScenarioTelemetry
node_general = False
# Get the node scenarios object of specfied cloud type
# krkn_lib_kubernetes
def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def get_node_scenario_object(node_scenario, kubecli: KrknKubernetes):
if "cloud_type" not in node_scenario.keys() or node_scenario["cloud_type"] == "generic":
global node_general
node_general = True
@@ -52,8 +52,8 @@ def get_node_scenario_object(node_scenario, kubecli: krkn_lib_kubernetes.KrknLib
# Run defined scenarios
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
# krkn_lib
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
failed_scenarios = []
for node_scenario_config in scenarios_list:
@@ -89,7 +89,7 @@ def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.Krkn
# Inject the specified node scenario
def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def inject_node_scenario(action, node_scenario, node_scenario_object, kubecli: KrknKubernetes):
generic_cloud_scenarios = ("stop_kubelet_scenario", "node_crash_scenario")
# Get the node scenario configurations
run_kill_count = node_scenario.get("runs", 1)

View File

@@ -13,7 +13,9 @@ from kraken.plugins.run_python_plugin import run_python_file
from kraken.plugins.network.ingress_shaping import network_chaos
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_outage
from kraken.plugins.pod_network_outage.pod_network_outage_plugin import pod_egress_shaping
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
@dataclasses.dataclass

View File

@@ -10,7 +10,7 @@ import re
from dataclasses import dataclass, field
from traceback import format_exc
from jinja2 import Environment, FileSystemLoader
from krkn_lib_kubernetes import KrknLibKubernetes
from krkn_lib.k8s import KrknKubernetes
from arcaflow_plugin_sdk import plugin, validation
from kubernetes import client
from kubernetes.client.api.apiextensions_v1_api import ApiextensionsV1Api
@@ -19,7 +19,7 @@ from . import cerberus
def get_test_pods(
pod_name: str, pod_label: str, namespace: str, kubecli: KrknLibKubernetes
pod_name: str, pod_label: str, namespace: str, kubecli: KrknKubernetes
) -> typing.List[str]:
"""
Function that returns a list of pods to apply network policy
@@ -35,7 +35,7 @@ def get_test_pods(
namepsace (string)
- namespace in which the pod is present
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
Returns:
@@ -54,12 +54,12 @@ def get_test_pods(
return pods_list
def get_job_pods(kubecli: KrknLibKubernetes, api_response):
def get_job_pods(kubecli: KrknKubernetes, api_response):
"""
Function that gets the pod corresponding to the job
Args:
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
api_response
@@ -78,12 +78,12 @@ def get_job_pods(kubecli: KrknLibKubernetes, api_response):
return pods_list[0]
def delete_jobs(kubecli: KrknLibKubernetes, job_list: typing.List[str]):
def delete_jobs(kubecli: KrknKubernetes, job_list: typing.List[str]):
"""
Function that deletes jobs
Args:
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
job_list (List of strings)
@@ -109,7 +109,7 @@ def delete_jobs(kubecli: KrknLibKubernetes, job_list: typing.List[str]):
def wait_for_job(
job_list: typing.List[str], kubecli: KrknLibKubernetes, timeout: int = 300
job_list: typing.List[str], kubecli: KrknKubernetes, timeout: int = 300
) -> None:
"""
Function that waits for a list of jobs to finish within a time period
@@ -118,7 +118,7 @@ def wait_for_job(
job_list (List of strings)
- The list of jobs to check for completion
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
timeout (int)
@@ -195,7 +195,7 @@ def apply_outage_policy(
direction: str,
duration: str,
bridge_name: str,
kubecli: KrknLibKubernetes,
kubecli: KrknKubernetes,
) -> typing.List[str]:
"""
Function that applies filters(ingress or egress) to block traffic.
@@ -278,7 +278,7 @@ def apply_net_policy(
network_params: typing.Dict[str, str],
duration: str,
bridge_name: str,
kubecli: KrknLibKubernetes,
kubecli: KrknKubernetes,
test_execution: str,
) -> typing.List[str]:
"""
@@ -312,7 +312,7 @@ def apply_net_policy(
bridge_name (string):
- bridge to which filter rules need to be applied
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
test_execution (String)
@@ -393,7 +393,7 @@ def get_egress_cmd(
def list_bridges(
node: str, pod_template, kubecli: KrknLibKubernetes
node: str, pod_template, kubecli: KrknKubernetes
) -> typing.List[str]:
"""
Function that returns a list of bridges on the node
@@ -406,7 +406,7 @@ def list_bridges(
- The YAML template used to instantiate a pod to query
the node's interface
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
Returns:
@@ -437,7 +437,7 @@ def list_bridges(
def check_cookie(
node: str, pod_template, br_name, cookie, kubecli: KrknLibKubernetes
node: str, pod_template, br_name, cookie, kubecli: KrknKubernetes
) -> str:
"""
Function to check for matching flow rules
@@ -496,7 +496,7 @@ def check_cookie(
def get_pod_interface(
node: str, ip: str, pod_template, br_name, kubecli: KrknLibKubernetes
node: str, ip: str, pod_template, br_name, kubecli: KrknKubernetes
) -> str:
"""
Function to query the pod interface on a node
@@ -515,7 +515,7 @@ def get_pod_interface(
br_name (string):
- bridge against which the flows rules need to be checked
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
Returns
@@ -576,7 +576,7 @@ def get_pod_interface(
def check_bridge_interface(
node_name: str, pod_template, bridge_name: str, kubecli: KrknLibKubernetes
node_name: str, pod_template, bridge_name: str, kubecli: KrknKubernetes
) -> bool:
"""
Function is used to check if the required OVS or OVN bridge is found in
@@ -593,7 +593,7 @@ def check_bridge_interface(
bridge_name (string):
- bridge name to check for in the node.
kubecli (KrknLibKubernetes)
kubecli (KrknKubernetes)
- Object to interact with Kubernetes Python client
Returns:
@@ -815,7 +815,7 @@ def pod_outage(
node_dict = {}
label_set = set()
kubecli = KrknLibKubernetes(kubeconfig_path=params.kubeconfig_path)
kubecli = KrknKubernetes(kubeconfig_path=params.kubeconfig_path)
api_ext = client.ApiextensionsV1Api(kubecli.api_client)
custom_obj = client.CustomObjectsApi(kubecli.api_client)
@@ -1073,7 +1073,7 @@ def pod_egress_shaping(
param_lst = ["latency", "loss", "bandwidth"]
mod_lst = [i for i in param_lst if i in params.network_params]
kubecli = KrknLibKubernetes(kubeconfig_path=params.kubeconfig_path)
kubecli = KrknKubernetes(kubeconfig_path=params.kubeconfig_path)
api_ext = client.ApiextensionsV1Api(kubecli.api_client)
custom_obj = client.CustomObjectsApi(kubecli.api_client)

View File

@@ -1,16 +1,15 @@
import logging
from arcaflow_plugin_sdk import serialization
import arcaflow_plugin_kill_pod
import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
import krkn_lib_kubernetes
import time
import yaml
import sys
import random
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
import arcaflow_plugin_kill_pod
import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
from arcaflow_plugin_sdk import serialization
# Run pod based scenarios
def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_duration):
@@ -66,13 +65,13 @@ def run(kubeconfig_path, scenarios_list, config, failed_post_scenarios, wait_dur
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
return failed_post_scenarios
# krkn_lib_kubernetes
# krkn_lib
def container_run(kubeconfig_path,
scenarios_list,
config,
failed_post_scenarios,
wait_duration,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
kubecli: KrknKubernetes,
telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_scenarios = []
@@ -129,7 +128,7 @@ def container_run(kubeconfig_path,
def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
scenario_name = cont_scenario.get("name", "")
namespace = cont_scenario.get("namespace", "*")
label_selector = cont_scenario.get("label_selector", None)
@@ -196,7 +195,7 @@ def container_killing_in_pod(cont_scenario, kubecli: krkn_lib_kubernetes.KrknLib
return killed_container_list
def retry_container_killing(kill_action, podname, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def retry_container_killing(kill_action, podname, namespace, container_name, kubecli: KrknKubernetes):
i = 0
while i < 5:
logging.info("Killing container %s in pod %s (ns %s)" % (str(container_name), str(podname), str(namespace)))
@@ -213,7 +212,7 @@ def retry_container_killing(kill_action, podname, namespace, container_name, kub
continue
def check_failed_containers(killed_container_list, wait_time, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
def check_failed_containers(killed_container_list, wait_time, kubecli: KrknKubernetes):
container_ready = []
timer = 0

View File

@@ -1,16 +1,16 @@
import logging
import random
import re
import sys
import time
import krkn_lib_kubernetes
import yaml
from ..cerberus import setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
# krkn_lib_kubernetes
def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
# krkn_lib
def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
"""
Reads the scenario config and creates a temp file to fill up the PVC
"""
@@ -317,7 +317,7 @@ def run(scenarios_list, config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes,
# krkn_lib_kubernetes
# krkn_lib
def remove_temp_file(
file_name,
full_path,
@@ -326,7 +326,7 @@ def remove_temp_file(
container_name,
mount_path,
file_size_kb,
kubecli: krkn_lib_kubernetes.KrknLibKubernetes
kubecli: KrknKubernetes
):
command = "rm -f %s" % (str(full_path))
logging.debug("Remove temp file from the PVC command:\n %s" % command)

View File

@@ -1,19 +1,17 @@
#!/usr/bin/env python
import os
import sys
import yaml
import logging
import time
import krkn_lib_kubernetes
from multiprocessing.pool import ThreadPool
from ..cerberus import setup as cerberus
from ..post_actions import actions as post_actions
from ..node_actions.aws_node_scenarios import AWS
from ..node_actions.openstack_node_scenarios import OPENSTACKCLOUD
from ..node_actions.az_node_scenarios import Azure
from ..node_actions.gcp_node_scenarios import GCP
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
def multiprocess_nodes(cloud_object_function, nodes):
try:
@@ -40,8 +38,8 @@ def multiprocess_nodes(cloud_object_function, nodes):
# Inject the cluster shut down scenario
# krkn_lib_kubernetes
def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def cluster_shut_down(shut_down_config, kubecli: KrknKubernetes):
runs = shut_down_config["runs"]
shut_down_duration = shut_down_config["shut_down_duration"]
cloud_type = shut_down_config["cloud_type"]
@@ -128,9 +126,9 @@ def cluster_shut_down(shut_down_config, kubecli: krkn_lib_kubernetes.KrknLibKube
logging.info("Successfully injected cluster_shut_down scenario!")
# krkn_lib_kubernetes
# krkn_lib
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_post_scenarios = []
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []

View File

@@ -2,16 +2,16 @@ import datetime
import time
import logging
import re
import sys
import yaml
import random
import krkn_lib_kubernetes
from ..cerberus import setup as cerberus
from ..invoke import command as runcommand
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
# krkn_lib_kubernetes
def pod_exec(pod_name, command, namespace, container_name, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def pod_exec(pod_name, command, namespace, container_name, kubecli:KrknKubernetes):
for i in range(5):
response = kubecli.exec_cmd_in_pod(
command,
@@ -40,8 +40,8 @@ def node_debug(node_name, command):
return response
# krkn_lib_kubernetes
def get_container_name(pod_name, namespace, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, container_name=""):
# krkn_lib
def get_container_name(pod_name, namespace, kubecli:KrknKubernetes, container_name=""):
container_names = kubecli.get_containers_in_pod(pod_name, namespace)
if container_name != "":
@@ -63,9 +63,9 @@ def get_container_name(pod_name, namespace, kubecli: krkn_lib_kubernetes.KrknLib
return container_name
# krkn_lib_kubernetes
def skew_time(scenario, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
skew_command = "date --set "
# krkn_lib
def skew_time(scenario, kubecli:KrknKubernetes):
skew_command = "date --date "
if scenario["action"] == "skew_date":
skewed_date = "00-01-01"
skew_command += skewed_date
@@ -231,8 +231,8 @@ def string_to_date(obj_datetime):
return datetime.datetime(datetime.MINYEAR, 1, 1)
# krkn_lib_kubernetes
def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKubernetes):
# krkn_lib
def check_date_time(object_type, names, kubecli:KrknKubernetes):
skew_command = "date"
not_reset = []
max_retries = 30
@@ -307,8 +307,8 @@ def check_date_time(object_type, names, kubecli: krkn_lib_kubernetes.KrknLibKube
return not_reset
# krkn_lib_kubernetes
def run(scenarios_list, config, wait_duration, kubecli: krkn_lib_kubernetes.KrknLibKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
# krkn_lib
def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]):
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []
for time_scenario_config in scenarios_list:

View File

@@ -3,7 +3,9 @@ import logging
import time
from ..node_actions.aws_node_scenarios import AWS
from ..cerberus import setup as cerberus
from krkn_lib_kubernetes import ScenarioTelemetry, KrknTelemetry
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ScenarioTelemetry
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetry) -> (list[str], list[ScenarioTelemetry]) :
"""

View File

@@ -32,9 +32,9 @@ wheel
service_identity
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
git+https://github.com/redhat-chaos/arcaflow-plugin-kill-pod.git
arcaflow >= 0.4.1
arcaflow >= 0.6.1
prometheus_api_client
ibm_cloud_sdk_core
ibm_vpc
pytest
krkn-lib-kubernetes >= 0.1.3
krkn-lib >= 1.0.0

View File

@@ -25,7 +25,11 @@ import kraken.arcaflow_plugin as arcaflow_plugin
import server as server
import kraken.prometheus.client as promcli
from kraken import plugins
from krkn_lib_kubernetes import KrknLibKubernetes, KrknTelemetry, ChaosRunTelemetry, SafeLogger
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.telemetry import KrknTelemetry
from krkn_lib.models.telemetry import ChaosRunTelemetry
from krkn_lib.utils import SafeLogger
KUBE_BURNER_URL = (
"https://github.com/cloud-bulldozer/kube-burner/"
@@ -119,7 +123,7 @@ def main(cfg):
kubeconfig_path
os.environ["KUBECONFIG"] = str(kubeconfig_path)
# krkn-lib-kubernetes init
kubecli = KrknLibKubernetes(kubeconfig_path=kubeconfig_path)
kubecli = KrknKubernetes(kubeconfig_path=kubeconfig_path)
except:
kubecli.initialize_clients(None)
@@ -233,7 +237,7 @@ def main(cfg):
telemetry
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "container_scenarios":
logging.info("Running container scenarios")
failed_post_scenarios, scenario_telemetries = pod_scenarios.container_run(
@@ -248,13 +252,13 @@ def main(cfg):
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject node chaos scenarios specified in the config
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "node_scenarios":
logging.info("Running node scenarios")
failed_post_scenarios, scenario_telemetries = nodeaction.run(scenarios_list, config, wait_duration, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject managedcluster chaos scenarios specified in the config
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "managedcluster_scenarios":
logging.info("Running managedcluster scenarios")
managedcluster_scenarios.run(
@@ -263,7 +267,7 @@ def main(cfg):
# Inject time skew chaos scenarios specified
# in the config
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "time_scenarios":
if distribution == "openshift":
logging.info("Running time skew scenarios")
@@ -316,13 +320,13 @@ def main(cfg):
sys.exit(1)
# Inject cluster shutdown scenarios
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "cluster_shut_down_scenarios":
failed_post_scenarios, scenario_telemetries = shut_down.run(scenarios_list, config, wait_duration, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Inject namespace chaos scenarios
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "namespace_scenarios":
logging.info("Running namespace scenarios")
failed_post_scenarios, scenario_telemetries = namespace_actions.run(
@@ -349,14 +353,14 @@ def main(cfg):
chaos_telemetry.scenarios.extend(scenario_telemetries)
# PVC scenarios
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "pvc_scenarios":
logging.info("Running PVC scenario")
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# Network scenarios
# krkn_lib_kubernetes
# krkn_lib
elif scenario_type == "network_chaos":
logging.info("Running Network Chaos")
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry)

View File

@@ -60,17 +60,17 @@ input:
steps:
kubeconfig:
plugin: quay.io/arcalot/arcaflow-plugin-kubeconfig:latest
plugin: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin: quay.io/arcalot/arcaflow-plugin-stressng:latest
plugin: quay.io/arcalot/arcaflow-plugin-stressng:0.3.1
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
cleanup: "true"
items:
stressors:
- stressor: cpu
cpu_count: !expr $.input.cpu_count
cpu_method: !expr $.input.cpu_method

View File

@@ -0,0 +1,10 @@
deployer:
connection: {}
type: kubernetes
log:
level: debug
logged_outputs:
error:
level: error
success:
level: debug

View File

@@ -0,0 +1,13 @@
input_list:
- duration: 30s
io_block_size: 1m
io_workers: 1
io_write_bytes: 10m
kubeconfig: ''
namespace: default
node_selector: {}
target_pod_folder: /hog-data
target_pod_volume:
hostPath:
path: /tmp
name: node-volume

View File

@@ -0,0 +1,138 @@
input:
root: RootObject
objects:
RootObject:
id: RootObject
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
io_workers:
display:
description: number of workers
name: start N workers continually writing, reading and removing temporary files
type:
type_id: integer
required: true
io_block_size:
display:
description: single write size
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
type:
type_id: string
required: true
io_write_bytes:
display:
description: Total number of bytes written
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
GBytes using the suffix b, k, m or g
type:
type_id: string
required: true
target_pod_folder:
display:
description: Target Folder
name: Folder in the pod where the test will be executed and the test files will be written
type:
type_id: string
required: true
target_pod_volume:
display:
name: kubernetes volume definition
description: the volume that will be attached to the pod. In order to stress
the node storage only hosPath mode is currently supported
type:
type_id: object
id: k8s_volume
properties:
name:
display:
description: name of the volume (must match the name in pod definition)
type:
type_id: string
required: true
hostPath:
display:
description: hostPath options expressed as string map (key-value)
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
required: true
steps:
kubeconfig:
plugin: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin: quay.io/arcalot/arcaflow-plugin-stressng:0.3.1
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
workdir: !expr $.input.target_pod_folder
stressors:
- stressor: hdd
hdd: !expr $.input.io_workers
hdd_bytes: !expr $.input.io_write_bytes
hdd_write_size: !expr $.input.io_block_size
deploy:
type: kubernetes
connection: !expr $.steps.kubeconfig.outputs.success.connection
pod:
metadata:
namespace: !expr $.input.namespace
labels:
arcaflow: stressng
spec:
nodeSelector: !expr $.input.node_selector
pluginContainer:
imagePullPolicy: Always
securityContext:
privileged: true
volumeMounts:
- mountPath: /hog-data
name: node-volume
volumes:
- !expr $.input.target_pod_volume
outputs:
success:
stressng: !expr $.steps.stressng.outputs.success

View File

@@ -0,0 +1,113 @@
input:
root: RootObject
objects:
RootObject:
id: RootObject
properties:
input_list:
type:
type_id: list
items:
id: input_item
type_id: object
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
io_workers:
display:
description: number of workers
name: start N workers continually writing, reading and removing temporary files
type:
type_id: integer
required: true
io_block_size:
display:
description: single write size
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
type:
type_id: string
required: true
io_write_bytes:
display:
description: Total number of bytes written
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
GBytes using the suffix b, k, m or g
type:
type_id: string
required: true
target_pod_folder:
display:
description: Target Folder
name: Folder in the pod where the test will be executed and the test files will be written
type:
type_id: string
required: true
target_pod_volume:
display:
name: kubernetes volume definition
description: the volume that will be attached to the pod. In order to stress
the node storage only hosPath mode is currently supported
type:
type_id: object
id: k8s_volume
properties:
name:
display:
description: name of the volume (must match the name in pod definition)
type:
type_id: string
required: true
hostPath:
display:
description: hostPath options expressed as string map (key-value)
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
required: true
steps:
workload_loop:
kind: foreach
items: !expr $.input.input_list
workflow: sub-workflow.yaml
parallelism: 1000
outputs:
success:
workloads: !expr $.steps.workload_loop.outputs.success.data

View File

@@ -52,17 +52,17 @@ input:
steps:
kubeconfig:
plugin: quay.io/arcalot/arcaflow-plugin-kubeconfig:latest
plugin: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin: quay.io/arcalot/arcaflow-plugin-stressng:latest
plugin: quay.io/arcalot/arcaflow-plugin-stressng:0.3.1
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
cleanup: "true"
items:
stressors:
- stressor: vm
vm: !expr $.input.vm_workers
vm_bytes: !expr $.input.vm_bytes

View File

@@ -0,0 +1,9 @@
- id: kill-pods
config:
namespace_pattern: ^openshift-monitoring$
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
- id: wait-for-pods
config:
namespace_pattern: ^openshift-monitoring$
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
count: 1