Compare commits

..

5 Commits

Author SHA1 Message Date
Paige Rubendall
f154bcb692 adding krkn report location
Signed-off-by: Paige Rubendall <prubenda@redhat.com>
2024-01-25 10:45:01 -05:00
Naga Ravi Chaitanya Elluri
60ece4b1b8 Use 0.38.0 wheel version to fix security vulnerability
Reported by https://snyk.io/

Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2024-01-25 09:51:19 -05:00
Naga Ravi Chaitanya Elluri
d660542a40 Add CNCF trademark guidelines and update community members (#560)
Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2024-01-24 14:13:53 -05:00
Naga Ravi Chaitanya Elluri
2e651798fa Update redhat-chaos references with krkn-chaos
The tools are now hosted under https://github.com/krkn-chaos

Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2024-01-24 13:40:39 -05:00
Tullio Sebastiani
f801dfce54 functional tests pointing to real scenario config files
Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

typo

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

app_outage fix

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

typo

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

typo

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
2024-01-18 12:54:39 -05:00
52 changed files with 111 additions and 782 deletions

View File

@@ -68,14 +68,14 @@ jobs:
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
echo "test_app_outages" > ./CI/tests/my_tests
echo "test_container" >> ./CI/tests/my_tests
echo "test_namespace" >> ./CI/tests/my_tests
echo "test_net_chaos" >> ./CI/tests/my_tests
echo "test_time" >> ./CI/tests/my_tests
echo "test_arca_cpu_hog" >> ./CI/tests/my_tests
echo "test_arca_memory_hog" >> ./CI/tests/my_tests
echo "test_arca_io_hog" >> ./CI/tests/my_tests
echo "test_app_outages" > ./CI/tests/functional_tests
echo "test_container" >> ./CI/tests/functional_tests
echo "test_namespace" >> ./CI/tests/functional_tests
echo "test_net_chaos" >> ./CI/tests/functional_tests
echo "test_time" >> ./CI/tests/functional_tests
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
- name: Run Functional tests
run: |
./CI/run.sh

2
.gitignore vendored
View File

@@ -61,7 +61,7 @@ inspect.local.*
!CI/config/common_test_config.yaml
CI/out/*
CI/ci_results
CI/scenarios/*node.yaml
CI/legacy/*node.yaml
CI/results.markdown
#env

View File

@@ -1,7 +1,7 @@
## CI Tests
### First steps
Edit [my_tests](tests/my_tests) with tests you want to run
Edit [functional_tests](tests/functional_tests) with tests you want to run
### How to run
```./CI/run.sh```
@@ -11,7 +11,7 @@ This will run kraken using python, make sure python3 is set up and configured pr
### Adding a test case
1. Add in simple scenario yaml file to execute under [../CI/scenarios/](scenarios)
1. Add in simple scenario yaml file to execute under [../CI/scenarios/](legacy)
2. Copy [test_application_outages.sh](tests/test_app_outages.sh) for example on how to get started
@@ -27,7 +27,7 @@ This will run kraken using python, make sure python3 is set up and configured pr
e. 15: Make sure name of config in line 14 matches what you pass on this line
4. Add test name to [my_tests](../CI/tests/my_tests) file
4. Add test name to [functional_tests](../CI/tests/functional_tests) file
a. This will be the name of the file without ".sh"

View File

@@ -17,7 +17,7 @@ wait_cluster_become_ready() {
ci_tests_loc="CI/tests/my_tests"
ci_tests_loc="CI/tests/functional_tests"
echo -e "********* Running Functional Tests Suite *********\n\n"
@@ -37,7 +37,7 @@ echo '-----------------------|--------|---------' >> $results
# Run each test
failed_tests=()
for test_name in `cat CI/tests/my_tests`
for test_name in `cat CI/tests/functional_tests`
do
wait_cluster_become_ready
return_value=`./CI/run_test.sh $test_name $results`

View File

@@ -1,5 +0,0 @@
application_outage: # Scenario to create an outage of an application by blocking traffic
duration: 10 # Duration in seconds after which the routes will be accessible
namespace: openshift-monitoring # Namespace to target - all application routes will go inaccessible if pod selector is empty
pod_selector: {} # Pods to target
block: [Ingress, Egress] # It can be Ingress or Egress or Ingress, Egress

View File

@@ -1,12 +0,0 @@
---
deployers:
image:
connection: {}
deployer_name: kubernetes
log:
level: debug
logged_outputs:
error:
level: error
success:
level: debug

View File

@@ -1,9 +0,0 @@
input_list:
- cpu_count: 1
cpu_load_percentage: 80
cpu_method: all
duration: 1s
kubeconfig: ''
namespace: default
node_selector:
kubernetes.io/hostname: kind-worker2

View File

@@ -1,98 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
RootObject:
id: input_item
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
cpu_count:
display:
description: Number of CPU cores to be used (0 means all)
name: number of CPUs
type:
type_id: integer
required: true
cpu_method:
display:
description: CPU stress method
name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
type:
type_id: string
required: true
cpu_load_percentage:
display:
description: load CPU by percentage
name: CPU load
type:
type_id: integer
required: true
steps:
kubeconfig:
plugin:
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
deployment_type: image
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin:
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
deployment_type: image
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
stressors:
- stressor: cpu
cpu_count: !expr $.input.cpu_count
cpu_method: !expr $.input.cpu_method
cpu_load: !expr $.input.cpu_load_percentage
deploy:
deployer_name: kubernetes
connection: !expr $.steps.kubeconfig.outputs.success.connection
pod:
metadata:
namespace: !expr $.input.namespace
labels:
arcaflow: stressng
spec:
nodeSelector: !expr $.input.node_selector
pluginContainer:
imagePullPolicy: Always
outputs:
success:
stressng: !expr $.steps.stressng.outputs.success

View File

@@ -1,77 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
RootObject:
id: RootObject
properties:
input_list:
type:
type_id: list
items:
id: input_item
type_id: object
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
cpu_count:
display:
description: Number of CPU cores to be used (0 means all)
name: number of CPUs
type:
type_id: integer
required: true
cpu_method:
display:
description: CPU stress method
name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
type:
type_id: string
required: true
cpu_load_percentage:
display:
description: load CPU by percentage
name: CPU load
type:
type_id: integer
required: true
steps:
workload_loop:
kind: foreach
items: !expr $.input.input_list
workflow: sub-workflow.yaml
parallelism: 1000
outputs:
success:
workloads: !expr $.steps.workload_loop.outputs.success.data

View File

@@ -1,11 +0,0 @@
deployers:
image:
connection: {}
deployer_name: kubernetes
log:
level: debug
logged_outputs:
error:
level: error
success:
level: debug

View File

@@ -1,14 +0,0 @@
input_list:
- duration: 30s
io_block_size: 1m
io_workers: 1
io_write_bytes: 10m
kubeconfig: ''
namespace: default
node_selector:
kubernetes.io/hostname: kind-worker2
target_pod_folder: /hog-data
target_pod_volume:
hostPath:
path: /tmp
name: node-volume

View File

@@ -1,142 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
hostPath:
id: HostPathVolumeSource
properties:
path:
type:
type_id: string
Volume:
id: Volume
properties:
name:
type:
type_id: string
hostPath:
type:
id: hostPath
type_id: ref
RootObject:
id: input_item
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
io_workers:
display:
description: number of workers
name: start N workers continually writing, reading and removing temporary files
type:
type_id: integer
required: true
io_block_size:
display:
description: single write size
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
type:
type_id: string
required: true
io_write_bytes:
display:
description: Total number of bytes written
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
GBytes using the suffix b, k, m or g
type:
type_id: string
required: true
target_pod_folder:
display:
description: Target Folder
name: Folder in the pod where the test will be executed and the test files will be written
type:
type_id: string
required: true
target_pod_volume:
display:
name: kubernetes volume definition
description: the volume that will be attached to the pod. In order to stress
the node storage only hosPath mode is currently supported
type:
type_id: ref
id: Volume
required: true
steps:
kubeconfig:
plugin:
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
deployment_type: image
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin:
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
deployment_type: image
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
workdir: !expr $.input.target_pod_folder
stressors:
- stressor: hdd
hdd: !expr $.input.io_workers
hdd_bytes: !expr $.input.io_write_bytes
hdd_write_size: !expr $.input.io_block_size
deploy:
deployer_name: kubernetes
connection: !expr $.steps.kubeconfig.outputs.success.connection
pod:
metadata:
namespace: !expr $.input.namespace
labels:
arcaflow: stressng
spec:
nodeSelector: !expr $.input.node_selector
pluginContainer:
imagePullPolicy: Always
securityContext:
privileged: true
volumeMounts:
- mountPath: /hog-data
name: node-volume
volumes:
- !expr $.input.target_pod_volume
outputs:
success:
stressng: !expr $.steps.stressng.outputs.success

View File

@@ -1,113 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
hostPath:
id: HostPathVolumeSource
properties:
path:
type:
type_id: string
Volume:
id: Volume
properties:
name:
type:
type_id: string
hostPath:
type:
id: hostPath
type_id: ref
RootObject:
id: RootObject
properties:
input_list:
type:
type_id: list
items:
id: input_item
type_id: object
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
io_workers:
display:
description: number of workers
name: start N workers continually writing, reading and removing temporary files
type:
type_id: integer
required: true
io_block_size:
display:
description: single write size
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
type:
type_id: string
required: true
io_write_bytes:
display:
description: Total number of bytes written
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
GBytes using the suffix b, k, m or g
type:
type_id: string
required: true
target_pod_folder:
display:
description: Target Folder
name: Folder in the pod where the test will be executed and the test files will be written
type:
type_id: string
required: true
target_pod_volume:
display:
name: kubernetes volume definition
description: the volume that will be attached to the pod. In order to stress
the node storage only hosPath mode is currently supported
type:
type_id: ref
id: Volume
required: true
steps:
workload_loop:
kind: foreach
items: !expr $.input.input_list
workflow: sub-workflow.yaml
parallelism: 1000
outputs:
success:
workloads: !expr $.steps.workload_loop.outputs.success.data

View File

@@ -1,12 +0,0 @@
---
deployers:
image:
connection: {}
deployer_name: kubernetes
log:
level: debug
logged_outputs:
error:
level: error
success:
level: debug

View File

@@ -1,14 +0,0 @@
input_list:
- duration: 30s
vm_bytes: 10%
vm_workers: 2
node_selector:
kubernetes.io/hostname: kind-worker2
# node selector example
# node_selector:
# kubernetes.io/hostname: master
kubeconfig: ""
namespace: default
# duplicate this section to run simultaneous stressors in the same run

View File

@@ -1,90 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
RootObject:
id: input_item
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
vm_workers:
display:
description: Number of VM stressors to be run (0 means 1 stressor per CPU)
name: Number of VM stressors
type:
type_id: integer
required: true
vm_bytes:
display:
description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
name: Kubeconfig file contents
type:
type_id: string
required: true
steps:
kubeconfig:
plugin:
src: quay.io/arcalot/arcaflow-plugin-kubeconfig:0.2.0
deployment_type: image
input:
kubeconfig: !expr $.input.kubeconfig
stressng:
plugin:
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
deployment_type: image
step: workload
input:
cleanup: "true"
StressNGParams:
timeout: !expr $.input.duration
stressors:
- stressor: vm
vm: !expr $.input.vm_workers
vm_bytes: !expr $.input.vm_bytes
deploy:
deployer_name: kubernetes
connection: !expr $.steps.kubeconfig.outputs.success.connection
pod:
metadata:
namespace: !expr $.input.namespace
labels:
arcaflow: stressng
spec:
nodeSelector: !expr $.input.node_selector
pluginContainer:
imagePullPolicy: Always
outputs:
success:
stressng: !expr $.steps.stressng.outputs.success

View File

@@ -1,73 +0,0 @@
version: v0.2.0
input:
root: RootObject
objects:
RootObject:
id: RootObject
properties:
input_list:
type:
type_id: list
items:
id: input_item
type_id: object
properties:
kubeconfig:
display:
description: The complete kubeconfig file as a string
name: Kubeconfig file contents
type:
type_id: string
required: true
namespace:
display:
description: The namespace where the container will be deployed
name: Namespace
type:
type_id: string
required: true
node_selector:
display:
description: kubernetes node name where the plugin must be deployed
type:
type_id: map
values:
type_id: string
keys:
type_id: string
required: true
duration:
display:
name: duration the scenario expressed in seconds
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
type:
type_id: string
required: true
vm_workers:
display:
description: Number of VM stressors to be run (0 means 1 stressor per CPU)
name: Number of VM stressors
type:
type_id: integer
required: true
vm_bytes:
display:
description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
name: Kubeconfig file contents
type:
type_id: string
required: true
steps:
workload_loop:
kind: foreach
items: !expr $.input.input_list
workflow: sub-workflow.yaml
parallelism: 1000
outputs:
success:
workloads: !expr $.steps.workload_loop.outputs.success.data

View File

@@ -1,8 +0,0 @@
scenarios:
- name: "kill test container"
namespace: "default"
label_selector: "scenario=container"
container_name: "fedtools"
action: 1
count: 1
retry_wait: 60

View File

@@ -1,8 +0,0 @@
network_chaos: # Scenario to create an outage by simulating random variations in the network.
duration: 10 # seconds
instance_count: 1
node_name: kind-worker2
execution: serial
egress:
bandwidth: 100mbit

View File

@@ -1,7 +0,0 @@
scenarios:
- action: delete
namespace: "^namespace-scenario$"
label_selector:
runs: 1
sleep: 15
wait_time: 30

View File

@@ -1,5 +0,0 @@
time_scenarios:
- action: skew_time
object_type: pod
label_selector: scenario=time-skew
container_name: ""

View File

@@ -7,10 +7,11 @@ trap finish EXIT
function functional_test_app_outage {
yq -i '.application_outage.pod_selector={"scenario":"outage"}' CI/scenarios/app_outage.yaml
yq -i '.application_outage.namespace="default"' CI/scenarios/app_outage.yaml
yq -i '.application_outage.duration=10' scenarios/openshift/app_outage.yaml
yq -i '.application_outage.pod_selector={"scenario":"outage"}' scenarios/openshift/app_outage.yaml
yq -i '.application_outage.namespace="default"' scenarios/openshift/app_outage.yaml
export scenario_type="application_outages"
export scenario_file="CI/scenarios/app_outage.yaml"
export scenario_file="scenarios/openshift/app_outage.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/app_outage.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/app_outage.yaml

View File

@@ -7,8 +7,9 @@ trap finish EXIT
function functional_test_arca_cpu_hog {
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/cpu-hog/input.yaml
export scenario_type="arcaflow_scenarios"
export scenario_file="CI/scenarios/arcaflow/cpu-hog/input.yaml"
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/arca_cpu_hog.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/arca_cpu_hog.yaml

View File

@@ -7,8 +7,9 @@ trap finish EXIT
function functional_test_arca_io_hog {
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/io-hog/input.yaml
export scenario_type="arcaflow_scenarios"
export scenario_file="CI/scenarios/arcaflow/io-hog/input.yaml"
export scenario_file="scenarios/arcaflow/io-hog/input.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/arca_io_hog.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/arca_io_hog.yaml

View File

@@ -7,8 +7,9 @@ trap finish EXIT
function functional_test_arca_memory_hog {
yq -i '.input_list[0].node_selector={"kubernetes.io/hostname":"kind-worker2"}' scenarios/arcaflow/memory-hog/input.yaml
export scenario_type="arcaflow_scenarios"
export scenario_file="CI/scenarios/arcaflow/memory-hog/input.yaml"
export scenario_file="scenarios/arcaflow/memory-hog/input.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/arca_memory_hog.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/arca_memory_hog.yaml

View File

@@ -8,9 +8,11 @@ trap finish EXIT
pod_file="CI/scenarios/hello_pod.yaml"
function functional_test_container_crash {
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
export scenario_type="container_scenarios"
export scenario_file="- CI/scenarios/container_scenario.yml"
export scenario_file="- scenarios/openshift/app_outage.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

View File

@@ -7,12 +7,13 @@ trap finish EXIT
function funtional_test_namespace_deletion {
export scenario_type="namespace_scenarios"
export scenario_file="- CI/scenarios/network_diagnostics_namespace.yaml"
export scenario_file="- scenarios/openshift/ingress_namespace.yaml"
export post_config=""
yq '.scenarios.[0].namespace="^openshift-network-diagnostics$"' -i CI/scenarios/network_diagnostics_namespace.yaml
yq '.scenarios[0].namespace="^namespace-scenario$"' -i scenarios/openshift/ingress_namespace.yaml
yq '.scenarios[0].wait_time=30' -i scenarios/openshift/ingress_namespace.yaml
yq '.scenarios[0].action="delete"' -i scenarios/openshift/ingress_namespace.yaml
envsubst < CI/config/common_test_config.yaml > CI/config/namespace_config.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/namespace_config.yaml
echo $?
echo "Namespace scenario test: Success"
}

View File

@@ -7,9 +7,16 @@ trap finish EXIT
function functional_test_network_chaos {
yq -i '.network_chaos.duration=10' scenarios/openshift/network_chaos.yaml
yq -i '.network_chaos.node_name="kind-worker2"' scenarios/openshift/network_chaos.yaml
yq -i '.network_chaos.egress.bandwidth="100mbit"' scenarios/openshift/network_chaos.yaml
yq -i 'del(.network_chaos.interfaces)' scenarios/openshift/network_chaos.yaml
yq -i 'del(.network_chaos.label_selector)' scenarios/openshift/network_chaos.yaml
yq -i 'del(.network_chaos.egress.latency)' scenarios/openshift/network_chaos.yaml
yq -i 'del(.network_chaos.egress.loss)' scenarios/openshift/network_chaos.yaml
export scenario_type="network_chaos"
export scenario_file="CI/scenarios/network_chaos.yaml"
export scenario_file="scenarios/openshift/network_chaos.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/network_chaos.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/network_chaos.yaml

View File

@@ -7,8 +7,12 @@ trap finish EXIT
function functional_test_time_scenario {
yq -i '.time_scenarios[0].label_selector="scenario=time-skew"' scenarios/openshift/time_scenarios_example.yml
yq -i '.time_scenarios[0].container_name=""' scenarios/openshift/time_scenarios_example.yml
yq -i '.time_scenarios[0].namespace="default"' scenarios/openshift/time_scenarios_example.yml
yq -i '.time_scenarios[1].label_selector="kubernetes.io/hostname=kind-worker2"' scenarios/openshift/time_scenarios_example.yml
export scenario_type="time_scenarios"
export scenario_file="CI/scenarios/time_scenarios.yml"
export scenario_file="scenarios/openshift/time_scenarios_example.yml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/time_config.yaml

View File

@@ -1,6 +1,6 @@
# Krkn aka Kraken
[![Docker Repository on Quay](https://quay.io/repository/redhat-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/redhat-chaos/krkn?tab=tags&tag=latest)
![Workflow-Status](https://github.com/redhat-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)
[![Docker Repository on Quay](https://quay.io/repository/krkn-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)
![Krkn logo](media/logo.png)
@@ -79,7 +79,7 @@ Scenario type | Kubernetes
### Kraken scenario pass/fail criteria and report
It is important to make sure to check if the targeted component recovered from the chaos injection and also if the Kubernetes cluster is healthy as failures in one component can have an adverse impact on other components. Kraken does this by:
- Having built in checks for pod and node based scenarios to ensure the expected number of replicas and nodes are up. It also supports running custom scripts with the checks.
- Leveraging [Cerberus](https://github.com/redhat-chaos/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/redhat-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
- Leveraging [Cerberus](https://github.com/krkn-chaos/cerberus) to monitor the cluster under test and consuming the aggregated go/no-go signal to determine pass/fail post chaos. It is highly recommended to turn on the Cerberus health check feature available in Kraken. Instructions on installing and setting up Cerberus can be found [here](https://github.com/openshift-scale/cerberus#installation) or can be installed from Kraken using the [instructions](https://github.com/krkn-chaos/krkn#setting-up-infrastructure-dependencies). Once Cerberus is up and running, set cerberus_enabled to True and cerberus_url to the url where Cerberus publishes go/no-go signal in the Kraken config file. Cerberus can monitor [application routes](https://github.com/redhat-chaos/cerberus/blob/main/docs/config.md#watch-routes) during the chaos and fails the run if it encounters downtime as it is a potential downtime in a customers, or users environment as well. It is especially important during the control plane chaos scenarios including the API server, Etcd, Ingress etc. It can be enabled by setting `check_applicaton_routes: True` in the [Kraken config](https://github.com/redhat-chaos/krkn/blob/main/config/config.yaml) provided application routes are being monitored in the [cerberus config](https://github.com/redhat-chaos/krkn/blob/main/config/cerberus.yaml).
- Leveraging built-in alert collection feature to fail the runs in case of critical alerts.
### Signaling
@@ -103,7 +103,7 @@ Information on enabling and leveraging this feature can be found [here](docs/SLO
### OCM / ACM integration
Kraken supports injecting faults into [Open Cluster Management (OCM)](https://open-cluster-management.io/) and [Red Hat Advanced Cluster Management for Kubernetes (ACM)](https://www.redhat.com/en/technologies/management/advanced-cluster-management) managed clusters through [ManagedCluster Scenarios](docs/managedcluster_scenarios.md).
Kraken supports injecting faults into [Open Cluster Management (OCM)](https://open-cluster-management.io/) and [Red Hat Advanced Cluster Management for Kubernetes (ACM)](https://www.krkn.com/en/technologies/management/advanced-cluster-management) managed clusters through [ManagedCluster Scenarios](docs/managedcluster_scenarios.md).
### Blogs and other useful resources
@@ -129,6 +129,7 @@ Please read [this file]((CI/README.md#adding-a-test-case)) for more information
### Community
Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, ravielluri/Naga Ravi Chaitanya Elluri.
* [**#krkn on Kubernetes Slack**](https://kubernetes.slack.com)
* [**#forum-chaos on CoreOS Slack internal to Red Hat**](https://coreos.slack.com)
Key Members(slack_usernames/full name): paigerube14/Paige Rubendall, mffiedler/Mike Fiedler, tsebasti/Tullio Sebastiani, yogi/Yogananth Subramanian, sahil/Sahil Shah, pradeep/Pradeep Surisetty and ravielluri/Naga Ravi Chaitanya Elluri.
* [**#krkn on Kubernetes Slack**](https://kubernetes.slack.com/messages/C05SFMHRWK1)
The Linux Foundation® (TLF) has registered trademarks and uses trademarks. For a list of TLF trademarks, see [Trademark Usage](https://www.linuxfoundation.org/legal/trademark-usage).

View File

@@ -2,14 +2,14 @@
Following are a list of enhancements that we are planning to work on adding support in Krkn. Of course any help/contributions are greatly appreciated.
- [ ] [Ability to run multiple chaos scenarios in parallel under load to mimic real world outages](https://github.com/redhat-chaos/krkn/issues/424)
- [x] [Centralized storage for chaos experiments artifacts](https://github.com/redhat-chaos/krkn/issues/423)
- [ ] [Support for causing DNS outages](https://github.com/redhat-chaos/krkn/issues/394)
- [x] [Chaos recommender](https://github.com/redhat-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results
- [ ] [Ability to run multiple chaos scenarios in parallel under load to mimic real world outages](https://github.com/krkn-chaos/krkn/issues/424)
- [x] [Centralized storage for chaos experiments artifacts](https://github.com/krkn-chaos/krkn/issues/423)
- [ ] [Support for causing DNS outages](https://github.com/krkn-chaos/krkn/issues/394)
- [x] [Chaos recommender](https://github.com/krkn-chaos/krkn/tree/main/utils/chaos-recommender) to suggest scenarios having probability of impacting the service under test using profiling results
- [ ] Chaos AI integration to improve and automate test coverage
- [x] [Support for pod level network traffic shaping](https://github.com/redhat-chaos/krkn/issues/393)
- [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/redhat-chaos/krkn/issues/124)
- [ ] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/redhat-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186
- [ ] Continue to improve [Chaos Testing Guide](https://redhat-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
- [ ] [Switch documentation references to Kubernetes](https://github.com/redhat-chaos/krkn/issues/495)
- [ ] [OCP and Kubernetes functionalities segregation](https://github.com/redhat-chaos/krkn/issues/497)
- [x] [Support for pod level network traffic shaping](https://github.com/krkn-chaos/krkn/issues/393)
- [ ] [Ability to visualize the metrics that are being captured by Kraken and stored in Elasticsearch](https://github.com/krkn-chaos/krkn/issues/124)
- [ ] Support for running all the scenarios of Kraken on Kubernetes distribution - see https://github.com/krkn-chaos/krkn/issues/185, https://github.com/redhat-chaos/krkn/issues/186
- [ ] Continue to improve [Chaos Testing Guide](https://krkn-chaos.github.io/krkn) in terms of adding best practices, test environment recommendations and scenarios to make sure the OpenShift platform, as well the applications running on top it, are resilient and performant under chaotic conditions.
- [ ] [Switch documentation references to Kubernetes](https://github.com/krkn-chaos/krkn/issues/495)
- [ ] [OCP and Kubernetes functionalities segregation](https://github.com/krkn-chaos/krkn/issues/497)

View File

@@ -1,5 +1,5 @@
#### Kubernetes/OpenShift cluster shut down scenario
Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/redhat-chaos/krkn/blob/main/scenarios/cluster_shut_down_scenario.yml) config file.
#### Kubernetes cluster shut down scenario
Scenario to shut down all the nodes including the masters and restart them after specified duration. Cluster shut down scenario can be injected by placing the shut_down config file under cluster_shut_down_scenario option in the kraken config. Refer to [cluster_shut_down_scenario](https://github.com/krkn-chaos/krkn/blob/main/scenarios/cluster_shut_down_scenario.yml) config file.
Refer to [cloud setup](cloud_setup.md) to configure your cli properly for the cloud provider of the cluster you want to shut down.

View File

@@ -4,7 +4,7 @@ This can be based on the pods namespace or labels. If you know the exact object
These scenarios are in a simple yaml format that you can manipulate to run your specific tests or use the pre-existing scenarios to see how it works.
#### Example Config
The following are the components of Kubernetes/OpenShift for which a basic chaos scenario config exists today.
The following are the components of Kubernetes for which a basic chaos scenario config exists today.
```
scenarios:
@@ -25,7 +25,7 @@ In all scenarios we do a post chaos check to wait and verify the specific compon
Here there are two options:
1. Pass a custom script in the main config scenario list that will run before the chaos and verify the output matches post chaos scenario.
See [scenarios/post_action_etcd_container.py](https://github.com/redhat-chaos/krkn/blob/main/scenarios/post_action_etcd_container.py) for an example.
See [scenarios/post_action_etcd_container.py](https://github.com/krkn-chaos/krkn/blob/main/scenarios/post_action_etcd_container.py) for an example.
```
- container_scenarios: # List of chaos pod scenarios to load.
- - scenarios/container_etcd.yml

View File

@@ -62,7 +62,7 @@ If changes go into the main repository while you're working on your code it is b
If not already configured, set the upstream url for kraken.
```
git remote add upstream https://github.com/redhat-chaos/krkn.git
git remote add upstream https://github.com/krkn-chaos/krkn.git
```
Rebase to upstream master branch.

View File

@@ -3,13 +3,13 @@
The following ways are supported to run Kraken:
- Standalone python program through Git.
- Containerized version using either Podman or Docker as the runtime via [Krkn-hub](https://github.com/redhat-chaos/krkn-hub)
- Containerized version using either Podman or Docker as the runtime via [Krkn-hub](https://github.com/krkn-chaos/krkn-hub)
- Kubernetes or OpenShift deployment ( unsupported )
**NOTE**: It is recommended to run Kraken external to the cluster ( Standalone or Containerized ) hitting the Kubernetes/OpenShift API as running it internal to the cluster might be disruptive to itself and also might not report back the results if the chaos leads to cluster's API server instability.
**NOTE**: To run Kraken on Power (ppc64le) architecture, build and run a containerized version by following the
instructions given [here](https://github.com/redhat-chaos/krkn/blob/main/containers/build_own_image-README.md).
instructions given [here](https://github.com/krkn-chaos/krkn/blob/main/containers/build_own_image-README.md).
**NOTE**: Helper functions for interactions in Krkn are part of [krkn-lib](https://github.com/redhat-chaos/krkn-lib).
Please feel free to reuse and expand them as you see fit when adding a new scenario or expanding
@@ -19,9 +19,9 @@ the capabilities of the current supported scenarios.
### Git
#### Clone the repository
Pick the latest stable release to install [here](https://github.com/redhat-chaos/krkn/releases).
Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
```
$ git clone https://github.com/redhat-chaos/krkn.git --branch <release version>
$ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
$ cd kraken
```
@@ -40,13 +40,13 @@ $ python3.9 run_kraken.py --config <config_file_location>
```
### Run containerized version
[Krkn-hub](https://github.com/redhat-chaos/krkn-hub) is a wrapper that allows running Krkn chaos scenarios via podman or docker runtime with scenario parameters/configuration defined as environment variables.
[Krkn-hub](https://github.com/krkn-chaos/krkn-hub) is a wrapper that allows running Krkn chaos scenarios via podman or docker runtime with scenario parameters/configuration defined as environment variables.
Refer [instructions](https://github.com/redhat-chaos/krkn-hub#supported-chaos-scenarios) to get started.
Refer [instructions](https://github.com/krkn-chaos/krkn-hub#supported-chaos-scenarios) to get started.
### Run Kraken as a Kubernetes deployment ( unsupported option - standalone or containerized deployers are recommended )
Refer [Instructions](https://github.com/redhat-chaos/krkn/blob/main/containers/README.md) on how to deploy and run Kraken as a Kubernetes/OpenShift deployment.
Refer [Instructions](https://github.com/krkn-chaos/krkn/blob/main/containers/README.md) on how to deploy and run Kraken as a Kubernetes/OpenShift deployment.
Refer to the [chaos-kraken chart manpage](https://artifacthub.io/packages/helm/startx/chaos-kraken)

View File

@@ -16,7 +16,7 @@ Set to '^.*$' and label_selector to "" to randomly select any namespace in your
**sleep:** Number of seconds to wait between each iteration/count of killing namespaces. Defaults to 10 seconds if not set
Refer to [namespace_scenarios_example](https://github.com/redhat-chaos/krkn/blob/main/scenarios/regex_namespace.yaml) config file.
Refer to [namespace_scenarios_example](https://github.com/krkn-chaos/krkn/blob/main/scenarios/regex_namespace.yaml) config file.
```
scenarios:

View File

@@ -16,7 +16,7 @@ Configuration Options:
**object_name:** List of the names of pods or nodes you want to skew.
Refer to [time_scenarios_example](https://github.com/redhat-chaos/krkn/blob/main/scenarios/time_scenarios_example.yml) config file.
Refer to [time_scenarios_example](https://github.com/krkn-chaos/krkn/blob/main/scenarios/time_scenarios_example.yml) config file.
```
time_scenarios:

View File

@@ -11,7 +11,7 @@ coverage
datetime
docker
docker-compose
git+https://github.com/redhat-chaos/arcaflow-plugin-kill-pod.git
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
gitpython
google-api-python-client
@@ -35,6 +35,6 @@ requests
service_identity
setuptools==65.5.1
werkzeug==3.0.1
wheel
wheel>=0.38.0
zope.interface==5.4.0
pandas<2.0.0
pandas>=2.2.0

View File

@@ -34,7 +34,7 @@ from krkn_lib.utils import SafeLogger
from krkn_lib.utils.functions import get_yaml_item_value
report_file = ""
# Main function
def main(cfg):
@@ -414,10 +414,9 @@ def main(cfg):
)
sys.exit(1)
run_dir = os.getcwd() + "/kraken.report"
logging.info(
"Successfully finished running Kraken. UUID for the run: "
"%s. Report generated at %s. Exiting" % (run_uuid, run_dir)
"%s. Report generated at %s. Exiting" % (run_uuid, report_file)
)
else:
logging.error("Cannot find a config at %s, please check" % (cfg))
@@ -434,12 +433,21 @@ if __name__ == "__main__":
help="config location",
default="config/config.yaml",
)
parser.add_option(
"-o",
"--output",
dest="output",
help="output report location",
default="kraken.report",
)
(options, args) = parser.parse_args()
report_file = options.output
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
logging.FileHandler("kraken.report", mode="w"),
logging.FileHandler(report_file, mode="w"),
logging.StreamHandler(),
],
)

View File

@@ -1,8 +1,9 @@
input_list:
- cpu_count: 1
cpu_load_percentage: 80
cpu_method: all
duration: 1s
kubeconfig: ''
namespace: default
node_selector: {}
- cpu_count: 1
cpu_load_percentage: 80
cpu_method: all
duration: 1s
kubeconfig: ''
namespace: default
node_selector:
kubernetes.io/hostname: kind-worker2

View File

@@ -3,6 +3,6 @@ scenarios:
namespace: "kube-system"
label_selector: "k8s-app=kube-dns"
container_name: ""
action: "kill 1"
action: 1
count: 1
retry_wait: 60

View File

@@ -1,12 +1,11 @@
network_chaos: # Scenario to create an outage by simulating random variations in the network.
duration: 300 # seconds
node_name: # node on which scenario has to be injected;
label_selector: <label_selector> # when node_name is not specified, a node with matching label_selector is selected for running the scenario.
network_chaos: # Scenario to create an outage by simulating random variations in the network.
duration: 300 # seconds
node_name: # node on which scenario has to be injected;
label_selector: <label_selector> # when node_name is not specified, a node with matching label_selector is selected for running the scenario.
instance_count: 1
interfaces: # Interface name would be the Kernel host network interface name.
- "<interface_name>"
interfaces: # Interface name would be the Kernel host network interface name.
- "<interface_name>"
execution: serial
egress:
latency: 50ms # 50ms
loss: 0.02 # percentage
bandwidth: 100mbit
latency: 50ms # 50ms
loss: 0.02 # percentage

View File

@@ -17,7 +17,7 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
```
$ python3.9 -m venv chaos
$ source chaos/bin/activate
$ git clone https://github.com/redhat-chaos/krkn.git
$ git clone https://github.com/krkn-chaos/krkn.git
$ cd krkn
$ pip3 install -r requirements.txt
$ python3.9 utils/chaos_recommender/chaos_recommender.py
@@ -89,7 +89,7 @@ If you provide the input values through command-line arguments, the correspondin
## Podman & Docker image
To run the recommender image please visit the [krkn-hub](https://github.com/redhat-chaos/krkn-hub for further infos.
To run the recommender image please visit the [krkn-hub](https://github.com/krkn-chaos/krkn-hub for further infos.
## How it works