mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Compare commits
41 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
2327531e46 | ||
|
|
2c14c48a63 | ||
|
|
ab98e416a6 | ||
|
|
19ad2d1a3d | ||
|
|
804d7cbf58 | ||
|
|
54af2fc6ff | ||
|
|
b79e526cfd | ||
|
|
a5efd7d06c | ||
|
|
a1b81bd382 | ||
|
|
782440c8c4 | ||
|
|
7e2755cbb7 | ||
|
|
2babb53d6e | ||
|
|
85f76e9193 | ||
|
|
8bf21392f1 | ||
|
|
606fb60811 | ||
|
|
fac7c3c6fb | ||
|
|
8dd9b30030 | ||
|
|
2d99f17aaf | ||
|
|
50742a793c | ||
|
|
ba6a844544 | ||
|
|
7e7a917dba | ||
|
|
b9c0bb39c7 | ||
|
|
706a886151 | ||
|
|
a1cf9e2c00 | ||
|
|
0f5dfcb823 | ||
|
|
1e1015e6e7 | ||
|
|
c71ce31779 | ||
|
|
1298f220a6 | ||
|
|
24059fb731 | ||
|
|
ab951adb78 | ||
|
|
a9a7fb7e51 | ||
|
|
5a8d5b0fe1 | ||
|
|
c440dc4b51 | ||
|
|
b174c51ee0 | ||
|
|
fec0434ce1 | ||
|
|
1067d5ec8d | ||
|
|
85ea1ef7e1 | ||
|
|
2e38b8b033 | ||
|
|
c7ea366756 | ||
|
|
67d4ee9fa2 | ||
|
|
fa59834bae |
4
.github/workflows/docker-image.yml
vendored
4
.github/workflows/docker-image.yml
vendored
@@ -37,5 +37,5 @@ jobs:
|
|||||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
uses: redhat-chaos/actions/krkn-hub@main
|
uses: redhat-chaos/actions/krkn-hub@main
|
||||||
with:
|
with:
|
||||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
QUAY_USER: ${{ secrets.QUAY_USERNAME }}
|
||||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
|
||||||
|
|||||||
39
.github/workflows/tests.yml
vendored
39
.github/workflows/tests.yml
vendored
@@ -1,8 +1,12 @@
|
|||||||
name: Functional & Unit Tests
|
name: Functional & Unit Tests
|
||||||
on:
|
on:
|
||||||
pull_request:
|
pull_request:
|
||||||
|
push:
|
||||||
|
branches:
|
||||||
|
- main
|
||||||
jobs:
|
jobs:
|
||||||
tests:
|
tests:
|
||||||
|
# Common steps
|
||||||
name: Functional & Unit Tests
|
name: Functional & Unit Tests
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
@@ -47,8 +51,7 @@ jobs:
|
|||||||
sudo apt-get install build-essential python3-dev
|
sudo apt-get install build-essential python3-dev
|
||||||
pip install --upgrade pip
|
pip install --upgrade pip
|
||||||
pip install -r requirements.txt
|
pip install -r requirements.txt
|
||||||
# - name: Run unit tests
|
|
||||||
# run: python -m coverage run -a -m unittest discover -s tests -v
|
|
||||||
- name: Deploy test workloads
|
- name: Deploy test workloads
|
||||||
run: |
|
run: |
|
||||||
kubectl apply -f CI/templates/outage_pod.yaml
|
kubectl apply -f CI/templates/outage_pod.yaml
|
||||||
@@ -61,10 +64,14 @@ jobs:
|
|||||||
- name: Get Kind nodes
|
- name: Get Kind nodes
|
||||||
run: |
|
run: |
|
||||||
kubectl get nodes --show-labels=true
|
kubectl get nodes --show-labels=true
|
||||||
|
# Pull request only steps
|
||||||
|
- name: Run unit tests
|
||||||
|
if: github.event_name == 'pull_request'
|
||||||
|
run: python -m coverage run -a -m unittest discover -s tests -v
|
||||||
|
|
||||||
- name: Setup Functional Tests
|
- name: Setup Pull Request Functional Tests
|
||||||
|
if: github.event_name == 'pull_request'
|
||||||
run: |
|
run: |
|
||||||
yq -i '.kraken.distribution="kubernetes"' CI/config/common_test_config.yaml
|
|
||||||
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
|
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
|
||||||
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
|
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
|
||||||
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
|
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
|
||||||
@@ -76,13 +83,33 @@ jobs:
|
|||||||
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
||||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||||
|
|
||||||
|
# Push on main only steps
|
||||||
|
- name: Configure AWS Credentials
|
||||||
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
|
uses: aws-actions/configure-aws-credentials@v4
|
||||||
|
with:
|
||||||
|
aws-access-key-id: ${{ secrets.AWS_ACCESS_KEY_ID }}
|
||||||
|
aws-secret-access-key: ${{ secrets.AWS_SECRET_ACCESS_KEY }}
|
||||||
|
aws-region : ${{ secrets.AWS_REGION }}
|
||||||
|
- name: Setup Post Merge Request Functional Tests
|
||||||
|
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||||
|
run: |
|
||||||
|
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml
|
||||||
|
echo "test_telemetry" > ./CI/tests/functional_tests
|
||||||
|
|
||||||
|
# Final common steps
|
||||||
- name: Run Functional tests
|
- name: Run Functional tests
|
||||||
|
env:
|
||||||
|
AWS_BUCKET: ${{ secrets.AWS_BUCKET }}
|
||||||
run: |
|
run: |
|
||||||
./CI/run.sh
|
./CI/run.sh
|
||||||
cat ./CI/results.markdown >> $GITHUB_STEP_SUMMARY
|
cat ./CI/results.markdown >> $GITHUB_STEP_SUMMARY
|
||||||
echo >> $GITHUB_STEP_SUMMARY
|
echo >> $GITHUB_STEP_SUMMARY
|
||||||
- name: Run Unit tests
|
|
||||||
run: python -m coverage run -a -m unittest discover -s tests -v
|
|
||||||
- name: Upload CI logs
|
- name: Upload CI logs
|
||||||
uses: actions/upload-artifact@v3
|
uses: actions/upload-artifact@v3
|
||||||
with:
|
with:
|
||||||
|
|||||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ __pycache__/*
|
|||||||
*.out
|
*.out
|
||||||
kube-burner*
|
kube-burner*
|
||||||
kube_burner*
|
kube_burner*
|
||||||
|
recommender_*.json
|
||||||
|
|
||||||
# Project files
|
# Project files
|
||||||
.ropeproject
|
.ropeproject
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
kraken:
|
kraken:
|
||||||
distribution: openshift # Distribution can be kubernetes or openshift.
|
distribution: kubernetes # Distribution can be kubernetes or openshift.
|
||||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig.
|
kubeconfig_path: ~/.kube/config # Path to kubeconfig.
|
||||||
exit_on_failure: False # Exit when a post action scenario fails.
|
exit_on_failure: False # Exit when a post action scenario fails.
|
||||||
litmus_version: v1.13.6 # Litmus version to install.
|
litmus_version: v1.13.6 # Litmus version to install.
|
||||||
@@ -29,9 +29,12 @@ tunings:
|
|||||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
||||||
telemetry:
|
telemetry:
|
||||||
enabled: False # enable/disables the telemetry collection feature
|
enabled: False # enable/disables the telemetry collection feature
|
||||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
|
||||||
username: username # telemetry service username
|
username: $TELEMETRY_USERNAME # telemetry service username
|
||||||
password: password # telemetry service password
|
password: $TELEMETRY_PASSWORD # telemetry service password
|
||||||
|
prometheus_namespace: 'prometheus-k8s' # prometheus namespace
|
||||||
|
prometheus_pod_name: 'prometheus-kind-prometheus-kube-prome-prometheus-0' # prometheus pod_name
|
||||||
|
prometheus_container_name: 'prometheus'
|
||||||
prometheus_backup: True # enables/disables prometheus data collection
|
prometheus_backup: True # enables/disables prometheus data collection
|
||||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||||
backup_threads: 5 # number of telemetry download/upload threads
|
backup_threads: 5 # number of telemetry download/upload threads
|
||||||
@@ -39,3 +42,11 @@ telemetry:
|
|||||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||||
|
logs_backup: True
|
||||||
|
logs_filter_patterns:
|
||||||
|
- "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532
|
||||||
|
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
||||||
|
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||||
|
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||||
|
events_backup: True # enables/disables cluster events collection
|
||||||
|
telemetry_group: "funtests"
|
||||||
|
|||||||
@@ -39,7 +39,7 @@ echo '-----------------------|--------|---------' >> $results
|
|||||||
failed_tests=()
|
failed_tests=()
|
||||||
for test_name in `cat CI/tests/functional_tests`
|
for test_name in `cat CI/tests/functional_tests`
|
||||||
do
|
do
|
||||||
wait_cluster_become_ready
|
#wait_cluster_become_ready
|
||||||
return_value=`./CI/run_test.sh $test_name $results`
|
return_value=`./CI/run_test.sh $test_name $results`
|
||||||
if [[ $return_value == 1 ]]
|
if [[ $return_value == 1 ]]
|
||||||
then
|
then
|
||||||
@@ -49,6 +49,7 @@ do
|
|||||||
wait_cluster_become_ready
|
wait_cluster_become_ready
|
||||||
done
|
done
|
||||||
|
|
||||||
|
|
||||||
if (( ${#failed_tests[@]}>0 ))
|
if (( ${#failed_tests[@]}>0 ))
|
||||||
then
|
then
|
||||||
echo -e "\n\n======================================================================"
|
echo -e "\n\n======================================================================"
|
||||||
|
|||||||
@@ -1,15 +1,23 @@
|
|||||||
ERRORED=false
|
ERRORED=false
|
||||||
|
|
||||||
function finish {
|
function finish {
|
||||||
if [ $? -eq 1 ] && [ $ERRORED != "true" ]
|
if [ $? != 0 ] && [ $ERRORED != "true" ]
|
||||||
then
|
then
|
||||||
error
|
error
|
||||||
fi
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function error {
|
function error {
|
||||||
echo "Error caught."
|
exit_code=$?
|
||||||
ERRORED=true
|
if [ $exit_code == 1 ]
|
||||||
|
then
|
||||||
|
echo "Error caught."
|
||||||
|
ERRORED=true
|
||||||
|
elif [ $exit_code == 2 ]
|
||||||
|
then
|
||||||
|
echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
|
||||||
|
exit 0
|
||||||
|
fi
|
||||||
}
|
}
|
||||||
|
|
||||||
function get_node {
|
function get_node {
|
||||||
|
|||||||
@@ -8,11 +8,11 @@ trap finish EXIT
|
|||||||
pod_file="CI/scenarios/hello_pod.yaml"
|
pod_file="CI/scenarios/hello_pod.yaml"
|
||||||
|
|
||||||
function functional_test_container_crash {
|
function functional_test_container_crash {
|
||||||
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
|
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
|
||||||
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
|
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
|
||||||
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
|
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
|
||||||
export scenario_type="container_scenarios"
|
export scenario_type="container_scenarios"
|
||||||
export scenario_file="- scenarios/openshift/app_outage.yaml"
|
export scenario_file="- scenarios/openshift/container_etcd.yml"
|
||||||
export post_config=""
|
export post_config=""
|
||||||
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
|
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
|
||||||
|
|
||||||
|
|||||||
37
CI/tests/test_telemetry.sh
Normal file
37
CI/tests/test_telemetry.sh
Normal file
@@ -0,0 +1,37 @@
|
|||||||
|
set -xeEo pipefail
|
||||||
|
|
||||||
|
source CI/tests/common.sh
|
||||||
|
|
||||||
|
trap error ERR
|
||||||
|
trap finish EXIT
|
||||||
|
|
||||||
|
|
||||||
|
function functional_test_telemetry {
|
||||||
|
AWS_CLI=`which aws`
|
||||||
|
[ -z "$AWS_CLI" ]&& echo "AWS cli not found in path" && exit 1
|
||||||
|
[ -z "$AWS_BUCKET" ] && echo "AWS bucket not set in environment" && exit 1
|
||||||
|
|
||||||
|
export RUN_TAG="funtest-telemetry"
|
||||||
|
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||||
|
|
||||||
|
export scenario_type="arcaflow_scenarios"
|
||||||
|
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
||||||
|
export post_config=""
|
||||||
|
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||||
|
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||||
|
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||||
|
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||||
|
echo "checking if telemetry files are uploaded on s3"
|
||||||
|
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
||||||
|
cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 )
|
||||||
|
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||||
|
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||||
|
echo "all files uploaded!"
|
||||||
|
echo "Telemetry Collection: Success"
|
||||||
|
}
|
||||||
|
|
||||||
|
functional_test_telemetry
|
||||||
@@ -1,5 +1,4 @@
|
|||||||
# Krkn aka Kraken
|
# Krkn aka Kraken
|
||||||
[](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
|
|
||||||

|

|
||||||
|
|
||||||

|

|
||||||
|
|||||||
@@ -8,7 +8,7 @@
|
|||||||
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
|
description: 10 minutes avg. 99th etcd fsync latency on {{$labels.pod}} higher than 1s. {{$value}}s
|
||||||
severity: error
|
severity: error
|
||||||
|
|
||||||
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.007
|
- expr: avg_over_time(histogram_quantile(0.99, rate(etcd_disk_backend_commit_duration_seconds_bucket[2m]))[10m:]) > 0.03
|
||||||
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
|
description: 10 minutes avg. 99th etcd commit latency on {{$labels.pod}} higher than 30ms. {{$value}}s
|
||||||
severity: warning
|
severity: warning
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
kraken:
|
kraken:
|
||||||
distribution: openshift # Distribution can be kubernetes or openshift
|
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||||
exit_on_failure: False # Exit when a post action scenario fails
|
exit_on_failure: False # Exit when a post action scenario fails
|
||||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||||
@@ -15,7 +15,7 @@ kraken:
|
|||||||
- application_outages:
|
- application_outages:
|
||||||
- scenarios/openshift/app_outage.yaml
|
- scenarios/openshift/app_outage.yaml
|
||||||
- container_scenarios: # List of chaos pod scenarios to load
|
- container_scenarios: # List of chaos pod scenarios to load
|
||||||
- - scenarios/openshift/container_etcd.yml
|
- - scenarios/openshift/container_etcd.yml
|
||||||
- plugin_scenarios:
|
- plugin_scenarios:
|
||||||
- scenarios/openshift/etcd.yml
|
- scenarios/openshift/etcd.yml
|
||||||
- scenarios/openshift/regex_openshift_pod_kill.yml
|
- scenarios/openshift/regex_openshift_pod_kill.yml
|
||||||
@@ -23,7 +23,7 @@ kraken:
|
|||||||
- scenarios/openshift/network_chaos_ingress.yml
|
- scenarios/openshift/network_chaos_ingress.yml
|
||||||
- scenarios/openshift/prom_kill.yml
|
- scenarios/openshift/prom_kill.yml
|
||||||
- node_scenarios: # List of chaos node scenarios to load
|
- node_scenarios: # List of chaos node scenarios to load
|
||||||
- scenarios/openshift/node_scenarios_example.yml
|
- scenarios/openshift/node_scenarios_example.yml
|
||||||
- plugin_scenarios:
|
- plugin_scenarios:
|
||||||
- scenarios/openshift/openshift-apiserver.yml
|
- scenarios/openshift/openshift-apiserver.yml
|
||||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||||
@@ -51,7 +51,7 @@ cerberus:
|
|||||||
performance_monitoring:
|
performance_monitoring:
|
||||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||||
uuid: # uuid for the run is generated by default if not set
|
uuid: # uuid for the run is generated by default if not set
|
||||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||||
@@ -65,14 +65,19 @@ telemetry:
|
|||||||
enabled: False # enable/disables the telemetry collection feature
|
enabled: False # enable/disables the telemetry collection feature
|
||||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||||
username: username # telemetry service username
|
username: username # telemetry service username
|
||||||
password: password # telemetry service password
|
password: password # telemetry service password
|
||||||
prometheus_backup: True # enables/disables prometheus data collection
|
prometheus_backup: True # enables/disables prometheus data collection
|
||||||
|
prometheus_namespace: "" # namespace where prometheus is deployed (if distribution is kubernetes)
|
||||||
|
prometheus_container_name: "" # name of the prometheus container name (if distribution is kubernetes)
|
||||||
|
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||||
backup_threads: 5 # number of telemetry download/upload threads
|
backup_threads: 5 # number of telemetry download/upload threads
|
||||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
archive_size: 500000
|
||||||
|
telemetry_group: '' # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
|
||||||
|
# the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||||
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
||||||
# simultaneously).
|
# simultaneously).
|
||||||
# For unstable/slow connection is better to keep this value low
|
# For unstable/slow connection is better to keep this value low
|
||||||
@@ -85,6 +90,9 @@ telemetry:
|
|||||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||||
events_backup: True # enables/disables cluster events collection
|
events_backup: True # enables/disables cluster events collection
|
||||||
|
elastic:
|
||||||
|
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||||
|
elastic_index: "" # Elastic search index pattern to post results to
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -77,3 +77,8 @@ telemetry:
|
|||||||
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
||||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||||
|
elastic:
|
||||||
|
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||||
|
elastic_index: "" # Elastic search index pattern to post results to
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
application: openshift-etcd
|
application: openshift-etcd
|
||||||
namespace: openshift-etcd
|
namespaces: openshift-etcd
|
||||||
labels: app=openshift-etcd
|
labels: app=openshift-etcd
|
||||||
kubeconfig: ~/.kube/config.yaml
|
kubeconfig: ~/.kube/config.yaml
|
||||||
prometheus_endpoint: <Prometheus_Endpoint>
|
prometheus_endpoint: <Prometheus_Endpoint>
|
||||||
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
|
|||||||
scrape_duration: 10m
|
scrape_duration: 10m
|
||||||
chaos_library: "kraken"
|
chaos_library: "kraken"
|
||||||
log_level: INFO
|
log_level: INFO
|
||||||
|
json_output_file: False
|
||||||
|
json_output_folder_path:
|
||||||
|
|
||||||
# for output purpose only do not change if not needed
|
# for output purpose only do not change if not needed
|
||||||
chaos_tests:
|
chaos_tests:
|
||||||
@@ -26,4 +28,8 @@ chaos_tests:
|
|||||||
- pod_network_chaos
|
- pod_network_chaos
|
||||||
MEM:
|
MEM:
|
||||||
- node_memory_hog
|
- node_memory_hog
|
||||||
- pvc_disk_fill
|
- pvc_disk_fill
|
||||||
|
|
||||||
|
threshold: .7
|
||||||
|
cpu_threshold: .5
|
||||||
|
mem_threshold: .5
|
||||||
|
|||||||
@@ -12,7 +12,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||||
python3.9 -m pip install -U pip && \
|
python3.9 -m pip install -U pip && \
|
||||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.5 /root/kraken && \
|
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.13 /root/kraken && \
|
||||||
mkdir -p /root/.kube && cd /root/kraken && \
|
mkdir -p /root/.kube && cd /root/kraken && \
|
||||||
pip3.9 install -r requirements.txt && \
|
pip3.9 install -r requirements.txt && \
|
||||||
pip3.9 install virtualenv && \
|
pip3.9 install virtualenv && \
|
||||||
@@ -20,7 +20,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \
|
|||||||
|
|
||||||
# Get Kubernetes and OpenShift clients from stable releases
|
# Get Kubernetes and OpenShift clients from stable releases
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||||
|
|
||||||
WORKDIR /root/kraken
|
WORKDIR /root/kraken
|
||||||
|
|
||||||
|
|||||||
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||||
python3.9 -m pip install -U pip && \
|
python3.9 -m pip install -U pip && \
|
||||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.5 /root/kraken && \
|
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.13 /root/kraken && \
|
||||||
mkdir -p /root/.kube && cd /root/kraken && \
|
mkdir -p /root/.kube && cd /root/kraken && \
|
||||||
pip3.9 install -r requirements.txt && \
|
pip3.9 install -r requirements.txt && \
|
||||||
pip3.9 install virtualenv && \
|
pip3.9 install virtualenv && \
|
||||||
@@ -22,7 +22,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \
|
|||||||
|
|
||||||
# Get Kubernetes and OpenShift clients from stable releases
|
# Get Kubernetes and OpenShift clients from stable releases
|
||||||
WORKDIR /tmp
|
WORKDIR /tmp
|
||||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||||
|
|
||||||
WORKDIR /root/kraken
|
WORKDIR /root/kraken
|
||||||
|
|
||||||
|
|||||||
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
|
|||||||
namespace_pattern: ^<namespace>$
|
namespace_pattern: ^<namespace>$
|
||||||
label_selector: <pod label>
|
label_selector: <pod label>
|
||||||
kill: <number of pods to kill>
|
kill: <number of pods to kill>
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: <expected time for the pod to become ready>
|
||||||
config:
|
|
||||||
namespace_pattern: ^<namespace>$
|
|
||||||
label_selector: <pod label>
|
|
||||||
count: <expected number of pods that match namespace and label>
|
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Node Scenario Yaml Template
|
#### Node Scenario Yaml Template
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
|
|||||||
Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
|
Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
|
||||||
```
|
```
|
||||||
$ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
|
$ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
|
||||||
$ cd kraken
|
$ cd krkn
|
||||||
```
|
```
|
||||||
|
|
||||||
#### Install the dependencies
|
#### Install the dependencies
|
||||||
|
|||||||
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^kube-system$
|
namespace_pattern: ^kube-system$
|
||||||
label_selector: k8s-app=kube-scheduler
|
label_selector: k8s-app=kube-scheduler
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^kube-system$
|
|
||||||
label_selector: k8s-app=kube-scheduler
|
|
||||||
count: 3
|
|
||||||
```
|
```
|
||||||
|
|
||||||
Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
|
Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
|
||||||
|
|||||||
@@ -4,6 +4,7 @@ import time
|
|||||||
import kraken.cerberus.setup as cerberus
|
import kraken.cerberus.setup as cerberus
|
||||||
from jinja2 import Template
|
from jinja2 import Template
|
||||||
import kraken.invoke.command as runcommand
|
import kraken.invoke.command as runcommand
|
||||||
|
from krkn_lib.k8s import KrknKubernetes
|
||||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||||
from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||||
@@ -11,7 +12,7 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
|||||||
|
|
||||||
# Reads the scenario config, applies and deletes a network policy to
|
# Reads the scenario config, applies and deletes a network policy to
|
||||||
# block the traffic for the specified duration
|
# block the traffic for the specified duration
|
||||||
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
def run(scenarios_list, config, wait_duration,kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||||
failed_post_scenarios = ""
|
failed_post_scenarios = ""
|
||||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||||
failed_scenarios = []
|
failed_scenarios = []
|
||||||
@@ -49,25 +50,22 @@ spec:
|
|||||||
podSelector:
|
podSelector:
|
||||||
matchLabels: {{ pod_selector }}
|
matchLabels: {{ pod_selector }}
|
||||||
policyTypes: {{ traffic_type }}
|
policyTypes: {{ traffic_type }}
|
||||||
"""
|
"""
|
||||||
t = Template(network_policy_template)
|
t = Template(network_policy_template)
|
||||||
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
|
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
|
||||||
# Write the rendered template to a file
|
yaml_spec = yaml.safe_load(rendered_spec)
|
||||||
with open("kraken_network_policy.yaml", "w") as f:
|
|
||||||
f.write(rendered_spec)
|
|
||||||
# Block the traffic by creating network policy
|
# Block the traffic by creating network policy
|
||||||
logging.info("Creating the network policy")
|
logging.info("Creating the network policy")
|
||||||
runcommand.invoke(
|
|
||||||
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
|
|
||||||
)
|
|
||||||
|
|
||||||
|
kubecli.create_net_policy(yaml_spec, namespace)
|
||||||
|
|
||||||
# wait for the specified duration
|
# wait for the specified duration
|
||||||
logging.info("Waiting for the specified duration in the config: %s" % (duration))
|
logging.info("Waiting for the specified duration in the config: %s" % (duration))
|
||||||
time.sleep(duration)
|
time.sleep(duration)
|
||||||
|
|
||||||
# unblock the traffic by deleting the network policy
|
# unblock the traffic by deleting the network policy
|
||||||
logging.info("Deleting the network policy")
|
logging.info("Deleting the network policy")
|
||||||
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
|
kubecli.delete_net_policy("kraken-deny", namespace)
|
||||||
|
|
||||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||||
time.sleep(wait_duration)
|
time.sleep(wait_duration)
|
||||||
|
|||||||
@@ -4,13 +4,10 @@ import pandas as pd
|
|||||||
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
||||||
import time
|
import time
|
||||||
|
|
||||||
threshold = .7 # Adjust the threshold as needed
|
|
||||||
heatmap_cpu_threshold = .5
|
|
||||||
heatmap_mem_threshold = .5
|
|
||||||
|
|
||||||
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
||||||
|
|
||||||
#Placeholder, this should be done with topology
|
|
||||||
|
# Placeholder, this should be done with topology
|
||||||
def return_critical_services():
|
def return_critical_services():
|
||||||
return ["web", "cart"]
|
return ["web", "cart"]
|
||||||
|
|
||||||
@@ -19,15 +16,18 @@ def load_telemetry_data(file_path):
|
|||||||
data = pd.read_csv(file_path, delimiter=r"\s+")
|
data = pd.read_csv(file_path, delimiter=r"\s+")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def calculate_zscores(data):
|
def calculate_zscores(data):
|
||||||
zscores = pd.DataFrame()
|
zscores = pd.DataFrame()
|
||||||
|
zscores["Namespace"] = data["namespace"]
|
||||||
zscores["Service"] = data["service"]
|
zscores["Service"] = data["service"]
|
||||||
zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
|
zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
|
||||||
zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
|
zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
|
||||||
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
||||||
return zscores
|
return zscores
|
||||||
|
|
||||||
def identify_outliers(data):
|
|
||||||
|
def identify_outliers(data, threshold):
|
||||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||||
@@ -47,44 +47,85 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
|
|||||||
return cpu_services, mem_services
|
return cpu_services, mem_services
|
||||||
|
|
||||||
|
|
||||||
def analysis(file_path, chaos_tests_config):
|
def analysis(file_path, namespaces, chaos_tests_config, threshold,
|
||||||
|
heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||||
# Load the telemetry data from file
|
# Load the telemetry data from file
|
||||||
|
logging.info("Fetching the Telemetry data...")
|
||||||
data = load_telemetry_data(file_path)
|
data = load_telemetry_data(file_path)
|
||||||
|
|
||||||
# Calculate Z-scores for CPU, Memory, and Network columns
|
# Calculate Z-scores for CPU, Memory, and Network columns
|
||||||
zscores = calculate_zscores(data)
|
zscores = calculate_zscores(data)
|
||||||
|
# Dict for saving analysis data -- key is the namespace
|
||||||
|
analysis_data = {}
|
||||||
|
|
||||||
# Identify outliers
|
# Identify outliers for each namespace
|
||||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
|
for namespace in namespaces:
|
||||||
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
|
||||||
|
|
||||||
# Display the identified outliers
|
logging.info(f"Identifying outliers for namespace {namespace}...")
|
||||||
logging.info("======================== Profiling ==================================")
|
|
||||||
logging.info(f"CPU outliers: {outliers_cpu}")
|
namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
|
||||||
logging.info(f"Memory outliers: {outliers_memory}")
|
namespace_data = data.loc[data["namespace"] == namespace]
|
||||||
logging.info(f"Network outliers: {outliers_network}")
|
outliers_cpu, outliers_memory, outliers_network = identify_outliers(
|
||||||
logging.info("===================== HeatMap Analysis ==============================")
|
namespace_zscores, threshold)
|
||||||
|
cpu_services, mem_services = get_services_above_heatmap_threshold(
|
||||||
|
namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
|
|
||||||
|
analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
|
||||||
|
outliers_network,
|
||||||
|
cpu_services, mem_services,
|
||||||
|
chaos_tests_config)
|
||||||
|
|
||||||
|
if cpu_services:
|
||||||
|
logging.info(f"These services use significant CPU compared to "
|
||||||
|
f"their assigned limits: {cpu_services}")
|
||||||
|
else:
|
||||||
|
logging.info("There are no services that are using significant "
|
||||||
|
"CPU compared to their assigned limits "
|
||||||
|
"(infinite in case no limits are set).")
|
||||||
|
if mem_services:
|
||||||
|
logging.info(f"These services use significant MEMORY compared to "
|
||||||
|
f"their assigned limits: {mem_services}")
|
||||||
|
else:
|
||||||
|
logging.info("There are no services that are using significant "
|
||||||
|
"MEMORY compared to their assigned limits "
|
||||||
|
"(infinite in case no limits are set).")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.info("Please check data in utilisation.txt for further analysis")
|
||||||
|
|
||||||
|
return analysis_data
|
||||||
|
|
||||||
|
|
||||||
|
def analysis_json(outliers_cpu, outliers_memory, outliers_network,
|
||||||
|
cpu_services, mem_services, chaos_tests_config):
|
||||||
|
|
||||||
|
profiling = {
|
||||||
|
"cpu_outliers": outliers_cpu,
|
||||||
|
"memory_outliers": outliers_memory,
|
||||||
|
"network_outliers": outliers_network
|
||||||
|
}
|
||||||
|
|
||||||
|
heatmap = {
|
||||||
|
"services_with_cpu_heatmap_above_threshold": cpu_services,
|
||||||
|
"services_with_mem_heatmap_above_threshold": mem_services
|
||||||
|
}
|
||||||
|
|
||||||
|
recommendations = {}
|
||||||
|
|
||||||
if cpu_services:
|
if cpu_services:
|
||||||
logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
|
cpu_recommend = {"services": cpu_services,
|
||||||
else:
|
"tests": chaos_tests_config['CPU']}
|
||||||
logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
|
recommendations["cpu_services_recommendations"] = cpu_recommend
|
||||||
|
|
||||||
if mem_services:
|
if mem_services:
|
||||||
logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
|
mem_recommend = {"services": mem_services,
|
||||||
else:
|
"tests": chaos_tests_config['MEM']}
|
||||||
logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
recommendations["mem_services_recommendations"] = mem_recommend
|
||||||
time.sleep(2)
|
|
||||||
logging.info("======================= Recommendations =============================")
|
|
||||||
if cpu_services:
|
|
||||||
logging.info(f"Recommended tests for {str(cpu_services)} :\n {chaos_tests_config['CPU']}")
|
|
||||||
logging.info("\n")
|
|
||||||
if mem_services:
|
|
||||||
logging.info(f"Recommended tests for {str(mem_services)} :\n {chaos_tests_config['MEM']}")
|
|
||||||
logging.info("\n")
|
|
||||||
|
|
||||||
if outliers_network:
|
if outliers_network:
|
||||||
logging.info(f"Recommended tests for str(outliers_network) :\n {chaos_tests_config['NETWORK']}")
|
outliers_network_recommend = {"outliers_networks": outliers_network,
|
||||||
logging.info("\n")
|
"tests": chaos_tests_config['NETWORK']}
|
||||||
|
recommendations["outliers_network_recommendations"] = (
|
||||||
|
outliers_network_recommend)
|
||||||
|
|
||||||
logging.info("\n")
|
return [profiling, heatmap, recommendations]
|
||||||
logging.info("Please check data in utilisation.txt for further analysis")
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import pandas
|
|
||||||
from prometheus_api_client import PrometheusConnect
|
from prometheus_api_client import PrometheusConnect
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import urllib3
|
import urllib3
|
||||||
@@ -8,6 +7,7 @@ import urllib3
|
|||||||
|
|
||||||
saved_metrics_path = "./utilisation.txt"
|
saved_metrics_path = "./utilisation.txt"
|
||||||
|
|
||||||
|
|
||||||
def convert_data_to_dataframe(data, label):
|
def convert_data_to_dataframe(data, label):
|
||||||
df = pd.DataFrame()
|
df = pd.DataFrame()
|
||||||
df['service'] = [item['metric']['pod'] for item in data]
|
df['service'] = [item['metric']['pod'] for item in data]
|
||||||
@@ -25,21 +25,27 @@ def convert_data(data, service):
|
|||||||
result[pod_name] = value
|
result[pod_name] = value
|
||||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||||
|
|
||||||
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
|
||||||
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
|
||||||
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
|
||||||
services = df_cpu.service.unique()
|
|
||||||
logging.info(services)
|
|
||||||
|
|
||||||
for s in services:
|
def save_utilization_to_file(utilization, filename):
|
||||||
|
merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
|
||||||
new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
|
for namespace in utilization:
|
||||||
"CPU_LIMITS" : convert_data(cpu_limits_result, s),
|
# Loading utilization_data[] for namespace
|
||||||
"MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
|
# indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
|
||||||
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
utilization_data = utilization[namespace]
|
||||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
|
||||||
|
services = df_cpu.service.unique()
|
||||||
|
logging.info(f"Services for namespace {namespace}: {services}")
|
||||||
|
|
||||||
|
for s in services:
|
||||||
|
|
||||||
|
new_row_df = pd.DataFrame({
|
||||||
|
"namespace": namespace, "service": s,
|
||||||
|
"CPU": convert_data(utilization_data[0], s),
|
||||||
|
"CPU_LIMITS": convert_data(utilization_data[1], s),
|
||||||
|
"MEM": convert_data(utilization_data[2], s),
|
||||||
|
"MEM_LIMITS": convert_data(utilization_data[3], s),
|
||||||
|
"NETWORK": convert_data(utilization_data[4], s)}, index=[0])
|
||||||
|
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||||
|
|
||||||
# Convert columns to string
|
# Convert columns to string
|
||||||
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
||||||
@@ -57,40 +63,50 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
|||||||
|
|
||||||
merged_df.to_csv(filename, sep='\t', index=False)
|
merged_df.to_csv(filename, sep='\t', index=False)
|
||||||
|
|
||||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
|
||||||
|
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
|
||||||
|
namespaces, scrape_duration):
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
|
||||||
|
'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||||
|
|
||||||
# Fetch CPU utilization
|
# Dicts for saving utilisation and queries -- key is namespace
|
||||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
utilization = {}
|
||||||
logging.info(cpu_query)
|
queries = {}
|
||||||
cpu_result = prometheus.custom_query(cpu_query)
|
|
||||||
cpu_data = cpu_result
|
logging.info("Fetching utilization...")
|
||||||
|
for namespace in namespaces:
|
||||||
|
|
||||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
# Fetch CPU utilization
|
||||||
logging.info(cpu_limits_query)
|
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
cpu_result = prometheus.custom_query(cpu_query)
|
||||||
|
|
||||||
|
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||||
logging.info(mem_query)
|
|
||||||
mem_result = prometheus.custom_query(mem_query)
|
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||||
mem_data = mem_result
|
mem_result = prometheus.custom_query(mem_query)
|
||||||
|
|
||||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||||
logging.info(mem_limits_query)
|
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
|
||||||
|
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||||
|
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
network_result = prometheus.custom_query(network_query)
|
||||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
|
||||||
network_result = prometheus.custom_query(network_query)
|
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
|
||||||
logging.info(network_query)
|
queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
|
||||||
network_data = network_result
|
|
||||||
|
save_utilization_to_file(utilization, saved_metrics_path)
|
||||||
|
return saved_metrics_path, queries
|
||||||
save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
|
|
||||||
return saved_metrics_path
|
|
||||||
|
|
||||||
|
|
||||||
|
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
|
||||||
|
queries = {
|
||||||
|
"cpu_query": cpu_query,
|
||||||
|
"cpu_limit_query": cpu_limits_query,
|
||||||
|
"memory_query": mem_query,
|
||||||
|
"memory_limit_query": mem_limits_query,
|
||||||
|
"network_query": network_query
|
||||||
|
}
|
||||||
|
return queries
|
||||||
|
|||||||
@@ -2,11 +2,14 @@ import dataclasses
|
|||||||
import json
|
import json
|
||||||
import logging
|
import logging
|
||||||
from os.path import abspath
|
from os.path import abspath
|
||||||
from typing import List, Dict
|
from typing import List, Dict, Any
|
||||||
import time
|
import time
|
||||||
|
|
||||||
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
||||||
from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
|
from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
|
||||||
|
from krkn_lib.k8s import KrknKubernetes
|
||||||
|
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||||
|
|
||||||
import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
|
import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
|
||||||
import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||||
from kraken.plugins.run_python_plugin import run_python_file
|
from kraken.plugins.run_python_plugin import run_python_file
|
||||||
@@ -47,11 +50,14 @@ class Plugins:
|
|||||||
)
|
)
|
||||||
self.steps_by_id[step.schema.id] = step
|
self.steps_by_id[step.schema.id] = step
|
||||||
|
|
||||||
|
def unserialize_scenario(self, file: str) -> Any:
|
||||||
|
return serialization.load_from_file(abspath(file))
|
||||||
|
|
||||||
def run(self, file: str, kubeconfig_path: str, kraken_config: str):
|
def run(self, file: str, kubeconfig_path: str, kraken_config: str):
|
||||||
"""
|
"""
|
||||||
Run executes a series of steps
|
Run executes a series of steps
|
||||||
"""
|
"""
|
||||||
data = serialization.load_from_file(abspath(file))
|
data = self.unserialize_scenario(abspath(file))
|
||||||
if not isinstance(data, list):
|
if not isinstance(data, list):
|
||||||
raise Exception(
|
raise Exception(
|
||||||
"Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
|
"Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
|
||||||
@@ -213,6 +219,12 @@ PLUGINS = Plugins(
|
|||||||
"error"
|
"error"
|
||||||
]
|
]
|
||||||
),
|
),
|
||||||
|
PluginStep(
|
||||||
|
network_chaos,
|
||||||
|
[
|
||||||
|
"error"
|
||||||
|
]
|
||||||
|
),
|
||||||
PluginStep(
|
PluginStep(
|
||||||
pod_outage,
|
pod_outage,
|
||||||
[
|
[
|
||||||
@@ -235,7 +247,15 @@ PLUGINS = Plugins(
|
|||||||
)
|
)
|
||||||
|
|
||||||
|
|
||||||
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
|
def run(scenarios: List[str],
|
||||||
|
kubeconfig_path: str,
|
||||||
|
kraken_config: str,
|
||||||
|
failed_post_scenarios: List[str],
|
||||||
|
wait_duration: int,
|
||||||
|
telemetry: KrknTelemetryKubernetes,
|
||||||
|
kubecli: KrknKubernetes
|
||||||
|
) -> (List[str], list[ScenarioTelemetry]):
|
||||||
|
|
||||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||||
for scenario in scenarios:
|
for scenario in scenarios:
|
||||||
scenario_telemetry = ScenarioTelemetry()
|
scenario_telemetry = ScenarioTelemetry()
|
||||||
@@ -243,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
|
|||||||
scenario_telemetry.startTimeStamp = time.time()
|
scenario_telemetry.startTimeStamp = time.time()
|
||||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||||
logging.info('scenario ' + str(scenario))
|
logging.info('scenario ' + str(scenario))
|
||||||
|
pool = PodsMonitorPool(kubecli)
|
||||||
|
kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
|
||||||
|
|
||||||
try:
|
try:
|
||||||
|
start_monitoring(pool, kill_scenarios)
|
||||||
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
|
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
|
||||||
|
result = pool.join()
|
||||||
|
scenario_telemetry.affected_pods = result
|
||||||
|
if result.error:
|
||||||
|
raise Exception(f"unrecovered pods: {result.error}")
|
||||||
|
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
|
logging.error(f"scenario exception: {str(e)}")
|
||||||
scenario_telemetry.exitStatus = 1
|
scenario_telemetry.exitStatus = 1
|
||||||
|
pool.cancel()
|
||||||
failed_post_scenarios.append(scenario)
|
failed_post_scenarios.append(scenario)
|
||||||
log_exception(scenario)
|
log_exception(scenario)
|
||||||
else:
|
else:
|
||||||
@@ -257,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
|
|||||||
scenario_telemetry.endTimeStamp = time.time()
|
scenario_telemetry.endTimeStamp = time.time()
|
||||||
|
|
||||||
return failed_post_scenarios, scenario_telemetries
|
return failed_post_scenarios, scenario_telemetries
|
||||||
|
|
||||||
|
|
||||||
|
def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
|
||||||
|
for kill_scenario in scenarios:
|
||||||
|
recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
|
||||||
|
if ("namespace_pattern" in kill_scenario["config"] and
|
||||||
|
"label_selector" in kill_scenario["config"]):
|
||||||
|
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||||
|
label_selector = kill_scenario["config"]["label_selector"]
|
||||||
|
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||||
|
namespace_pattern=namespace_pattern,
|
||||||
|
label_selector=label_selector,
|
||||||
|
max_timeout=recovery_time)
|
||||||
|
logging.info(
|
||||||
|
f"waiting {recovery_time} seconds for pod recovery, "
|
||||||
|
f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
|
||||||
|
|
||||||
|
elif ("namespace_pattern" in kill_scenario["config"] and
|
||||||
|
"name_pattern" in kill_scenario["config"]):
|
||||||
|
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||||
|
name_pattern = kill_scenario["config"]["name_pattern"]
|
||||||
|
pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
|
||||||
|
namespace_pattern=namespace_pattern,
|
||||||
|
max_timeout=recovery_time)
|
||||||
|
logging.info(f"waiting {recovery_time} seconds for pod recovery, "
|
||||||
|
f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
|
||||||
|
else:
|
||||||
|
raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
|
||||||
|
|||||||
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
|
|||||||
typing.Optional[int],
|
typing.Optional[int],
|
||||||
validation.min(1)
|
validation.min(1)
|
||||||
] = field(
|
] = field(
|
||||||
default=300,
|
default=30,
|
||||||
metadata={
|
metadata={
|
||||||
"name": "Wait Duration",
|
"name": "Wait Duration",
|
||||||
"description":
|
"description":
|
||||||
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
|||||||
)
|
)
|
||||||
logging.info("Waiting for parallel job to finish")
|
logging.info("Waiting for parallel job to finish")
|
||||||
start_time = int(time.time())
|
start_time = int(time.time())
|
||||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||||
end_time = int(time.time())
|
end_time = int(time.time())
|
||||||
if publish:
|
if publish:
|
||||||
cerberus.publish_kraken_status(
|
cerberus.publish_kraken_status(
|
||||||
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
|||||||
)
|
)
|
||||||
logging.info("Waiting for serial job to finish")
|
logging.info("Waiting for serial job to finish")
|
||||||
start_time = int(time.time())
|
start_time = int(time.time())
|
||||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||||
logging.info("Deleting jobs")
|
logging.info("Deleting jobs")
|
||||||
delete_jobs(cli, batch_cli, job_list[:])
|
delete_jobs(cli, batch_cli, job_list[:])
|
||||||
job_list = []
|
job_list = []
|
||||||
|
|||||||
@@ -1,9 +1,13 @@
|
|||||||
import logging
|
import logging
|
||||||
import time
|
import time
|
||||||
|
from typing import Any
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
import sys
|
import sys
|
||||||
import random
|
import random
|
||||||
import arcaflow_plugin_kill_pod
|
import arcaflow_plugin_kill_pod
|
||||||
|
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||||
|
|
||||||
import kraken.cerberus.setup as cerberus
|
import kraken.cerberus.setup as cerberus
|
||||||
import kraken.post_actions.actions as post_actions
|
import kraken.post_actions.actions as post_actions
|
||||||
from krkn_lib.k8s import KrknKubernetes
|
from krkn_lib.k8s import KrknKubernetes
|
||||||
@@ -79,6 +83,7 @@ def container_run(kubeconfig_path,
|
|||||||
|
|
||||||
failed_scenarios = []
|
failed_scenarios = []
|
||||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||||
|
pool = PodsMonitorPool(kubecli)
|
||||||
|
|
||||||
for container_scenario_config in scenarios_list:
|
for container_scenario_config in scenarios_list:
|
||||||
scenario_telemetry = ScenarioTelemetry()
|
scenario_telemetry = ScenarioTelemetry()
|
||||||
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
|
|||||||
pre_action_output = ""
|
pre_action_output = ""
|
||||||
with open(container_scenario_config[0], "r") as f:
|
with open(container_scenario_config[0], "r") as f:
|
||||||
cont_scenario_config = yaml.full_load(f)
|
cont_scenario_config = yaml.full_load(f)
|
||||||
|
start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
|
||||||
for cont_scenario in cont_scenario_config["scenarios"]:
|
for cont_scenario in cont_scenario_config["scenarios"]:
|
||||||
# capture start time
|
# capture start time
|
||||||
start_time = int(time.time())
|
start_time = int(time.time())
|
||||||
try:
|
try:
|
||||||
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
||||||
if len(container_scenario_config) > 1:
|
logging.info(f"killed containers: {str(killed_containers)}")
|
||||||
failed_post_scenarios = post_actions.check_recovery(
|
result = pool.join()
|
||||||
kubeconfig_path,
|
if result.error:
|
||||||
container_scenario_config,
|
raise Exception(f"pods failed to recovery: {result.error}")
|
||||||
failed_post_scenarios,
|
scenario_telemetry.affected_pods = result
|
||||||
pre_action_output
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
failed_post_scenarios = check_failed_containers(
|
|
||||||
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
|
|
||||||
)
|
|
||||||
|
|
||||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||||
time.sleep(wait_duration)
|
time.sleep(wait_duration)
|
||||||
|
|
||||||
@@ -117,6 +116,7 @@ def container_run(kubeconfig_path,
|
|||||||
# publish cerberus status
|
# publish cerberus status
|
||||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||||
except (RuntimeError, Exception):
|
except (RuntimeError, Exception):
|
||||||
|
pool.cancel()
|
||||||
failed_scenarios.append(container_scenario_config[0])
|
failed_scenarios.append(container_scenario_config[0])
|
||||||
log_exception(container_scenario_config[0])
|
log_exception(container_scenario_config[0])
|
||||||
scenario_telemetry.exitStatus = 1
|
scenario_telemetry.exitStatus = 1
|
||||||
@@ -129,6 +129,16 @@ def container_run(kubeconfig_path,
|
|||||||
|
|
||||||
return failed_scenarios, scenario_telemetries
|
return failed_scenarios, scenario_telemetries
|
||||||
|
|
||||||
|
def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
|
||||||
|
for kill_scenario in kill_scenarios:
|
||||||
|
namespace_pattern = f"^{kill_scenario['namespace']}$"
|
||||||
|
label_selector = kill_scenario["label_selector"]
|
||||||
|
recovery_time = kill_scenario["expected_recovery_time"]
|
||||||
|
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||||
|
namespace_pattern=namespace_pattern,
|
||||||
|
label_selector=label_selector,
|
||||||
|
max_timeout=recovery_time)
|
||||||
|
|
||||||
|
|
||||||
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
|
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
|
||||||
scenario_name = get_yaml_item_value(cont_scenario, "name", "")
|
scenario_name = get_yaml_item_value(cont_scenario, "name", "")
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import os.path
|
import os.path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import urllib3
|
import urllib3
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
|
||||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||||
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
|||||||
|
|
||||||
prom_cli.process_alert(alert,
|
prom_cli.process_alert(alert,
|
||||||
datetime.datetime.fromtimestamp(start_time),
|
datetime.datetime.fromtimestamp(start_time),
|
||||||
datetime.datetime.fromtimestamp(end_time))
|
datetime.datetime.fromtimestamp(end_time))
|
||||||
|
|
||||||
|
|
||||||
|
def critical_alerts(prom_cli: KrknPrometheus,
|
||||||
|
summary: ChaosRunAlertSummary,
|
||||||
|
run_id,
|
||||||
|
scenario,
|
||||||
|
start_time,
|
||||||
|
end_time):
|
||||||
|
summary.scenario = scenario
|
||||||
|
summary.run_id = run_id
|
||||||
|
query = r"""ALERTS{severity="critical"}"""
|
||||||
|
logging.info("Checking for critical alerts firing post chaos")
|
||||||
|
|
||||||
|
during_critical_alerts = prom_cli.process_prom_query_in_range(
|
||||||
|
query,
|
||||||
|
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||||
|
end_time=end_time
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
for alert in during_critical_alerts:
|
||||||
|
if "metric" in alert:
|
||||||
|
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||||
|
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||||
|
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||||
|
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||||
|
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||||
|
summary.chaos_alerts.append(alert)
|
||||||
|
|
||||||
|
|
||||||
|
post_critical_alerts = prom_cli.process_query(
|
||||||
|
query
|
||||||
|
)
|
||||||
|
|
||||||
|
for alert in post_critical_alerts:
|
||||||
|
if "metric" in alert:
|
||||||
|
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||||
|
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||||
|
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||||
|
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||||
|
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||||
|
summary.post_chaos_alerts.append(alert)
|
||||||
|
|
||||||
|
during_critical_alerts_count = len(during_critical_alerts)
|
||||||
|
post_critical_alerts_count = len(post_critical_alerts)
|
||||||
|
firing_alerts = False
|
||||||
|
|
||||||
|
if during_critical_alerts_count > 0:
|
||||||
|
firing_alerts = True
|
||||||
|
|
||||||
|
if post_critical_alerts_count > 0:
|
||||||
|
firing_alerts = True
|
||||||
|
|
||||||
|
if not firing_alerts:
|
||||||
|
logging.info("No critical alerts are firing!!")
|
||||||
|
|||||||
@@ -1,40 +1,41 @@
|
|||||||
PyYAML>=5.1
|
|
||||||
aliyun-python-sdk-core==2.13.36
|
aliyun-python-sdk-core==2.13.36
|
||||||
aliyun-python-sdk-ecs==4.24.25
|
aliyun-python-sdk-ecs==4.24.25
|
||||||
arcaflow==0.9.0
|
arcaflow==0.9.0
|
||||||
arcaflow-plugin-sdk==0.10.0
|
arcaflow-plugin-sdk==0.10.0
|
||||||
azure-identity
|
|
||||||
azure-keyvault
|
|
||||||
azure-mgmt-compute
|
|
||||||
boto3==1.28.61
|
boto3==1.28.61
|
||||||
coverage
|
azure-identity==1.15.0
|
||||||
datetime
|
azure-keyvault==4.2.0
|
||||||
docker
|
azure-mgmt-compute==30.5.0
|
||||||
docker-compose
|
|
||||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
|
|
||||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
|
||||||
gitpython
|
|
||||||
google-api-python-client
|
|
||||||
ibm_cloud_sdk_core
|
|
||||||
ibm_vpc
|
|
||||||
itsdangerous==2.0.1
|
itsdangerous==2.0.1
|
||||||
|
coverage==7.4.1
|
||||||
|
datetime==5.4
|
||||||
|
docker==7.0.0
|
||||||
|
gitpython==3.1.41
|
||||||
|
google-api-python-client==2.116.0
|
||||||
|
ibm_cloud_sdk_core==3.18.0
|
||||||
|
ibm_vpc==0.20.0
|
||||||
jinja2==3.1.3
|
jinja2==3.1.3
|
||||||
krkn-lib >= 1.4.6
|
krkn-lib==2.1.2
|
||||||
kubernetes
|
lxml==5.1.0
|
||||||
lxml >= 4.3.0
|
kubernetes==26.1.0
|
||||||
oauth2client>=4.1.3
|
oauth2client==4.1.3
|
||||||
openshift-client
|
pandas==2.2.0
|
||||||
paramiko
|
openshift-client==1.0.21
|
||||||
podman-compose
|
paramiko==3.4.0
|
||||||
pyVmomi >= 6.7
|
podman-compose==1.0.6
|
||||||
pyfiglet
|
pyVmomi==8.0.2.0.1
|
||||||
pytest
|
pyfiglet==1.0.2
|
||||||
python-ipmi
|
pytest==8.0.0
|
||||||
python-openstackclient
|
python-ipmi==0.5.4
|
||||||
requests
|
python-openstackclient==6.5.0
|
||||||
service_identity
|
requests==2.31.0
|
||||||
|
service_identity==24.1.0
|
||||||
|
PyYAML==6.0
|
||||||
setuptools==65.5.1
|
setuptools==65.5.1
|
||||||
werkzeug==3.0.1
|
werkzeug==3.0.1
|
||||||
wheel>=0.38.0
|
wheel==0.42.0
|
||||||
zope.interface==5.4.0
|
zope.interface==5.4.0
|
||||||
pandas>=2.2.0
|
|
||||||
|
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
|
||||||
|
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||||
|
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||||
|
|||||||
121
run_kraken.py
121
run_kraken.py
@@ -9,6 +9,8 @@ import optparse
|
|||||||
import pyfiglet
|
import pyfiglet
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
|
||||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||||
import kraken.time_actions.common_time_functions as time_actions
|
import kraken.time_actions.common_time_functions as time_actions
|
||||||
import kraken.performance_dashboards.setup as performance_dashboards
|
import kraken.performance_dashboards.setup as performance_dashboards
|
||||||
@@ -27,6 +29,7 @@ import server as server
|
|||||||
from kraken import plugins
|
from kraken import plugins
|
||||||
from krkn_lib.k8s import KrknKubernetes
|
from krkn_lib.k8s import KrknKubernetes
|
||||||
from krkn_lib.ocp import KrknOpenshift
|
from krkn_lib.ocp import KrknOpenshift
|
||||||
|
from krkn_lib.telemetry.elastic import KrknElastic
|
||||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||||
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||||
@@ -94,6 +97,9 @@ def main(cfg):
|
|||||||
config["performance_monitoring"], "check_critical_alerts", False
|
config["performance_monitoring"], "check_critical_alerts", False
|
||||||
)
|
)
|
||||||
telemetry_api_url = config["telemetry"].get("api_url")
|
telemetry_api_url = config["telemetry"].get("api_url")
|
||||||
|
elastic_config = get_yaml_item_value(config,"elastic",{})
|
||||||
|
elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
|
||||||
|
elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")
|
||||||
|
|
||||||
# Initialize clients
|
# Initialize clients
|
||||||
if (not os.path.isfile(kubeconfig_path) and
|
if (not os.path.isfile(kubeconfig_path) and
|
||||||
@@ -129,8 +135,6 @@ def main(cfg):
|
|||||||
except:
|
except:
|
||||||
kubecli.initialize_clients(None)
|
kubecli.initialize_clients(None)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# find node kraken might be running on
|
# find node kraken might be running on
|
||||||
kubecli.find_kraken_node()
|
kubecli.find_kraken_node()
|
||||||
|
|
||||||
@@ -156,12 +160,22 @@ def main(cfg):
|
|||||||
# Cluster info
|
# Cluster info
|
||||||
logging.info("Fetching cluster info")
|
logging.info("Fetching cluster info")
|
||||||
cv = ""
|
cv = ""
|
||||||
if config["kraken"]["distribution"] == "openshift":
|
if distribution == "openshift":
|
||||||
cv = ocpcli.get_clusterversion_string()
|
cv = ocpcli.get_clusterversion_string()
|
||||||
if prometheus_url is None:
|
if prometheus_url is None:
|
||||||
connection_data = ocpcli.get_prometheus_api_connection_data()
|
try:
|
||||||
prometheus_url = connection_data.endpoint
|
connection_data = ocpcli.get_prometheus_api_connection_data()
|
||||||
prometheus_bearer_token = connection_data.token
|
if connection_data:
|
||||||
|
prometheus_url = connection_data.endpoint
|
||||||
|
prometheus_bearer_token = connection_data.token
|
||||||
|
else:
|
||||||
|
# If can't make a connection, set alerts to false
|
||||||
|
enable_alerts = False
|
||||||
|
critical_alerts = False
|
||||||
|
except Exception:
|
||||||
|
logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
|
||||||
|
"Please set 'kubernetes' in config.yaml krkn.platform and try again")
|
||||||
|
sys.exit(1)
|
||||||
if cv != "":
|
if cv != "":
|
||||||
logging.info(cv)
|
logging.info(cv)
|
||||||
else:
|
else:
|
||||||
@@ -170,9 +184,9 @@ def main(cfg):
|
|||||||
# KrknTelemetry init
|
# KrknTelemetry init
|
||||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||||
|
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
|
||||||
|
summary = ChaosRunAlertSummary()
|
||||||
if enable_alerts:
|
if enable_alerts or check_critical_alerts:
|
||||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||||
|
|
||||||
logging.info("Server URL: %s" % kubecli.get_host())
|
logging.info("Server URL: %s" % kubecli.get_host())
|
||||||
@@ -203,7 +217,8 @@ def main(cfg):
|
|||||||
|
|
||||||
# Capture the start time
|
# Capture the start time
|
||||||
start_time = int(time.time())
|
start_time = int(time.time())
|
||||||
|
post_critical_alerts = 0
|
||||||
|
chaos_output = ChaosRunOutput()
|
||||||
chaos_telemetry = ChaosRunTelemetry()
|
chaos_telemetry = ChaosRunTelemetry()
|
||||||
chaos_telemetry.run_uuid = run_uuid
|
chaos_telemetry.run_uuid = run_uuid
|
||||||
# Loop to run the chaos starts here
|
# Loop to run the chaos starts here
|
||||||
@@ -249,7 +264,8 @@ def main(cfg):
|
|||||||
kraken_config,
|
kraken_config,
|
||||||
failed_post_scenarios,
|
failed_post_scenarios,
|
||||||
wait_duration,
|
wait_duration,
|
||||||
telemetry_k8s
|
telemetry_k8s,
|
||||||
|
kubecli
|
||||||
)
|
)
|
||||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||||
# krkn_lib
|
# krkn_lib
|
||||||
@@ -317,7 +333,7 @@ def main(cfg):
|
|||||||
elif scenario_type == "application_outages":
|
elif scenario_type == "application_outages":
|
||||||
logging.info("Injecting application outage")
|
logging.info("Injecting application outage")
|
||||||
failed_post_scenarios, scenario_telemetries = application_outage.run(
|
failed_post_scenarios, scenario_telemetries = application_outage.run(
|
||||||
scenarios_list, config, wait_duration, telemetry_k8s)
|
scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||||
|
|
||||||
# PVC scenarios
|
# PVC scenarios
|
||||||
@@ -334,25 +350,21 @@ def main(cfg):
|
|||||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||||
|
|
||||||
# Check for critical alerts when enabled
|
# Check for critical alerts when enabled
|
||||||
if enable_alerts and check_critical_alerts :
|
post_critical_alerts = 0
|
||||||
logging.info("Checking for critical alerts firing post choas")
|
if check_critical_alerts:
|
||||||
|
prometheus_plugin.critical_alerts(prometheus,
|
||||||
|
summary,
|
||||||
|
run_uuid,
|
||||||
|
scenario_type,
|
||||||
|
start_time,
|
||||||
|
datetime.datetime.now())
|
||||||
|
|
||||||
##PROM
|
chaos_output.critical_alerts = summary
|
||||||
query = r"""ALERTS{severity="critical"}"""
|
post_critical_alerts = len(summary.post_chaos_alerts)
|
||||||
end_time = datetime.datetime.now()
|
if post_critical_alerts > 0:
|
||||||
critical_alerts = prometheus.process_prom_query_in_range(
|
logging.error("Post chaos critical alerts firing please check, exiting")
|
||||||
query,
|
break
|
||||||
start_time = datetime.datetime.fromtimestamp(start_time),
|
|
||||||
end_time = end_time
|
|
||||||
|
|
||||||
)
|
|
||||||
critical_alerts_count = len(critical_alerts)
|
|
||||||
if critical_alerts_count > 0:
|
|
||||||
logging.error("Critical alerts are firing: %s", critical_alerts)
|
|
||||||
logging.error("Please check, exiting")
|
|
||||||
sys.exit(1)
|
|
||||||
else:
|
|
||||||
logging.info("No critical alerts are firing!!")
|
|
||||||
|
|
||||||
iteration += 1
|
iteration += 1
|
||||||
logging.info("")
|
logging.info("")
|
||||||
@@ -366,27 +378,52 @@ def main(cfg):
|
|||||||
# if platform is openshift will be collected
|
# if platform is openshift will be collected
|
||||||
# Cloud platform and network plugins metadata
|
# Cloud platform and network plugins metadata
|
||||||
# through OCP specific APIs
|
# through OCP specific APIs
|
||||||
if config["kraken"]["distribution"] == "openshift":
|
if distribution == "openshift":
|
||||||
telemetry_ocp.collect_cluster_metadata(chaos_telemetry)
|
telemetry_ocp.collect_cluster_metadata(chaos_telemetry)
|
||||||
else:
|
else:
|
||||||
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
||||||
|
|
||||||
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
||||||
logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
|
chaos_output.telemetry = decoded_chaos_run_telemetry
|
||||||
|
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
|
||||||
|
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
|
||||||
if config["telemetry"]["enabled"]:
|
if config["telemetry"]["enabled"]:
|
||||||
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
|
logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
|
||||||
|
f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
|
||||||
|
f'{telemetry_request_id}')
|
||||||
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
||||||
try:
|
try:
|
||||||
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
||||||
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
||||||
|
telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
|
||||||
# prometheus data collection is available only on Openshift
|
# prometheus data collection is available only on Openshift
|
||||||
if config["telemetry"]["prometheus_backup"] and config["kraken"]["distribution"] == "openshift":
|
if config["telemetry"]["prometheus_backup"]:
|
||||||
safe_logger.info("archives download started:")
|
prometheus_archive_files = ''
|
||||||
prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
|
if distribution == "openshift" :
|
||||||
safe_logger.info("archives upload started:")
|
prometheus_archive_files = telemetry_ocp.get_ocp_prometheus_data(config["telemetry"], telemetry_request_id)
|
||||||
telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
|
else:
|
||||||
if config["telemetry"]["logs_backup"]:
|
if (config["telemetry"]["prometheus_namespace"] and
|
||||||
|
config["telemetry"]["prometheus_pod_name"] and
|
||||||
|
config["telemetry"]["prometheus_container_name"]):
|
||||||
|
try:
|
||||||
|
prometheus_archive_files = telemetry_k8s.get_prometheus_pod_data(
|
||||||
|
config["telemetry"],
|
||||||
|
telemetry_request_id,
|
||||||
|
config["telemetry"]["prometheus_pod_name"],
|
||||||
|
config["telemetry"]["prometheus_container_name"],
|
||||||
|
config["telemetry"]["prometheus_namespace"]
|
||||||
|
)
|
||||||
|
except Exception as e:
|
||||||
|
logging.error(f"failed to get prometheus backup with exception {str(e)}")
|
||||||
|
else:
|
||||||
|
logging.warning("impossible to backup prometheus,"
|
||||||
|
"check if config contains telemetry.prometheus_namespace, "
|
||||||
|
"telemetry.prometheus_pod_name and "
|
||||||
|
"telemetry.prometheus_container_name")
|
||||||
|
if prometheus_archive_files:
|
||||||
|
safe_logger.info("starting prometheus archive upload:")
|
||||||
|
telemetry_k8s.put_prometheus_data(config["telemetry"], prometheus_archive_files, telemetry_request_id)
|
||||||
|
if config["telemetry"]["logs_backup"] and distribution == "openshift":
|
||||||
telemetry_ocp.put_ocp_logs(telemetry_request_id, config["telemetry"], start_time, end_time)
|
telemetry_ocp.put_ocp_logs(telemetry_request_id, config["telemetry"], start_time, end_time)
|
||||||
except Exception as e:
|
except Exception as e:
|
||||||
logging.error(f"failed to send telemetry data: {str(e)}")
|
logging.error(f"failed to send telemetry data: {str(e)}")
|
||||||
@@ -408,11 +445,15 @@ def main(cfg):
|
|||||||
logging.error("Alert profile is not defined")
|
logging.error("Alert profile is not defined")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
if post_critical_alerts > 0:
|
||||||
|
logging.error("Critical alerts are firing, please check; exiting")
|
||||||
|
sys.exit(2)
|
||||||
|
|
||||||
if failed_post_scenarios:
|
if failed_post_scenarios:
|
||||||
logging.error(
|
logging.error(
|
||||||
"Post scenarios are still failing at the end of all iterations"
|
"Post scenarios are still failing at the end of all iterations"
|
||||||
)
|
)
|
||||||
sys.exit(1)
|
sys.exit(2)
|
||||||
|
|
||||||
logging.info(
|
logging.info(
|
||||||
"Successfully finished running Kraken. UUID for the run: "
|
"Successfully finished running Kraken. UUID for the run: "
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ deployers:
|
|||||||
connection: {}
|
connection: {}
|
||||||
deployer_name: kubernetes
|
deployer_name: kubernetes
|
||||||
log:
|
log:
|
||||||
level: debug
|
level: error
|
||||||
logged_outputs:
|
logged_outputs:
|
||||||
error:
|
error:
|
||||||
level: error
|
level: error
|
||||||
|
|||||||
@@ -5,5 +5,9 @@ input_list:
|
|||||||
duration: 1s
|
duration: 1s
|
||||||
kubeconfig: ''
|
kubeconfig: ''
|
||||||
namespace: default
|
namespace: default
|
||||||
node_selector:
|
# set the node selector as a key-value pair eg.
|
||||||
kubernetes.io/hostname: kind-worker2
|
# node_selector:
|
||||||
|
# kubernetes.io/hostname: kind-worker2
|
||||||
|
node_selector: {}
|
||||||
|
|
||||||
|
|
||||||
|
|||||||
@@ -3,7 +3,7 @@ deployers:
|
|||||||
connection: {}
|
connection: {}
|
||||||
deployer_name: kubernetes
|
deployer_name: kubernetes
|
||||||
log:
|
log:
|
||||||
level: debug
|
level: error
|
||||||
logged_outputs:
|
logged_outputs:
|
||||||
error:
|
error:
|
||||||
level: error
|
level: error
|
||||||
|
|||||||
@@ -5,6 +5,9 @@ input_list:
|
|||||||
io_write_bytes: 10m
|
io_write_bytes: 10m
|
||||||
kubeconfig: ''
|
kubeconfig: ''
|
||||||
namespace: default
|
namespace: default
|
||||||
|
# set the node selector as a key-value pair eg.
|
||||||
|
# node_selector:
|
||||||
|
# kubernetes.io/hostname: kind-worker2
|
||||||
node_selector: {}
|
node_selector: {}
|
||||||
target_pod_folder: /hog-data
|
target_pod_folder: /hog-data
|
||||||
target_pod_volume:
|
target_pod_volume:
|
||||||
|
|||||||
@@ -4,7 +4,7 @@ deployers:
|
|||||||
connection: {}
|
connection: {}
|
||||||
deployer_name: kubernetes
|
deployer_name: kubernetes
|
||||||
log:
|
log:
|
||||||
level: debug
|
level: error
|
||||||
logged_outputs:
|
logged_outputs:
|
||||||
error:
|
error:
|
||||||
level: error
|
level: error
|
||||||
|
|||||||
@@ -2,10 +2,10 @@ input_list:
|
|||||||
- duration: 30s
|
- duration: 30s
|
||||||
vm_bytes: 10%
|
vm_bytes: 10%
|
||||||
vm_workers: 2
|
vm_workers: 2
|
||||||
node_selector: { }
|
# set the node selector as a key-value pair eg.
|
||||||
# node selector example
|
|
||||||
# node_selector:
|
# node_selector:
|
||||||
# kubernetes.io/hostname: master
|
# kubernetes.io/hostname: kind-worker2
|
||||||
|
node_selector: { }
|
||||||
kubeconfig: ""
|
kubeconfig: ""
|
||||||
namespace: default
|
namespace: default
|
||||||
|
|
||||||
|
|||||||
@@ -3,8 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^kube-system$
|
namespace_pattern: ^kube-system$
|
||||||
label_selector: component=kube-scheduler
|
label_selector: component=kube-scheduler
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^kube-system$
|
|
||||||
label_selector: component=kube-scheduler
|
|
||||||
count: 3
|
|
||||||
|
|||||||
@@ -4,3 +4,4 @@
|
|||||||
name_pattern: ^nginx-.*$
|
name_pattern: ^nginx-.*$
|
||||||
namespace_pattern: ^default$
|
namespace_pattern: ^default$
|
||||||
kill: 1
|
kill: 1
|
||||||
|
krkn_pod_recovery_time: 120
|
||||||
|
|||||||
@@ -3,8 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^kube-system$
|
namespace_pattern: ^kube-system$
|
||||||
label_selector: k8s-app=kube-scheduler
|
label_selector: k8s-app=kube-scheduler
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^kube-system$
|
|
||||||
label_selector: k8s-app=kube-scheduler
|
|
||||||
count: 3
|
|
||||||
|
|||||||
@@ -5,4 +5,4 @@ scenarios:
|
|||||||
container_name: "etcd"
|
container_name: "etcd"
|
||||||
action: 1
|
action: 1
|
||||||
count: 1
|
count: 1
|
||||||
expected_recovery_time: 60
|
expected_recovery_time: 120
|
||||||
|
|||||||
@@ -3,8 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^acme-air$
|
namespace_pattern: ^acme-air$
|
||||||
name_pattern: .*
|
name_pattern: .*
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^acme-air$
|
|
||||||
name_pattern: .*
|
|
||||||
count: 8
|
|
||||||
@@ -3,8 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-etcd$
|
namespace_pattern: ^openshift-etcd$
|
||||||
label_selector: k8s-app=etcd
|
label_selector: k8s-app=etcd
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-etcd$
|
|
||||||
label_selector: k8s-app=etcd
|
|
||||||
count: 3
|
|
||||||
|
|||||||
@@ -3,8 +3,5 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-apiserver$
|
namespace_pattern: ^openshift-apiserver$
|
||||||
label_selector: app=openshift-apiserver-a
|
label_selector: app=openshift-apiserver-a
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-apiserver$
|
|
||||||
label_selector: app=openshift-apiserver-a
|
|
||||||
count: 3
|
|
||||||
|
|||||||
@@ -3,8 +3,5 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-kube-apiserver$
|
namespace_pattern: ^openshift-kube-apiserver$
|
||||||
label_selector: app=openshift-kube-apiserver
|
label_selector: app=openshift-kube-apiserver
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-kube-apiserver$
|
|
||||||
label_selector: app=openshift-kube-apiserver
|
|
||||||
count: 3
|
|
||||||
|
|||||||
@@ -3,8 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-monitoring$
|
namespace_pattern: ^openshift-monitoring$
|
||||||
label_selector: app=prometheus
|
label_selector: app=prometheus
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-monitoring$
|
|
||||||
label_selector: app=prometheus
|
|
||||||
count: 2
|
|
||||||
@@ -2,8 +2,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-monitoring$
|
namespace_pattern: ^openshift-monitoring$
|
||||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-monitoring$
|
|
||||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
|
||||||
count: 1
|
|
||||||
@@ -3,9 +3,4 @@
|
|||||||
config:
|
config:
|
||||||
namespace_pattern: ^openshift-monitoring$
|
namespace_pattern: ^openshift-monitoring$
|
||||||
label_selector: app=prometheus
|
label_selector: app=prometheus
|
||||||
- id: wait-for-pods
|
krkn_pod_recovery_time: 120
|
||||||
config:
|
|
||||||
namespace_pattern: ^openshift-monitoring$
|
|
||||||
label_selector: app=prometheus
|
|
||||||
count: 2
|
|
||||||
timeout: 180
|
|
||||||
|
|||||||
@@ -4,3 +4,4 @@
|
|||||||
namespace_pattern: ^openshift-.*$
|
namespace_pattern: ^openshift-.*$
|
||||||
name_pattern: .*
|
name_pattern: .*
|
||||||
kill: 3
|
kill: 3
|
||||||
|
krkn_pod_recovery_time: 120
|
||||||
|
|||||||
@@ -60,7 +60,14 @@
|
|||||||
"default": 1,
|
"default": 1,
|
||||||
"title": "Backoff",
|
"title": "Backoff",
|
||||||
"description": "How many seconds to wait between checks for the target pod status."
|
"description": "How many seconds to wait between checks for the target pod status."
|
||||||
|
},
|
||||||
|
"krkn_pod_recovery_time": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 30,
|
||||||
|
"title": "Recovery Time",
|
||||||
|
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||||
}
|
}
|
||||||
|
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
"namespace_pattern"
|
"namespace_pattern"
|
||||||
@@ -112,6 +119,12 @@
|
|||||||
"default": 1,
|
"default": 1,
|
||||||
"title": "Backoff",
|
"title": "Backoff",
|
||||||
"description": "How many seconds to wait between checks for the target pod status."
|
"description": "How many seconds to wait between checks for the target pod status."
|
||||||
|
},
|
||||||
|
"krkn_pod_recovery_time": {
|
||||||
|
"type": "integer",
|
||||||
|
"default": 30,
|
||||||
|
"title": "Recovery Time",
|
||||||
|
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
"required": [
|
"required": [
|
||||||
|
|||||||
39
utils/chaos_ai/README.md
Normal file
39
utils/chaos_ai/README.md
Normal file
@@ -0,0 +1,39 @@
|
|||||||
|
# aichaos
|
||||||
|
Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
|
||||||
|
|
||||||
|
## Generate python package wheel file
|
||||||
|
```
|
||||||
|
python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||||
|
```
|
||||||
|
This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder.
|
||||||
|
|
||||||
|
## Build Image
|
||||||
|
```
|
||||||
|
cd docker
|
||||||
|
podman build -t aichaos:1.0 .
|
||||||
|
OR
|
||||||
|
docker build -t aichaos:1.0 .
|
||||||
|
```
|
||||||
|
|
||||||
|
## Run Chaos AI
|
||||||
|
```
|
||||||
|
podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||||
|
OR
|
||||||
|
docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||||
|
```
|
||||||
|
|
||||||
|
The output should look like:
|
||||||
|
```
|
||||||
|
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||||
|
* Serving Flask app 'swagger_api' (lazy loading)
|
||||||
|
* Environment: production
|
||||||
|
WARNING: This is a development server. Do not use it in a production deployment.
|
||||||
|
Use a production WSGI server instead.
|
||||||
|
* Debug mode: on
|
||||||
|
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
|
||||||
|
* Running on all addresses (0.0.0.0)
|
||||||
|
* Running on http://127.0.0.1:5001
|
||||||
|
* Running on http://172.17.0.2:5001
|
||||||
|
```
|
||||||
|
|
||||||
|
You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
|
||||||
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
FROM bitnami/kubectl:1.20.9 as kubectl
|
||||||
|
FROM python:3.9
|
||||||
|
WORKDIR /app
|
||||||
|
RUN pip3 install --upgrade pip
|
||||||
|
COPY config config/
|
||||||
|
COPY requirements.txt .
|
||||||
|
RUN mkdir -p /app/logs
|
||||||
|
RUN pip3 install -r requirements.txt
|
||||||
|
|
||||||
|
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
|
||||||
|
|
||||||
|
COPY swagger_api.py .
|
||||||
|
ENV PYTHONUNBUFFERED=1
|
||||||
|
|
||||||
|
RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
|
||||||
|
|
||||||
|
RUN apt-get update && apt-get install -y podman
|
||||||
|
|
||||||
|
COPY aichaos-0.0.1-py3-none-any.whl .
|
||||||
|
RUN pip3 install aichaos-0.0.1-py3-none-any.whl
|
||||||
|
CMD ["python3", "swagger_api.py"]
|
||||||
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
@@ -0,0 +1,7 @@
|
|||||||
|
{
|
||||||
|
"command": "podman",
|
||||||
|
"chaosengine": "kraken",
|
||||||
|
"faults": "pod-delete",
|
||||||
|
"iterations": 1,
|
||||||
|
"maxfaults": 5
|
||||||
|
}
|
||||||
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
Get Log from the Chaos ID.---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API Results
|
||||||
|
parameters:
|
||||||
|
- name: chaosid
|
||||||
|
in: path
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Chaos-ID
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Results for the given Chaos ID.
|
||||||
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
@@ -0,0 +1,36 @@
|
|||||||
|
{
|
||||||
|
"apiVersion": "1.0",
|
||||||
|
"kind": "ChaosEngine",
|
||||||
|
"metadata": {
|
||||||
|
"name": "engine-cartns3"
|
||||||
|
},
|
||||||
|
"spec": {
|
||||||
|
"engineState": "active",
|
||||||
|
"annotationCheck": "false",
|
||||||
|
"appinfo": {
|
||||||
|
"appns": "robot-shop",
|
||||||
|
"applabel": "service=payment",
|
||||||
|
"appkind": "deployment"
|
||||||
|
},
|
||||||
|
"chaosServiceAccount": "pod-delete-sa",
|
||||||
|
"experiments": [
|
||||||
|
{
|
||||||
|
"name": "pod-delete",
|
||||||
|
"spec": {
|
||||||
|
"components": {
|
||||||
|
"env": [
|
||||||
|
{
|
||||||
|
"name": "FORCE",
|
||||||
|
"value": "true"
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "TOTAL_CHAOS_DURATION",
|
||||||
|
"value": "120"
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
]
|
||||||
|
}
|
||||||
|
}
|
||||||
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
@@ -0,0 +1,40 @@
|
|||||||
|
|
||||||
|
Generate chaos on an application deployed on a cluster.
|
||||||
|
---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API
|
||||||
|
parameters:
|
||||||
|
- name: file
|
||||||
|
in: formData
|
||||||
|
type: file
|
||||||
|
required: true
|
||||||
|
description: Kube-config file
|
||||||
|
- name: namespace
|
||||||
|
in: formData
|
||||||
|
type: string
|
||||||
|
default: robot-shop
|
||||||
|
required: true
|
||||||
|
description: Namespace to test
|
||||||
|
- name: podlabels
|
||||||
|
in: formData
|
||||||
|
type: string
|
||||||
|
default: service=cart,service=payment
|
||||||
|
required: true
|
||||||
|
description: Pod labels to test
|
||||||
|
- name: nodelabels
|
||||||
|
in: formData
|
||||||
|
type: string
|
||||||
|
required: false
|
||||||
|
description: Node labels to test
|
||||||
|
- name: urls
|
||||||
|
in: formData
|
||||||
|
type: string
|
||||||
|
default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
|
||||||
|
required: true
|
||||||
|
description: Application URLs to test
|
||||||
|
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Chaos ID for the initiated chaos.
|
||||||
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
Get Episodes from the Chaos ID.---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API Results
|
||||||
|
parameters:
|
||||||
|
- name: chaosid
|
||||||
|
in: path
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Chaos-ID
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Results for the given Chaos ID.
|
||||||
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
Get Log from the Chaos ID.---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API Results
|
||||||
|
parameters:
|
||||||
|
- name: chaosid
|
||||||
|
in: path
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Chaos-ID
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Results for the given Chaos ID.
|
||||||
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
Get QTable from the Chaos ID.---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API Results
|
||||||
|
parameters:
|
||||||
|
- name: chaosid
|
||||||
|
in: path
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Chaos-ID
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Results for the given Chaos ID.
|
||||||
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
@@ -0,0 +1,15 @@
|
|||||||
|
|
||||||
|
Get status of the Constraints ID.---
|
||||||
|
tags:
|
||||||
|
- ChaosAI API
|
||||||
|
parameters:
|
||||||
|
- name: chaosid
|
||||||
|
in: path
|
||||||
|
type: string
|
||||||
|
required: true
|
||||||
|
description: Chaos-ID
|
||||||
|
responses:
|
||||||
|
500:
|
||||||
|
description: Error!
|
||||||
|
200:
|
||||||
|
description: Chaos for the given ID.
|
||||||
6
utils/chaos_ai/docker/requirements.txt
Normal file
6
utils/chaos_ai/docker/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
|||||||
|
numpy
|
||||||
|
pandas
|
||||||
|
requests
|
||||||
|
Flask==2.1.0
|
||||||
|
Werkzeug==2.3.8
|
||||||
|
flasgger==0.9.5
|
||||||
186
utils/chaos_ai/docker/swagger_api.py
Normal file
186
utils/chaos_ai/docker/swagger_api.py
Normal file
@@ -0,0 +1,186 @@
|
|||||||
|
import json, os
|
||||||
|
import logging
|
||||||
|
# import numpy as np
|
||||||
|
# import pandas as pd
|
||||||
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
|
from flask import Flask, request
|
||||||
|
from flasgger import Swagger
|
||||||
|
from flasgger.utils import swag_from
|
||||||
|
# import zipfile
|
||||||
|
import sys
|
||||||
|
|
||||||
|
# sys.path.append("..")
|
||||||
|
from src.aichaos_main import AIChaos
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
Swagger(app)
|
||||||
|
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
|
||||||
|
|
||||||
|
|
||||||
|
class AIChaosSwagger:
|
||||||
|
def __init__(self, flaskdir=''):
|
||||||
|
self.flaskdir = flaskdir
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def empty(params=''):
|
||||||
|
return "AI Chaos Repository!"
|
||||||
|
|
||||||
|
def startchaos(self, kubeconfigfile, file_id, params):
|
||||||
|
print('[StartChaos]', file_id, kubeconfigfile)
|
||||||
|
dir = flaskdir
|
||||||
|
outfile = ''.join([dir, 'out-', file_id])
|
||||||
|
initfile = ''.join([dir, 'init-', file_id])
|
||||||
|
with open(initfile, 'w'):
|
||||||
|
pass
|
||||||
|
if os.path.exists(outfile):
|
||||||
|
os.remove(outfile)
|
||||||
|
# kubeconfigfile = params['file']
|
||||||
|
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||||
|
os.system("export KUBECONFIG="+kubeconfigfile)
|
||||||
|
os.system("echo $KUBECONFIG")
|
||||||
|
print('setting kubeconfig')
|
||||||
|
params['command'] = 'podman'
|
||||||
|
params['chaosengine'] = 'kraken'
|
||||||
|
params['faults'] = 'pod-delete'
|
||||||
|
params['iterations'] = 1
|
||||||
|
params['maxfaults'] = 5
|
||||||
|
if os.path.isfile('/config/aichaos-config.json'):
|
||||||
|
with open('/config/aichaos-config.json') as f:
|
||||||
|
config_params = json.load(f)
|
||||||
|
params['command'] = config_params['command']
|
||||||
|
params['chaosengine'] = config_params['chaosengine']
|
||||||
|
params['faults']= config_params['faults']
|
||||||
|
params['iterations'] = config_params['iterations']
|
||||||
|
params['maxfaults'] = config_params['maxfaults']
|
||||||
|
# faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||||
|
faults = []
|
||||||
|
for f in params['faults'].split(','):
|
||||||
|
if f in ['pod-delete']:
|
||||||
|
for p in params['podlabels'].split(','):
|
||||||
|
faults.append(f + ':' + p)
|
||||||
|
elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
|
||||||
|
for p in params['nodelabels'].split(','):
|
||||||
|
faults.append(f + ':' + p)
|
||||||
|
else:
|
||||||
|
pass
|
||||||
|
|
||||||
|
print('#faults:', len(faults), faults)
|
||||||
|
states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
|
||||||
|
'401': 6, '403': 7, '404': 8, '429': 9,
|
||||||
|
'Timeout': 10, 'Other': 11}
|
||||||
|
rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
|
||||||
|
'401': 1, '403': 1, '404': 1, '429': 1,
|
||||||
|
'Timeout': 1, 'Other': 1}
|
||||||
|
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||||
|
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||||
|
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||||
|
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||||
|
# probe_url = params['probeurl']
|
||||||
|
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||||
|
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||||
|
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||||
|
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||||
|
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||||
|
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||||
|
max_faults=int(params['maxfaults']),
|
||||||
|
num_requests=10, timeout=2,
|
||||||
|
chaos_engine=params['chaosengine'],
|
||||||
|
chaos_dir='config/', kubeconfig=kubeconfigfile,
|
||||||
|
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
|
||||||
|
command=params['command'])
|
||||||
|
print('checking kubeconfig')
|
||||||
|
os.system("echo $KUBECONFIG")
|
||||||
|
aichaos.start_chaos()
|
||||||
|
|
||||||
|
file = open(outfile, "w")
|
||||||
|
file.write('done')
|
||||||
|
file.close()
|
||||||
|
os.remove(initfile)
|
||||||
|
# os.remove(csvfile)
|
||||||
|
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||||
|
return 'WRITE'
|
||||||
|
|
||||||
|
@app.route('/GenerateChaos/', methods=['POST'])
|
||||||
|
@swag_from('config/yml/chaosGen.yml')
|
||||||
|
def chaos_gen():
|
||||||
|
dir = flaskdir
|
||||||
|
sw = AIChaosSwagger(flaskdir=dir)
|
||||||
|
f = request.files['file']
|
||||||
|
list = os.listdir(dir)
|
||||||
|
for i in range(10000):
|
||||||
|
fname = 'kubeconfig-'+str(i)
|
||||||
|
if fname not in list:
|
||||||
|
break
|
||||||
|
kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
|
||||||
|
f.save(kubeconfigfile)
|
||||||
|
# creating empty file
|
||||||
|
open(kubeconfigfile, 'a').close()
|
||||||
|
# print('HEADER:', f.headers)
|
||||||
|
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||||
|
# print('[GenerateChaos]', f.filename, datetime.now())
|
||||||
|
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||||
|
thread.daemon = True
|
||||||
|
print(thread.getName())
|
||||||
|
thread.start()
|
||||||
|
return 'Chaos ID: ' + str(i)
|
||||||
|
|
||||||
|
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('config/yml/status.yml')
|
||||||
|
def get_status(chaosid):
|
||||||
|
print('[GetStatus]', chaosid, flaskdir)
|
||||||
|
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(epfile):
|
||||||
|
return 'Completed'
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Does not exist'
|
||||||
|
|
||||||
|
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('config/yml/qtable.yml')
|
||||||
|
def get_qtable(chaosid):
|
||||||
|
print('[GetQTable]', chaosid)
|
||||||
|
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(qfile):
|
||||||
|
f = open(qfile, "r")
|
||||||
|
return f.read()
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Invalid Chaos ID: ' + chaosid
|
||||||
|
|
||||||
|
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('config/yml/episodes.yml')
|
||||||
|
def get_episodes(chaosid):
|
||||||
|
print('[GetEpisodes]', chaosid)
|
||||||
|
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(epfile):
|
||||||
|
f = open(epfile, "r")
|
||||||
|
return f.read()
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Invalid Chaos ID: ' + chaosid
|
||||||
|
|
||||||
|
|
||||||
|
@app.route('/GetLog/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('config/yml/log.yml')
|
||||||
|
def get_log(chaosid):
|
||||||
|
print('[GetLog]', chaosid)
|
||||||
|
epfile = flaskdir + 'log_' + str(chaosid)
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(epfile):
|
||||||
|
f = open(epfile, "r")
|
||||||
|
return f.read()
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Invalid Chaos ID: ' + chaosid
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||||
21
utils/chaos_ai/generate_wheel_package.py
Normal file
21
utils/chaos_ai/generate_wheel_package.py
Normal file
@@ -0,0 +1,21 @@
|
|||||||
|
import setuptools
|
||||||
|
# from setuptools_cythonize import get_cmdclass
|
||||||
|
|
||||||
|
setuptools.setup(
|
||||||
|
# cmdclass=get_cmdclass(),
|
||||||
|
name="aichaos",
|
||||||
|
version="0.0.1",
|
||||||
|
author="Sandeep Hans",
|
||||||
|
author_email="shans001@in.ibm.com",
|
||||||
|
description="Chaos AI",
|
||||||
|
long_description="Chaos Engineering using AI",
|
||||||
|
long_description_content_type="text/markdown",
|
||||||
|
url="",
|
||||||
|
packages=setuptools.find_packages(),
|
||||||
|
classifiers=[
|
||||||
|
"Programming Language :: Python :: 3",
|
||||||
|
"License :: OSI Approved :: MIT License",
|
||||||
|
"Operating System :: OS Independent",
|
||||||
|
],
|
||||||
|
python_requires='>=3.9',
|
||||||
|
)
|
||||||
10
utils/chaos_ai/requirements.txt
Normal file
10
utils/chaos_ai/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
numpy
|
||||||
|
pandas
|
||||||
|
notebook
|
||||||
|
jupyterlab
|
||||||
|
jupyter
|
||||||
|
seaborn
|
||||||
|
requests
|
||||||
|
wheel
|
||||||
|
Flask==2.1.0
|
||||||
|
flasgger==0.9.5
|
||||||
0
utils/chaos_ai/src/__init__.py
Normal file
0
utils/chaos_ai/src/__init__.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
@@ -0,0 +1,213 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
import sys
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import logging
|
||||||
|
|
||||||
|
|
||||||
|
class AIChaos:
|
||||||
|
def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
|
||||||
|
chaos_experiment='experiment.json',
|
||||||
|
chaos_journal='journal.json', iterations=1000, static_run=False):
|
||||||
|
self.faults = faults
|
||||||
|
self.pod_names = pod_names
|
||||||
|
self.states = states
|
||||||
|
self.rewards = rewards
|
||||||
|
self.episodes = []
|
||||||
|
|
||||||
|
self.chaos_dir = chaos_dir
|
||||||
|
self.chaos_experiment = chaos_experiment
|
||||||
|
self.chaos_journal = chaos_journal
|
||||||
|
|
||||||
|
self.iterations = iterations
|
||||||
|
# Initialize parameters
|
||||||
|
self.gamma = 0.75 # Discount factor
|
||||||
|
self.alpha = 0.9 # Learning rate
|
||||||
|
|
||||||
|
# Initializing Q-Values
|
||||||
|
# self.Q = np.array(np.zeros([9, 9]))
|
||||||
|
# self.Q = np.array(np.zeros([len(faults), len(faults)]))
|
||||||
|
# currently action is a single fault, later on we will do multiple faults together
|
||||||
|
# For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
|
||||||
|
# eg. {f1,f2},f3,f4,{f4,f5} - f1,f2 in parallel, then f3, then f4, then f4,f5 in parallel produces end state
|
||||||
|
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||||
|
self.Q = np.array(np.zeros([len(states), len(faults)]))
|
||||||
|
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||||
|
|
||||||
|
# may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
|
||||||
|
# Q = {'500' = {'f1f2f4': 0.3, 'f1': 0.5}, '404' = {'f2': 0.22}}
|
||||||
|
|
||||||
|
self.logger = logging.getLogger()
|
||||||
|
# run from old static experiment and journal files
|
||||||
|
self.static_run = static_run
|
||||||
|
|
||||||
|
# End state is reached when system is down or return error code like '500','404'
|
||||||
|
def get_next_state(self):
|
||||||
|
self.logger.info('[GET_NEXT_STATE]')
|
||||||
|
f = open(self.chaos_dir + self.chaos_journal)
|
||||||
|
data = json.load(f)
|
||||||
|
|
||||||
|
# before the experiment (if before steady state is false, after is null?)
|
||||||
|
for probe in data['steady_states']['before']['probes']:
|
||||||
|
if not probe['tolerance_met']:
|
||||||
|
# start_state = probe['activity']['tolerance']
|
||||||
|
# end_state = probe['status']
|
||||||
|
start_state, end_state = None, None
|
||||||
|
return start_state, end_state
|
||||||
|
|
||||||
|
# after the experiment
|
||||||
|
for probe in data['steady_states']['after']['probes']:
|
||||||
|
# if probe['output']['status'] == probe['activity']['tolerance']:
|
||||||
|
if not probe['tolerance_met']:
|
||||||
|
# print(probe)
|
||||||
|
start_state = probe['activity']['tolerance']
|
||||||
|
end_state = probe['output']['status']
|
||||||
|
# end_state = probe['status']
|
||||||
|
return start_state, end_state
|
||||||
|
# if tolerances for all probes are met
|
||||||
|
start_state = probe['activity']['tolerance']
|
||||||
|
end_state = probe['activity']['tolerance']
|
||||||
|
return start_state, end_state
|
||||||
|
|
||||||
|
def inject_faults(self, fault, pod_name):
|
||||||
|
self.logger.info('[INJECT_FAULT] ' + fault)
|
||||||
|
f = open(self.chaos_dir + self.chaos_experiment)
|
||||||
|
data = json.load(f)
|
||||||
|
for m in data['method']:
|
||||||
|
if 'provider' in m:
|
||||||
|
if fault == 'kill_microservice':
|
||||||
|
m['name'] = 'kill-microservice'
|
||||||
|
m['provider']['module'] = 'chaosk8s.actions'
|
||||||
|
m['provider']['arguments']['name'] = pod_name
|
||||||
|
else:
|
||||||
|
m['provider']['arguments']['name_pattern'] = pod_name
|
||||||
|
m['provider']['func'] = fault
|
||||||
|
|
||||||
|
print('[INJECT_FAULT] method:', m)
|
||||||
|
# self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
|
||||||
|
# self.logger.info('[INJECT_FAULT] ' + str(m))
|
||||||
|
|
||||||
|
exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
|
||||||
|
with open(exp_file, 'w') as f:
|
||||||
|
json.dump(data, f)
|
||||||
|
exp_file = self.chaos_dir + 'experiment.json'
|
||||||
|
# execute faults
|
||||||
|
# cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
|
||||||
|
cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
|
||||||
|
if not self.static_run:
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
def create_episode(self):
|
||||||
|
self.logger.info('[CREATE_EPISODE]')
|
||||||
|
episode = []
|
||||||
|
while True:
|
||||||
|
# inject more faults
|
||||||
|
# TODO: model - choose faults based on q-learning ...
|
||||||
|
fault_pod = random.choice(self.faults)
|
||||||
|
fault = fault_pod.split(':')[0]
|
||||||
|
pod_name = fault_pod.split(':')[1]
|
||||||
|
# fault = random.choice(self.faults)
|
||||||
|
# pod_name = random.choice(self.pod_names)
|
||||||
|
# fault = lstm_model.get_next_fault(episode)
|
||||||
|
# fault = get_max_prob_fault(episode)
|
||||||
|
|
||||||
|
self.inject_faults(fault, pod_name)
|
||||||
|
start_state, next_state = self.get_next_state()
|
||||||
|
print('[CREATE EPISODE]', start_state, next_state)
|
||||||
|
# if before state tolerance is not met
|
||||||
|
if start_state is None and next_state is None:
|
||||||
|
continue
|
||||||
|
|
||||||
|
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||||
|
self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||||
|
# self.update_q_fault(fault, episode, start_state, next_state)
|
||||||
|
# if an end_state is reached
|
||||||
|
# if next_state is not None:
|
||||||
|
if start_state != next_state:
|
||||||
|
self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||||
|
self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||||
|
return episode, start_state, next_state
|
||||||
|
|
||||||
|
def update_q_fault(self, fault, episode, start_state, end_state):
|
||||||
|
self.logger.info('[UPDATE_Q]')
|
||||||
|
print('[UPDATE_Q] ', str(start_state), str(end_state))
|
||||||
|
if end_state is None:
|
||||||
|
end_state = start_state
|
||||||
|
|
||||||
|
# reward is dependent on the error response (eg. '404') and length of episode
|
||||||
|
reward = self.rewards[str(end_state)] / len(episode)
|
||||||
|
current_state = self.states[str(start_state)]
|
||||||
|
next_state = self.states[str(end_state)]
|
||||||
|
fault_index = self.faults.index(fault)
|
||||||
|
|
||||||
|
TD = reward + \
|
||||||
|
self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||||
|
self.Q[current_state, fault_index]
|
||||||
|
self.Q[current_state, fault_index] += self.alpha * TD
|
||||||
|
|
||||||
|
# update state matrix
|
||||||
|
TD_state = reward + \
|
||||||
|
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||||
|
self.state_matrix[current_state, next_state]
|
||||||
|
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||||
|
|
||||||
|
# def update_q(self, episode, start_state, end_state):
|
||||||
|
# self.logger.info('[UPDATE_Q]')
|
||||||
|
# if end_state is None:
|
||||||
|
# end_state = start_state
|
||||||
|
#
|
||||||
|
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||||
|
# reward = self.rewards[str(end_state)] / len(episode)
|
||||||
|
# current_state = self.states[str(start_state)]
|
||||||
|
# next_state = self.states[str(end_state)]
|
||||||
|
# TD = reward + \
|
||||||
|
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||||
|
# self.Q[current_state, next_state]
|
||||||
|
# self.Q[current_state, next_state] += self.alpha * TD
|
||||||
|
|
||||||
|
def start_chaos(self):
|
||||||
|
for i in range(self.iterations):
|
||||||
|
episode, start_state, end_state = self.create_episode()
|
||||||
|
# update Q matrix
|
||||||
|
# will do it with each fault injection
|
||||||
|
# self.update_q(episode, start_state, end_state)
|
||||||
|
print(self.Q)
|
||||||
|
print(self.state_matrix)
|
||||||
|
|
||||||
|
|
||||||
|
def test_chaos():
|
||||||
|
svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
|
||||||
|
'shipping', 'user', 'web']
|
||||||
|
# Define faults
|
||||||
|
# faults = ['terminate_pods']
|
||||||
|
# faults = ['terminate_pods:' + x for x in pod_names]
|
||||||
|
faults = ['kill_microservice:' + x for x in svc_list]
|
||||||
|
# Define the states
|
||||||
|
states = {
|
||||||
|
'200': 0,
|
||||||
|
'500': 1,
|
||||||
|
'404': 2
|
||||||
|
}
|
||||||
|
# Define rewards, currently not used
|
||||||
|
rewards = {
|
||||||
|
'200': 0,
|
||||||
|
'500': 0.8,
|
||||||
|
'404': 1
|
||||||
|
}
|
||||||
|
|
||||||
|
# cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
|
||||||
|
cdir = '/Users/sandeephans/Downloads/openshift/'
|
||||||
|
cexp = 'experiment.json'
|
||||||
|
cjournal = 'journal.json'
|
||||||
|
|
||||||
|
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||||
|
chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
|
||||||
|
static_run=False)
|
||||||
|
aichaos.start_chaos()
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||||
|
test_chaos()
|
||||||
248
utils/chaos_ai/src/aichaos_main.py
Normal file
248
utils/chaos_ai/src/aichaos_main.py
Normal file
@@ -0,0 +1,248 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import random
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
import pandas as pd
|
||||||
|
import logging
|
||||||
|
|
||||||
|
# sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||||
|
import src.utils as utils
|
||||||
|
from src.kraken_utils import KrakenUtils
|
||||||
|
from src.qlearning import QLearning
|
||||||
|
from src.test_application import TestApplication
|
||||||
|
|
||||||
|
|
||||||
|
class AIChaos:
|
||||||
|
def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
|
||||||
|
service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
|
||||||
|
chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
|
||||||
|
loglevel=logging.INFO,
|
||||||
|
chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
|
||||||
|
num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
|
||||||
|
static_run=False, all_faults=False, command='podman'):
|
||||||
|
self.namespace = namespace
|
||||||
|
self.faults = faults
|
||||||
|
self.unused_faults = faults.copy()
|
||||||
|
self.all_faults = all_faults
|
||||||
|
self.pod_names = pod_names
|
||||||
|
self.states = states
|
||||||
|
self.rewards = rewards
|
||||||
|
self.urls = urls
|
||||||
|
self.max_faults = max_faults
|
||||||
|
self.episodes = []
|
||||||
|
self.service_weights = service_weights
|
||||||
|
self.ctd_subsets = ctd_subsets
|
||||||
|
|
||||||
|
self.kubeconfig = kubeconfig
|
||||||
|
self.chaos_dir = chaos_dir
|
||||||
|
self.chaos_experiment = chaos_experiment
|
||||||
|
self.chaos_journal = chaos_journal
|
||||||
|
self.command = command
|
||||||
|
|
||||||
|
if chaos_engine == 'kraken':
|
||||||
|
self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
|
||||||
|
else:
|
||||||
|
self.chaos_engine = None
|
||||||
|
|
||||||
|
self.iterations = iterations
|
||||||
|
# Initialize RL parameters
|
||||||
|
self.epsilon = epsilon # epsilon decay policy
|
||||||
|
# self.epsdecay = 0
|
||||||
|
|
||||||
|
# log files
|
||||||
|
self.logfile = logfile
|
||||||
|
self.qfile = qfile
|
||||||
|
self.efile = efile
|
||||||
|
self.epfile = epfile
|
||||||
|
open(efile, 'w+').close()
|
||||||
|
open(logfile, 'w+').close()
|
||||||
|
open(logfile, 'r+').truncate(0)
|
||||||
|
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||||
|
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||||
|
logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
|
||||||
|
self.logger = logging.getLogger(logfile.replace('/',''))
|
||||||
|
self.logger.addHandler(logging.FileHandler(logfile))
|
||||||
|
|
||||||
|
self.testapp = TestApplication(num_requests, timeout, sleep_time)
|
||||||
|
self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
|
||||||
|
|
||||||
|
# run from old static experiment and journal files
|
||||||
|
self.static_run = static_run
|
||||||
|
|
||||||
|
def realistic(self, faults_pods):
|
||||||
|
self.logger.debug('[Realistic] ' + str(faults_pods))
|
||||||
|
fp = faults_pods.copy()
|
||||||
|
for f1 in faults_pods:
|
||||||
|
for f2 in faults_pods:
|
||||||
|
if f1 == f2:
|
||||||
|
continue
|
||||||
|
if f1 in fp and f2 in fp:
|
||||||
|
f1_fault, load_1 = utils.get_load(f1.split(':')[0])
|
||||||
|
f1_pod = f1.split(':')[1]
|
||||||
|
f2_fault, load_2 = utils.get_load(f2.split(':')[0])
|
||||||
|
f2_pod = f2.split(':')[1]
|
||||||
|
if f1_pod == f2_pod:
|
||||||
|
if f1_fault == 'pod-delete':
|
||||||
|
fp.remove(f2)
|
||||||
|
if f1_fault == f2_fault:
|
||||||
|
# if int(load_1) > int(load_2):
|
||||||
|
# randomly remove one fault from same faults with different params
|
||||||
|
fp.remove(f2)
|
||||||
|
if self.service_weights is None:
|
||||||
|
return fp
|
||||||
|
|
||||||
|
fp_copy = fp.copy()
|
||||||
|
for f in fp:
|
||||||
|
f_fault = f.split(':')[0]
|
||||||
|
f_pod = f.split(':')[1].replace('service=', '')
|
||||||
|
self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
|
||||||
|
if self.service_weights[f_pod][f_fault] == 0:
|
||||||
|
fp_copy.remove(f)
|
||||||
|
|
||||||
|
self.logger.debug('[Realistic] ' + str(fp_copy))
|
||||||
|
return fp_copy
|
||||||
|
|
||||||
|
def select_faults(self):
|
||||||
|
max_faults = min(self.max_faults, len(self.unused_faults))
|
||||||
|
num_faults = random.randint(1, max_faults)
|
||||||
|
if self.all_faults:
|
||||||
|
num_faults = len(self.unused_faults)
|
||||||
|
if random.random() > self.epsilon:
|
||||||
|
self.logger.info('[Exploration]')
|
||||||
|
# faults_pods = random.sample(self.faults, k=num_faults)
|
||||||
|
# using used faults list to avoid starvation
|
||||||
|
faults_pods = random.sample(self.unused_faults, k=num_faults)
|
||||||
|
faults_pods = self.realistic(faults_pods)
|
||||||
|
for f in faults_pods:
|
||||||
|
self.unused_faults.remove(f)
|
||||||
|
if len(self.unused_faults) == 0:
|
||||||
|
self.unused_faults = self.faults.copy()
|
||||||
|
else:
|
||||||
|
self.logger.info('[Exploitation]')
|
||||||
|
first_row = self.ql.Q[:, 0, :][0]
|
||||||
|
top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
|
||||||
|
faults_pods = [self.faults[i] for i in top_k_indices]
|
||||||
|
faults_pods = self.realistic(faults_pods)
|
||||||
|
|
||||||
|
return faults_pods
|
||||||
|
|
||||||
|
def create_episode(self, ctd_subset=None):
|
||||||
|
self.logger.debug('[CREATE_EPISODE]')
|
||||||
|
episode = []
|
||||||
|
|
||||||
|
if ctd_subset is None:
|
||||||
|
faults_pods = self.select_faults()
|
||||||
|
else:
|
||||||
|
faults_pods = ctd_subset
|
||||||
|
self.logger.info('CTD Subset: ' + str(faults_pods))
|
||||||
|
|
||||||
|
# faults_pods = self.realistic(faults_pods)
|
||||||
|
if len(faults_pods) == 0:
|
||||||
|
return [], 200, 200
|
||||||
|
|
||||||
|
engines = []
|
||||||
|
for fp in faults_pods:
|
||||||
|
fault = fp.split(':')[0]
|
||||||
|
pod_name = fp.split(':')[1]
|
||||||
|
engine = self.chaos_engine.inject_faults(fault, pod_name)
|
||||||
|
engines.append(engine)
|
||||||
|
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||||
|
self.logger.info('[create_episode]' + str(faults_pods))
|
||||||
|
engines_running = self.chaos_engine.wait_engines(engines)
|
||||||
|
self.logger.info('[create_episode] engines_running' + str(engines_running))
|
||||||
|
if not engines_running:
|
||||||
|
return None, None, None
|
||||||
|
|
||||||
|
# randomly shuffling urls
|
||||||
|
urls = random.sample(self.urls, len(self.urls))
|
||||||
|
ep_json = []
|
||||||
|
for url in urls:
|
||||||
|
start_state, next_state = self.testapp.test_load(url)
|
||||||
|
self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
|
||||||
|
# if before state tolerance is not met
|
||||||
|
if start_state is None and next_state is None:
|
||||||
|
# self.cleanup()
|
||||||
|
self.chaos_engine.stop_engines()
|
||||||
|
continue
|
||||||
|
|
||||||
|
### episode.append({'fault': fault, 'pod_name': pod_name})
|
||||||
|
# self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||||
|
url_index = self.urls.index(url)
|
||||||
|
self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
|
||||||
|
for fp in faults_pods:
|
||||||
|
self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
|
||||||
|
ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
|
||||||
|
|
||||||
|
self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||||
|
self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||||
|
|
||||||
|
self.chaos_engine.print_result(engines)
|
||||||
|
self.chaos_engine.stop_engines(episode=episode)
|
||||||
|
# ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
|
||||||
|
|
||||||
|
return ep_json, start_state, next_state
|
||||||
|
|
||||||
|
def start_chaos(self):
|
||||||
|
self.logger.info('[INITIALIZING]')
|
||||||
|
self.logger.info('Logfile: '+self.logfile)
|
||||||
|
self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
|
||||||
|
self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
|
||||||
|
self.logger.debug('Faults:' + str(self.faults))
|
||||||
|
|
||||||
|
self.chaos_engine.cleanup()
|
||||||
|
if self.ctd_subsets is None:
|
||||||
|
for i in range(self.iterations):
|
||||||
|
episode, start_state, end_state = self.create_episode()
|
||||||
|
self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
|
||||||
|
if episode is None:
|
||||||
|
continue
|
||||||
|
# update Q matrix
|
||||||
|
# will do it with each fault injection
|
||||||
|
# self.update_q(episode, start_state, end_state)
|
||||||
|
# if episode['next_state'] != '200':
|
||||||
|
self.episodes.extend(episode)
|
||||||
|
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||||
|
# print(i, self.state_matrix)
|
||||||
|
self.write_q()
|
||||||
|
self.write_episode(episode)
|
||||||
|
else:
|
||||||
|
for i, subset in enumerate(self.ctd_subsets):
|
||||||
|
episode, start_state, end_state = self.create_episode(subset)
|
||||||
|
self.logger.debug('[start_chaos]' + str(episode))
|
||||||
|
if episode is None:
|
||||||
|
continue
|
||||||
|
self.episodes.append(episode)
|
||||||
|
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||||
|
self.write_q()
|
||||||
|
self.write_episode(episode)
|
||||||
|
|
||||||
|
self.chaos_engine.cleanup()
|
||||||
|
# self.remove_temp_file()
|
||||||
|
with open(self.epfile, 'w', encoding='utf-8') as f:
|
||||||
|
json.dump(self.episodes, f, ensure_ascii=False, indent=4)
|
||||||
|
self.logger.info('COMPLETE!!!')
|
||||||
|
|
||||||
|
def write_q(self):
|
||||||
|
df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
|
||||||
|
df.to_csv(self.qfile)
|
||||||
|
return df
|
||||||
|
|
||||||
|
def write_episode(self, episode):
|
||||||
|
for ep in episode:
|
||||||
|
with open(self.efile, "a") as outfile:
|
||||||
|
x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
|
||||||
|
x.append(ep['url'])
|
||||||
|
x.append(str(ep['next_state']))
|
||||||
|
outfile.write(','.join(x) + '\n')
|
||||||
|
|
||||||
|
def remove_temp_file(self):
|
||||||
|
mydir = self.chaos_dir + 'experiments'
|
||||||
|
print('Removing temp files from: '+mydir)
|
||||||
|
self.logger.debug('Removing temp files: '+mydir)
|
||||||
|
if os.path.exists(mydir):
|
||||||
|
return
|
||||||
|
filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
|
||||||
|
for f in filelist:
|
||||||
|
print(f)
|
||||||
|
os.remove(os.path.join(mydir, f))
|
||||||
56
utils/chaos_ai/src/experiments.py
Normal file
56
utils/chaos_ai/src/experiments.py
Normal file
@@ -0,0 +1,56 @@
|
|||||||
|
import random
|
||||||
|
|
||||||
|
|
||||||
|
class Experiments:
|
||||||
|
def __init__(self):
|
||||||
|
self.k = 0
|
||||||
|
|
||||||
|
def monotonic(self, aichaos, num_sets=3):
|
||||||
|
for i in range(num_sets):
|
||||||
|
faults_pods = random.sample(aichaos.faults, k=2)
|
||||||
|
faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
|
||||||
|
|
||||||
|
resp1, resp2, resp_both = 0, 0, 0
|
||||||
|
for fl in faults_set:
|
||||||
|
engines = []
|
||||||
|
for fp in fl:
|
||||||
|
fault = fp.split(':')[0]
|
||||||
|
pod_name = fp.split(':')[1]
|
||||||
|
engine = aichaos.inject_faults_litmus(fault, pod_name)
|
||||||
|
engines.append(engine)
|
||||||
|
aichaos.litmus.wait_engines(engines)
|
||||||
|
|
||||||
|
for index, url in enumerate(aichaos.urls):
|
||||||
|
start_state, next_state = aichaos.test_load(url)
|
||||||
|
print(i, fl, next_state)
|
||||||
|
# self.write(str(fl), next_state)
|
||||||
|
if resp1 == 0:
|
||||||
|
resp1 = next_state
|
||||||
|
elif resp2 == 0:
|
||||||
|
resp2 = next_state
|
||||||
|
else:
|
||||||
|
resp_both = next_state
|
||||||
|
|
||||||
|
aichaos.litmus.stop_engines()
|
||||||
|
self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
|
||||||
|
print('Experiment Complete!!!')
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write(fault, next_state):
|
||||||
|
with open("experiment", "a") as outfile:
|
||||||
|
outfile.write(fault + ',' + str(next_state) + ',' + '\n')
|
||||||
|
|
||||||
|
|
||||||
|
@staticmethod
|
||||||
|
def write_resp(faults, resp1, resp2, resp3):
|
||||||
|
monotonic = True
|
||||||
|
if resp3 == 200:
|
||||||
|
if resp1 != 200 or resp2 != 200:
|
||||||
|
monotonic = False
|
||||||
|
else:
|
||||||
|
if resp1 == 200 and resp2 == 200:
|
||||||
|
monotonic = False
|
||||||
|
|
||||||
|
with open("experiment", "a") as outfile:
|
||||||
|
# outfile.write(faults + ',' + str(resp1) + ',' + '\n')
|
||||||
|
outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
|
||||||
99
utils/chaos_ai/src/kraken_utils.py
Normal file
99
utils/chaos_ai/src/kraken_utils.py
Normal file
@@ -0,0 +1,99 @@
|
|||||||
|
import json
|
||||||
|
import os
|
||||||
|
import time
|
||||||
|
import logging
|
||||||
|
|
||||||
|
import src.utils as utils
|
||||||
|
|
||||||
|
|
||||||
|
class KrakenUtils:
|
||||||
|
def __init__(self, namespace='robot-shop', chaos_dir='../config/',
|
||||||
|
chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
|
||||||
|
self.chaos_dir = chaos_dir
|
||||||
|
self.chaos_experiment = chaos_experiment
|
||||||
|
self.namespace = namespace
|
||||||
|
self.kubeconfig = kubeconfig
|
||||||
|
self.logger = logging.getLogger()
|
||||||
|
self.engines = []
|
||||||
|
self.wait_checks = wait_checks
|
||||||
|
self.command = command
|
||||||
|
|
||||||
|
def exp_status(self, engine='engine-cartns3'):
|
||||||
|
substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
|
||||||
|
substr = '|'.join(substring_list)
|
||||||
|
# cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
|
||||||
|
# cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||||
|
cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||||
|
line = os.popen(cmd).read()
|
||||||
|
self.logger.debug('[exp_status]'+line)
|
||||||
|
# if 'Waiting for the specified duration' in line:
|
||||||
|
# if 'Waiting for' in line or 'waiting for' in line:
|
||||||
|
# if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
|
||||||
|
if any(map(line.__contains__, substring_list)):
|
||||||
|
return 'Running'
|
||||||
|
return 'Not Running'
|
||||||
|
|
||||||
|
# print chaos result, check if litmus showed any error
|
||||||
|
def print_result(self, engines):
|
||||||
|
# self.logger.debug('')
|
||||||
|
for e in engines:
|
||||||
|
# cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
|
||||||
|
# line = os.popen(cmd).read()
|
||||||
|
# self.logger.debug('[Chaos Result] '+e+' : '+line)
|
||||||
|
self.logger.debug('[KRAKEN][Chaos Result] '+e)
|
||||||
|
|
||||||
|
def wait_engines(self, engines=[]):
|
||||||
|
status = 'Completed'
|
||||||
|
max_checks = self.wait_checks
|
||||||
|
for e in engines:
|
||||||
|
self.logger.info('[Wait Engines] ' + e)
|
||||||
|
for i in range(max_checks):
|
||||||
|
status = self.exp_status(e)
|
||||||
|
if status == 'Running':
|
||||||
|
break
|
||||||
|
time.sleep(1)
|
||||||
|
# return False, if even one engine is not running
|
||||||
|
if status != 'Running':
|
||||||
|
return False
|
||||||
|
|
||||||
|
self.engines = engines
|
||||||
|
# return True if all engines are running
|
||||||
|
return True
|
||||||
|
|
||||||
|
|
||||||
|
def cleanup(self):
|
||||||
|
self.logger.debug('Removing previous engines')
|
||||||
|
# cmd = "docker rm $(docker ps -q -f 'status=exited')"
|
||||||
|
if len(self.engines) > 0:
|
||||||
|
cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
|
||||||
|
os.system(cmd)
|
||||||
|
self.engines = []
|
||||||
|
|
||||||
|
cmd = self.command+" container prune -f >> temp"
|
||||||
|
os.system(cmd)
|
||||||
|
self.logger.debug('Engines removed')
|
||||||
|
|
||||||
|
def stop_engines(self, episode=[]):
|
||||||
|
self.cleanup()
|
||||||
|
|
||||||
|
def get_name(self):
|
||||||
|
return 'kraken'
|
||||||
|
|
||||||
|
def inject_faults(self, fault, pod_name):
|
||||||
|
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
|
||||||
|
fault, load = utils.get_load(fault)
|
||||||
|
engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
|
||||||
|
if fault == 'pod-delete':
|
||||||
|
cmd = self.command+' run -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
|
||||||
|
elif fault == 'network-chaos':
|
||||||
|
# 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10 --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||||
|
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||||
|
elif fault == 'node-memory-hog':
|
||||||
|
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
|
||||||
|
elif fault == 'node-cpu-hog':
|
||||||
|
cmd = self.command+' run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
|
||||||
|
else:
|
||||||
|
cmd = 'echo'
|
||||||
|
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
|
||||||
|
os.system(cmd)
|
||||||
|
return engine
|
||||||
62
utils/chaos_ai/src/qlearning.py
Normal file
62
utils/chaos_ai/src/qlearning.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
import logging
|
||||||
|
|
||||||
|
import numpy as np
|
||||||
|
|
||||||
|
|
||||||
|
class QLearning:
|
||||||
|
def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
|
||||||
|
self.gamma = gamma # Discount factor
|
||||||
|
self.alpha = alpha # Learning rate
|
||||||
|
self.faults = faults
|
||||||
|
self.states = states
|
||||||
|
self.rewards = rewards
|
||||||
|
|
||||||
|
# Initializing Q-Values
|
||||||
|
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||||
|
self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
|
||||||
|
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||||
|
|
||||||
|
self.logger = logging.getLogger()
|
||||||
|
|
||||||
|
def update_q_fault(self, fault, episode, start_state, end_state, url_index):
|
||||||
|
self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
|
||||||
|
if end_state is None:
|
||||||
|
end_state = start_state
|
||||||
|
if end_state not in self.states:
|
||||||
|
end_state = 'Other'
|
||||||
|
# reward is dependent on the error response (eg. '404') and length of episode
|
||||||
|
reward = self.rewards[str(end_state)] / len(episode)
|
||||||
|
current_state = self.states[str(start_state)]
|
||||||
|
next_state = self.states[str(end_state)]
|
||||||
|
fault_index = self.faults.index(fault)
|
||||||
|
# self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
|
||||||
|
# self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
|
||||||
|
# self.logger.debug(
|
||||||
|
# 'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
|
||||||
|
# self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
|
||||||
|
|
||||||
|
TD = reward + \
|
||||||
|
self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
|
||||||
|
self.Q[url_index, current_state, fault_index]
|
||||||
|
self.Q[url_index, current_state, fault_index] += self.alpha * TD
|
||||||
|
|
||||||
|
# update state matrix
|
||||||
|
TD_state = reward + \
|
||||||
|
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||||
|
self.state_matrix[current_state, next_state]
|
||||||
|
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||||
|
# self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
|
||||||
|
|
||||||
|
# def update_q(self, episode, start_state, end_state):
|
||||||
|
# self.logger.info('[UPDATE_Q]')
|
||||||
|
# if end_state is None:
|
||||||
|
# end_state = start_state
|
||||||
|
#
|
||||||
|
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||||
|
# reward = self.rewards[str(end_state)] / len(episode)
|
||||||
|
# current_state = self.states[str(start_state)]
|
||||||
|
# next_state = self.states[str(end_state)]
|
||||||
|
# TD = reward + \
|
||||||
|
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||||
|
# self.Q[current_state, next_state]
|
||||||
|
# self.Q[current_state, next_state] += self.alpha * TD
|
||||||
171
utils/chaos_ai/src/swagger_api.py
Normal file
171
utils/chaos_ai/src/swagger_api.py
Normal file
@@ -0,0 +1,171 @@
|
|||||||
|
import json, os
|
||||||
|
import logging
|
||||||
|
# import numpy as np
|
||||||
|
# import pandas as pd
|
||||||
|
import threading
|
||||||
|
from datetime import datetime
|
||||||
|
from flask import Flask, request
|
||||||
|
from flasgger import Swagger
|
||||||
|
from flasgger.utils import swag_from
|
||||||
|
# import zipfile
|
||||||
|
import sys
|
||||||
|
|
||||||
|
sys.path.append("..")
|
||||||
|
from aichaos_main import AIChaos
|
||||||
|
|
||||||
|
app = Flask(__name__)
|
||||||
|
Swagger(app)
|
||||||
|
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
|
||||||
|
"flask") + '/'
|
||||||
|
|
||||||
|
|
||||||
|
class AIChaosSwagger:
|
||||||
|
def __init__(self, flaskdir=''):
|
||||||
|
self.flaskdir = flaskdir
|
||||||
|
|
||||||
|
@app.route("/")
|
||||||
|
def empty(params=''):
|
||||||
|
return "AI Chaos Repository!"
|
||||||
|
|
||||||
|
def startchaos(self, kubeconfigfile, file_id, params):
|
||||||
|
print('[StartChaos]', file_id, kubeconfigfile)
|
||||||
|
dir = flaskdir
|
||||||
|
outfile = ''.join([dir, 'out-', file_id])
|
||||||
|
initfile = ''.join([dir, 'init-', file_id])
|
||||||
|
with open(initfile, 'w'):
|
||||||
|
pass
|
||||||
|
if os.path.exists(outfile):
|
||||||
|
os.remove(outfile)
|
||||||
|
# cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
|
||||||
|
# write_local=False)
|
||||||
|
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||||
|
params['command'] = 'podman'
|
||||||
|
params['chaos_engine'] = 'kraken'
|
||||||
|
params['faults'] = 'pod-delete'
|
||||||
|
params['iterations'] = 1
|
||||||
|
params['maxfaults'] = 5
|
||||||
|
if os.path.isfile('/config/aichaos-config.json'):
|
||||||
|
with open('/config/aichaos-config.json') as f:
|
||||||
|
config_params = json.load(f)
|
||||||
|
params['command'] = config_params['command']
|
||||||
|
params['chaos_engine'] = config_params['chaos_engine']
|
||||||
|
params['faults']= config_params['faults']
|
||||||
|
params['iterations'] = config_params['iterations']
|
||||||
|
params['maxfaults'] = config_params['maxfaults']
|
||||||
|
faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||||
|
print('#faults:', len(faults), faults)
|
||||||
|
states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
|
||||||
|
rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
|
||||||
|
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||||
|
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||||
|
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||||
|
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||||
|
probe_url = params['probeurl']
|
||||||
|
probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
|
||||||
|
'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
|
||||||
|
'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
|
||||||
|
'packet_corruption': 'wolffi/packet_corruption',
|
||||||
|
'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
|
||||||
|
'http_bad_request': 'wolffi/http_bad_request',
|
||||||
|
'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
|
||||||
|
'http_not_found': 'wolffi/http_not_found',
|
||||||
|
'http_method_not_allowed': 'wolffi/http_method_not_allowed',
|
||||||
|
'http_not_acceptable': 'wolffi/http_not_acceptable',
|
||||||
|
'http_request_timeout': 'wolffi/http_request_timeout',
|
||||||
|
'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
|
||||||
|
'http_internal_server_error': 'wolffi/http_internal_server_error',
|
||||||
|
'http_not_implemented': 'wolffi/http_not_implemented',
|
||||||
|
'http_bad_gateway': 'wolffi/http_bad_gateway',
|
||||||
|
'http_service_unavailable': 'wolffi/http_service_unavailable',
|
||||||
|
'bandwidth_restrict': 'wolffi/bandwidth_restrict',
|
||||||
|
'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
|
||||||
|
'pod_io_load': 'wolffi/pod_io_load',
|
||||||
|
'pod_network_load': 'wolffi/pod_network_load'
|
||||||
|
}
|
||||||
|
dstk_probes = {k: probe_url + v for k, v in probes.items()}
|
||||||
|
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||||
|
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||||
|
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||||
|
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||||
|
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||||
|
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||||
|
max_faults=params['maxfaults'],
|
||||||
|
num_requests=10, timeout=2,
|
||||||
|
chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
|
||||||
|
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
|
||||||
|
aichaos.start_chaos()
|
||||||
|
|
||||||
|
file = open(outfile, "w")
|
||||||
|
file.write('done')
|
||||||
|
file.close()
|
||||||
|
os.remove(initfile)
|
||||||
|
# os.remove(csvfile)
|
||||||
|
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||||
|
return 'WRITE'
|
||||||
|
|
||||||
|
@app.route('/GenerateChaos/', methods=['POST'])
|
||||||
|
@swag_from('../config/yml/chaosGen.yml')
|
||||||
|
def chaos_gen():
|
||||||
|
dir = flaskdir
|
||||||
|
sw = AIChaosSwagger(flaskdir=dir)
|
||||||
|
f = request.files['file']
|
||||||
|
list = os.listdir(dir)
|
||||||
|
for i in range(10000):
|
||||||
|
if str(i) not in list:
|
||||||
|
break
|
||||||
|
kubeconfigfile = ''.join([dir, str(i)])
|
||||||
|
f.save(kubeconfigfile)
|
||||||
|
print('HEADER:', f.headers)
|
||||||
|
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||||
|
print('[GenerateChaos]', f.filename, datetime.now())
|
||||||
|
# thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
|
||||||
|
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||||
|
thread.daemon = True
|
||||||
|
print(thread.getName())
|
||||||
|
thread.start()
|
||||||
|
return 'Chaos ID: ' + str(i)
|
||||||
|
|
||||||
|
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('../config/yml/status.yml')
|
||||||
|
def get_status(chaosid):
|
||||||
|
print('[GetStatus]', chaosid, flaskdir)
|
||||||
|
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(epfile):
|
||||||
|
return 'Completed'
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Does not exist'
|
||||||
|
|
||||||
|
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('../config/yml/qtable.yml')
|
||||||
|
def get_qtable(chaosid):
|
||||||
|
print('[GetQTable]', chaosid)
|
||||||
|
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(qfile):
|
||||||
|
f = open(qfile, "r")
|
||||||
|
return f.read()
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Invalid Chaos ID: ' + chaosid
|
||||||
|
|
||||||
|
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||||
|
@swag_from('../config/yml/episodes.yml')
|
||||||
|
def get_episodes(chaosid):
|
||||||
|
print('[GetEpisodes]', chaosid)
|
||||||
|
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||||
|
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||||
|
if os.path.exists(epfile):
|
||||||
|
f = open(epfile, "r")
|
||||||
|
return f.read()
|
||||||
|
elif os.path.exists(initfile):
|
||||||
|
return 'Running'
|
||||||
|
else:
|
||||||
|
return 'Invalid Chaos ID: ' + chaosid
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||||
83
utils/chaos_ai/src/test_application.py
Normal file
83
utils/chaos_ai/src/test_application.py
Normal file
@@ -0,0 +1,83 @@
|
|||||||
|
import json
|
||||||
|
import logging
|
||||||
|
import time
|
||||||
|
import requests
|
||||||
|
|
||||||
|
|
||||||
|
class TestApplication:
|
||||||
|
def __init__(self, num_requests=10, timeout=2, sleep_time=1):
|
||||||
|
self.num_requests = num_requests
|
||||||
|
self.timeout = timeout
|
||||||
|
self.sleep_time = sleep_time
|
||||||
|
self.logger = logging.getLogger()
|
||||||
|
|
||||||
|
def test_load(self, url=''):
|
||||||
|
# url = 'http://192.168.49.2:31902/api/cart/health'
|
||||||
|
timeout_count = 0
|
||||||
|
avg_lat = 0
|
||||||
|
for i in range(self.num_requests):
|
||||||
|
try:
|
||||||
|
r = requests.get(url, verify=False, timeout=self.timeout)
|
||||||
|
avg_lat += r.elapsed.total_seconds()
|
||||||
|
self.logger.info(
|
||||||
|
url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
|
||||||
|
+ " {:.2f}".format(avg_lat))
|
||||||
|
if r.status_code != 200:
|
||||||
|
return '200', r.status_code
|
||||||
|
# except requests.exceptions.Timeout as toe:
|
||||||
|
except Exception as toe:
|
||||||
|
self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
|
||||||
|
timeout_count += 1
|
||||||
|
if timeout_count > 3:
|
||||||
|
return '200', 'Timeout'
|
||||||
|
# except Exception as e:
|
||||||
|
# self.logger.debug('Connection refused!'+str(e))
|
||||||
|
time.sleep(self.sleep_time)
|
||||||
|
self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
|
||||||
|
return '200', '200'
|
||||||
|
|
||||||
|
# def test_load_hey(self):
|
||||||
|
# cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
|
||||||
|
# os.system(cmd)
|
||||||
|
# with open('temp') as f:
|
||||||
|
# datafile = f.readlines()
|
||||||
|
# found = False
|
||||||
|
# for line in datafile:
|
||||||
|
# if 'Status code distribution:' in line:
|
||||||
|
# found = True
|
||||||
|
# if found:
|
||||||
|
# print('[test_load]', line)
|
||||||
|
# m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
|
||||||
|
# if m is not None:
|
||||||
|
# resp_code = m.group(1)
|
||||||
|
# if resp_code != 200:
|
||||||
|
# return '200', resp_code
|
||||||
|
# return '200', '200'
|
||||||
|
|
||||||
|
# # End state is reached when system is down or return error code like '500','404'
|
||||||
|
# def get_next_state(self):
|
||||||
|
# self.logger.info('[GET_NEXT_STATE]')
|
||||||
|
# f = open(self.chaos_dir + self.chaos_journal)
|
||||||
|
# data = json.load(f)
|
||||||
|
#
|
||||||
|
# # before the experiment (if before steady state is false, after is null?)
|
||||||
|
# for probe in data['steady_states']['before']['probes']:
|
||||||
|
# if not probe['tolerance_met']:
|
||||||
|
# # start_state = probe['activity']['tolerance']
|
||||||
|
# # end_state = probe['status']
|
||||||
|
# start_state, end_state = None, None
|
||||||
|
# return start_state, end_state
|
||||||
|
#
|
||||||
|
# # after the experiment
|
||||||
|
# for probe in data['steady_states']['after']['probes']:
|
||||||
|
# # if probe['output']['status'] == probe['activity']['tolerance']:
|
||||||
|
# if not probe['tolerance_met']:
|
||||||
|
# # print(probe)
|
||||||
|
# start_state = probe['activity']['tolerance']
|
||||||
|
# end_state = probe['output']['status']
|
||||||
|
# # end_state = probe['status']
|
||||||
|
# return start_state, end_state
|
||||||
|
# # if tolerances for all probes are met
|
||||||
|
# start_state = probe['activity']['tolerance']
|
||||||
|
# end_state = probe['activity']['tolerance']
|
||||||
|
# return start_state, end_state
|
||||||
10
utils/chaos_ai/src/utils.py
Normal file
10
utils/chaos_ai/src/utils.py
Normal file
@@ -0,0 +1,10 @@
|
|||||||
|
import re
|
||||||
|
|
||||||
|
|
||||||
|
def get_load(fault):
|
||||||
|
params = re.findall(r'\(.*?\)', fault)
|
||||||
|
load = 100
|
||||||
|
if len(params) > 0:
|
||||||
|
load = params[0].strip('()')
|
||||||
|
fault = fault.strip(params[0])
|
||||||
|
return fault, load
|
||||||
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
|||||||
$ git clone https://github.com/krkn-chaos/krkn.git
|
$ git clone https://github.com/krkn-chaos/krkn.git
|
||||||
$ cd krkn
|
$ cd krkn
|
||||||
$ pip3 install -r requirements.txt
|
$ pip3 install -r requirements.txt
|
||||||
|
Edit configuration file:
|
||||||
|
$ vi config/recommender_config.yaml
|
||||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -30,18 +32,23 @@ To run the recommender with a config file specify the config file path with the
|
|||||||
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
|
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
|
||||||
|
|
||||||
- `application`: Specify the application name.
|
- `application`: Specify the application name.
|
||||||
- `namespace`: Specify the namespace name. If you want to profile
|
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
|
||||||
- `labels`: Specify the labels (not used).
|
- `labels`: Specify the labels (not used).
|
||||||
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
|
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
|
||||||
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
|
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
|
||||||
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
||||||
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
||||||
- `chaos_library`: "kraken" (currently it only supports kraken).
|
- `chaos_library`: "kraken" (currently it only supports kraken).
|
||||||
|
- `json_output_file`: True or False (by default False).
|
||||||
|
- `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
|
||||||
- `chaos_tests`: (for output purpose only do not change if not needed)
|
- `chaos_tests`: (for output purpose only do not change if not needed)
|
||||||
- `GENERAL`: list of general purpose tests available in Krkn
|
- `GENERAL`: list of general purpose tests available in Krkn
|
||||||
- `MEM`: list of memory related tests available in Krkn
|
- `MEM`: list of memory related tests available in Krkn
|
||||||
- `NETWORK`: list of network related tests available in Krkn
|
- `NETWORK`: list of network related tests available in Krkn
|
||||||
- `CPU`: list of memory related tests available in Krkn
|
- `CPU`: list of memory related tests available in Krkn
|
||||||
|
- `threshold`: Specify the threshold to use for comparison and identifying outliers
|
||||||
|
- `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
|
||||||
|
- `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers
|
||||||
|
|
||||||
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
||||||
```
|
```
|
||||||
@@ -58,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
|
|||||||
-o, --options Evaluate command line options
|
-o, --options Evaluate command line options
|
||||||
-a APPLICATION, --application APPLICATION
|
-a APPLICATION, --application APPLICATION
|
||||||
Kubernetes application name
|
Kubernetes application name
|
||||||
-n NAMESPACE, --namespace NAMESPACE
|
-n NAMESPACES, --namespaces NAMESPACE
|
||||||
Kubernetes application namespace
|
Kubernetes application namespaces separated by space
|
||||||
-l LABELS, --labels LABELS
|
-l LABELS, --labels LABELS
|
||||||
Kubernetes application labels
|
Kubernetes application labels
|
||||||
-p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
|
-p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
|
||||||
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
|
|||||||
Chaos library
|
Chaos library
|
||||||
-L LOG_LEVEL, --log-level LOG_LEVEL
|
-L LOG_LEVEL, --log-level LOG_LEVEL
|
||||||
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||||
|
-J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
|
||||||
|
Create output file, the path to the folder can be specified, if not specified the default folder is used.
|
||||||
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
||||||
Memory related chaos tests (space separated list)
|
Memory related chaos tests (space separated list)
|
||||||
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
||||||
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
|
|||||||
Network related chaos tests (space separated list)
|
Network related chaos tests (space separated list)
|
||||||
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
||||||
Memory related chaos tests (space separated list)
|
Memory related chaos tests (space separated list)
|
||||||
|
--threshold THRESHOLD
|
||||||
|
Threshold
|
||||||
|
--cpu_threshold CPU_THRESHOLD
|
||||||
|
CPU threshold to compare with the cpu limits
|
||||||
|
--mem_threshold MEM_THRESHOLD
|
||||||
|
Memory threshold to compare with the memory limits
|
||||||
```
|
```
|
||||||
|
|
||||||
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
||||||
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t
|
|||||||
|
|
||||||
## Customizing Thresholds and Options
|
## Customizing Thresholds and Options
|
||||||
|
|
||||||
You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
|
You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.
|
||||||
|
|
||||||
## Additional Files
|
## Additional Files
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,10 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
|
import re
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import yaml
|
import yaml
|
||||||
# kraken module import for running the recommender
|
# kraken module import for running the recommender
|
||||||
# both from the root directory and the recommender
|
# both from the root directory and the recommender
|
||||||
@@ -9,24 +12,28 @@ import yaml
|
|||||||
sys.path.insert(0, './')
|
sys.path.insert(0, './')
|
||||||
sys.path.insert(0, '../../')
|
sys.path.insert(0, '../../')
|
||||||
|
|
||||||
|
from krkn_lib.utils import get_yaml_item_value
|
||||||
|
|
||||||
import kraken.chaos_recommender.analysis as analysis
|
import kraken.chaos_recommender.analysis as analysis
|
||||||
import kraken.chaos_recommender.prometheus as prometheus
|
import kraken.chaos_recommender.prometheus as prometheus
|
||||||
from kubernetes import config as kube_config
|
from kubernetes import config as kube_config
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments(parser):
|
def parse_arguments(parser):
|
||||||
|
|
||||||
# command line options
|
# command line options
|
||||||
parser.add_argument("-c", "--config-file", action="store", help="Config file path")
|
parser.add_argument("-c", "--config-file", action="store", help="Config file path")
|
||||||
parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
|
parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
|
||||||
parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
|
parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
|
||||||
parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
|
parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
|
||||||
parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
|
parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
|
||||||
parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
|
parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
|
||||||
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
||||||
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
||||||
|
|
||||||
|
parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
|
||||||
|
help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
|
||||||
|
|
||||||
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
||||||
help="Memory related chaos tests (space separated list)")
|
help="Memory related chaos tests (space separated list)")
|
||||||
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
||||||
@@ -35,10 +42,13 @@ def parse_arguments(parser):
|
|||||||
help="Network related chaos tests (space separated list)")
|
help="Network related chaos tests (space separated list)")
|
||||||
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
||||||
help="Memory related chaos tests (space separated list)")
|
help="Memory related chaos tests (space separated list)")
|
||||||
|
parser.add_argument("--threshold", action="store", default="", help="Threshold")
|
||||||
|
parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
|
||||||
|
parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def read_configuration(config_file_path):
|
def read_configuration(config_file_path):
|
||||||
if not os.path.exists(config_file_path):
|
if not os.path.exists(config_file_path):
|
||||||
logging.error(f"Config file not found: {config_file_path}")
|
logging.error(f"Config file not found: {config_file_path}")
|
||||||
@@ -48,15 +58,26 @@ def read_configuration(config_file_path):
|
|||||||
config = yaml.safe_load(config_file)
|
config = yaml.safe_load(config_file)
|
||||||
|
|
||||||
log_level = config.get("log level", "INFO")
|
log_level = config.get("log level", "INFO")
|
||||||
namespace = config.get("namespace", "")
|
namespaces = config.get("namespaces")
|
||||||
kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
namespaces = re.split(r",+\s+|,+|\s+", namespaces)
|
||||||
|
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||||
|
|
||||||
|
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||||
|
auth_token = config.get("auth_token")
|
||||||
|
scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
|
||||||
|
threshold = get_yaml_item_value(config, "threshold", ".7")
|
||||||
|
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
|
||||||
|
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
|
||||||
|
output_file = config.get("json_output_file", False)
|
||||||
|
if output_file is True:
|
||||||
|
output_path = config.get("json_output_folder_path")
|
||||||
|
else:
|
||||||
|
output_path = False
|
||||||
|
chaos_tests = config.get("chaos_tests", {})
|
||||||
|
return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
|
||||||
|
scrape_duration, chaos_tests, log_level, threshold,
|
||||||
|
heatmap_cpu_threshold, heatmap_mem_threshold, output_path)
|
||||||
|
|
||||||
prometheus_endpoint = config.get("prometheus_endpoint", "")
|
|
||||||
auth_token = config.get("auth_token", "")
|
|
||||||
scrape_duration = config.get("scrape_duration", "10m")
|
|
||||||
chaos_tests = config.get("chaos_tests" , {})
|
|
||||||
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
|
||||||
chaos_tests, log_level)
|
|
||||||
|
|
||||||
def prompt_input(prompt, default_value):
|
def prompt_input(prompt, default_value):
|
||||||
user_input = input(f"{prompt} [{default_value}]: ")
|
user_input = input(f"{prompt} [{default_value}]: ")
|
||||||
@@ -64,6 +85,54 @@ def prompt_input(prompt, default_value):
|
|||||||
return user_input
|
return user_input
|
||||||
return default_value
|
return default_value
|
||||||
|
|
||||||
|
|
||||||
|
def make_json_output(inputs, namespace_data, output_path):
|
||||||
|
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"inputs": inputs,
|
||||||
|
"analysis_outputs": namespace_data
|
||||||
|
}
|
||||||
|
|
||||||
|
logging.info(f"Summary\n{json.dumps(data, indent=4)}")
|
||||||
|
|
||||||
|
if output_path is not False:
|
||||||
|
file = f"recommender_{time_str}.json"
|
||||||
|
path = f"{os.path.expanduser(output_path)}/{file}"
|
||||||
|
|
||||||
|
with open(path, "w") as json_output:
|
||||||
|
logging.info(f"Saving output file in {output_path} folder...")
|
||||||
|
json_output.write(json.dumps(data, indent=4))
|
||||||
|
logging.info(f"Recommendation output saved in {file}.")
|
||||||
|
|
||||||
|
|
||||||
|
def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
|
||||||
|
chaos_tests, threshold, heatmap_cpu_threshold,
|
||||||
|
heatmap_mem_threshold):
|
||||||
|
inputs = {
|
||||||
|
"namespaces": namespaces,
|
||||||
|
"kubeconfig": kubeconfig,
|
||||||
|
"prometheus_endpoint": prometheus_endpoint,
|
||||||
|
"scrape_duration": scrape_duration,
|
||||||
|
"chaos_tests": chaos_tests,
|
||||||
|
"threshold": threshold,
|
||||||
|
"heatmap_cpu_threshold": heatmap_cpu_threshold,
|
||||||
|
"heatmap_mem_threshold": heatmap_mem_threshold
|
||||||
|
}
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
|
||||||
|
def json_namespace(namespace, queries, analysis_data):
|
||||||
|
data = {
|
||||||
|
"namespace": namespace,
|
||||||
|
"queries": queries,
|
||||||
|
"profiling": analysis_data[0],
|
||||||
|
"heatmap_analysis": analysis_data[1],
|
||||||
|
"recommendations": analysis_data[2]
|
||||||
|
}
|
||||||
|
return data
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
||||||
args = parse_arguments(parser)
|
args = parse_arguments(parser)
|
||||||
@@ -75,43 +144,67 @@ def main():
|
|||||||
|
|
||||||
if args.config_file is not None:
|
if args.config_file is not None:
|
||||||
(
|
(
|
||||||
namespace,
|
namespaces,
|
||||||
kubeconfig,
|
kubeconfig,
|
||||||
prometheus_endpoint,
|
prometheus_endpoint,
|
||||||
auth_token,
|
auth_token,
|
||||||
scrape_duration,
|
scrape_duration,
|
||||||
chaos_tests,
|
chaos_tests,
|
||||||
log_level
|
log_level,
|
||||||
|
threshold,
|
||||||
|
heatmap_cpu_threshold,
|
||||||
|
heatmap_mem_threshold,
|
||||||
|
output_path
|
||||||
) = read_configuration(args.config_file)
|
) = read_configuration(args.config_file)
|
||||||
|
|
||||||
if args.options:
|
if args.options:
|
||||||
namespace = args.namespace
|
namespaces = args.namespaces
|
||||||
kubeconfig = args.kubeconfig
|
kubeconfig = args.kubeconfig
|
||||||
auth_token = args.token
|
auth_token = args.token
|
||||||
scrape_duration = args.scrape_duration
|
scrape_duration = args.scrape_duration
|
||||||
log_level = args.log_level
|
log_level = args.log_level
|
||||||
prometheus_endpoint = args.prometheus_endpoint
|
prometheus_endpoint = args.prometheus_endpoint
|
||||||
|
output_path = args.json_output_file
|
||||||
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
||||||
|
threshold = args.threshold
|
||||||
|
heatmap_mem_threshold = args.mem_threshold
|
||||||
|
heatmap_cpu_threshold = args.cpu_threshold
|
||||||
|
|
||||||
if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
|
if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
||||||
logging.error(f"{log_level} not a valid log level")
|
logging.error(f"{log_level} not a valid log level")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
logging.basicConfig(level=log_level)
|
logging.basicConfig(level=log_level)
|
||||||
|
|
||||||
logging.info("============================INPUTS===================================")
|
if output_path is not False:
|
||||||
logging.info(f"Namespace: {namespace}")
|
if output_path is None:
|
||||||
logging.info(f"Kubeconfig: {kubeconfig}")
|
output_path = "./recommender_output"
|
||||||
logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
|
logging.info(f"Path for output file not specified. "
|
||||||
logging.info(f"Scrape duration: {scrape_duration}")
|
f"Using default folder {output_path}")
|
||||||
for test in chaos_tests.keys():
|
if not os.path.exists(os.path.expanduser(output_path)):
|
||||||
logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
|
logging.error(f"Folder {output_path} for output not found.")
|
||||||
logging.info("=====================================================================")
|
sys.exit(1)
|
||||||
logging.info("Starting Analysis ...")
|
|
||||||
logging.info("Fetching the Telemetry data")
|
logging.info("Loading inputs...")
|
||||||
|
inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
|
||||||
|
scrape_duration, chaos_tests, threshold,
|
||||||
|
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
|
namespaces_data = []
|
||||||
|
|
||||||
|
logging.info("Starting Analysis...")
|
||||||
|
|
||||||
|
file_path, queries = prometheus.fetch_utilization_from_prometheus(
|
||||||
|
prometheus_endpoint, auth_token, namespaces, scrape_duration)
|
||||||
|
|
||||||
|
analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
|
||||||
|
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
|
|
||||||
|
for namespace in namespaces:
|
||||||
|
namespace_data = json_namespace(namespace, queries[namespace],
|
||||||
|
analysis_data[namespace])
|
||||||
|
namespaces_data.append(namespace_data)
|
||||||
|
make_json_output(inputs, namespaces_data, output_path)
|
||||||
|
|
||||||
file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
|
||||||
analysis(file_path, chaos_tests)
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user