mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-19 20:40:33 +00:00
Compare commits
76 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
9378cd74cd | ||
|
|
4d3491da0f | ||
|
|
d6ce66160b | ||
|
|
ef1a55438b | ||
|
|
d8f54b83a2 | ||
|
|
4870c86515 | ||
|
|
6ae17cf678 | ||
|
|
ce9f8aa050 | ||
|
|
05148317c1 | ||
|
|
5f836f294b | ||
|
|
cfa1bb09a0 | ||
|
|
5ddfff5a85 | ||
|
|
7d18487228 | ||
|
|
08de42c91a | ||
|
|
dc7d5bb01b | ||
|
|
ea3444d375 | ||
|
|
7b660a0878 | ||
|
|
5fe0655f22 | ||
|
|
5df343c183 | ||
|
|
f364e9f283 | ||
|
|
86a7427606 | ||
|
|
31266fbc3e | ||
|
|
57de3769e7 | ||
|
|
42fc8eea40 | ||
|
|
22d56e2cdc | ||
|
|
a259b68221 | ||
|
|
052f83e7d9 | ||
|
|
fb3bbe4e26 | ||
|
|
96ba9be4b8 | ||
|
|
58d5d1d8dc | ||
|
|
3fe22a0d8f | ||
|
|
21b89a32a7 | ||
|
|
dbe3ea9718 | ||
|
|
a142f6e7a4 | ||
|
|
2610a7af67 | ||
|
|
f827f65132 | ||
|
|
aa6cbbc11a | ||
|
|
e17354e54d | ||
|
|
2dfa5cb0cd | ||
|
|
0799008cd5 | ||
|
|
2327531e46 | ||
|
|
2c14c48a63 | ||
|
|
ab98e416a6 | ||
|
|
19ad2d1a3d | ||
|
|
804d7cbf58 | ||
|
|
54af2fc6ff | ||
|
|
b79e526cfd | ||
|
|
a5efd7d06c | ||
|
|
a1b81bd382 | ||
|
|
782440c8c4 | ||
|
|
7e2755cbb7 | ||
|
|
2babb53d6e | ||
|
|
85f76e9193 | ||
|
|
8bf21392f1 | ||
|
|
606fb60811 | ||
|
|
fac7c3c6fb | ||
|
|
8dd9b30030 | ||
|
|
2d99f17aaf | ||
|
|
50742a793c | ||
|
|
ba6a844544 | ||
|
|
7e7a917dba | ||
|
|
b9c0bb39c7 | ||
|
|
706a886151 | ||
|
|
a1cf9e2c00 | ||
|
|
0f5dfcb823 | ||
|
|
1e1015e6e7 | ||
|
|
c71ce31779 | ||
|
|
1298f220a6 | ||
|
|
24059fb731 | ||
|
|
ab951adb78 | ||
|
|
a9a7fb7e51 | ||
|
|
5a8d5b0fe1 | ||
|
|
c440dc4b51 | ||
|
|
b174c51ee0 | ||
|
|
fec0434ce1 | ||
|
|
1067d5ec8d |
32
.github/workflows/docker-image.yml
vendored
32
.github/workflows/docker-image.yml
vendored
@@ -1,8 +1,7 @@
|
||||
name: Docker Image CI
|
||||
on:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
tags: ['v[0-9].[0-9]+.[0-9]+']
|
||||
pull_request:
|
||||
|
||||
jobs:
|
||||
@@ -12,30 +11,43 @@ jobs:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
- name: Build the Docker images
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg TAG=${GITHUB_REF#refs/tags/}
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
docker tag quay.io/krkn-chaos/krkn quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
|
||||
- name: Test Build the Docker images
|
||||
if: ${{ github.event_name == 'pull_request' }}
|
||||
run: |
|
||||
docker build --no-cache -t quay.io/krkn-chaos/krkn containers/ --build-arg PR_NUMBER=${{ github.event.pull_request.number }}
|
||||
- name: Login in quay
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
|
||||
env:
|
||||
QUAY_USER: ${{ secrets.QUAY_USERNAME }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
|
||||
- name: Push the KrknChaos Docker images
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker push quay.io/krkn-chaos/krkn
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
docker push quay.io/krkn-chaos/krkn
|
||||
docker push quay.io/krkn-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
- name: Login in to redhat-chaos quay
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
if: startsWith(github.ref, 'refs/tags/v')
|
||||
run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}
|
||||
env:
|
||||
QUAY_USER: ${{ secrets.QUAY_USER_1 }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_TOKEN_1 }}
|
||||
- name: Push the RedHat Chaos Docker images
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
run: docker push quay.io/redhat-chaos/krkn
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
run: |
|
||||
docker push quay.io/redhat-chaos/krkn
|
||||
docker push quay.io/redhat-chaos/krkn:${GITHUB_REF#refs/tags/}
|
||||
- name: Rebuild krkn-hub
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
if: startsWith(github.ref, 'refs/tags')
|
||||
uses: redhat-chaos/actions/krkn-hub@main
|
||||
with:
|
||||
QUAY_USER: ${{ secrets.QUAY_USERNAME }}
|
||||
QUAY_TOKEN: ${{ secrets.QUAY_PASSWORD }}
|
||||
AUTOPUSH: ${{ secrets.AUTOPUSH }}
|
||||
|
||||
70
.github/workflows/tests.yml
vendored
70
.github/workflows/tests.yml
vendored
@@ -61,6 +61,8 @@ jobs:
|
||||
kubectl create namespace namespace-scenario
|
||||
kubectl apply -f CI/templates/time_pod.yaml
|
||||
kubectl wait --for=condition=ready pod -l scenario=time-skew --timeout=300s
|
||||
kubectl apply -f CI/templates/service_hijacking.yaml
|
||||
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=proxy" --timeout=300s
|
||||
- name: Get Kind nodes
|
||||
run: |
|
||||
kubectl get nodes --show-labels=true
|
||||
@@ -70,12 +72,14 @@ jobs:
|
||||
run: python -m coverage run -a -m unittest discover -s tests -v
|
||||
|
||||
- name: Setup Pull Request Functional Tests
|
||||
if: github.event_name == 'pull_request'
|
||||
if: |
|
||||
github.event_name == 'pull_request'
|
||||
run: |
|
||||
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
|
||||
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
|
||||
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
|
||||
echo "test_app_outages" > ./CI/tests/functional_tests
|
||||
echo "test_service_hijacking" > ./CI/tests/functional_tests
|
||||
echo "test_app_outages" >> ./CI/tests/functional_tests
|
||||
echo "test_container" >> ./CI/tests/functional_tests
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
@@ -84,7 +88,9 @@ jobs:
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
# Push on main only steps
|
||||
|
||||
# Push on main only steps + all other functional to collect coverage
|
||||
# for the badge
|
||||
- name: Configure AWS Credentials
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
@@ -101,6 +107,15 @@ jobs:
|
||||
yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml
|
||||
echo "test_telemetry" > ./CI/tests/functional_tests
|
||||
echo "test_service_hijacking" >> ./CI/tests/functional_tests
|
||||
echo "test_app_outages" >> ./CI/tests/functional_tests
|
||||
echo "test_container" >> ./CI/tests/functional_tests
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_time" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
# Final common steps
|
||||
- name: Run Functional tests
|
||||
@@ -119,6 +134,7 @@ jobs:
|
||||
- name: Collect coverage report
|
||||
run: |
|
||||
python -m coverage html
|
||||
python -m coverage json
|
||||
- name: Publish coverage report to job summary
|
||||
run: |
|
||||
pip install html2text
|
||||
@@ -129,6 +145,54 @@ jobs:
|
||||
name: coverage
|
||||
path: htmlcov
|
||||
if-no-files-found: error
|
||||
- name: Upload json coverage
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: coverage.json
|
||||
path: coverage.json
|
||||
if-no-files-found: error
|
||||
- name: Check CI results
|
||||
run: grep Fail CI/results.markdown && false || true
|
||||
badge:
|
||||
permissions:
|
||||
contents: write
|
||||
name: Generate Coverage Badge
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- tests
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
steps:
|
||||
- name: Check out doc repo
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: krkn-chaos/krkn-lib-docs
|
||||
path: krkn-lib-docs
|
||||
ssh-key: ${{ secrets.KRKN_LIB_DOCS_PRIV_KEY }}
|
||||
- name: Download json coverage
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: coverage.json
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Copy badge on GitHub Page Repo
|
||||
env:
|
||||
COLOR: yellow
|
||||
run: |
|
||||
# generate coverage badge on previously calculated total coverage
|
||||
# and copy in the docs page
|
||||
export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
|
||||
[[ $TOTAL > 40 ]] && COLOR=green
|
||||
echo "TOTAL: $TOTAL"
|
||||
echo "COLOR: $COLOR"
|
||||
curl "https://img.shields.io/badge/coverage-$TOTAL%25-$COLOR" > ./krkn-lib-docs/coverage_badge_krkn.svg
|
||||
- name: Push updated Coverage Badge
|
||||
run: |
|
||||
cd krkn-lib-docs
|
||||
git add .
|
||||
git config user.name "krkn-chaos"
|
||||
git config user.email "<>"
|
||||
git commit -m "[KRKN] Coverage Badge ${GITHUB_REF##*/}" || echo "no changes to commit"
|
||||
git push
|
||||
|
||||
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ __pycache__/*
|
||||
*.out
|
||||
kube-burner*
|
||||
kube_burner*
|
||||
recommender_*.json
|
||||
|
||||
# Project files
|
||||
.ropeproject
|
||||
|
||||
@@ -29,7 +29,7 @@ tunings:
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
|
||||
username: $TELEMETRY_USERNAME # telemetry service username
|
||||
password: $TELEMETRY_PASSWORD # telemetry service password
|
||||
prometheus_namespace: 'prometheus-k8s' # prometheus namespace
|
||||
@@ -49,3 +49,4 @@ telemetry:
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
telemetry_group: "funtests"
|
||||
|
||||
29
CI/templates/service_hijacking.yaml
Normal file
29
CI/templates/service_hijacking.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nginx
|
||||
labels:
|
||||
app.kubernetes.io/name: proxy
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:stable
|
||||
ports:
|
||||
- containerPort: 80
|
||||
name: http-web-svc
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nginx-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: proxy
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: name-of-service-port
|
||||
protocol: TCP
|
||||
port: 80
|
||||
targetPort: http-web-svc
|
||||
nodePort: 30036
|
||||
@@ -1,15 +1,23 @@
|
||||
ERRORED=false
|
||||
|
||||
function finish {
|
||||
if [ $? -eq 1 ] && [ $ERRORED != "true" ]
|
||||
if [ $? != 0 ] && [ $ERRORED != "true" ]
|
||||
then
|
||||
error
|
||||
fi
|
||||
}
|
||||
|
||||
function error {
|
||||
echo "Error caught."
|
||||
ERRORED=true
|
||||
exit_code=$?
|
||||
if [ $exit_code == 1 ]
|
||||
then
|
||||
echo "Error caught."
|
||||
ERRORED=true
|
||||
elif [ $exit_code == 2 ]
|
||||
then
|
||||
echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
function get_node {
|
||||
|
||||
@@ -8,11 +8,11 @@ trap finish EXIT
|
||||
pod_file="CI/scenarios/hello_pod.yaml"
|
||||
|
||||
function functional_test_container_crash {
|
||||
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
|
||||
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
|
||||
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
|
||||
export scenario_type="container_scenarios"
|
||||
export scenario_file="- scenarios/openshift/app_outage.yaml"
|
||||
export scenario_file="- scenarios/openshift/container_etcd.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
|
||||
|
||||
|
||||
107
CI/tests/test_service_hijacking.sh
Normal file
107
CI/tests/test_service_hijacking.sh
Normal file
@@ -0,0 +1,107 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
# port mapping has been configured in kind-config.yml
|
||||
SERVICE_URL=http://localhost:8888
|
||||
PAYLOAD_GET_1="{ \
|
||||
\"status\":\"internal server error\" \
|
||||
}"
|
||||
STATUS_CODE_GET_1=500
|
||||
|
||||
PAYLOAD_PATCH_1="resource patched"
|
||||
STATUS_CODE_PATCH_1=201
|
||||
|
||||
PAYLOAD_POST_1="{ \
|
||||
\"status\": \"unauthorized\" \
|
||||
}"
|
||||
STATUS_CODE_POST_1=401
|
||||
|
||||
PAYLOAD_GET_2="{ \
|
||||
\"status\":\"resource created\" \
|
||||
}"
|
||||
STATUS_CODE_GET_2=201
|
||||
|
||||
PAYLOAD_PATCH_2="bad request"
|
||||
STATUS_CODE_PATCH_2=400
|
||||
|
||||
PAYLOAD_POST_2="not found"
|
||||
STATUS_CODE_POST_2=404
|
||||
|
||||
JSON_MIME="application/json"
|
||||
TEXT_MIME="text/plain; charset=utf-8"
|
||||
|
||||
function functional_test_service_hijacking {
|
||||
|
||||
export scenario_type="service_hijacking"
|
||||
export scenario_file="scenarios/kube/service_hijacking.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/service_hijacking.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/service_hijacking.yaml > /dev/null 2>&1 &
|
||||
PID=$!
|
||||
#Waiting the hijacking to have effect
|
||||
while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]; do echo "waiting scenario to kick in."; sleep 1; done;
|
||||
|
||||
#Checking Step 1 GET on /list/index.php
|
||||
OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_GET_1//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 1 GET Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_1" ] && echo "Step 1 GET Status Code OK" || (echo " Step 1 GET status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 GET MIME OK" || (echo " Step 1 GET MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 1 POST on /list/index.php
|
||||
OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
|
||||
OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_POST_1//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 1 POST Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_1" ] && echo "Step 1 POST Status Code OK" || (echo "Step 1 POST status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 POST MIME OK" || (echo " Step 1 POST MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 1 PATCH on /patch
|
||||
OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
|
||||
OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
|
||||
OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
|
||||
[ "${PAYLOAD_PATCH_1//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 1 PATCH Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_1" ] && echo "Step 1 PATCH Status Code OK" || (echo "Step 1 PATCH status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 1 PATCH MIME OK" || (echo " Step 1 PATCH MIME did not match. Test failed." && exit 1)
|
||||
# wait for the next step
|
||||
sleep 16
|
||||
|
||||
#Checking Step 2 GET on /list/index.php
|
||||
OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_GET_2//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 2 GET Payload OK" || (echo "Step 2 GET Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_2" ] && echo "Step 2 GET Status Code OK" || (echo "Step 2 GET status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 2 GET MIME OK" || (echo " Step 2 GET MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 2 POST on /list/index.php
|
||||
OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_POST_2//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 2 POST Payload OK" || (echo "Step 2 POST Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_2" ] && echo "Step 2 POST Status Code OK" || (echo "Step 2 POST status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 POST MIME OK" || (echo " Step 2 POST MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 2 PATCH on /patch
|
||||
OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
|
||||
OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
|
||||
OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
|
||||
[ "${PAYLOAD_PATCH_2//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 2 PATCH Payload OK" || (echo "Step 2 PATCH Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_2" ] && echo "Step 2 PATCH Status Code OK" || (echo "Step 2 PATCH status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 PATCH MIME OK" || (echo " Step 2 PATCH MIME did not match. Test failed." && exit 1)
|
||||
wait $PID
|
||||
|
||||
# now checking if service has been restore correctly and nginx responds correctly
|
||||
curl -s $SERVICE_URL | grep nginx! && echo "BODY: Service restored!" || (echo "BODY: failed to restore service" && exit 1)
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL`
|
||||
[ "$OUT_STATUS_CODE" == "200" ] && echo "STATUS_CODE: Service restored!" || (echo "STATUS_CODE: failed to restore service" && exit 1)
|
||||
|
||||
echo "Service Hijacking Chaos test: Success"
|
||||
}
|
||||
|
||||
|
||||
functional_test_service_hijacking
|
||||
@@ -14,18 +14,22 @@ function functional_test_telemetry {
|
||||
export RUN_TAG="funtest-telemetry"
|
||||
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
|
||||
yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
|
||||
yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||
|
||||
export scenario_type="arcaflow_scenarios"
|
||||
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
|
||||
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||
echo "checking if telemetry files are uploaded on s3"
|
||||
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||
echo "all files uploaded!"
|
||||
echo "Telemetry Collection: Success"
|
||||
}
|
||||
|
||||
16
README.md
16
README.md
@@ -1,6 +1,7 @@
|
||||
# Krkn aka Kraken
|
||||
[](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
|
||||

|
||||

|
||||

|
||||
|
||||

|
||||
|
||||
@@ -40,18 +41,6 @@ After installation, refer back to the below sections for supported scenarios and
|
||||
#### Running Kraken with minimal configuration tweaks
|
||||
For cases where you want to run Kraken with minimal configuration changes, refer to [krkn-hub](https://github.com/krkn-chaos/krkn-hub). One use case is CI integration where you do not want to carry around different configuration files for the scenarios.
|
||||
|
||||
### Setting up infrastructure dependencies
|
||||
Kraken indexes the metrics specified in the profile into Elasticsearch in addition to leveraging Cerberus for understanding the health of the Kubernetes cluster under test. More information on the features is documented below. The infrastructure pieces can be easily installed and uninstalled by running:
|
||||
|
||||
```
|
||||
$ cd kraken
|
||||
$ podman-compose up or $ docker-compose up # Spins up the containers specified in the docker-compose.yml file present in the run directory.
|
||||
$ podman-compose down or $ docker-compose down # Delete the containers installed.
|
||||
```
|
||||
This will manage the Cerberus and Elasticsearch containers on the host on which you are running Kraken.
|
||||
|
||||
**NOTE**: Make sure you have enough resources (memory and disk) on the machine on top of which the containers are running as Elasticsearch is resource intensive. Cerberus monitors the system components by default, the [config](config/cerberus.yaml) can be tweaked to add applications namespaces, routes and other components to monitor as well. The command will keep running until killed since detached mode is not supported as of now.
|
||||
|
||||
|
||||
### Config
|
||||
Instructions on how to setup the config and the options supported can be found at [Config](docs/config.md).
|
||||
@@ -74,6 +63,7 @@ Scenario type | Kubernetes
|
||||
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: |
|
||||
[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: |
|
||||
[ManagedCluster Scenarios](docs/managedcluster_scenarios.md) | :heavy_check_mark: |
|
||||
[Service Hijacking Scenarios](docs/service_hijacking_scenarios.md) | :heavy_check_mark: |
|
||||
|
||||
|
||||
### Kraken scenario pass/fail criteria and report
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
kraken:
|
||||
distribution: openshift # Distribution can be kubernetes or openshift
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
@@ -15,7 +15,7 @@ kraken:
|
||||
- application_outages:
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/etcd.yml
|
||||
- scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
@@ -23,7 +23,7 @@ kraken:
|
||||
- scenarios/openshift/network_chaos_ingress.yml
|
||||
- scenarios/openshift/prom_kill.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/openshift-apiserver.yml
|
||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||
@@ -42,6 +42,8 @@ kraken:
|
||||
- scenarios/openshift/pvc_scenario.yaml
|
||||
- network_chaos:
|
||||
- scenarios/openshift/network_chaos.yaml
|
||||
- service_hijacking:
|
||||
- scenarios/kube/service_hijacking.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
@@ -51,7 +53,7 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
@@ -65,14 +67,19 @@ telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
username: username # telemetry service username
|
||||
password: password # telemetry service password
|
||||
password: password # telemetry service password
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
prometheus_namespace: "" # namespace where prometheus is deployed (if distribution is kubernetes)
|
||||
prometheus_container_name: "" # name of the prometheus container name (if distribution is kubernetes)
|
||||
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
archive_size: 500000
|
||||
telemetry_group: '' # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
|
||||
# the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
||||
# simultaneously).
|
||||
# For unstable/slow connection is better to keep this value low
|
||||
@@ -85,6 +92,9 @@ telemetry:
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
elastic:
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_index: "" # Elastic search index pattern to post results to
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -77,3 +77,8 @@ telemetry:
|
||||
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
elastic:
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_index: "" # Elastic search index pattern to post results to
|
||||
|
||||
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
application: openshift-etcd
|
||||
namespace: openshift-etcd
|
||||
namespaces: openshift-etcd
|
||||
labels: app=openshift-etcd
|
||||
kubeconfig: ~/.kube/config.yaml
|
||||
prometheus_endpoint: <Prometheus_Endpoint>
|
||||
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
|
||||
scrape_duration: 10m
|
||||
chaos_library: "kraken"
|
||||
log_level: INFO
|
||||
json_output_file: False
|
||||
json_output_folder_path:
|
||||
|
||||
# for output purpose only do not change if not needed
|
||||
chaos_tests:
|
||||
@@ -26,4 +28,8 @@ chaos_tests:
|
||||
- pod_network_chaos
|
||||
MEM:
|
||||
- node_memory_hog
|
||||
- pvc_disk_fill
|
||||
- pvc_disk_fill
|
||||
|
||||
threshold: .7
|
||||
cpu_threshold: .5
|
||||
mem_threshold: .5
|
||||
|
||||
@@ -1,28 +1,57 @@
|
||||
# Dockerfile for kraken
|
||||
|
||||
FROM mcr.microsoft.com/azure-cli:latest as azure-cli
|
||||
|
||||
FROM registry.access.redhat.com/ubi8/ubi:latest
|
||||
|
||||
ENV KUBECONFIG /root/.kube/config
|
||||
|
||||
# Copy azure client binary from azure-cli image
|
||||
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
# oc build
|
||||
FROM golang:1.22.4 AS oc-build
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends libkrb5-dev
|
||||
WORKDIR /tmp
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
||||
RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
|
||||
WORKDIR /tmp/oc
|
||||
RUN go mod edit -go 1.22.3 &&\
|
||||
go get github.com/moby/buildkit@v0.12.5 &&\
|
||||
go get github.com/containerd/containerd@v1.7.11&&\
|
||||
go get github.com/docker/docker@v25.0.5&&\
|
||||
go mod tidy && go mod vendor
|
||||
RUN make GO_REQUIRED_MIN_VERSION:= oc
|
||||
|
||||
WORKDIR /root/kraken
|
||||
FROM fedora:40
|
||||
ARG PR_NUMBER
|
||||
ARG TAG
|
||||
RUN groupadd -g 1001 krkn && useradd -m -u 1001 -g krkn krkn
|
||||
RUN dnf update -y
|
||||
|
||||
# krkn version that will be built
|
||||
ENV KRKN_VERSION v1.6.2
|
||||
|
||||
ENV KUBECONFIG /home/krkn/.kube/config
|
||||
|
||||
# install kubectl
|
||||
RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" &&\
|
||||
cp kubectl /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl &&\
|
||||
cp kubectl /usr/bin/kubectl && chmod +x /usr/bin/kubectl
|
||||
|
||||
# This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo
|
||||
RUN dnf update && dnf install -y --setopt=install_weak_deps=False \
|
||||
git python39 jq yq gettext wget which &&\
|
||||
dnf clean all
|
||||
|
||||
# copy oc client binary from oc-build image
|
||||
COPY --from=oc-build /tmp/oc/oc /usr/bin/oc
|
||||
|
||||
# krkn build
|
||||
RUN git clone https://github.com/krkn-chaos/krkn.git /home/krkn/kraken && \
|
||||
mkdir -p /home/krkn/.kube
|
||||
|
||||
WORKDIR /home/krkn/kraken
|
||||
|
||||
# default behaviour will be to build main
|
||||
# if it is a PR trigger the PR itself will be checked out
|
||||
RUN if [ -n "$PR_NUMBER" ]; then git fetch origin pull/${PR_NUMBER}/head:pr-${PR_NUMBER} && git checkout pr-${PR_NUMBER};fi
|
||||
# if it is a TAG trigger checkout the tag
|
||||
RUN if [ -n "$TAG" ]; then git checkout "$TAG";fi
|
||||
|
||||
RUN python3.9 -m ensurepip
|
||||
RUN pip3.9 install -r requirements.txt
|
||||
RUN pip3.9 install jsonschema
|
||||
|
||||
RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn
|
||||
USER krkn
|
||||
ENTRYPOINT ["python3.9", "run_kraken.py"]
|
||||
CMD ["--config=config/config.yaml"]
|
||||
|
||||
@@ -2,19 +2,14 @@
|
||||
|
||||
FROM ppc64le/centos:8
|
||||
|
||||
FROM mcr.microsoft.com/azure-cli:latest as azure-cli
|
||||
|
||||
LABEL org.opencontainers.image.authors="Red Hat OpenShift Chaos Engineering"
|
||||
|
||||
ENV KUBECONFIG /root/.kube/config
|
||||
|
||||
# Copy azure client binary from azure-cli image
|
||||
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.14 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
@@ -22,7 +17,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
WORKDIR /tmp
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||
|
||||
WORKDIR /root/kraken
|
||||
|
||||
|
||||
@@ -1,31 +0,0 @@
|
||||
version: "3"
|
||||
services:
|
||||
elastic:
|
||||
image: docker.elastic.co/elasticsearch/elasticsearch:7.13.2
|
||||
deploy:
|
||||
replicas: 1
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
network_mode: host
|
||||
environment:
|
||||
discovery.type: single-node
|
||||
kibana:
|
||||
image: docker.elastic.co/kibana/kibana:7.13.2
|
||||
deploy:
|
||||
replicas: 1
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
network_mode: host
|
||||
environment:
|
||||
ELASTICSEARCH_HOSTS: "http://0.0.0.0:9200"
|
||||
cerberus:
|
||||
image: quay.io/openshift-scale/cerberus:latest
|
||||
privileged: true
|
||||
deploy:
|
||||
replicas: 1
|
||||
restart_policy:
|
||||
condition: on-failure
|
||||
network_mode: host
|
||||
volumes:
|
||||
- ./config/cerberus.yaml:/root/cerberus/config/config.yaml:Z # Modify the config in case of the need to monitor additional components
|
||||
- ${HOME}/.kube/config:/root/.kube/config:Z
|
||||
@@ -27,14 +27,12 @@ After creating the service account you will need to enable the account using the
|
||||
|
||||
## Azure
|
||||
|
||||
**NOTE**: For Azure node killing scenarios, make sure [Azure CLI](https://docs.microsoft.com/en-us/cli/azure/install-azure-cli?view=azure-cli-latest) is installed.
|
||||
|
||||
You will also need to create a service principal and give it the correct access, see [here](https://docs.openshift.com/container-platform/4.5/installing/installing_azure/installing-azure-account.html) for creating the service principal and setting the proper permissions.
|
||||
**NOTE**: You will need to create a service principal and give it the correct access, see [here](https://docs.openshift.com/container-platform/4.5/installing/installing_azure/installing-azure-account.html) for creating the service principal and setting the proper permissions.
|
||||
|
||||
To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator”.
|
||||
|
||||
Before running you will need to set the following:
|
||||
1. Login using ```az login```
|
||||
1. ```export AZURE_SUBSCRIPTION_ID=<subscription_id>```
|
||||
|
||||
2. ```export AZURE_TENANT_ID=<tenant_id>```
|
||||
|
||||
|
||||
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
|
||||
namespace_pattern: ^<namespace>$
|
||||
label_selector: <pod label>
|
||||
kill: <number of pods to kill>
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^<namespace>$
|
||||
label_selector: <pod label>
|
||||
count: <expected number of pods that match namespace and label>
|
||||
krkn_pod_recovery_time: <expected time for the pod to become ready>
|
||||
```
|
||||
|
||||
#### Node Scenario Yaml Template
|
||||
|
||||
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
|
||||
Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
|
||||
```
|
||||
$ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
|
||||
$ cd kraken
|
||||
$ cd krkn
|
||||
```
|
||||
|
||||
#### Install the dependencies
|
||||
|
||||
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
```
|
||||
|
||||
Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
|
||||
|
||||
80
docs/service_hijacking_scenarios.md
Normal file
80
docs/service_hijacking_scenarios.md
Normal file
@@ -0,0 +1,80 @@
|
||||
### Service Hijacking Scenarios
|
||||
|
||||
Service Hijacking Scenarios aim to simulate fake HTTP responses from a workload targeted by a
|
||||
`Service` already deployed in the cluster.
|
||||
This scenario is executed by deploying a custom-made web service and modifying the target `Service`
|
||||
selector to direct traffic to this web service for a specified duration.
|
||||
|
||||
The web service's source code is available [here](https://github.com/krkn-chaos/krkn-service-hijacking).
|
||||
It employs a time-based test plan from the scenario configuration file, which specifies the behavior of resources during the chaos scenario as follows:
|
||||
|
||||
```yaml
|
||||
service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
|
||||
service_name: nginx-service # The name of the service that will be hijacked.
|
||||
service_namespace: default # The namespace where the target service is located.
|
||||
image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
|
||||
chaos_duration: 30 # Total duration of the chaos scenario in seconds.
|
||||
plan:
|
||||
- resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored. For resources, only query parameters are captured.
|
||||
|
||||
steps: # A time-based plan consisting of steps can be defined for each resource.
|
||||
GET: # One or more HTTP methods can be specified for each step. Note: Non-standard methods are supported for fully custom web services (e.g., using NONEXISTENT instead of POST).
|
||||
|
||||
- duration: 15 # Duration in seconds for this step before moving to the next one, if defined. Otherwise, this step will continue until the chaos scenario ends.
|
||||
|
||||
status: 500 # HTTP status code to be returned in this step.
|
||||
mime_type: "application/json" # MIME type of the response for this step.
|
||||
payload: | # The response payload for this step.
|
||||
{
|
||||
"status":"internal server error"
|
||||
}
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status":"resource created"
|
||||
}
|
||||
POST:
|
||||
- duration: 15
|
||||
status: 401
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status": "unauthorized"
|
||||
}
|
||||
- duration: 15
|
||||
status: 404
|
||||
mime_type: "text/plain"
|
||||
payload: "not found"
|
||||
|
||||
|
||||
```
|
||||
The scenario will focus on the `service_name` within the `service_namespace`,
|
||||
substituting the selector with a randomly generated one, which is added as a label in the mock service manifest.
|
||||
This allows multiple scenarios to be executed in the same namespace, each targeting different services without
|
||||
causing conflicts.
|
||||
|
||||
The newly deployed mock web service will expose a `service_target_port`,
|
||||
which can be either a named or numeric port based on the service configuration.
|
||||
This ensures that the Service correctly routes HTTP traffic to the mock web service during the chaos run.
|
||||
|
||||
Each step will last for `duration` seconds from the deployment of the mock web service in the cluster.
|
||||
For each HTTP resource, defined as a top-level YAML property of the plan
|
||||
(it could be a specific resource, e.g., /list/index.php, or a path-based resource typical in MVC frameworks),
|
||||
one or more HTTP request methods can be specified. Both standard and custom request methods are supported.
|
||||
|
||||
During this time frame, the web service will respond with:
|
||||
|
||||
- `status`: The [HTTP status code](https://datatracker.ietf.org/doc/html/rfc7231#section-6) (can be standard or custom).
|
||||
- `mime_type`: The [MIME type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) (can be standard or custom).
|
||||
- `payload`: The response body to be returned to the client.
|
||||
|
||||
At the end of the step `duration`, the web service will proceed to the next step (if available) until
|
||||
the global `chaos_duration` concludes. At this point, the original service will be restored,
|
||||
and the custom web service and its resources will be undeployed.
|
||||
|
||||
__NOTE__: Some clients (e.g., cURL, jQuery) may optimize queries using lightweight methods (like HEAD or OPTIONS)
|
||||
to probe API behavior. If these methods are not defined in the test plan, the web service may respond with
|
||||
a `405` or `404` status code. If you encounter unexpected behavior, consider this use case.
|
||||
|
||||
@@ -2,6 +2,9 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
extraPortMappings:
|
||||
- containerPort: 30036
|
||||
hostPort: 8888
|
||||
- role: control-plane
|
||||
- role: control-plane
|
||||
- role: worker
|
||||
|
||||
@@ -4,6 +4,7 @@ import time
|
||||
import kraken.cerberus.setup as cerberus
|
||||
from jinja2 import Template
|
||||
import kraken.invoke.command as runcommand
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
@@ -11,14 +12,14 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
|
||||
# Reads the scenario config, applies and deletes a network policy to
|
||||
# block the traffic for the specified duration
|
||||
def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
def run(scenarios_list, config, wait_duration,kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
failed_post_scenarios = ""
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
failed_scenarios = []
|
||||
for app_outage_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
|
||||
if len(app_outage_config) > 1:
|
||||
try:
|
||||
@@ -49,25 +50,22 @@ spec:
|
||||
podSelector:
|
||||
matchLabels: {{ pod_selector }}
|
||||
policyTypes: {{ traffic_type }}
|
||||
"""
|
||||
"""
|
||||
t = Template(network_policy_template)
|
||||
rendered_spec = t.render(pod_selector=pod_selector, traffic_type=traffic_type)
|
||||
# Write the rendered template to a file
|
||||
with open("kraken_network_policy.yaml", "w") as f:
|
||||
f.write(rendered_spec)
|
||||
yaml_spec = yaml.safe_load(rendered_spec)
|
||||
# Block the traffic by creating network policy
|
||||
logging.info("Creating the network policy")
|
||||
runcommand.invoke(
|
||||
"kubectl create -f %s -n %s --validate=false" % ("kraken_network_policy.yaml", namespace)
|
||||
)
|
||||
|
||||
kubecli.create_net_policy(yaml_spec, namespace)
|
||||
|
||||
# wait for the specified duration
|
||||
logging.info("Waiting for the specified duration in the config: %s" % (duration))
|
||||
time.sleep(duration)
|
||||
|
||||
# unblock the traffic by deleting the network policy
|
||||
logging.info("Deleting the network policy")
|
||||
runcommand.invoke("kubectl delete -f %s -n %s" % ("kraken_network_policy.yaml", namespace))
|
||||
kubecli.delete_net_policy("kraken-deny", namespace)
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
@@ -75,12 +73,12 @@ spec:
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except Exception as e :
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(app_outage_config)
|
||||
log_exception(app_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@ def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetr
|
||||
for scenario in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry,scenario)
|
||||
engine_args = build_args(scenario)
|
||||
status_code = run_workflow(engine_args, kubeconfig_path)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exitStatus = status_code
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetry.exit_status = status_code
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
if status_code != 0:
|
||||
failed_post_scenarios.append(scenario)
|
||||
@@ -36,9 +36,10 @@ def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
|
||||
|
||||
def build_args(input_file: str) -> arcaflow.EngineArgs:
|
||||
"""sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow"""
|
||||
context = Path(input_file).parent
|
||||
workflow = "{}/workflow.yaml".format(context)
|
||||
config = "{}/config.yaml".format(context)
|
||||
current_path = Path().resolve()
|
||||
context = f"{current_path}/{Path(input_file).parent}"
|
||||
workflow = f"{context}/workflow.yaml"
|
||||
config = f"{context}/config.yaml"
|
||||
if not os.path.exists(context):
|
||||
raise Exception(
|
||||
"context folder for arcaflow workflow not found: {}".format(
|
||||
@@ -61,7 +62,8 @@ def build_args(input_file: str) -> arcaflow.EngineArgs:
|
||||
engine_args = arcaflow.EngineArgs()
|
||||
engine_args.context = context
|
||||
engine_args.config = config
|
||||
engine_args.input = input_file
|
||||
engine_args.workflow = workflow
|
||||
engine_args.input = f"{current_path}/{input_file}"
|
||||
return engine_args
|
||||
|
||||
|
||||
|
||||
@@ -4,13 +4,10 @@ import pandas as pd
|
||||
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
||||
import time
|
||||
|
||||
threshold = .7 # Adjust the threshold as needed
|
||||
heatmap_cpu_threshold = .5
|
||||
heatmap_mem_threshold = .5
|
||||
|
||||
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
||||
|
||||
#Placeholder, this should be done with topology
|
||||
|
||||
# Placeholder, this should be done with topology
|
||||
def return_critical_services():
|
||||
return ["web", "cart"]
|
||||
|
||||
@@ -19,15 +16,18 @@ def load_telemetry_data(file_path):
|
||||
data = pd.read_csv(file_path, delimiter=r"\s+")
|
||||
return data
|
||||
|
||||
|
||||
def calculate_zscores(data):
|
||||
zscores = pd.DataFrame()
|
||||
zscores["Namespace"] = data["namespace"]
|
||||
zscores["Service"] = data["service"]
|
||||
zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
|
||||
zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
|
||||
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
||||
return zscores
|
||||
|
||||
def identify_outliers(data):
|
||||
|
||||
def identify_outliers(data, threshold):
|
||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||
@@ -47,44 +47,85 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
|
||||
return cpu_services, mem_services
|
||||
|
||||
|
||||
def analysis(file_path, chaos_tests_config):
|
||||
def analysis(file_path, namespaces, chaos_tests_config, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
# Load the telemetry data from file
|
||||
logging.info("Fetching the Telemetry data...")
|
||||
data = load_telemetry_data(file_path)
|
||||
|
||||
# Calculate Z-scores for CPU, Memory, and Network columns
|
||||
zscores = calculate_zscores(data)
|
||||
# Dict for saving analysis data -- key is the namespace
|
||||
analysis_data = {}
|
||||
|
||||
# Identify outliers
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
|
||||
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
# Identify outliers for each namespace
|
||||
for namespace in namespaces:
|
||||
|
||||
# Display the identified outliers
|
||||
logging.info("======================== Profiling ==================================")
|
||||
logging.info(f"CPU outliers: {outliers_cpu}")
|
||||
logging.info(f"Memory outliers: {outliers_memory}")
|
||||
logging.info(f"Network outliers: {outliers_network}")
|
||||
logging.info("===================== HeatMap Analysis ==============================")
|
||||
logging.info(f"Identifying outliers for namespace {namespace}...")
|
||||
|
||||
namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
|
||||
namespace_data = data.loc[data["namespace"] == namespace]
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(
|
||||
namespace_zscores, threshold)
|
||||
cpu_services, mem_services = get_services_above_heatmap_threshold(
|
||||
namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
|
||||
outliers_network,
|
||||
cpu_services, mem_services,
|
||||
chaos_tests_config)
|
||||
|
||||
if cpu_services:
|
||||
logging.info(f"These services use significant CPU compared to "
|
||||
f"their assigned limits: {cpu_services}")
|
||||
else:
|
||||
logging.info("There are no services that are using significant "
|
||||
"CPU compared to their assigned limits "
|
||||
"(infinite in case no limits are set).")
|
||||
if mem_services:
|
||||
logging.info(f"These services use significant MEMORY compared to "
|
||||
f"their assigned limits: {mem_services}")
|
||||
else:
|
||||
logging.info("There are no services that are using significant "
|
||||
"MEMORY compared to their assigned limits "
|
||||
"(infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
|
||||
logging.info("Please check data in utilisation.txt for further analysis")
|
||||
|
||||
return analysis_data
|
||||
|
||||
|
||||
def analysis_json(outliers_cpu, outliers_memory, outliers_network,
|
||||
cpu_services, mem_services, chaos_tests_config):
|
||||
|
||||
profiling = {
|
||||
"cpu_outliers": outliers_cpu,
|
||||
"memory_outliers": outliers_memory,
|
||||
"network_outliers": outliers_network
|
||||
}
|
||||
|
||||
heatmap = {
|
||||
"services_with_cpu_heatmap_above_threshold": cpu_services,
|
||||
"services_with_mem_heatmap_above_threshold": mem_services
|
||||
}
|
||||
|
||||
recommendations = {}
|
||||
|
||||
if cpu_services:
|
||||
logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
|
||||
else:
|
||||
logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
|
||||
cpu_recommend = {"services": cpu_services,
|
||||
"tests": chaos_tests_config['CPU']}
|
||||
recommendations["cpu_services_recommendations"] = cpu_recommend
|
||||
|
||||
if mem_services:
|
||||
logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
|
||||
else:
|
||||
logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
logging.info("======================= Recommendations =============================")
|
||||
if cpu_services:
|
||||
logging.info(f"Recommended tests for {str(cpu_services)} :\n {chaos_tests_config['CPU']}")
|
||||
logging.info("\n")
|
||||
if mem_services:
|
||||
logging.info(f"Recommended tests for {str(mem_services)} :\n {chaos_tests_config['MEM']}")
|
||||
logging.info("\n")
|
||||
mem_recommend = {"services": mem_services,
|
||||
"tests": chaos_tests_config['MEM']}
|
||||
recommendations["mem_services_recommendations"] = mem_recommend
|
||||
|
||||
if outliers_network:
|
||||
logging.info(f"Recommended tests for str(outliers_network) :\n {chaos_tests_config['NETWORK']}")
|
||||
logging.info("\n")
|
||||
outliers_network_recommend = {"outliers_networks": outliers_network,
|
||||
"tests": chaos_tests_config['NETWORK']}
|
||||
recommendations["outliers_network_recommendations"] = (
|
||||
outliers_network_recommend)
|
||||
|
||||
logging.info("\n")
|
||||
logging.info("Please check data in utilisation.txt for further analysis")
|
||||
return [profiling, heatmap, recommendations]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import logging
|
||||
|
||||
import pandas
|
||||
from prometheus_api_client import PrometheusConnect
|
||||
import pandas as pd
|
||||
import urllib3
|
||||
@@ -8,6 +7,7 @@ import urllib3
|
||||
|
||||
saved_metrics_path = "./utilisation.txt"
|
||||
|
||||
|
||||
def convert_data_to_dataframe(data, label):
|
||||
df = pd.DataFrame()
|
||||
df['service'] = [item['metric']['pod'] for item in data]
|
||||
@@ -17,29 +17,60 @@ def convert_data_to_dataframe(data, label):
|
||||
|
||||
|
||||
def convert_data(data, service):
|
||||
|
||||
result = {}
|
||||
for entry in data:
|
||||
pod_name = entry['metric']['pod']
|
||||
value = entry['value'][1]
|
||||
result[pod_name] = value
|
||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
||||
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
||||
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
||||
services = df_cpu.service.unique()
|
||||
logging.info(services)
|
||||
|
||||
for s in services:
|
||||
|
||||
new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
|
||||
"CPU_LIMITS" : convert_data(cpu_limits_result, s),
|
||||
"MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
|
||||
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
return result.get(service) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
|
||||
def convert_data_limits(data, node_data, service, prometheus):
|
||||
result = {}
|
||||
for entry in data:
|
||||
pod_name = entry['metric']['pod']
|
||||
value = entry['value'][1]
|
||||
result[pod_name] = value
|
||||
return result.get(service, get_node_capacity(node_data, service, prometheus)) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
def get_node_capacity(node_data, pod_name, prometheus ):
|
||||
|
||||
# Get the node name on which the pod is running
|
||||
query = f'kube_pod_info{{pod="{pod_name}"}}'
|
||||
result = prometheus.custom_query(query)
|
||||
if not result:
|
||||
return None
|
||||
|
||||
node_name = result[0]['metric']['node']
|
||||
|
||||
for item in node_data:
|
||||
if item['metric']['node'] == node_name:
|
||||
return item['value'][1]
|
||||
|
||||
return '1000000000'
|
||||
|
||||
|
||||
def save_utilization_to_file(utilization, filename, prometheus):
|
||||
|
||||
merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
|
||||
for namespace in utilization:
|
||||
# Loading utilization_data[] for namespace
|
||||
# indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
|
||||
utilization_data = utilization[namespace]
|
||||
df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
|
||||
services = df_cpu.service.unique()
|
||||
logging.info(f"Services for namespace {namespace}: {services}")
|
||||
|
||||
for s in services:
|
||||
|
||||
new_row_df = pd.DataFrame({
|
||||
"namespace": namespace, "service": s,
|
||||
"CPU": convert_data(utilization_data[0], s),
|
||||
"CPU_LIMITS": convert_data_limits(utilization_data[1],utilization_data[5], s, prometheus),
|
||||
"MEM": convert_data(utilization_data[2], s),
|
||||
"MEM_LIMITS": convert_data_limits(utilization_data[3], utilization_data[6], s, prometheus),
|
||||
"NETWORK": convert_data(utilization_data[4], s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
|
||||
# Convert columns to string
|
||||
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
||||
@@ -49,48 +80,65 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
||||
merged_df['NETWORK'] = merged_df['NETWORK'].astype(str)
|
||||
|
||||
# Extract integer part before the decimal point
|
||||
merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
|
||||
merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
|
||||
merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
|
||||
merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
|
||||
merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]
|
||||
#merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
|
||||
#merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
|
||||
#merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
|
||||
#merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
|
||||
#merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]
|
||||
|
||||
merged_df.to_csv(filename, sep='\t', index=False)
|
||||
|
||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
||||
|
||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
|
||||
namespaces, scrape_duration):
|
||||
urllib3.disable_warnings()
|
||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
|
||||
'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||
|
||||
# Fetch CPU utilization
|
||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||
logging.info(cpu_query)
|
||||
cpu_result = prometheus.custom_query(cpu_query)
|
||||
cpu_data = cpu_result
|
||||
|
||||
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
logging.info(cpu_limits_query)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
|
||||
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
logging.info(mem_query)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
mem_data = mem_result
|
||||
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
logging.info(mem_limits_query)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
logging.info(network_query)
|
||||
network_data = network_result
|
||||
|
||||
|
||||
save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
|
||||
return saved_metrics_path
|
||||
# Dicts for saving utilisation and queries -- key is namespace
|
||||
utilization = {}
|
||||
queries = {}
|
||||
|
||||
logging.info("Fetching utilization...")
|
||||
for namespace in namespaces:
|
||||
|
||||
# Fetch CPU utilization
|
||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||
cpu_result = prometheus.custom_query(cpu_query)
|
||||
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
|
||||
node_cpu_limits_query = 'kube_node_status_capacity{resource="cpu", unit="core"}*1000'
|
||||
node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query)
|
||||
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
|
||||
node_mem_limits_query = 'kube_node_status_capacity{resource="memory", unit="byte"}'
|
||||
node_mem_limits_result = prometheus.custom_query(node_mem_limits_query)
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
|
||||
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, node_cpu_limits_result, node_mem_limits_result ]
|
||||
queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
|
||||
|
||||
save_utilization_to_file(utilization, saved_metrics_path, prometheus)
|
||||
|
||||
return saved_metrics_path, queries
|
||||
|
||||
|
||||
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
|
||||
queries = {
|
||||
"cpu_query": cpu_query,
|
||||
"cpu_limit_query": cpu_limits_query,
|
||||
"memory_query": mem_query,
|
||||
"memory_limit_query": mem_limits_query,
|
||||
"network_query": network_query
|
||||
}
|
||||
return queries
|
||||
|
||||
@@ -23,7 +23,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
for net_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = net_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, net_config)
|
||||
try:
|
||||
with open(net_config, "r") as file:
|
||||
@@ -114,11 +114,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
logging.info("Deleting jobs")
|
||||
delete_job(joblst[:], kubecli)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(net_config)
|
||||
log_exception(net_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
|
||||
import time
|
||||
import yaml
|
||||
import os
|
||||
import kraken.invoke.command as runcommand
|
||||
import logging
|
||||
import kraken.node_actions.common_node_functions as nodeaction
|
||||
@@ -17,9 +17,9 @@ class Azure:
|
||||
# Acquire a credential object using CLI-based authentication.
|
||||
credentials = DefaultAzureCredential()
|
||||
logging.info("credential " + str(credentials))
|
||||
az_account = runcommand.invoke("az account list -o yaml")
|
||||
az_account_yaml = yaml.safe_load(az_account, Loader=yaml.FullLoader)
|
||||
subscription_id = az_account_yaml[0]["id"]
|
||||
# az_account = runcommand.invoke("az account list -o yaml")
|
||||
# az_account_yaml = yaml.safe_load(az_account, Loader=yaml.FullLoader)
|
||||
subscription_id = os.getenv("AZURE_SUBSCRIPTION_ID")
|
||||
self.compute_client = ComputeManagementClient(credentials, subscription_id)
|
||||
|
||||
# Get the instance ID of the node
|
||||
|
||||
@@ -1,6 +1,8 @@
|
||||
import os
|
||||
import sys
|
||||
import time
|
||||
import logging
|
||||
import json
|
||||
import kraken.node_actions.common_node_functions as nodeaction
|
||||
from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
|
||||
from googleapiclient import discovery
|
||||
@@ -10,11 +12,19 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
class GCP:
|
||||
def __init__(self):
|
||||
try:
|
||||
gapp_creds = os.getenv("GOOGLE_APPLICATION_CREDENTIALS")
|
||||
with open(gapp_creds, "r") as f:
|
||||
f_str = f.read()
|
||||
self.project = json.loads(f_str)['project_id']
|
||||
#self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip()
|
||||
logging.info("project " + str(self.project) + "!")
|
||||
credentials = GoogleCredentials.get_application_default()
|
||||
self.client = discovery.build("compute", "v1", credentials=credentials, cache_discovery=False)
|
||||
|
||||
self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip()
|
||||
logging.info("project " + str(self.project) + "!")
|
||||
credentials = GoogleCredentials.get_application_default()
|
||||
self.client = discovery.build("compute", "v1", credentials=credentials, cache_discovery=False)
|
||||
except Exception as e:
|
||||
logging.error("Error on setting up GCP connection: " + str(e))
|
||||
sys.exit(1)
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
|
||||
@@ -15,7 +15,7 @@ import kraken.cerberus.setup as cerberus
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.utils.functions import get_yaml_item_value
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
|
||||
node_general = False
|
||||
|
||||
@@ -61,7 +61,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
for node_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = node_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
|
||||
with open(node_scenario_config, "r") as f:
|
||||
node_scenario_config = yaml.full_load(f)
|
||||
@@ -78,13 +78,13 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
cerberus.get_status(config, start_time, end_time)
|
||||
logging.info("")
|
||||
except (RuntimeError, Exception) as e:
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(node_scenario_config)
|
||||
log_exception(node_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -2,11 +2,14 @@ import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from os.path import abspath
|
||||
from typing import List, Dict
|
||||
from typing import List, Dict, Any
|
||||
import time
|
||||
|
||||
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
||||
from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||
|
||||
import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
|
||||
import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||
from kraken.plugins.run_python_plugin import run_python_file
|
||||
@@ -47,11 +50,14 @@ class Plugins:
|
||||
)
|
||||
self.steps_by_id[step.schema.id] = step
|
||||
|
||||
def run(self, file: str, kubeconfig_path: str, kraken_config: str):
|
||||
def unserialize_scenario(self, file: str) -> Any:
|
||||
return serialization.load_from_file(abspath(file))
|
||||
|
||||
def run(self, file: str, kubeconfig_path: str, kraken_config: str, run_uuid:str):
|
||||
"""
|
||||
Run executes a series of steps
|
||||
"""
|
||||
data = serialization.load_from_file(abspath(file))
|
||||
data = self.unserialize_scenario(abspath(file))
|
||||
if not isinstance(data, list):
|
||||
raise Exception(
|
||||
"Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
|
||||
@@ -96,7 +102,8 @@ class Plugins:
|
||||
unserialized_input.kubeconfig_path = kubeconfig_path
|
||||
if "kraken_config" in step.schema.input.properties:
|
||||
unserialized_input.kraken_config = kraken_config
|
||||
output_id, output_data = step.schema(unserialized_input)
|
||||
output_id, output_data = step.schema(params=unserialized_input, run_id=run_uuid)
|
||||
|
||||
logging.info(step.render_output(output_id, output_data) + "\n")
|
||||
if output_id in step.error_output_ids:
|
||||
raise Exception(
|
||||
@@ -213,6 +220,12 @@ PLUGINS = Plugins(
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
network_chaos,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
pod_outage,
|
||||
[
|
||||
@@ -235,25 +248,73 @@ PLUGINS = Plugins(
|
||||
)
|
||||
|
||||
|
||||
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
|
||||
def run(scenarios: List[str],
|
||||
kubeconfig_path: str,
|
||||
kraken_config: str,
|
||||
failed_post_scenarios: List[str],
|
||||
wait_duration: int,
|
||||
telemetry: KrknTelemetryKubernetes,
|
||||
kubecli: KrknKubernetes,
|
||||
run_uuid: str
|
||||
) -> (List[str], list[ScenarioTelemetry]):
|
||||
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
for scenario in scenarios:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
logging.info('scenario ' + str(scenario))
|
||||
pool = PodsMonitorPool(kubecli)
|
||||
kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
|
||||
|
||||
try:
|
||||
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
|
||||
start_monitoring(pool, kill_scenarios)
|
||||
PLUGINS.run(scenario, kubeconfig_path, kraken_config, run_uuid)
|
||||
result = pool.join()
|
||||
scenario_telemetry.affected_pods = result
|
||||
if result.error:
|
||||
raise Exception(f"unrecovered pods: {result.error}")
|
||||
|
||||
except Exception as e:
|
||||
scenario_telemetry.exitStatus = 1
|
||||
logging.error(f"scenario exception: {str(e)}")
|
||||
scenario_telemetry.exit_status = 1
|
||||
pool.cancel()
|
||||
failed_post_scenarios.append(scenario)
|
||||
log_exception(scenario)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
|
||||
for kill_scenario in scenarios:
|
||||
recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
|
||||
if ("namespace_pattern" in kill_scenario["config"] and
|
||||
"label_selector" in kill_scenario["config"]):
|
||||
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||
label_selector = kill_scenario["config"]["label_selector"]
|
||||
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||
namespace_pattern=namespace_pattern,
|
||||
label_selector=label_selector,
|
||||
max_timeout=recovery_time)
|
||||
logging.info(
|
||||
f"waiting {recovery_time} seconds for pod recovery, "
|
||||
f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
|
||||
|
||||
elif ("namespace_pattern" in kill_scenario["config"] and
|
||||
"name_pattern" in kill_scenario["config"]):
|
||||
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||
name_pattern = kill_scenario["config"]["name_pattern"]
|
||||
pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
|
||||
namespace_pattern=namespace_pattern,
|
||||
max_timeout=recovery_time)
|
||||
logging.info(f"waiting {recovery_time} seconds for pod recovery, "
|
||||
f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
|
||||
else:
|
||||
raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
|
||||
|
||||
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
|
||||
typing.Optional[int],
|
||||
validation.min(1)
|
||||
] = field(
|
||||
default=300,
|
||||
default=30,
|
||||
metadata={
|
||||
"name": "Wait Duration",
|
||||
"description":
|
||||
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
||||
)
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
||||
)
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||
logging.info("Deleting jobs")
|
||||
delete_jobs(cli, batch_cli, job_list[:])
|
||||
job_list = []
|
||||
|
||||
@@ -119,11 +119,11 @@ class vSphere:
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.stop(vm)
|
||||
logging.info("Stopped VM -- '{}-({})'", instance_id, vm)
|
||||
logging.info(f"Stopped VM -- '{instance_id}-({vm})'")
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is already Powered Off", instance_id, vm
|
||||
f"VM '{instance_id}'-'({vm})' is already Powered Off"
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -136,11 +136,11 @@ class vSphere:
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.start(vm)
|
||||
logging.info("Started VM -- '{}-({})'", instance_id, vm)
|
||||
logging.info(f"Started VM -- '{instance_id}-({vm})'")
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is already Powered On", instance_id, vm
|
||||
f"VM '{instance_id}'-'({vm})' is already Powered On"
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -318,12 +318,12 @@ class vSphere:
|
||||
try:
|
||||
vm = self.get_vm(instance_id)
|
||||
state = self.client.vcenter.vm.Power.get(vm).state
|
||||
logging.info("Check instance %s status", instance_id)
|
||||
logging.info(f"Check instance {instance_id} status")
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s.", instance_id, e
|
||||
f"Failed to get node instance status {instance_id}. Encountered following "
|
||||
f"exception: {str(e)}. "
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -338,16 +338,14 @@ class vSphere:
|
||||
while vm is not None:
|
||||
vm = self.get_vm(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still being deleted, "
|
||||
"sleeping for 5 seconds",
|
||||
instance_id
|
||||
f"VM {instance_id} is still being deleted, "
|
||||
f"sleeping for 5 seconds"
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not deleted in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not deleted in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -371,8 +369,7 @@ class vSphere:
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not ready in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not ready in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -388,16 +385,14 @@ class vSphere:
|
||||
while status != Power.State.POWERED_OFF:
|
||||
status = self.get_vm_status(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still not running, "
|
||||
"sleeping for 5 seconds",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not running, "
|
||||
f"sleeping for 5 seconds"
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not ready in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not ready in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -561,7 +556,7 @@ def node_start(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s ", name)
|
||||
logging.info(f"Starting the node {name} ")
|
||||
vm_started = vsphere.start_instances(name)
|
||||
if vm_started:
|
||||
vsphere.wait_until_running(name, cfg.timeout)
|
||||
@@ -571,7 +566,7 @@ def node_start(
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state", name
|
||||
f"Node with instance ID: {name} is in running state"
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
@@ -579,8 +574,8 @@ def node_start(
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_start_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_start_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
@@ -620,7 +615,7 @@ def node_stop(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s ", name)
|
||||
logging.info(f"Stopping the node {name} ")
|
||||
vm_stopped = vsphere.stop_instances(name)
|
||||
if vm_stopped:
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
@@ -630,7 +625,7 @@ def node_stop(
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state", name
|
||||
f"Node with instance ID: {name} is in stopped state"
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
@@ -638,8 +633,8 @@ def node_stop(
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_stop_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_stop_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
@@ -679,7 +674,7 @@ def node_reboot(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s ", name)
|
||||
logging.info(f"Rebooting the node {name} ")
|
||||
vsphere.reboot_instances(name)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
@@ -690,8 +685,8 @@ def node_reboot(
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted "
|
||||
"successfully", name
|
||||
f"Node with instance ID: {name} has rebooted "
|
||||
"successfully"
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
@@ -699,8 +694,8 @@ def node_reboot(
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_reboot_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_reboot_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
@@ -739,13 +734,13 @@ def node_terminate(
|
||||
vsphere.stop_instances(name)
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
logging.info(
|
||||
"Releasing the node with instance ID: %s ", name
|
||||
f"Releasing the node with instance ID: {name} "
|
||||
)
|
||||
vsphere.release_instances(name)
|
||||
vsphere.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released", name
|
||||
f"Node with instance ID: {name} has been released"
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been "
|
||||
@@ -754,8 +749,8 @@ def node_terminate(
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_terminate_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_terminate_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
import sys
|
||||
import random
|
||||
import arcaflow_plugin_kill_pod
|
||||
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||
|
||||
import kraken.cerberus.setup as cerberus
|
||||
import kraken.post_actions.actions as post_actions
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
@@ -79,11 +83,12 @@ def container_run(kubeconfig_path,
|
||||
|
||||
failed_scenarios = []
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
pool = PodsMonitorPool(kubecli)
|
||||
|
||||
for container_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = container_scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
|
||||
if len(container_scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
|
||||
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
|
||||
pre_action_output = ""
|
||||
with open(container_scenario_config[0], "r") as f:
|
||||
cont_scenario_config = yaml.full_load(f)
|
||||
start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
|
||||
for cont_scenario in cont_scenario_config["scenarios"]:
|
||||
# capture start time
|
||||
start_time = int(time.time())
|
||||
try:
|
||||
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
||||
if len(container_scenario_config) > 1:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path,
|
||||
container_scenario_config,
|
||||
failed_post_scenarios,
|
||||
pre_action_output
|
||||
)
|
||||
else:
|
||||
failed_post_scenarios = check_failed_containers(
|
||||
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
|
||||
)
|
||||
|
||||
logging.info(f"killed containers: {str(killed_containers)}")
|
||||
result = pool.join()
|
||||
if result.error:
|
||||
raise Exception(f"pods failed to recovery: {result.error}")
|
||||
scenario_telemetry.affected_pods = result
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
@@ -117,18 +116,29 @@ def container_run(kubeconfig_path,
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
pool.cancel()
|
||||
failed_scenarios.append(container_scenario_config[0])
|
||||
log_exception(container_scenario_config[0])
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
|
||||
for kill_scenario in kill_scenarios:
|
||||
namespace_pattern = f"^{kill_scenario['namespace']}$"
|
||||
label_selector = kill_scenario["label_selector"]
|
||||
recovery_time = kill_scenario["expected_recovery_time"]
|
||||
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||
namespace_pattern=namespace_pattern,
|
||||
label_selector=label_selector,
|
||||
max_timeout=recovery_time)
|
||||
|
||||
|
||||
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
|
||||
scenario_name = get_yaml_item_value(cont_scenario, "name", "")
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
import datetime
|
||||
import os.path
|
||||
from typing import Optional
|
||||
|
||||
import urllib3
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||
|
||||
prom_cli.process_alert(alert,
|
||||
datetime.datetime.fromtimestamp(start_time),
|
||||
datetime.datetime.fromtimestamp(end_time))
|
||||
datetime.datetime.fromtimestamp(end_time))
|
||||
|
||||
|
||||
def critical_alerts(prom_cli: KrknPrometheus,
|
||||
summary: ChaosRunAlertSummary,
|
||||
run_id,
|
||||
scenario,
|
||||
start_time,
|
||||
end_time):
|
||||
summary.scenario = scenario
|
||||
summary.run_id = run_id
|
||||
query = r"""ALERTS{severity="critical"}"""
|
||||
logging.info("Checking for critical alerts firing post chaos")
|
||||
|
||||
during_critical_alerts = prom_cli.process_prom_query_in_range(
|
||||
query,
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=end_time
|
||||
|
||||
)
|
||||
|
||||
for alert in during_critical_alerts:
|
||||
if "metric" in alert:
|
||||
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||
summary.chaos_alerts.append(alert)
|
||||
|
||||
|
||||
post_critical_alerts = prom_cli.process_query(
|
||||
query
|
||||
)
|
||||
|
||||
for alert in post_critical_alerts:
|
||||
if "metric" in alert:
|
||||
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||
summary.post_chaos_alerts.append(alert)
|
||||
|
||||
during_critical_alerts_count = len(during_critical_alerts)
|
||||
post_critical_alerts_count = len(post_critical_alerts)
|
||||
firing_alerts = False
|
||||
|
||||
if during_critical_alerts_count > 0:
|
||||
firing_alerts = True
|
||||
|
||||
if post_critical_alerts_count > 0:
|
||||
firing_alerts = True
|
||||
|
||||
if not firing_alerts:
|
||||
logging.info("No critical alerts are firing!!")
|
||||
|
||||
@@ -11,7 +11,7 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
|
||||
|
||||
# krkn_lib
|
||||
def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
"""
|
||||
Reads the scenario config and creates a temp file to fill up the PVC
|
||||
"""
|
||||
@@ -21,7 +21,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
for app_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_config)
|
||||
try:
|
||||
if len(app_config) > 1:
|
||||
@@ -305,7 +305,9 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
file_size_kb,
|
||||
kubecli
|
||||
)
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
@@ -314,11 +316,11 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(app_config)
|
||||
log_exception(app_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -165,7 +165,7 @@ def run(
|
||||
for scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
|
||||
try:
|
||||
if len(scenario_config) > 1:
|
||||
@@ -249,12 +249,12 @@ def run(
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (Exception, RuntimeError):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(scenario_config[0])
|
||||
log_exception(scenario_config[0])
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
0
kraken/service_hijacking/__init__.py
Normal file
0
kraken/service_hijacking/__init__.py
Normal file
90
kraken/service_hijacking/service_hijacking.py
Normal file
90
kraken/service_hijacking/service_hijacking.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
|
||||
|
||||
def run(scenarios_list: list[str],wait_duration: int, krkn_lib: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries= list[ScenarioTelemetry]()
|
||||
failed_post_scenarios = []
|
||||
for scenario in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
with open(scenario) as stream:
|
||||
scenario_config = yaml.safe_load(stream)
|
||||
|
||||
service_name = scenario_config['service_name']
|
||||
service_namespace = scenario_config['service_namespace']
|
||||
plan = scenario_config["plan"]
|
||||
image = scenario_config["image"]
|
||||
target_port = scenario_config["service_target_port"]
|
||||
chaos_duration = scenario_config["chaos_duration"]
|
||||
|
||||
logging.info(f"checking service {service_name} in namespace: {service_namespace}")
|
||||
if not krkn_lib.service_exists(service_name, service_namespace):
|
||||
logging.error(f"service: {service_name} not found in namespace: {service_namespace}, failed to run scenario.")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
try:
|
||||
logging.info(f"service: {service_name} found in namespace: {service_namespace}")
|
||||
logging.info(f"creating webservice and initializing test plan...")
|
||||
# both named ports and port numbers can be used
|
||||
if isinstance(target_port, int):
|
||||
logging.info(f"webservice will listen on port {target_port}")
|
||||
webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_number=target_port)
|
||||
else:
|
||||
logging.info(f"traffic will be redirected to named port: {target_port}")
|
||||
webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_name=target_port)
|
||||
logging.info(f"successfully deployed pod: {webservice.pod_name} "
|
||||
f"in namespace:{service_namespace} with selector {webservice.selector}!"
|
||||
)
|
||||
logging.info(f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}")
|
||||
original_service = krkn_lib.replace_service_selector([webservice.selector], service_name, service_namespace)
|
||||
if original_service is None:
|
||||
logging.error(f"failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
|
||||
logging.info(f"service: {service_name} successfully patched!")
|
||||
logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}")
|
||||
logging.info(f"waiting {chaos_duration} before restoring the service")
|
||||
time.sleep(chaos_duration)
|
||||
selectors = ["=".join([key, original_service["spec"]["selector"][key]]) for key in original_service["spec"]["selector"].keys()]
|
||||
logging.info(f"restoring the service selectors {selectors}")
|
||||
original_service = krkn_lib.replace_service_selector(selectors, service_name, service_namespace)
|
||||
if original_service is None:
|
||||
logging.error(f"failed to restore original service: {service_name}, namespace: {service_namespace} with selectors: {selectors}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
logging.info("selectors successfully restored")
|
||||
logging.info("undeploying service-hijacking resources...")
|
||||
krkn_lib.undeploy_service_hijacking(webservice)
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
logging.info("success")
|
||||
except Exception as e:
|
||||
logging.error(f"scenario {scenario} failed with exception: {e}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
def fail(scenario_telemetry: ScenarioTelemetry, scenario_telemetries: list[ScenarioTelemetry]):
|
||||
scenario_telemetry.exit_status = 1
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
@@ -147,7 +147,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = config_path
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, config_path)
|
||||
|
||||
with open(config_path, "r") as f:
|
||||
@@ -175,11 +175,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
except (RuntimeError, Exception):
|
||||
log_exception(config_path)
|
||||
failed_scenarios.append(config_path)
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -354,7 +354,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
|
||||
for time_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = time_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
|
||||
try:
|
||||
with open(time_scenario_config, "r") as f:
|
||||
@@ -377,12 +377,12 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
log_exception(time_scenario_config)
|
||||
failed_scenarios.append(time_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -19,7 +19,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
|
||||
for zone_outage_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = zone_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
|
||||
try:
|
||||
if len(zone_outage_config) > 1:
|
||||
@@ -110,12 +110,12 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(zone_outage_config)
|
||||
log_exception(zone_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -1,41 +1,40 @@
|
||||
aliyun-python-sdk-core==2.13.36
|
||||
aliyun-python-sdk-ecs==4.24.25
|
||||
arcaflow==0.9.0
|
||||
arcaflow-plugin-sdk==0.10.0
|
||||
arcaflow-plugin-sdk==0.14.0
|
||||
arcaflow==0.17.2
|
||||
boto3==1.28.61
|
||||
azure-identity==1.15.0
|
||||
azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
itsdangerous==2.0.1
|
||||
coverage==7.4.1
|
||||
datetime==5.4
|
||||
docker==7.0.0
|
||||
docker-compose==1.29.2
|
||||
gitpython==3.1.41
|
||||
google-api-python-client==2.116.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.3
|
||||
krkn-lib==1.4.9
|
||||
jinja2==3.1.4
|
||||
krkn-lib==2.1.6
|
||||
lxml==5.1.0
|
||||
kubernetes==26.1.0
|
||||
kubernetes==28.1.0
|
||||
oauth2client==4.1.3
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
paramiko==3.4.0
|
||||
podman-compose==1.0.6
|
||||
pyVmomi==8.0.2.0.1
|
||||
pyfiglet==1.0.2
|
||||
pytest==8.0.0
|
||||
python-ipmi==0.5.4
|
||||
python-openstackclient==6.5.0
|
||||
requests==2.31.0
|
||||
requests==2.32.2
|
||||
service_identity==24.1.0
|
||||
PyYAML==5.4.1
|
||||
PyYAML==6.0.1
|
||||
setuptools==65.5.1
|
||||
werkzeug==3.0.1
|
||||
werkzeug==3.0.3
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git@v0.1.0
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
@@ -9,6 +9,8 @@ import optparse
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
|
||||
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
import kraken.time_actions.common_time_functions as time_actions
|
||||
import kraken.performance_dashboards.setup as performance_dashboards
|
||||
@@ -23,10 +25,12 @@ import kraken.pvc.pvc_scenario as pvc_scenario
|
||||
import kraken.network_chaos.actions as network_chaos
|
||||
import kraken.arcaflow_plugin as arcaflow_plugin
|
||||
import kraken.prometheus as prometheus_plugin
|
||||
import kraken.service_hijacking.service_hijacking as service_hijacking_plugin
|
||||
import server as server
|
||||
from kraken import plugins
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.ocp import KrknOpenshift
|
||||
from krkn_lib.telemetry.elastic import KrknElastic
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||
@@ -94,6 +98,9 @@ def main(cfg):
|
||||
config["performance_monitoring"], "check_critical_alerts", False
|
||||
)
|
||||
telemetry_api_url = config["telemetry"].get("api_url")
|
||||
elastic_config = get_yaml_item_value(config,"elastic",{})
|
||||
elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
|
||||
elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")
|
||||
|
||||
# Initialize clients
|
||||
if (not os.path.isfile(kubeconfig_path) and
|
||||
@@ -129,8 +136,6 @@ def main(cfg):
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
|
||||
|
||||
# find node kraken might be running on
|
||||
kubecli.find_kraken_node()
|
||||
|
||||
@@ -161,8 +166,13 @@ def main(cfg):
|
||||
if prometheus_url is None:
|
||||
try:
|
||||
connection_data = ocpcli.get_prometheus_api_connection_data()
|
||||
prometheus_url = connection_data.endpoint
|
||||
prometheus_bearer_token = connection_data.token
|
||||
if connection_data:
|
||||
prometheus_url = connection_data.endpoint
|
||||
prometheus_bearer_token = connection_data.token
|
||||
else:
|
||||
# If can't make a connection, set alerts to false
|
||||
enable_alerts = False
|
||||
critical_alerts = False
|
||||
except Exception:
|
||||
logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
|
||||
"Please set 'kubernetes' in config.yaml krkn.platform and try again")
|
||||
@@ -175,9 +185,9 @@ def main(cfg):
|
||||
# KrknTelemetry init
|
||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||
|
||||
|
||||
if enable_alerts:
|
||||
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
|
||||
summary = ChaosRunAlertSummary()
|
||||
if enable_alerts or check_critical_alerts:
|
||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||
|
||||
logging.info("Server URL: %s" % kubecli.get_host())
|
||||
@@ -208,7 +218,8 @@ def main(cfg):
|
||||
|
||||
# Capture the start time
|
||||
start_time = int(time.time())
|
||||
|
||||
post_critical_alerts = 0
|
||||
chaos_output = ChaosRunOutput()
|
||||
chaos_telemetry = ChaosRunTelemetry()
|
||||
chaos_telemetry.run_uuid = run_uuid
|
||||
# Loop to run the chaos starts here
|
||||
@@ -254,7 +265,9 @@ def main(cfg):
|
||||
kraken_config,
|
||||
failed_post_scenarios,
|
||||
wait_duration,
|
||||
telemetry_k8s
|
||||
telemetry_k8s,
|
||||
kubecli,
|
||||
run_uuid
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
# krkn_lib
|
||||
@@ -322,14 +335,14 @@ def main(cfg):
|
||||
elif scenario_type == "application_outages":
|
||||
logging.info("Injecting application outage")
|
||||
failed_post_scenarios, scenario_telemetries = application_outage.run(
|
||||
scenarios_list, config, wait_duration, telemetry_k8s)
|
||||
scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# PVC scenarios
|
||||
# krkn_lib
|
||||
elif scenario_type == "pvc_scenarios":
|
||||
logging.info("Running PVC scenario")
|
||||
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry_k8s)
|
||||
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Network scenarios
|
||||
@@ -337,27 +350,27 @@ def main(cfg):
|
||||
elif scenario_type == "network_chaos":
|
||||
logging.info("Running Network Chaos")
|
||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
elif scenario_type == "service_hijacking":
|
||||
logging.info("Running Service Hijacking Chaos")
|
||||
failed_post_scenarios, scenario_telemetries = service_hijacking_plugin.run(scenarios_list, wait_duration, kubecli, telemetry_k8s)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Check for critical alerts when enabled
|
||||
if enable_alerts and check_critical_alerts :
|
||||
logging.info("Checking for critical alerts firing post choas")
|
||||
post_critical_alerts = 0
|
||||
if check_critical_alerts:
|
||||
prometheus_plugin.critical_alerts(prometheus,
|
||||
summary,
|
||||
run_uuid,
|
||||
scenario_type,
|
||||
start_time,
|
||||
datetime.datetime.now())
|
||||
|
||||
##PROM
|
||||
query = r"""ALERTS{severity="critical"}"""
|
||||
end_time = datetime.datetime.now()
|
||||
critical_alerts = prometheus.process_prom_query_in_range(
|
||||
query,
|
||||
start_time = datetime.datetime.fromtimestamp(start_time),
|
||||
end_time = end_time
|
||||
chaos_output.critical_alerts = summary
|
||||
post_critical_alerts = len(summary.post_chaos_alerts)
|
||||
if post_critical_alerts > 0:
|
||||
logging.error("Post chaos critical alerts firing please check, exiting")
|
||||
break
|
||||
|
||||
)
|
||||
critical_alerts_count = len(critical_alerts)
|
||||
if critical_alerts_count > 0:
|
||||
logging.error("Critical alerts are firing: %s", critical_alerts)
|
||||
logging.error("Please check, exiting")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("No critical alerts are firing!!")
|
||||
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
@@ -377,14 +390,18 @@ def main(cfg):
|
||||
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
||||
|
||||
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
||||
logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
|
||||
|
||||
chaos_output.telemetry = decoded_chaos_run_telemetry
|
||||
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
|
||||
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
|
||||
if config["telemetry"]["enabled"]:
|
||||
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
|
||||
logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
|
||||
f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
|
||||
f'{telemetry_request_id}')
|
||||
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
||||
try:
|
||||
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
||||
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
||||
telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
|
||||
# prometheus data collection is available only on Openshift
|
||||
if config["telemetry"]["prometheus_backup"]:
|
||||
prometheus_archive_files = ''
|
||||
@@ -434,11 +451,15 @@ def main(cfg):
|
||||
logging.error("Alert profile is not defined")
|
||||
sys.exit(1)
|
||||
|
||||
if post_critical_alerts > 0:
|
||||
logging.error("Critical alerts are firing, please check; exiting")
|
||||
sys.exit(2)
|
||||
|
||||
if failed_post_scenarios:
|
||||
logging.error(
|
||||
"Post scenarios are still failing at the end of all iterations"
|
||||
)
|
||||
sys.exit(1)
|
||||
sys.exit(2)
|
||||
|
||||
logging.info(
|
||||
"Successfully finished running Kraken. UUID for the run: "
|
||||
|
||||
@@ -4,7 +4,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -2,7 +2,7 @@ input_list:
|
||||
- cpu_count: 1
|
||||
cpu_load_percentage: 80
|
||||
cpu_method: all
|
||||
duration: 1s
|
||||
duration: 30
|
||||
kubeconfig: ''
|
||||
namespace: default
|
||||
# set the node selector as a key-value pair eg.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -35,7 +35,7 @@ input:
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_count:
|
||||
display:
|
||||
@@ -68,18 +68,18 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: cpu
|
||||
cpu_count: !expr $.input.cpu_count
|
||||
cpu_method: !expr $.input.cpu_method
|
||||
cpu_load: !expr $.input.cpu_load_percentage
|
||||
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: cpu
|
||||
workers: !expr $.input.cpu_count
|
||||
cpu-method: "all"
|
||||
cpu-load: !expr $.input.cpu_load_percentage
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
|
||||
@@ -9,62 +9,10 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
cpu_count:
|
||||
display:
|
||||
description: Number of CPU cores to be used (0 means all)
|
||||
name: number of CPUs
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_method:
|
||||
display:
|
||||
description: CPU stress method
|
||||
name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
cpu_load_percentage:
|
||||
display:
|
||||
description: load CPU by percentage
|
||||
name: CPU load
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
@@ -3,7 +3,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
input_list:
|
||||
- duration: 30s
|
||||
- duration: 30
|
||||
io_block_size: 1m
|
||||
io_workers: 1
|
||||
io_write_bytes: 10m
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
hostPath:
|
||||
id: HostPathVolumeSource
|
||||
@@ -18,8 +18,8 @@ input:
|
||||
type:
|
||||
id: hostPath
|
||||
type_id: ref
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -51,7 +51,7 @@ input:
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
io_workers:
|
||||
display:
|
||||
@@ -102,19 +102,18 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
workdir: !expr $.input.target_pod_folder
|
||||
stressors:
|
||||
- stressor: hdd
|
||||
hdd: !expr $.input.io_workers
|
||||
hdd_bytes: !expr $.input.io_write_bytes
|
||||
hdd_write_size: !expr $.input.io_block_size
|
||||
timeout: !expr $.input.duration
|
||||
workdir: !expr $.input.target_pod_folder
|
||||
stressors:
|
||||
- stressor: hdd
|
||||
workers: !expr $.input.io_workers
|
||||
hdd-bytes: !expr $.input.io_write_bytes
|
||||
hdd-write-size: !expr $.input.io_block_size
|
||||
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
|
||||
@@ -2,22 +2,6 @@ version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
objects:
|
||||
hostPath:
|
||||
id: HostPathVolumeSource
|
||||
properties:
|
||||
path:
|
||||
type:
|
||||
type_id: string
|
||||
Volume:
|
||||
id: Volume
|
||||
properties:
|
||||
name:
|
||||
type:
|
||||
type_id: string
|
||||
hostPath:
|
||||
type:
|
||||
id: hostPath
|
||||
type_id: ref
|
||||
RootObject:
|
||||
id: RootObject
|
||||
properties:
|
||||
@@ -25,80 +9,9 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
io_workers:
|
||||
display:
|
||||
description: number of workers
|
||||
name: start N workers continually writing, reading and removing temporary files
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
io_block_size:
|
||||
display:
|
||||
description: single write size
|
||||
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
io_write_bytes:
|
||||
display:
|
||||
description: Total number of bytes written
|
||||
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
|
||||
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
|
||||
GBytes using the suffix b, k, m or g
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_folder:
|
||||
display:
|
||||
description: Target Folder
|
||||
name: Folder in the pod where the test will be executed and the test files will be written
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_volume:
|
||||
display:
|
||||
name: kubernetes volume definition
|
||||
description: the volume that will be attached to the pod. In order to stress
|
||||
the node storage only hosPath mode is currently supported
|
||||
type:
|
||||
type_id: ref
|
||||
id: Volume
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
@@ -4,7 +4,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
input_list:
|
||||
- duration: 30s
|
||||
- duration: 30
|
||||
vm_bytes: 10%
|
||||
vm_workers: 2
|
||||
# set the node selector as a key-value pair eg.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -34,7 +34,7 @@ input:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_workers:
|
||||
display:
|
||||
@@ -60,17 +60,16 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: vm
|
||||
vm: !expr $.input.vm_workers
|
||||
vm_bytes: !expr $.input.vm_bytes
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: vm
|
||||
workers: !expr $.input.vm_workers
|
||||
vm-bytes: !expr $.input.vm_bytes
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
|
||||
@@ -9,54 +9,10 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
vm_workers:
|
||||
display:
|
||||
description: Number of VM stressors to be run (0 means 1 stressor per CPU)
|
||||
name: Number of VM stressors
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_bytes:
|
||||
display:
|
||||
description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: component=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: component=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
name_pattern: ^nginx-.*$
|
||||
namespace_pattern: ^default$
|
||||
kill: 1
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
56
scenarios/kube/service_hijacking.yaml
Normal file
56
scenarios/kube/service_hijacking.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
# refer to the documentation for further infos https://github.com/krkn-chaos/krkn/blob/main/docs/service_hijacking.md
|
||||
|
||||
service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
|
||||
service_name: nginx-service # name of the service to be hijacked
|
||||
service_namespace: default # The namespace where the target service is located
|
||||
image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
|
||||
chaos_duration: 30 # Total duration of the chaos scenario in seconds.
|
||||
plan:
|
||||
- resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored.
|
||||
# For resources, only query parameters are captured.
|
||||
|
||||
steps: # A time-based plan consisting of steps can be defined for each resource.
|
||||
GET: # One or more HTTP methods can be specified for each step.
|
||||
# Note: Non-standard methods are supported
|
||||
# for fully custom web services (e.g., using NONEXISTENT instead of POST).
|
||||
|
||||
- duration: 15 # Duration in seconds for this step before moving to the next one, if defined. Otherwise,
|
||||
# this step will continue until the chaos scenario ends.
|
||||
|
||||
status: 500 # HTTP status code to be returned in this step.
|
||||
mime_type: "application/json" # MIME type of the response for this step.
|
||||
payload: | # The response payload for this step.
|
||||
{
|
||||
"status":"internal server error"
|
||||
}
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status":"resource created"
|
||||
}
|
||||
POST:
|
||||
- duration: 15
|
||||
status: 401
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status": "unauthorized"
|
||||
}
|
||||
- duration: 15
|
||||
status: 404
|
||||
mime_type: "text/plain"
|
||||
payload: "not found"
|
||||
|
||||
- resource: "/patch"
|
||||
steps:
|
||||
PATCH:
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "text/plain"
|
||||
payload: "resource patched"
|
||||
- duration: 15
|
||||
status: 400
|
||||
mime_type: "text/plain"
|
||||
payload: "bad request"
|
||||
@@ -5,4 +5,4 @@ scenarios:
|
||||
container_name: "etcd"
|
||||
action: 1
|
||||
count: 1
|
||||
expected_recovery_time: 60
|
||||
expected_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^acme-air$
|
||||
name_pattern: .*
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^acme-air$
|
||||
name_pattern: .*
|
||||
count: 8
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-etcd$
|
||||
label_selector: k8s-app=etcd
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-etcd$
|
||||
label_selector: k8s-app=etcd
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,5 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-apiserver$
|
||||
label_selector: app=openshift-apiserver-a
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-apiserver$
|
||||
label_selector: app=openshift-apiserver-a
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
|
||||
@@ -3,8 +3,5 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-kube-apiserver$
|
||||
label_selector: app=openshift-kube-apiserver
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-kube-apiserver$
|
||||
label_selector: app=openshift-kube-apiserver
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
count: 2
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -2,8 +2,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
||||
count: 1
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -3,9 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
count: 2
|
||||
timeout: 180
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
namespace_pattern: ^openshift-.*$
|
||||
name_pattern: .*
|
||||
kill: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -60,7 +60,14 @@
|
||||
"default": 1,
|
||||
"title": "Backoff",
|
||||
"description": "How many seconds to wait between checks for the target pod status."
|
||||
},
|
||||
"krkn_pod_recovery_time": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"title": "Recovery Time",
|
||||
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||
}
|
||||
|
||||
},
|
||||
"required": [
|
||||
"namespace_pattern"
|
||||
@@ -112,6 +119,12 @@
|
||||
"default": 1,
|
||||
"title": "Backoff",
|
||||
"description": "How many seconds to wait between checks for the target pod status."
|
||||
},
|
||||
"krkn_pod_recovery_time": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"title": "Recovery Time",
|
||||
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
||||
@@ -39,7 +39,7 @@ class NetworkScenariosTest(unittest.TestCase):
|
||||
|
||||
def test_network_chaos(self):
|
||||
output_id, output_data = ingress_shaping.network_chaos(
|
||||
ingress_shaping.NetworkScenarioConfig(
|
||||
params=ingress_shaping.NetworkScenarioConfig(
|
||||
label_selector="node-role.kubernetes.io/control-plane",
|
||||
instance_count=1,
|
||||
network_params={
|
||||
@@ -47,7 +47,8 @@ class NetworkScenariosTest(unittest.TestCase):
|
||||
"loss": "0.02",
|
||||
"bandwidth": "100mbit"
|
||||
}
|
||||
)
|
||||
),
|
||||
run_id="network-shaping-test"
|
||||
)
|
||||
if output_id == "error":
|
||||
logging.error(output_data.error)
|
||||
|
||||
@@ -10,7 +10,7 @@ class RunPythonPluginTest(unittest.TestCase):
|
||||
tmp_file = tempfile.NamedTemporaryFile()
|
||||
tmp_file.write(bytes("print('Hello world!')", 'utf-8'))
|
||||
tmp_file.flush()
|
||||
output_id, output_data = run_python_file(RunPythonFileInput(tmp_file.name))
|
||||
output_id, output_data = run_python_file(params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-success")
|
||||
self.assertEqual("success", output_id)
|
||||
self.assertEqual("Hello world!\n", output_data.stdout)
|
||||
|
||||
@@ -18,7 +18,7 @@ class RunPythonPluginTest(unittest.TestCase):
|
||||
tmp_file = tempfile.NamedTemporaryFile()
|
||||
tmp_file.write(bytes("import sys\nprint('Hello world!')\nsys.exit(42)\n", 'utf-8'))
|
||||
tmp_file.flush()
|
||||
output_id, output_data = run_python_file(RunPythonFileInput(tmp_file.name))
|
||||
output_id, output_data = run_python_file(params=RunPythonFileInput(tmp_file.name), run_id="test-python-plugin-error")
|
||||
self.assertEqual("error", output_id)
|
||||
self.assertEqual(42, output_data.exit_code)
|
||||
self.assertEqual("Hello world!\n", output_data.stdout)
|
||||
|
||||
40
utils/chaos_ai/README.md
Normal file
40
utils/chaos_ai/README.md
Normal file
@@ -0,0 +1,40 @@
|
||||
# aichaos
|
||||
Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
|
||||
|
||||
## Generate python package wheel file
|
||||
```
|
||||
$ python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||
$ cp dist/aichaos-0.0.1-py3-none-any.whl docker/
|
||||
```
|
||||
This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder.
|
||||
|
||||
## Build Image
|
||||
```
|
||||
$ cd docker
|
||||
$ podman build -t aichaos:1.0 .
|
||||
OR
|
||||
$ docker build -t aichaos:1.0 .
|
||||
```
|
||||
|
||||
## Run Chaos AI
|
||||
```
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
OR
|
||||
$ docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||
```
|
||||
|
||||
The output should look like:
|
||||
```
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
* Serving Flask app 'swagger_api' (lazy loading)
|
||||
* Environment: production
|
||||
WARNING: This is a development server. Do not use it in a production deployment.
|
||||
Use a production WSGI server instead.
|
||||
* Debug mode: on
|
||||
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:5001
|
||||
* Running on http://172.17.0.2:5001
|
||||
```
|
||||
|
||||
You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
|
||||
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM bitnami/kubectl:1.20.9 as kubectl
|
||||
FROM python:3.9
|
||||
WORKDIR /app
|
||||
RUN pip3 install --upgrade pip
|
||||
COPY config config/
|
||||
COPY requirements.txt .
|
||||
RUN mkdir -p /app/logs
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
|
||||
|
||||
COPY swagger_api.py .
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
|
||||
|
||||
RUN apt-get update && apt-get install -y podman
|
||||
|
||||
COPY aichaos-0.0.1-py3-none-any.whl .
|
||||
RUN pip3 install aichaos-0.0.1-py3-none-any.whl
|
||||
CMD ["python3", "swagger_api.py"]
|
||||
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"command": "podman",
|
||||
"chaosengine": "kraken",
|
||||
"faults": "pod-delete",
|
||||
"iterations": 1,
|
||||
"maxfaults": 5
|
||||
}
|
||||
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"apiVersion": "1.0",
|
||||
"kind": "ChaosEngine",
|
||||
"metadata": {
|
||||
"name": "engine-cartns3"
|
||||
},
|
||||
"spec": {
|
||||
"engineState": "active",
|
||||
"annotationCheck": "false",
|
||||
"appinfo": {
|
||||
"appns": "robot-shop",
|
||||
"applabel": "service=payment",
|
||||
"appkind": "deployment"
|
||||
},
|
||||
"chaosServiceAccount": "pod-delete-sa",
|
||||
"experiments": [
|
||||
{
|
||||
"name": "pod-delete",
|
||||
"spec": {
|
||||
"components": {
|
||||
"env": [
|
||||
{
|
||||
"name": "FORCE",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "TOTAL_CHAOS_DURATION",
|
||||
"value": "120"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
@@ -0,0 +1,40 @@
|
||||
|
||||
Generate chaos on an application deployed on a cluster.
|
||||
---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: file
|
||||
in: formData
|
||||
type: file
|
||||
required: true
|
||||
description: Kube-config file
|
||||
- name: namespace
|
||||
in: formData
|
||||
type: string
|
||||
default: robot-shop
|
||||
required: true
|
||||
description: Namespace to test
|
||||
- name: podlabels
|
||||
in: formData
|
||||
type: string
|
||||
default: service=cart,service=payment
|
||||
required: true
|
||||
description: Pod labels to test
|
||||
- name: nodelabels
|
||||
in: formData
|
||||
type: string
|
||||
required: false
|
||||
description: Node labels to test
|
||||
- name: urls
|
||||
in: formData
|
||||
type: string
|
||||
default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
|
||||
required: true
|
||||
description: Application URLs to test
|
||||
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos ID for the initiated chaos.
|
||||
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Episodes from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get QTable from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get status of the Constraints ID.---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos for the given ID.
|
||||
6
utils/chaos_ai/docker/requirements.txt
Normal file
6
utils/chaos_ai/docker/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
numpy
|
||||
pandas
|
||||
requests
|
||||
Flask==2.2.5
|
||||
Werkzeug==3.0.3
|
||||
flasgger==0.9.5
|
||||
186
utils/chaos_ai/docker/swagger_api.py
Normal file
186
utils/chaos_ai/docker/swagger_api.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
# sys.path.append("..")
|
||||
from src.aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# kubeconfigfile = params['file']
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
os.system("export KUBECONFIG="+kubeconfigfile)
|
||||
os.system("echo $KUBECONFIG")
|
||||
print('setting kubeconfig')
|
||||
params['command'] = 'podman'
|
||||
params['chaosengine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaosengine'] = config_params['chaosengine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
# faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
faults = []
|
||||
for f in params['faults'].split(','):
|
||||
if f in ['pod-delete']:
|
||||
for p in params['podlabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
|
||||
for p in params['nodelabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
else:
|
||||
pass
|
||||
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
|
||||
'401': 6, '403': 7, '404': 8, '429': 9,
|
||||
'Timeout': 10, 'Other': 11}
|
||||
rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
|
||||
'401': 1, '403': 1, '404': 1, '429': 1,
|
||||
'Timeout': 1, 'Other': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
# probe_url = params['probeurl']
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=int(params['maxfaults']),
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaosengine'],
|
||||
chaos_dir='config/', kubeconfig=kubeconfigfile,
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
|
||||
command=params['command'])
|
||||
print('checking kubeconfig')
|
||||
os.system("echo $KUBECONFIG")
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
fname = 'kubeconfig-'+str(i)
|
||||
if fname not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
# creating empty file
|
||||
open(kubeconfigfile, 'a').close()
|
||||
# print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
# print('[GenerateChaos]', f.filename, datetime.now())
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
@app.route('/GetLog/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/log.yml')
|
||||
def get_log(chaosid):
|
||||
print('[GetLog]', chaosid)
|
||||
epfile = flaskdir + 'log_' + str(chaosid)
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
21
utils/chaos_ai/generate_wheel_package.py
Normal file
21
utils/chaos_ai/generate_wheel_package.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import setuptools
|
||||
# from setuptools_cythonize import get_cmdclass
|
||||
|
||||
setuptools.setup(
|
||||
# cmdclass=get_cmdclass(),
|
||||
name="aichaos",
|
||||
version="0.0.1",
|
||||
author="Sandeep Hans",
|
||||
author_email="shans001@in.ibm.com",
|
||||
description="Chaos AI",
|
||||
long_description="Chaos Engineering using AI",
|
||||
long_description_content_type="text/markdown",
|
||||
url="",
|
||||
packages=setuptools.find_packages(),
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires='>=3.9',
|
||||
)
|
||||
10
utils/chaos_ai/requirements.txt
Normal file
10
utils/chaos_ai/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
numpy
|
||||
pandas
|
||||
notebook
|
||||
jupyterlab
|
||||
jupyter
|
||||
seaborn
|
||||
requests
|
||||
wheel
|
||||
Flask==2.1.0
|
||||
flasgger==0.9.5
|
||||
0
utils/chaos_ai/src/__init__.py
Normal file
0
utils/chaos_ai/src/__init__.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
@@ -0,0 +1,213 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
|
||||
chaos_experiment='experiment.json',
|
||||
chaos_journal='journal.json', iterations=1000, static_run=False):
|
||||
self.faults = faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.episodes = []
|
||||
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize parameters
|
||||
self.gamma = 0.75 # Discount factor
|
||||
self.alpha = 0.9 # Learning rate
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([9, 9]))
|
||||
# self.Q = np.array(np.zeros([len(faults), len(faults)]))
|
||||
# currently action is a single fault, later on we will do multiple faults together
|
||||
# For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
|
||||
# eg. {f1,f2},f3,f4,{f4,f5} - f1,f2 in parallel, then f3, then f4, then f4,f5 in parallel produces end state
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
# may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
|
||||
# Q = {'500' = {'f1f2f4': 0.3, 'f1': 0.5}, '404' = {'f2': 0.22}}
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
# End state is reached when system is down or return error code like '500','404'
|
||||
def get_next_state(self):
|
||||
self.logger.info('[GET_NEXT_STATE]')
|
||||
f = open(self.chaos_dir + self.chaos_journal)
|
||||
data = json.load(f)
|
||||
|
||||
# before the experiment (if before steady state is false, after is null?)
|
||||
for probe in data['steady_states']['before']['probes']:
|
||||
if not probe['tolerance_met']:
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['status']
|
||||
start_state, end_state = None, None
|
||||
return start_state, end_state
|
||||
|
||||
# after the experiment
|
||||
for probe in data['steady_states']['after']['probes']:
|
||||
# if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
if not probe['tolerance_met']:
|
||||
# print(probe)
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['output']['status']
|
||||
# end_state = probe['status']
|
||||
return start_state, end_state
|
||||
# if tolerances for all probes are met
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['activity']['tolerance']
|
||||
return start_state, end_state
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.info('[INJECT_FAULT] ' + fault)
|
||||
f = open(self.chaos_dir + self.chaos_experiment)
|
||||
data = json.load(f)
|
||||
for m in data['method']:
|
||||
if 'provider' in m:
|
||||
if fault == 'kill_microservice':
|
||||
m['name'] = 'kill-microservice'
|
||||
m['provider']['module'] = 'chaosk8s.actions'
|
||||
m['provider']['arguments']['name'] = pod_name
|
||||
else:
|
||||
m['provider']['arguments']['name_pattern'] = pod_name
|
||||
m['provider']['func'] = fault
|
||||
|
||||
print('[INJECT_FAULT] method:', m)
|
||||
# self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
|
||||
# self.logger.info('[INJECT_FAULT] ' + str(m))
|
||||
|
||||
exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
|
||||
with open(exp_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
exp_file = self.chaos_dir + 'experiment.json'
|
||||
# execute faults
|
||||
# cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
|
||||
cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
|
||||
if not self.static_run:
|
||||
os.system(cmd)
|
||||
|
||||
def create_episode(self):
|
||||
self.logger.info('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
while True:
|
||||
# inject more faults
|
||||
# TODO: model - choose faults based on q-learning ...
|
||||
fault_pod = random.choice(self.faults)
|
||||
fault = fault_pod.split(':')[0]
|
||||
pod_name = fault_pod.split(':')[1]
|
||||
# fault = random.choice(self.faults)
|
||||
# pod_name = random.choice(self.pod_names)
|
||||
# fault = lstm_model.get_next_fault(episode)
|
||||
# fault = get_max_prob_fault(episode)
|
||||
|
||||
self.inject_faults(fault, pod_name)
|
||||
start_state, next_state = self.get_next_state()
|
||||
print('[CREATE EPISODE]', start_state, next_state)
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
continue
|
||||
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
# self.update_q_fault(fault, episode, start_state, next_state)
|
||||
# if an end_state is reached
|
||||
# if next_state is not None:
|
||||
if start_state != next_state:
|
||||
self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
return episode, start_state, next_state
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state):
|
||||
self.logger.info('[UPDATE_Q]')
|
||||
print('[UPDATE_Q] ', str(start_state), str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
self.Q[current_state, fault_index]
|
||||
self.Q[current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
|
||||
def start_chaos(self):
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
print(self.Q)
|
||||
print(self.state_matrix)
|
||||
|
||||
|
||||
def test_chaos():
|
||||
svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
|
||||
'shipping', 'user', 'web']
|
||||
# Define faults
|
||||
# faults = ['terminate_pods']
|
||||
# faults = ['terminate_pods:' + x for x in pod_names]
|
||||
faults = ['kill_microservice:' + x for x in svc_list]
|
||||
# Define the states
|
||||
states = {
|
||||
'200': 0,
|
||||
'500': 1,
|
||||
'404': 2
|
||||
}
|
||||
# Define rewards, currently not used
|
||||
rewards = {
|
||||
'200': 0,
|
||||
'500': 0.8,
|
||||
'404': 1
|
||||
}
|
||||
|
||||
# cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
|
||||
cdir = '/Users/sandeephans/Downloads/openshift/'
|
||||
cexp = 'experiment.json'
|
||||
cjournal = 'journal.json'
|
||||
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
|
||||
static_run=False)
|
||||
aichaos.start_chaos()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
test_chaos()
|
||||
248
utils/chaos_ai/src/aichaos_main.py
Normal file
248
utils/chaos_ai/src/aichaos_main.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
# sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
import src.utils as utils
|
||||
from src.kraken_utils import KrakenUtils
|
||||
from src.qlearning import QLearning
|
||||
from src.test_application import TestApplication
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
|
||||
service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
|
||||
chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
|
||||
loglevel=logging.INFO,
|
||||
chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
|
||||
num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
|
||||
static_run=False, all_faults=False, command='podman'):
|
||||
self.namespace = namespace
|
||||
self.faults = faults
|
||||
self.unused_faults = faults.copy()
|
||||
self.all_faults = all_faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.urls = urls
|
||||
self.max_faults = max_faults
|
||||
self.episodes = []
|
||||
self.service_weights = service_weights
|
||||
self.ctd_subsets = ctd_subsets
|
||||
|
||||
self.kubeconfig = kubeconfig
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
self.command = command
|
||||
|
||||
if chaos_engine == 'kraken':
|
||||
self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
|
||||
else:
|
||||
self.chaos_engine = None
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize RL parameters
|
||||
self.epsilon = epsilon # epsilon decay policy
|
||||
# self.epsdecay = 0
|
||||
|
||||
# log files
|
||||
self.logfile = logfile
|
||||
self.qfile = qfile
|
||||
self.efile = efile
|
||||
self.epfile = epfile
|
||||
open(efile, 'w+').close()
|
||||
open(logfile, 'w+').close()
|
||||
open(logfile, 'r+').truncate(0)
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
|
||||
self.logger = logging.getLogger(logfile.replace('/',''))
|
||||
self.logger.addHandler(logging.FileHandler(logfile))
|
||||
|
||||
self.testapp = TestApplication(num_requests, timeout, sleep_time)
|
||||
self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
|
||||
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
def realistic(self, faults_pods):
|
||||
self.logger.debug('[Realistic] ' + str(faults_pods))
|
||||
fp = faults_pods.copy()
|
||||
for f1 in faults_pods:
|
||||
for f2 in faults_pods:
|
||||
if f1 == f2:
|
||||
continue
|
||||
if f1 in fp and f2 in fp:
|
||||
f1_fault, load_1 = utils.get_load(f1.split(':')[0])
|
||||
f1_pod = f1.split(':')[1]
|
||||
f2_fault, load_2 = utils.get_load(f2.split(':')[0])
|
||||
f2_pod = f2.split(':')[1]
|
||||
if f1_pod == f2_pod:
|
||||
if f1_fault == 'pod-delete':
|
||||
fp.remove(f2)
|
||||
if f1_fault == f2_fault:
|
||||
# if int(load_1) > int(load_2):
|
||||
# randomly remove one fault from same faults with different params
|
||||
fp.remove(f2)
|
||||
if self.service_weights is None:
|
||||
return fp
|
||||
|
||||
fp_copy = fp.copy()
|
||||
for f in fp:
|
||||
f_fault = f.split(':')[0]
|
||||
f_pod = f.split(':')[1].replace('service=', '')
|
||||
self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
|
||||
if self.service_weights[f_pod][f_fault] == 0:
|
||||
fp_copy.remove(f)
|
||||
|
||||
self.logger.debug('[Realistic] ' + str(fp_copy))
|
||||
return fp_copy
|
||||
|
||||
def select_faults(self):
|
||||
max_faults = min(self.max_faults, len(self.unused_faults))
|
||||
num_faults = random.randint(1, max_faults)
|
||||
if self.all_faults:
|
||||
num_faults = len(self.unused_faults)
|
||||
if random.random() > self.epsilon:
|
||||
self.logger.info('[Exploration]')
|
||||
# faults_pods = random.sample(self.faults, k=num_faults)
|
||||
# using used faults list to avoid starvation
|
||||
faults_pods = random.sample(self.unused_faults, k=num_faults)
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
for f in faults_pods:
|
||||
self.unused_faults.remove(f)
|
||||
if len(self.unused_faults) == 0:
|
||||
self.unused_faults = self.faults.copy()
|
||||
else:
|
||||
self.logger.info('[Exploitation]')
|
||||
first_row = self.ql.Q[:, 0, :][0]
|
||||
top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
|
||||
faults_pods = [self.faults[i] for i in top_k_indices]
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
|
||||
return faults_pods
|
||||
|
||||
def create_episode(self, ctd_subset=None):
|
||||
self.logger.debug('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
|
||||
if ctd_subset is None:
|
||||
faults_pods = self.select_faults()
|
||||
else:
|
||||
faults_pods = ctd_subset
|
||||
self.logger.info('CTD Subset: ' + str(faults_pods))
|
||||
|
||||
# faults_pods = self.realistic(faults_pods)
|
||||
if len(faults_pods) == 0:
|
||||
return [], 200, 200
|
||||
|
||||
engines = []
|
||||
for fp in faults_pods:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = self.chaos_engine.inject_faults(fault, pod_name)
|
||||
engines.append(engine)
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.logger.info('[create_episode]' + str(faults_pods))
|
||||
engines_running = self.chaos_engine.wait_engines(engines)
|
||||
self.logger.info('[create_episode] engines_running' + str(engines_running))
|
||||
if not engines_running:
|
||||
return None, None, None
|
||||
|
||||
# randomly shuffling urls
|
||||
urls = random.sample(self.urls, len(self.urls))
|
||||
ep_json = []
|
||||
for url in urls:
|
||||
start_state, next_state = self.testapp.test_load(url)
|
||||
self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
# self.cleanup()
|
||||
self.chaos_engine.stop_engines()
|
||||
continue
|
||||
|
||||
### episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
# self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
url_index = self.urls.index(url)
|
||||
self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
|
||||
for fp in faults_pods:
|
||||
self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
|
||||
ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
|
||||
|
||||
self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
|
||||
self.chaos_engine.print_result(engines)
|
||||
self.chaos_engine.stop_engines(episode=episode)
|
||||
# ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
|
||||
|
||||
return ep_json, start_state, next_state
|
||||
|
||||
def start_chaos(self):
|
||||
self.logger.info('[INITIALIZING]')
|
||||
self.logger.info('Logfile: '+self.logfile)
|
||||
self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
|
||||
self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
|
||||
self.logger.debug('Faults:' + str(self.faults))
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
if self.ctd_subsets is None:
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
# if episode['next_state'] != '200':
|
||||
self.episodes.extend(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
# print(i, self.state_matrix)
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
else:
|
||||
for i, subset in enumerate(self.ctd_subsets):
|
||||
episode, start_state, end_state = self.create_episode(subset)
|
||||
self.logger.debug('[start_chaos]' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
self.episodes.append(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
# self.remove_temp_file()
|
||||
with open(self.epfile, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.episodes, f, ensure_ascii=False, indent=4)
|
||||
self.logger.info('COMPLETE!!!')
|
||||
|
||||
def write_q(self):
|
||||
df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
|
||||
df.to_csv(self.qfile)
|
||||
return df
|
||||
|
||||
def write_episode(self, episode):
|
||||
for ep in episode:
|
||||
with open(self.efile, "a") as outfile:
|
||||
x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
|
||||
x.append(ep['url'])
|
||||
x.append(str(ep['next_state']))
|
||||
outfile.write(','.join(x) + '\n')
|
||||
|
||||
def remove_temp_file(self):
|
||||
mydir = self.chaos_dir + 'experiments'
|
||||
print('Removing temp files from: '+mydir)
|
||||
self.logger.debug('Removing temp files: '+mydir)
|
||||
if os.path.exists(mydir):
|
||||
return
|
||||
filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
|
||||
for f in filelist:
|
||||
print(f)
|
||||
os.remove(os.path.join(mydir, f))
|
||||
56
utils/chaos_ai/src/experiments.py
Normal file
56
utils/chaos_ai/src/experiments.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import random
|
||||
|
||||
|
||||
class Experiments:
|
||||
def __init__(self):
|
||||
self.k = 0
|
||||
|
||||
def monotonic(self, aichaos, num_sets=3):
|
||||
for i in range(num_sets):
|
||||
faults_pods = random.sample(aichaos.faults, k=2)
|
||||
faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
|
||||
|
||||
resp1, resp2, resp_both = 0, 0, 0
|
||||
for fl in faults_set:
|
||||
engines = []
|
||||
for fp in fl:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = aichaos.inject_faults_litmus(fault, pod_name)
|
||||
engines.append(engine)
|
||||
aichaos.litmus.wait_engines(engines)
|
||||
|
||||
for index, url in enumerate(aichaos.urls):
|
||||
start_state, next_state = aichaos.test_load(url)
|
||||
print(i, fl, next_state)
|
||||
# self.write(str(fl), next_state)
|
||||
if resp1 == 0:
|
||||
resp1 = next_state
|
||||
elif resp2 == 0:
|
||||
resp2 = next_state
|
||||
else:
|
||||
resp_both = next_state
|
||||
|
||||
aichaos.litmus.stop_engines()
|
||||
self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
|
||||
print('Experiment Complete!!!')
|
||||
|
||||
@staticmethod
|
||||
def write(fault, next_state):
|
||||
with open("experiment", "a") as outfile:
|
||||
outfile.write(fault + ',' + str(next_state) + ',' + '\n')
|
||||
|
||||
|
||||
@staticmethod
|
||||
def write_resp(faults, resp1, resp2, resp3):
|
||||
monotonic = True
|
||||
if resp3 == 200:
|
||||
if resp1 != 200 or resp2 != 200:
|
||||
monotonic = False
|
||||
else:
|
||||
if resp1 == 200 and resp2 == 200:
|
||||
monotonic = False
|
||||
|
||||
with open("experiment", "a") as outfile:
|
||||
# outfile.write(faults + ',' + str(resp1) + ',' + '\n')
|
||||
outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
|
||||
99
utils/chaos_ai/src/kraken_utils.py
Normal file
99
utils/chaos_ai/src/kraken_utils.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
import src.utils as utils
|
||||
|
||||
|
||||
class KrakenUtils:
|
||||
def __init__(self, namespace='robot-shop', chaos_dir='../config/',
|
||||
chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.namespace = namespace
|
||||
self.kubeconfig = kubeconfig
|
||||
self.logger = logging.getLogger()
|
||||
self.engines = []
|
||||
self.wait_checks = wait_checks
|
||||
self.command = command
|
||||
|
||||
def exp_status(self, engine='engine-cartns3'):
|
||||
substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
|
||||
substr = '|'.join(substring_list)
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
line = os.popen(cmd).read()
|
||||
self.logger.debug('[exp_status]'+line)
|
||||
# if 'Waiting for the specified duration' in line:
|
||||
# if 'Waiting for' in line or 'waiting for' in line:
|
||||
# if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
|
||||
if any(map(line.__contains__, substring_list)):
|
||||
return 'Running'
|
||||
return 'Not Running'
|
||||
|
||||
# print chaos result, check if litmus showed any error
|
||||
def print_result(self, engines):
|
||||
# self.logger.debug('')
|
||||
for e in engines:
|
||||
# cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
|
||||
# line = os.popen(cmd).read()
|
||||
# self.logger.debug('[Chaos Result] '+e+' : '+line)
|
||||
self.logger.debug('[KRAKEN][Chaos Result] '+e)
|
||||
|
||||
def wait_engines(self, engines=[]):
|
||||
status = 'Completed'
|
||||
max_checks = self.wait_checks
|
||||
for e in engines:
|
||||
self.logger.info('[Wait Engines] ' + e)
|
||||
for i in range(max_checks):
|
||||
status = self.exp_status(e)
|
||||
if status == 'Running':
|
||||
break
|
||||
time.sleep(1)
|
||||
# return False, if even one engine is not running
|
||||
if status != 'Running':
|
||||
return False
|
||||
|
||||
self.engines = engines
|
||||
# return True if all engines are running
|
||||
return True
|
||||
|
||||
|
||||
def cleanup(self):
|
||||
self.logger.debug('Removing previous engines')
|
||||
# cmd = "docker rm $(docker ps -q -f 'status=exited')"
|
||||
if len(self.engines) > 0:
|
||||
cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
|
||||
os.system(cmd)
|
||||
self.engines = []
|
||||
|
||||
cmd = self.command+" container prune -f >> temp"
|
||||
os.system(cmd)
|
||||
self.logger.debug('Engines removed')
|
||||
|
||||
def stop_engines(self, episode=[]):
|
||||
self.cleanup()
|
||||
|
||||
def get_name(self):
|
||||
return 'kraken'
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
|
||||
fault, load = utils.get_load(fault)
|
||||
engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
|
||||
if fault == 'pod-delete':
|
||||
cmd = self.command+' run -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
|
||||
elif fault == 'network-chaos':
|
||||
# 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10 --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
elif fault == 'node-memory-hog':
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
|
||||
elif fault == 'node-cpu-hog':
|
||||
cmd = self.command+' run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
|
||||
else:
|
||||
cmd = 'echo'
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
|
||||
os.system(cmd)
|
||||
return engine
|
||||
62
utils/chaos_ai/src/qlearning.py
Normal file
62
utils/chaos_ai/src/qlearning.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class QLearning:
|
||||
def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
|
||||
self.gamma = gamma # Discount factor
|
||||
self.alpha = alpha # Learning rate
|
||||
self.faults = faults
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state, url_index):
|
||||
self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
if end_state not in self.states:
|
||||
end_state = 'Other'
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
# self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
|
||||
# self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
|
||||
# self.logger.debug(
|
||||
# 'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
|
||||
# self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
|
||||
self.Q[url_index, current_state, fault_index]
|
||||
self.Q[url_index, current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
# self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
171
utils/chaos_ai/src/swagger_api.py
Normal file
171
utils/chaos_ai/src/swagger_api.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
from aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
|
||||
"flask") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
|
||||
# write_local=False)
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
params['command'] = 'podman'
|
||||
params['chaos_engine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaos_engine'] = config_params['chaos_engine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
|
||||
rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
probe_url = params['probeurl']
|
||||
probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
|
||||
'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
|
||||
'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
|
||||
'packet_corruption': 'wolffi/packet_corruption',
|
||||
'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
|
||||
'http_bad_request': 'wolffi/http_bad_request',
|
||||
'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
|
||||
'http_not_found': 'wolffi/http_not_found',
|
||||
'http_method_not_allowed': 'wolffi/http_method_not_allowed',
|
||||
'http_not_acceptable': 'wolffi/http_not_acceptable',
|
||||
'http_request_timeout': 'wolffi/http_request_timeout',
|
||||
'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
|
||||
'http_internal_server_error': 'wolffi/http_internal_server_error',
|
||||
'http_not_implemented': 'wolffi/http_not_implemented',
|
||||
'http_bad_gateway': 'wolffi/http_bad_gateway',
|
||||
'http_service_unavailable': 'wolffi/http_service_unavailable',
|
||||
'bandwidth_restrict': 'wolffi/bandwidth_restrict',
|
||||
'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
|
||||
'pod_io_load': 'wolffi/pod_io_load',
|
||||
'pod_network_load': 'wolffi/pod_network_load'
|
||||
}
|
||||
dstk_probes = {k: probe_url + v for k, v in probes.items()}
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=params['maxfaults'],
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('../config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
if str(i) not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
print('[GenerateChaos]', f.filename, datetime.now())
|
||||
# thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
83
utils/chaos_ai/src/test_application.py
Normal file
83
utils/chaos_ai/src/test_application.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import requests
|
||||
|
||||
|
||||
class TestApplication:
|
||||
def __init__(self, num_requests=10, timeout=2, sleep_time=1):
|
||||
self.num_requests = num_requests
|
||||
self.timeout = timeout
|
||||
self.sleep_time = sleep_time
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def test_load(self, url=''):
|
||||
# url = 'http://192.168.49.2:31902/api/cart/health'
|
||||
timeout_count = 0
|
||||
avg_lat = 0
|
||||
for i in range(self.num_requests):
|
||||
try:
|
||||
r = requests.get(url, verify=False, timeout=self.timeout)
|
||||
avg_lat += r.elapsed.total_seconds()
|
||||
self.logger.info(
|
||||
url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
|
||||
+ " {:.2f}".format(avg_lat))
|
||||
if r.status_code != 200:
|
||||
return '200', r.status_code
|
||||
# except requests.exceptions.Timeout as toe:
|
||||
except Exception as toe:
|
||||
self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
|
||||
timeout_count += 1
|
||||
if timeout_count > 3:
|
||||
return '200', 'Timeout'
|
||||
# except Exception as e:
|
||||
# self.logger.debug('Connection refused!'+str(e))
|
||||
time.sleep(self.sleep_time)
|
||||
self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
|
||||
return '200', '200'
|
||||
|
||||
# def test_load_hey(self):
|
||||
# cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
|
||||
# os.system(cmd)
|
||||
# with open('temp') as f:
|
||||
# datafile = f.readlines()
|
||||
# found = False
|
||||
# for line in datafile:
|
||||
# if 'Status code distribution:' in line:
|
||||
# found = True
|
||||
# if found:
|
||||
# print('[test_load]', line)
|
||||
# m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
|
||||
# if m is not None:
|
||||
# resp_code = m.group(1)
|
||||
# if resp_code != 200:
|
||||
# return '200', resp_code
|
||||
# return '200', '200'
|
||||
|
||||
# # End state is reached when system is down or return error code like '500','404'
|
||||
# def get_next_state(self):
|
||||
# self.logger.info('[GET_NEXT_STATE]')
|
||||
# f = open(self.chaos_dir + self.chaos_journal)
|
||||
# data = json.load(f)
|
||||
#
|
||||
# # before the experiment (if before steady state is false, after is null?)
|
||||
# for probe in data['steady_states']['before']['probes']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # start_state = probe['activity']['tolerance']
|
||||
# # end_state = probe['status']
|
||||
# start_state, end_state = None, None
|
||||
# return start_state, end_state
|
||||
#
|
||||
# # after the experiment
|
||||
# for probe in data['steady_states']['after']['probes']:
|
||||
# # if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # print(probe)
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['output']['status']
|
||||
# # end_state = probe['status']
|
||||
# return start_state, end_state
|
||||
# # if tolerances for all probes are met
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['activity']['tolerance']
|
||||
# return start_state, end_state
|
||||
10
utils/chaos_ai/src/utils.py
Normal file
10
utils/chaos_ai/src/utils.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import re
|
||||
|
||||
|
||||
def get_load(fault):
|
||||
params = re.findall(r'\(.*?\)', fault)
|
||||
load = 100
|
||||
if len(params) > 0:
|
||||
load = params[0].strip('()')
|
||||
fault = fault.strip(params[0])
|
||||
return fault, load
|
||||
@@ -7,8 +7,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
||||
## Pre-requisites
|
||||
|
||||
- Openshift Or Kubernetes Environment where the application is hosted
|
||||
- Access to the telemetry data via the exposed Prometheus endpoint
|
||||
- Python3
|
||||
- Access to the metrics via the exposed Prometheus endpoint
|
||||
- Python3.9
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -20,28 +20,35 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
||||
$ git clone https://github.com/krkn-chaos/krkn.git
|
||||
$ cd krkn
|
||||
$ pip3 install -r requirements.txt
|
||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
||||
Edit configuration file:
|
||||
$ vi config/recommender_config.yaml
|
||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py -c utils/chaos_recommender/recommender_config.yaml
|
||||
```
|
||||
|
||||
2. Follow the prompts to provide the required information.
|
||||
|
||||
## Configuration
|
||||
To run the recommender with a config file specify the config file path with the `-c` argument.
|
||||
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
|
||||
You can customize the default values by editing the `recommender_config.yaml` file. The configuration file contains the following options:
|
||||
|
||||
- `application`: Specify the application name.
|
||||
- `namespace`: Specify the namespace name. If you want to profile
|
||||
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
|
||||
- `labels`: Specify the labels (not used).
|
||||
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
|
||||
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
|
||||
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
||||
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
||||
- `chaos_library`: "kraken" (currently it only supports kraken).
|
||||
- `json_output_file`: True or False (by default False).
|
||||
- `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
|
||||
- `chaos_tests`: (for output purpose only do not change if not needed)
|
||||
- `GENERAL`: list of general purpose tests available in Krkn
|
||||
- `MEM`: list of memory related tests available in Krkn
|
||||
- `NETWORK`: list of network related tests available in Krkn
|
||||
- `CPU`: list of memory related tests available in Krkn
|
||||
- `threshold`: Specify the threshold to use for comparison and identifying outliers
|
||||
- `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
|
||||
- `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers
|
||||
|
||||
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
||||
```
|
||||
@@ -58,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
|
||||
-o, --options Evaluate command line options
|
||||
-a APPLICATION, --application APPLICATION
|
||||
Kubernetes application name
|
||||
-n NAMESPACE, --namespace NAMESPACE
|
||||
Kubernetes application namespace
|
||||
-n NAMESPACES, --namespaces NAMESPACE
|
||||
Kubernetes application namespaces separated by space
|
||||
-l LABELS, --labels LABELS
|
||||
Kubernetes application labels
|
||||
-p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
|
||||
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
|
||||
Chaos library
|
||||
-L LOG_LEVEL, --log-level LOG_LEVEL
|
||||
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
-J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
|
||||
Create output file, the path to the folder can be specified, if not specified the default folder is used.
|
||||
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
||||
Memory related chaos tests (space separated list)
|
||||
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
||||
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
|
||||
Network related chaos tests (space separated list)
|
||||
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
||||
Memory related chaos tests (space separated list)
|
||||
|
||||
--threshold THRESHOLD
|
||||
Threshold
|
||||
--cpu_threshold CPU_THRESHOLD
|
||||
CPU threshold to compare with the cpu limits
|
||||
--mem_threshold MEM_THRESHOLD
|
||||
Memory threshold to compare with the memory limits
|
||||
```
|
||||
|
||||
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
||||
@@ -97,10 +111,10 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t
|
||||
|
||||
## Customizing Thresholds and Options
|
||||
|
||||
You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
|
||||
You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.
|
||||
|
||||
## Additional Files
|
||||
|
||||
- `config/recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.
|
||||
- `recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.
|
||||
|
||||
Happy Chaos!
|
||||
|
||||
@@ -1,7 +1,10 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import yaml
|
||||
# kraken module import for running the recommender
|
||||
# both from the root directory and the recommender
|
||||
@@ -9,24 +12,28 @@ import yaml
|
||||
sys.path.insert(0, './')
|
||||
sys.path.insert(0, '../../')
|
||||
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
import kraken.chaos_recommender.analysis as analysis
|
||||
import kraken.chaos_recommender.prometheus as prometheus
|
||||
from kubernetes import config as kube_config
|
||||
|
||||
|
||||
|
||||
def parse_arguments(parser):
|
||||
|
||||
# command line options
|
||||
parser.add_argument("-c", "--config-file", action="store", help="Config file path")
|
||||
parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
|
||||
parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
|
||||
parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
|
||||
parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
|
||||
parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
|
||||
parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
|
||||
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
||||
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
||||
|
||||
parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
|
||||
help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
|
||||
|
||||
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
||||
help="Memory related chaos tests (space separated list)")
|
||||
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
||||
@@ -35,10 +42,13 @@ def parse_arguments(parser):
|
||||
help="Network related chaos tests (space separated list)")
|
||||
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
||||
help="Memory related chaos tests (space separated list)")
|
||||
|
||||
parser.add_argument("--threshold", action="store", default="", help="Threshold")
|
||||
parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
|
||||
parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def read_configuration(config_file_path):
|
||||
if not os.path.exists(config_file_path):
|
||||
logging.error(f"Config file not found: {config_file_path}")
|
||||
@@ -48,15 +58,26 @@ def read_configuration(config_file_path):
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
log_level = config.get("log level", "INFO")
|
||||
namespace = config.get("namespace", "")
|
||||
kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||
namespaces = config.get("namespaces")
|
||||
namespaces = re.split(r",+\s+|,+|\s+", namespaces)
|
||||
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||
|
||||
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||
auth_token = config.get("auth_token")
|
||||
scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
|
||||
threshold = get_yaml_item_value(config, "threshold", ".7")
|
||||
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
|
||||
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
|
||||
output_file = config.get("json_output_file", False)
|
||||
if output_file is True:
|
||||
output_path = config.get("json_output_folder_path")
|
||||
else:
|
||||
output_path = False
|
||||
chaos_tests = config.get("chaos_tests", {})
|
||||
return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
|
||||
scrape_duration, chaos_tests, log_level, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold, output_path)
|
||||
|
||||
prometheus_endpoint = config.get("prometheus_endpoint", "")
|
||||
auth_token = config.get("auth_token", "")
|
||||
scrape_duration = config.get("scrape_duration", "10m")
|
||||
chaos_tests = config.get("chaos_tests" , {})
|
||||
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
||||
chaos_tests, log_level)
|
||||
|
||||
def prompt_input(prompt, default_value):
|
||||
user_input = input(f"{prompt} [{default_value}]: ")
|
||||
@@ -64,6 +85,54 @@ def prompt_input(prompt, default_value):
|
||||
return user_input
|
||||
return default_value
|
||||
|
||||
|
||||
def make_json_output(inputs, namespace_data, output_path):
|
||||
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
|
||||
|
||||
data = {
|
||||
"inputs": inputs,
|
||||
"analysis_outputs": namespace_data
|
||||
}
|
||||
|
||||
logging.info(f"Summary\n{json.dumps(data, indent=4)}")
|
||||
|
||||
if output_path is not False:
|
||||
file = f"recommender_{time_str}.json"
|
||||
path = f"{os.path.expanduser(output_path)}/{file}"
|
||||
|
||||
with open(path, "w") as json_output:
|
||||
logging.info(f"Saving output file in {output_path} folder...")
|
||||
json_output.write(json.dumps(data, indent=4))
|
||||
logging.info(f"Recommendation output saved in {file}.")
|
||||
|
||||
|
||||
def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
|
||||
chaos_tests, threshold, heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold):
|
||||
inputs = {
|
||||
"namespaces": namespaces,
|
||||
"kubeconfig": kubeconfig,
|
||||
"prometheus_endpoint": prometheus_endpoint,
|
||||
"scrape_duration": scrape_duration,
|
||||
"chaos_tests": chaos_tests,
|
||||
"threshold": threshold,
|
||||
"heatmap_cpu_threshold": heatmap_cpu_threshold,
|
||||
"heatmap_mem_threshold": heatmap_mem_threshold
|
||||
}
|
||||
return inputs
|
||||
|
||||
|
||||
def json_namespace(namespace, queries, analysis_data):
|
||||
data = {
|
||||
"namespace": namespace,
|
||||
"queries": queries,
|
||||
"profiling": analysis_data[0],
|
||||
"heatmap_analysis": analysis_data[1],
|
||||
"recommendations": analysis_data[2]
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
||||
args = parse_arguments(parser)
|
||||
@@ -75,43 +144,67 @@ def main():
|
||||
|
||||
if args.config_file is not None:
|
||||
(
|
||||
namespace,
|
||||
namespaces,
|
||||
kubeconfig,
|
||||
prometheus_endpoint,
|
||||
auth_token,
|
||||
scrape_duration,
|
||||
chaos_tests,
|
||||
log_level
|
||||
log_level,
|
||||
threshold,
|
||||
heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold,
|
||||
output_path
|
||||
) = read_configuration(args.config_file)
|
||||
|
||||
if args.options:
|
||||
namespace = args.namespace
|
||||
namespaces = args.namespaces
|
||||
kubeconfig = args.kubeconfig
|
||||
auth_token = args.token
|
||||
scrape_duration = args.scrape_duration
|
||||
log_level = args.log_level
|
||||
prometheus_endpoint = args.prometheus_endpoint
|
||||
output_path = args.json_output_file
|
||||
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
||||
threshold = args.threshold
|
||||
heatmap_mem_threshold = args.mem_threshold
|
||||
heatmap_cpu_threshold = args.cpu_threshold
|
||||
|
||||
if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
|
||||
if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
||||
logging.error(f"{log_level} not a valid log level")
|
||||
sys.exit(1)
|
||||
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
logging.info("============================INPUTS===================================")
|
||||
logging.info(f"Namespace: {namespace}")
|
||||
logging.info(f"Kubeconfig: {kubeconfig}")
|
||||
logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
|
||||
logging.info(f"Scrape duration: {scrape_duration}")
|
||||
for test in chaos_tests.keys():
|
||||
logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
|
||||
logging.info("=====================================================================")
|
||||
logging.info("Starting Analysis ...")
|
||||
logging.info("Fetching the Telemetry data")
|
||||
if output_path is not False:
|
||||
if output_path is None:
|
||||
output_path = "./recommender_output"
|
||||
logging.info(f"Path for output file not specified. "
|
||||
f"Using default folder {output_path}")
|
||||
if not os.path.exists(os.path.expanduser(output_path)):
|
||||
logging.error(f"Folder {output_path} for output not found.")
|
||||
sys.exit(1)
|
||||
|
||||
logging.info("Loading inputs...")
|
||||
inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
|
||||
scrape_duration, chaos_tests, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
namespaces_data = []
|
||||
|
||||
logging.info("Starting Analysis...")
|
||||
|
||||
file_path, queries = prometheus.fetch_utilization_from_prometheus(
|
||||
prometheus_endpoint, auth_token, namespaces, scrape_duration)
|
||||
|
||||
analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
for namespace in namespaces:
|
||||
namespace_data = json_namespace(namespace, queries[namespace],
|
||||
analysis_data[namespace])
|
||||
namespaces_data.append(namespace_data)
|
||||
make_json_output(inputs, namespaces_data, output_path)
|
||||
|
||||
file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
||||
analysis(file_path, chaos_tests)
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
35
utils/chaos_recommender/recommender_config.yaml
Normal file
35
utils/chaos_recommender/recommender_config.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
application: openshift-etcd
|
||||
namespaces: openshift-etcd
|
||||
labels: app=openshift-etcd
|
||||
kubeconfig: ~/.kube/config.yaml
|
||||
prometheus_endpoint: <Prometheus_Endpoint>
|
||||
auth_token: <Auth_Token>
|
||||
scrape_duration: 10m
|
||||
chaos_library: "kraken"
|
||||
log_level: INFO
|
||||
json_output_file: False
|
||||
json_output_folder_path:
|
||||
|
||||
# for output purpose only do not change if not needed
|
||||
chaos_tests:
|
||||
GENERIC:
|
||||
- pod_failure
|
||||
- container_failure
|
||||
- node_failure
|
||||
- zone_outage
|
||||
- time_skew
|
||||
- namespace_failure
|
||||
- power_outage
|
||||
CPU:
|
||||
- node_cpu_hog
|
||||
NETWORK:
|
||||
- application_outage
|
||||
- node_network_chaos
|
||||
- pod_network_chaos
|
||||
MEM:
|
||||
- node_memory_hog
|
||||
- pvc_disk_fill
|
||||
|
||||
threshold: .7
|
||||
cpu_threshold: .5
|
||||
mem_threshold: .5
|
||||
Reference in New Issue
Block a user