mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-19 20:40:33 +00:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
7d18487228 | ||
|
|
08de42c91a | ||
|
|
dc7d5bb01b | ||
|
|
ea3444d375 | ||
|
|
7b660a0878 | ||
|
|
5fe0655f22 | ||
|
|
5df343c183 | ||
|
|
f364e9f283 | ||
|
|
86a7427606 | ||
|
|
31266fbc3e | ||
|
|
57de3769e7 | ||
|
|
42fc8eea40 | ||
|
|
22d56e2cdc | ||
|
|
a259b68221 | ||
|
|
052f83e7d9 | ||
|
|
fb3bbe4e26 | ||
|
|
96ba9be4b8 | ||
|
|
58d5d1d8dc | ||
|
|
3fe22a0d8f | ||
|
|
21b89a32a7 | ||
|
|
dbe3ea9718 | ||
|
|
a142f6e7a4 | ||
|
|
2610a7af67 | ||
|
|
f827f65132 | ||
|
|
aa6cbbc11a | ||
|
|
e17354e54d | ||
|
|
2dfa5cb0cd | ||
|
|
0799008cd5 |
70
.github/workflows/tests.yml
vendored
70
.github/workflows/tests.yml
vendored
@@ -61,6 +61,8 @@ jobs:
|
||||
kubectl create namespace namespace-scenario
|
||||
kubectl apply -f CI/templates/time_pod.yaml
|
||||
kubectl wait --for=condition=ready pod -l scenario=time-skew --timeout=300s
|
||||
kubectl apply -f CI/templates/service_hijacking.yaml
|
||||
kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=proxy" --timeout=300s
|
||||
- name: Get Kind nodes
|
||||
run: |
|
||||
kubectl get nodes --show-labels=true
|
||||
@@ -70,12 +72,14 @@ jobs:
|
||||
run: python -m coverage run -a -m unittest discover -s tests -v
|
||||
|
||||
- name: Setup Pull Request Functional Tests
|
||||
if: github.event_name == 'pull_request'
|
||||
if: |
|
||||
github.event_name == 'pull_request'
|
||||
run: |
|
||||
yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml
|
||||
yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml
|
||||
yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml
|
||||
echo "test_app_outages" > ./CI/tests/functional_tests
|
||||
echo "test_service_hijacking" > ./CI/tests/functional_tests
|
||||
echo "test_app_outages" >> ./CI/tests/functional_tests
|
||||
echo "test_container" >> ./CI/tests/functional_tests
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
@@ -84,7 +88,9 @@ jobs:
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
# Push on main only steps
|
||||
|
||||
# Push on main only steps + all other functional to collect coverage
|
||||
# for the badge
|
||||
- name: Configure AWS Credentials
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
uses: aws-actions/configure-aws-credentials@v4
|
||||
@@ -101,6 +107,15 @@ jobs:
|
||||
yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml
|
||||
echo "test_telemetry" > ./CI/tests/functional_tests
|
||||
echo "test_service_hijacking" >> ./CI/tests/functional_tests
|
||||
echo "test_app_outages" >> ./CI/tests/functional_tests
|
||||
echo "test_container" >> ./CI/tests/functional_tests
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_time" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_memory_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_arca_io_hog" >> ./CI/tests/functional_tests
|
||||
|
||||
# Final common steps
|
||||
- name: Run Functional tests
|
||||
@@ -119,6 +134,7 @@ jobs:
|
||||
- name: Collect coverage report
|
||||
run: |
|
||||
python -m coverage html
|
||||
python -m coverage json
|
||||
- name: Publish coverage report to job summary
|
||||
run: |
|
||||
pip install html2text
|
||||
@@ -129,6 +145,54 @@ jobs:
|
||||
name: coverage
|
||||
path: htmlcov
|
||||
if-no-files-found: error
|
||||
- name: Upload json coverage
|
||||
uses: actions/upload-artifact@v3
|
||||
with:
|
||||
name: coverage.json
|
||||
path: coverage.json
|
||||
if-no-files-found: error
|
||||
- name: Check CI results
|
||||
run: grep Fail CI/results.markdown && false || true
|
||||
badge:
|
||||
permissions:
|
||||
contents: write
|
||||
name: Generate Coverage Badge
|
||||
runs-on: ubuntu-latest
|
||||
needs:
|
||||
- tests
|
||||
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
|
||||
steps:
|
||||
- name: Check out doc repo
|
||||
uses: actions/checkout@master
|
||||
with:
|
||||
repository: krkn-chaos/krkn-lib-docs
|
||||
path: krkn-lib-docs
|
||||
ssh-key: ${{ secrets.KRKN_LIB_DOCS_PRIV_KEY }}
|
||||
- name: Download json coverage
|
||||
uses: actions/download-artifact@v3
|
||||
with:
|
||||
name: coverage.json
|
||||
- name: Set up Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: 3.9
|
||||
- name: Copy badge on GitHub Page Repo
|
||||
env:
|
||||
COLOR: yellow
|
||||
run: |
|
||||
# generate coverage badge on previously calculated total coverage
|
||||
# and copy in the docs page
|
||||
export TOTAL=$(python -c "import json;print(json.load(open('coverage.json'))['totals']['percent_covered_display'])")
|
||||
[[ $TOTAL > 40 ]] && COLOR=green
|
||||
echo "TOTAL: $TOTAL"
|
||||
echo "COLOR: $COLOR"
|
||||
curl "https://img.shields.io/badge/coverage-$TOTAL%25-$COLOR" > ./krkn-lib-docs/coverage_badge_krkn.svg
|
||||
- name: Push updated Coverage Badge
|
||||
run: |
|
||||
cd krkn-lib-docs
|
||||
git add .
|
||||
git config user.name "krkn-chaos"
|
||||
git config user.email "<>"
|
||||
git commit -m "[KRKN] Coverage Badge ${GITHUB_REF##*/}" || echo "no changes to commit"
|
||||
git push
|
||||
|
||||
|
||||
29
CI/templates/service_hijacking.yaml
Normal file
29
CI/templates/service_hijacking.yaml
Normal file
@@ -0,0 +1,29 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: nginx
|
||||
labels:
|
||||
app.kubernetes.io/name: proxy
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:stable
|
||||
ports:
|
||||
- containerPort: 80
|
||||
name: http-web-svc
|
||||
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nginx-service
|
||||
spec:
|
||||
selector:
|
||||
app.kubernetes.io/name: proxy
|
||||
type: NodePort
|
||||
ports:
|
||||
- name: name-of-service-port
|
||||
protocol: TCP
|
||||
port: 80
|
||||
targetPort: http-web-svc
|
||||
nodePort: 30036
|
||||
107
CI/tests/test_service_hijacking.sh
Normal file
107
CI/tests/test_service_hijacking.sh
Normal file
@@ -0,0 +1,107 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
# port mapping has been configured in kind-config.yml
|
||||
SERVICE_URL=http://localhost:8888
|
||||
PAYLOAD_GET_1="{ \
|
||||
\"status\":\"internal server error\" \
|
||||
}"
|
||||
STATUS_CODE_GET_1=500
|
||||
|
||||
PAYLOAD_PATCH_1="resource patched"
|
||||
STATUS_CODE_PATCH_1=201
|
||||
|
||||
PAYLOAD_POST_1="{ \
|
||||
\"status\": \"unauthorized\" \
|
||||
}"
|
||||
STATUS_CODE_POST_1=401
|
||||
|
||||
PAYLOAD_GET_2="{ \
|
||||
\"status\":\"resource created\" \
|
||||
}"
|
||||
STATUS_CODE_GET_2=201
|
||||
|
||||
PAYLOAD_PATCH_2="bad request"
|
||||
STATUS_CODE_PATCH_2=400
|
||||
|
||||
PAYLOAD_POST_2="not found"
|
||||
STATUS_CODE_POST_2=404
|
||||
|
||||
JSON_MIME="application/json"
|
||||
TEXT_MIME="text/plain; charset=utf-8"
|
||||
|
||||
function functional_test_service_hijacking {
|
||||
|
||||
export scenario_type="service_hijacking"
|
||||
export scenario_file="scenarios/kube/service_hijacking.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/service_hijacking.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/service_hijacking.yaml > /dev/null 2>&1 &
|
||||
PID=$!
|
||||
#Waiting the hijacking to have effect
|
||||
while [ `curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php` == 404 ]; do echo "waiting scenario to kick in."; sleep 1; done;
|
||||
|
||||
#Checking Step 1 GET on /list/index.php
|
||||
OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_GET_1//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 1 GET Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_1" ] && echo "Step 1 GET Status Code OK" || (echo " Step 1 GET status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 GET MIME OK" || (echo " Step 1 GET MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 1 POST on /list/index.php
|
||||
OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
|
||||
OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_POST_1//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 1 POST Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_1" ] && echo "Step 1 POST Status Code OK" || (echo "Step 1 POST status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 1 POST MIME OK" || (echo " Step 1 POST MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 1 PATCH on /patch
|
||||
OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
|
||||
OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
|
||||
OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
|
||||
[ "${PAYLOAD_PATCH_1//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 1 PATCH Payload OK" || (echo "Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_1" ] && echo "Step 1 PATCH Status Code OK" || (echo "Step 1 PATCH status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 1 PATCH MIME OK" || (echo " Step 1 PATCH MIME did not match. Test failed." && exit 1)
|
||||
# wait for the next step
|
||||
sleep 16
|
||||
|
||||
#Checking Step 2 GET on /list/index.php
|
||||
OUT_GET="`curl -X GET -s $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X GET -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_GET_2//[$'\t\r\n ']}" == "${OUT_GET//[$'\t\r\n ']}" ] && echo "Step 2 GET Payload OK" || (echo "Step 2 GET Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_GET_2" ] && echo "Step 2 GET Status Code OK" || (echo "Step 2 GET status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$JSON_MIME" ] && echo "Step 2 GET MIME OK" || (echo " Step 2 GET MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 2 POST on /list/index.php
|
||||
OUT_POST="`curl -s -X POST $SERVICE_URL/list/index.php`"
|
||||
OUT_CONTENT=`curl -X POST -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/list/index.php`
|
||||
OUT_STATUS_CODE=`curl -X POST -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/list/index.php`
|
||||
[ "${PAYLOAD_POST_2//[$'\t\r\n ']}" == "${OUT_POST//[$'\t\r\n ']}" ] && echo "Step 2 POST Payload OK" || (echo "Step 2 POST Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_POST_2" ] && echo "Step 2 POST Status Code OK" || (echo "Step 2 POST status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 POST MIME OK" || (echo " Step 2 POST MIME did not match. Test failed." && exit 1)
|
||||
|
||||
#Checking Step 2 PATCH on /patch
|
||||
OUT_PATCH="`curl -s -X PATCH $SERVICE_URL/patch`"
|
||||
OUT_CONTENT=`curl -X PATCH -s -o /dev/null -I -w "%{content_type}" $SERVICE_URL/patch`
|
||||
OUT_STATUS_CODE=`curl -X PATCH -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL/patch`
|
||||
[ "${PAYLOAD_PATCH_2//[$'\t\r\n ']}" == "${OUT_PATCH//[$'\t\r\n ']}" ] && echo "Step 2 PATCH Payload OK" || (echo "Step 2 PATCH Payload did not match. Test failed." && exit 1)
|
||||
[ "$OUT_STATUS_CODE" == "$STATUS_CODE_PATCH_2" ] && echo "Step 2 PATCH Status Code OK" || (echo "Step 2 PATCH status code did not match. Test failed." && exit 1)
|
||||
[ "$OUT_CONTENT" == "$TEXT_MIME" ] && echo "Step 2 PATCH MIME OK" || (echo " Step 2 PATCH MIME did not match. Test failed." && exit 1)
|
||||
wait $PID
|
||||
|
||||
# now checking if service has been restore correctly and nginx responds correctly
|
||||
curl -s $SERVICE_URL | grep nginx! && echo "BODY: Service restored!" || (echo "BODY: failed to restore service" && exit 1)
|
||||
OUT_STATUS_CODE=`curl -X GET -s -o /dev/null -I -w "%{http_code}" $SERVICE_URL`
|
||||
[ "$OUT_STATUS_CODE" == "200" ] && echo "STATUS_CODE: Service restored!" || (echo "STATUS_CODE: failed to restore service" && exit 1)
|
||||
|
||||
echo "Service Hijacking Chaos test: Success"
|
||||
}
|
||||
|
||||
|
||||
functional_test_service_hijacking
|
||||
@@ -1,5 +1,7 @@
|
||||
# Krkn aka Kraken
|
||||

|
||||

|
||||

|
||||
|
||||

|
||||
|
||||
@@ -73,6 +75,7 @@ Scenario type | Kubernetes
|
||||
[PVC scenario](docs/pvc_scenario.md) | :heavy_check_mark: |
|
||||
[Network_Chaos](docs/network_chaos.md) | :heavy_check_mark: |
|
||||
[ManagedCluster Scenarios](docs/managedcluster_scenarios.md) | :heavy_check_mark: |
|
||||
[Service Hijacking Scenarios](docs/service_hijacking_scenarios.md) | :heavy_check_mark: |
|
||||
|
||||
|
||||
### Kraken scenario pass/fail criteria and report
|
||||
|
||||
@@ -42,6 +42,8 @@ kraken:
|
||||
- scenarios/openshift/pvc_scenario.yaml
|
||||
- network_chaos:
|
||||
- scenarios/openshift/network_chaos.yaml
|
||||
- service_hijacking:
|
||||
- scenarios/kube/service_hijacking.yaml
|
||||
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed
|
||||
|
||||
@@ -1,28 +1,50 @@
|
||||
# Dockerfile for kraken
|
||||
|
||||
# azure-client
|
||||
FROM mcr.microsoft.com/azure-cli:latest as azure-cli
|
||||
|
||||
FROM registry.access.redhat.com/ubi8/ubi:latest
|
||||
# oc build
|
||||
FROM golang:1.22.4 AS oc-build
|
||||
RUN apt-get update && apt-get install -y libkrb5-dev
|
||||
WORKDIR /tmp
|
||||
RUN git clone --branch release-4.18 https://github.com/openshift/oc.git
|
||||
WORKDIR /tmp/oc
|
||||
RUN go mod edit -go 1.22.3 &&\
|
||||
go get github.com/moby/buildkit@v0.12.5 &&\
|
||||
go get github.com/containerd/containerd@v1.7.11&&\
|
||||
go get github.com/docker/docker@v25.0.5&&\
|
||||
go mod tidy && go mod vendor
|
||||
RUN make GO_REQUIRED_MIN_VERSION:= oc
|
||||
|
||||
ENV KUBECONFIG /root/.kube/config
|
||||
FROM fedora:40
|
||||
RUN groupadd -g 1001 krkn && useradd -m -u 1001 -g krkn krkn
|
||||
RUN dnf update -y
|
||||
|
||||
# Copy azure client binary from azure-cli image
|
||||
# krkn version that will be built
|
||||
ENV KRKN_VERSION v1.6.1
|
||||
|
||||
ENV KUBECONFIG /home/krkn/.kube/config
|
||||
|
||||
# install kubectl
|
||||
RUN curl -LO "https://dl.k8s.io/release/$(curl -L -s https://dl.k8s.io/release/stable.txt)/bin/linux/amd64/kubectl" &&\
|
||||
cp kubectl /usr/local/bin/kubectl && chmod +x /usr/local/bin/kubectl &&\
|
||||
cp kubectl /usr/bin/kubectl && chmod +x /usr/bin/kubectl
|
||||
|
||||
# This overwrites any existing configuration in /etc/yum.repos.d/kubernetes.repo
|
||||
RUN dnf update && dnf install -y git python39 jq yq gettext wget which
|
||||
# copy azure client binary from azure-cli image
|
||||
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.13 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
wget https://github.com/mikefarah/yq/releases/latest/download/yq_linux_amd64 -O /usr/bin/yq && chmod +x /usr/bin/yq
|
||||
# copy oc client binary from oc-build image
|
||||
COPY --from=oc-build /tmp/oc/oc /usr/bin/oc
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
WORKDIR /tmp
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||
|
||||
WORKDIR /root/kraken
|
||||
# krkn build
|
||||
RUN git clone https://github.com/krkn-chaos/krkn.git --branch $KRKN_VERSION /home/krkn/kraken && \
|
||||
mkdir -p /home/krkn/.kube
|
||||
WORKDIR /home/krkn/kraken
|
||||
RUN python3.9 -m ensurepip
|
||||
RUN pip3.9 install -r requirements.txt
|
||||
RUN pip3.9 install jsonschema
|
||||
|
||||
RUN chown -R krkn:krkn /home/krkn
|
||||
USER krkn
|
||||
ENTRYPOINT ["python3.9", "run_kraken.py"]
|
||||
CMD ["--config=config/config.yaml"]
|
||||
CMD ["--config=config/config.yaml"]
|
||||
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.13 /root/kraken && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.14 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
|
||||
80
docs/service_hijacking_scenarios.md
Normal file
80
docs/service_hijacking_scenarios.md
Normal file
@@ -0,0 +1,80 @@
|
||||
### Service Hijacking Scenarios
|
||||
|
||||
Service Hijacking Scenarios aim to simulate fake HTTP responses from a workload targeted by a
|
||||
`Service` already deployed in the cluster.
|
||||
This scenario is executed by deploying a custom-made web service and modifying the target `Service`
|
||||
selector to direct traffic to this web service for a specified duration.
|
||||
|
||||
The web service's source code is available [here](https://github.com/krkn-chaos/krkn-service-hijacking).
|
||||
It employs a time-based test plan from the scenario configuration file, which specifies the behavior of resources during the chaos scenario as follows:
|
||||
|
||||
```yaml
|
||||
service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
|
||||
service_name: nginx-service # The name of the service that will be hijacked.
|
||||
service_namespace: default # The namespace where the target service is located.
|
||||
image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
|
||||
chaos_duration: 30 # Total duration of the chaos scenario in seconds.
|
||||
plan:
|
||||
- resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored. For resources, only query parameters are captured.
|
||||
|
||||
steps: # A time-based plan consisting of steps can be defined for each resource.
|
||||
GET: # One or more HTTP methods can be specified for each step. Note: Non-standard methods are supported for fully custom web services (e.g., using NONEXISTENT instead of POST).
|
||||
|
||||
- duration: 15 # Duration in seconds for this step before moving to the next one, if defined. Otherwise, this step will continue until the chaos scenario ends.
|
||||
|
||||
status: 500 # HTTP status code to be returned in this step.
|
||||
mime_type: "application/json" # MIME type of the response for this step.
|
||||
payload: | # The response payload for this step.
|
||||
{
|
||||
"status":"internal server error"
|
||||
}
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status":"resource created"
|
||||
}
|
||||
POST:
|
||||
- duration: 15
|
||||
status: 401
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status": "unauthorized"
|
||||
}
|
||||
- duration: 15
|
||||
status: 404
|
||||
mime_type: "text/plain"
|
||||
payload: "not found"
|
||||
|
||||
|
||||
```
|
||||
The scenario will focus on the `service_name` within the `service_namespace`,
|
||||
substituting the selector with a randomly generated one, which is added as a label in the mock service manifest.
|
||||
This allows multiple scenarios to be executed in the same namespace, each targeting different services without
|
||||
causing conflicts.
|
||||
|
||||
The newly deployed mock web service will expose a `service_target_port`,
|
||||
which can be either a named or numeric port based on the service configuration.
|
||||
This ensures that the Service correctly routes HTTP traffic to the mock web service during the chaos run.
|
||||
|
||||
Each step will last for `duration` seconds from the deployment of the mock web service in the cluster.
|
||||
For each HTTP resource, defined as a top-level YAML property of the plan
|
||||
(it could be a specific resource, e.g., /list/index.php, or a path-based resource typical in MVC frameworks),
|
||||
one or more HTTP request methods can be specified. Both standard and custom request methods are supported.
|
||||
|
||||
During this time frame, the web service will respond with:
|
||||
|
||||
- `status`: The [HTTP status code](https://datatracker.ietf.org/doc/html/rfc7231#section-6) (can be standard or custom).
|
||||
- `mime_type`: The [MIME type](https://developer.mozilla.org/en-US/docs/Web/HTTP/Basics_of_HTTP/MIME_types) (can be standard or custom).
|
||||
- `payload`: The response body to be returned to the client.
|
||||
|
||||
At the end of the step `duration`, the web service will proceed to the next step (if available) until
|
||||
the global `chaos_duration` concludes. At this point, the original service will be restored,
|
||||
and the custom web service and its resources will be undeployed.
|
||||
|
||||
__NOTE__: Some clients (e.g., cURL, jQuery) may optimize queries using lightweight methods (like HEAD or OPTIONS)
|
||||
to probe API behavior. If these methods are not defined in the test plan, the web service may respond with
|
||||
a `405` or `404` status code. If you encounter unexpected behavior, consider this use case.
|
||||
|
||||
@@ -2,6 +2,9 @@ kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
extraPortMappings:
|
||||
- containerPort: 30036
|
||||
hostPort: 8888
|
||||
- role: control-plane
|
||||
- role: control-plane
|
||||
- role: worker
|
||||
|
||||
@@ -19,7 +19,7 @@ def run(scenarios_list, config, wait_duration,kubecli: KrknKubernetes, telemetry
|
||||
for app_outage_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_outage_config)
|
||||
if len(app_outage_config) > 1:
|
||||
try:
|
||||
@@ -73,12 +73,12 @@ spec:
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except Exception as e :
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(app_outage_config)
|
||||
log_exception(app_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -16,12 +16,12 @@ def run(scenarios_list: List[str], kubeconfig_path: str, telemetry: KrknTelemetr
|
||||
for scenario in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry,scenario)
|
||||
engine_args = build_args(scenario)
|
||||
status_code = run_workflow(engine_args, kubeconfig_path)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exitStatus = status_code
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetry.exit_status = status_code
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
if status_code != 0:
|
||||
failed_post_scenarios.append(scenario)
|
||||
@@ -36,9 +36,10 @@ def run_workflow(engine_args: arcaflow.EngineArgs, kubeconfig_path: str) -> int:
|
||||
|
||||
def build_args(input_file: str) -> arcaflow.EngineArgs:
|
||||
"""sets the kubeconfig parsed by setArcaKubeConfig as an input to the arcaflow workflow"""
|
||||
context = Path(input_file).parent
|
||||
workflow = "{}/workflow.yaml".format(context)
|
||||
config = "{}/config.yaml".format(context)
|
||||
current_path = Path().resolve()
|
||||
context = f"{current_path}/{Path(input_file).parent}"
|
||||
workflow = f"{context}/workflow.yaml"
|
||||
config = f"{context}/config.yaml"
|
||||
if not os.path.exists(context):
|
||||
raise Exception(
|
||||
"context folder for arcaflow workflow not found: {}".format(
|
||||
@@ -61,7 +62,8 @@ def build_args(input_file: str) -> arcaflow.EngineArgs:
|
||||
engine_args = arcaflow.EngineArgs()
|
||||
engine_args.context = context
|
||||
engine_args.config = config
|
||||
engine_args.input = input_file
|
||||
engine_args.workflow = workflow
|
||||
engine_args.input = f"{current_path}/{input_file}"
|
||||
return engine_args
|
||||
|
||||
|
||||
|
||||
@@ -17,16 +17,41 @@ def convert_data_to_dataframe(data, label):
|
||||
|
||||
|
||||
def convert_data(data, service):
|
||||
|
||||
result = {}
|
||||
for entry in data:
|
||||
pod_name = entry['metric']['pod']
|
||||
value = entry['value'][1]
|
||||
result[pod_name] = value
|
||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
return result.get(service) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
|
||||
def save_utilization_to_file(utilization, filename):
|
||||
def convert_data_limits(data, node_data, service, prometheus):
|
||||
result = {}
|
||||
for entry in data:
|
||||
pod_name = entry['metric']['pod']
|
||||
value = entry['value'][1]
|
||||
result[pod_name] = value
|
||||
return result.get(service, get_node_capacity(node_data, service, prometheus)) # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
def get_node_capacity(node_data, pod_name, prometheus ):
|
||||
|
||||
# Get the node name on which the pod is running
|
||||
query = f'kube_pod_info{{pod="{pod_name}"}}'
|
||||
result = prometheus.custom_query(query)
|
||||
if not result:
|
||||
return None
|
||||
|
||||
node_name = result[0]['metric']['node']
|
||||
|
||||
for item in node_data:
|
||||
if item['metric']['node'] == node_name:
|
||||
return item['value'][1]
|
||||
|
||||
return '1000000000'
|
||||
|
||||
|
||||
def save_utilization_to_file(utilization, filename, prometheus):
|
||||
|
||||
merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
|
||||
for namespace in utilization:
|
||||
# Loading utilization_data[] for namespace
|
||||
@@ -41,9 +66,9 @@ def save_utilization_to_file(utilization, filename):
|
||||
new_row_df = pd.DataFrame({
|
||||
"namespace": namespace, "service": s,
|
||||
"CPU": convert_data(utilization_data[0], s),
|
||||
"CPU_LIMITS": convert_data(utilization_data[1], s),
|
||||
"CPU_LIMITS": convert_data_limits(utilization_data[1],utilization_data[5], s, prometheus),
|
||||
"MEM": convert_data(utilization_data[2], s),
|
||||
"MEM_LIMITS": convert_data(utilization_data[3], s),
|
||||
"MEM_LIMITS": convert_data_limits(utilization_data[3], utilization_data[6], s, prometheus),
|
||||
"NETWORK": convert_data(utilization_data[4], s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
|
||||
@@ -55,11 +80,11 @@ def save_utilization_to_file(utilization, filename):
|
||||
merged_df['NETWORK'] = merged_df['NETWORK'].astype(str)
|
||||
|
||||
# Extract integer part before the decimal point
|
||||
merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
|
||||
merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
|
||||
merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
|
||||
merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
|
||||
merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]
|
||||
#merged_df['CPU'] = merged_df['CPU'].str.split('.').str[0]
|
||||
#merged_df['MEM'] = merged_df['MEM'].str.split('.').str[0]
|
||||
#merged_df['CPU_LIMITS'] = merged_df['CPU_LIMITS'].str.split('.').str[0]
|
||||
#merged_df['MEM_LIMITS'] = merged_df['MEM_LIMITS'].str.split('.').str[0]
|
||||
#merged_df['NETWORK'] = merged_df['NETWORK'].str.split('.').str[0]
|
||||
|
||||
merged_df.to_csv(filename, sep='\t', index=False)
|
||||
|
||||
@@ -84,20 +109,27 @@ def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
|
||||
node_cpu_limits_query = 'kube_node_status_capacity{resource="cpu", unit="core"}*1000'
|
||||
node_cpu_limits_result = prometheus.custom_query(node_cpu_limits_query)
|
||||
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
|
||||
node_mem_limits_query = 'kube_node_status_capacity{resource="memory", unit="byte"}'
|
||||
node_mem_limits_result = prometheus.custom_query(node_mem_limits_query)
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
|
||||
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
|
||||
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, node_cpu_limits_result, node_mem_limits_result ]
|
||||
queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
|
||||
|
||||
save_utilization_to_file(utilization, saved_metrics_path)
|
||||
save_utilization_to_file(utilization, saved_metrics_path, prometheus)
|
||||
|
||||
return saved_metrics_path, queries
|
||||
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
for net_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = net_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, net_config)
|
||||
try:
|
||||
with open(net_config, "r") as file:
|
||||
@@ -114,11 +114,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
logging.info("Deleting jobs")
|
||||
delete_job(joblst[:], kubecli)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(net_config)
|
||||
log_exception(net_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -15,7 +15,7 @@ import kraken.cerberus.setup as cerberus
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.utils.functions import get_yaml_item_value
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
|
||||
node_general = False
|
||||
|
||||
@@ -61,7 +61,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
for node_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = node_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, node_scenario_config)
|
||||
with open(node_scenario_config, "r") as f:
|
||||
node_scenario_config = yaml.full_load(f)
|
||||
@@ -78,13 +78,13 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
cerberus.get_status(config, start_time, end_time)
|
||||
logging.info("")
|
||||
except (RuntimeError, Exception) as e:
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(node_scenario_config)
|
||||
log_exception(node_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -260,7 +260,7 @@ def run(scenarios: List[str],
|
||||
for scenario in scenarios:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
logging.info('scenario ' + str(scenario))
|
||||
pool = PodsMonitorPool(kubecli)
|
||||
@@ -276,16 +276,16 @@ def run(scenarios: List[str],
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"scenario exception: {str(e)}")
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
pool.cancel()
|
||||
failed_post_scenarios.append(scenario)
|
||||
log_exception(scenario)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -119,11 +119,11 @@ class vSphere:
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.stop(vm)
|
||||
logging.info("Stopped VM -- '{}-({})'", instance_id, vm)
|
||||
logging.info(f"Stopped VM -- '{instance_id}-({vm})'")
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is already Powered Off", instance_id, vm
|
||||
f"VM '{instance_id}'-'({vm})' is already Powered Off"
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -136,11 +136,11 @@ class vSphere:
|
||||
vm = self.get_vm(instance_id)
|
||||
try:
|
||||
self.client.vcenter.vm.Power.start(vm)
|
||||
logging.info("Started VM -- '{}-({})'", instance_id, vm)
|
||||
logging.info(f"Started VM -- '{instance_id}-({vm})'")
|
||||
return True
|
||||
except AlreadyInDesiredState:
|
||||
logging.info(
|
||||
"VM '{}'-'({})' is already Powered On", instance_id, vm
|
||||
f"VM '{instance_id}'-'({vm})' is already Powered On"
|
||||
)
|
||||
return False
|
||||
|
||||
@@ -318,12 +318,12 @@ class vSphere:
|
||||
try:
|
||||
vm = self.get_vm(instance_id)
|
||||
state = self.client.vcenter.vm.Power.get(vm).state
|
||||
logging.info("Check instance %s status", instance_id)
|
||||
logging.info(f"Check instance {instance_id} status")
|
||||
return state
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to get node instance status %s. Encountered following "
|
||||
"exception: %s.", instance_id, e
|
||||
f"Failed to get node instance status {instance_id}. Encountered following "
|
||||
f"exception: {str(e)}. "
|
||||
)
|
||||
return None
|
||||
|
||||
@@ -338,16 +338,14 @@ class vSphere:
|
||||
while vm is not None:
|
||||
vm = self.get_vm(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still being deleted, "
|
||||
"sleeping for 5 seconds",
|
||||
instance_id
|
||||
f"VM {instance_id} is still being deleted, "
|
||||
f"sleeping for 5 seconds"
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not deleted in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not deleted in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -371,8 +369,7 @@ class vSphere:
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not ready in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not ready in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -388,16 +385,14 @@ class vSphere:
|
||||
while status != Power.State.POWERED_OFF:
|
||||
status = self.get_vm_status(instance_id)
|
||||
logging.info(
|
||||
"VM %s is still not running, "
|
||||
"sleeping for 5 seconds",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not running, "
|
||||
f"sleeping for 5 seconds"
|
||||
)
|
||||
time.sleep(5)
|
||||
time_counter += 5
|
||||
if time_counter >= timeout:
|
||||
logging.info(
|
||||
"VM %s is still not ready in allotted time",
|
||||
instance_id
|
||||
f"VM {instance_id} is still not ready in allotted time"
|
||||
)
|
||||
return False
|
||||
return True
|
||||
@@ -561,7 +556,7 @@ def node_start(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_start_scenario injection")
|
||||
logging.info("Starting the node %s ", name)
|
||||
logging.info(f"Starting the node {name} ")
|
||||
vm_started = vsphere.start_instances(name)
|
||||
if vm_started:
|
||||
vsphere.wait_until_running(name, cfg.timeout)
|
||||
@@ -571,7 +566,7 @@ def node_start(
|
||||
)
|
||||
nodes_started[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in running state", name
|
||||
f"Node with instance ID: {name} is in running state"
|
||||
)
|
||||
logging.info(
|
||||
"node_start_scenario has been successfully injected!"
|
||||
@@ -579,8 +574,8 @@ def node_start(
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_start_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_start_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.START
|
||||
@@ -620,7 +615,7 @@ def node_stop(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
logging.info("Stopping the node %s ", name)
|
||||
logging.info(f"Stopping the node {name} ")
|
||||
vm_stopped = vsphere.stop_instances(name)
|
||||
if vm_stopped:
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
@@ -630,7 +625,7 @@ def node_stop(
|
||||
)
|
||||
nodes_stopped[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s is in stopped state", name
|
||||
f"Node with instance ID: {name} is in stopped state"
|
||||
)
|
||||
logging.info(
|
||||
"node_stop_scenario has been successfully injected!"
|
||||
@@ -638,8 +633,8 @@ def node_stop(
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_stop_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_stop_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.STOP
|
||||
@@ -679,7 +674,7 @@ def node_reboot(
|
||||
try:
|
||||
for _ in range(cfg.runs):
|
||||
logging.info("Starting node_reboot_scenario injection")
|
||||
logging.info("Rebooting the node %s ", name)
|
||||
logging.info(f"Rebooting the node {name} ")
|
||||
vsphere.reboot_instances(name)
|
||||
if not cfg.skip_openshift_checks:
|
||||
kube_helper.wait_for_unknown_status(
|
||||
@@ -690,8 +685,8 @@ def node_reboot(
|
||||
)
|
||||
nodes_rebooted[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has rebooted "
|
||||
"successfully", name
|
||||
f"Node with instance ID: {name} has rebooted "
|
||||
"successfully"
|
||||
)
|
||||
logging.info(
|
||||
"node_reboot_scenario has been successfully injected!"
|
||||
@@ -699,8 +694,8 @@ def node_reboot(
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_reboot_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_reboot_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.REBOOT
|
||||
@@ -739,13 +734,13 @@ def node_terminate(
|
||||
vsphere.stop_instances(name)
|
||||
vsphere.wait_until_stopped(name, cfg.timeout)
|
||||
logging.info(
|
||||
"Releasing the node with instance ID: %s ", name
|
||||
f"Releasing the node with instance ID: {name} "
|
||||
)
|
||||
vsphere.release_instances(name)
|
||||
vsphere.wait_until_released(name, cfg.timeout)
|
||||
nodes_terminated[int(time.time_ns())] = Node(name=name)
|
||||
logging.info(
|
||||
"Node with instance ID: %s has been released", name
|
||||
f"Node with instance ID: {name} has been released"
|
||||
)
|
||||
logging.info(
|
||||
"node_terminate_scenario has been "
|
||||
@@ -754,8 +749,8 @@ def node_terminate(
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Test Failed")
|
||||
logging.error(
|
||||
"node_terminate_scenario injection failed! "
|
||||
"Error was: %s", str(e)
|
||||
f"node_terminate_scenario injection failed! "
|
||||
f"Error was: {str(e)}"
|
||||
)
|
||||
return "error", NodeScenarioErrorOutput(
|
||||
format_exc(), kube_helper.Actions.TERMINATE
|
||||
|
||||
@@ -88,7 +88,7 @@ def container_run(kubeconfig_path,
|
||||
for container_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = container_scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, container_scenario_config[0])
|
||||
if len(container_scenario_config) > 1:
|
||||
pre_action_output = post_actions.run(kubeconfig_path, container_scenario_config[1])
|
||||
@@ -119,12 +119,12 @@ def container_run(kubeconfig_path,
|
||||
pool.cancel()
|
||||
failed_scenarios.append(container_scenario_config[0])
|
||||
log_exception(container_scenario_config[0])
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
# removed_exit
|
||||
# sys.exit(1)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -11,7 +11,7 @@ from krkn_lib.utils.functions import get_yaml_item_value, log_exception
|
||||
|
||||
|
||||
# krkn_lib
|
||||
def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
"""
|
||||
Reads the scenario config and creates a temp file to fill up the PVC
|
||||
"""
|
||||
@@ -21,7 +21,7 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
for app_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = app_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, app_config)
|
||||
try:
|
||||
if len(app_config) > 1:
|
||||
@@ -305,7 +305,9 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
file_size_kb,
|
||||
kubecli
|
||||
)
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
config,
|
||||
@@ -314,11 +316,11 @@ def run(scenarios_list, config, kubecli: KrknKubernetes, telemetry: KrknTelemetr
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(app_config)
|
||||
log_exception(app_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -165,7 +165,7 @@ def run(
|
||||
for scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario_config[0]
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario_config[0])
|
||||
try:
|
||||
if len(scenario_config) > 1:
|
||||
@@ -249,12 +249,12 @@ def run(
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (Exception, RuntimeError):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(scenario_config[0])
|
||||
log_exception(scenario_config[0])
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
0
kraken/service_hijacking/__init__.py
Normal file
0
kraken/service_hijacking/__init__.py
Normal file
90
kraken/service_hijacking/service_hijacking.py
Normal file
90
kraken/service_hijacking/service_hijacking.py
Normal file
@@ -0,0 +1,90 @@
|
||||
import logging
|
||||
import time
|
||||
|
||||
import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
|
||||
|
||||
def run(scenarios_list: list[str],wait_duration: int, krkn_lib: KrknKubernetes, telemetry: KrknTelemetryKubernetes) -> (list[str], list[ScenarioTelemetry]):
|
||||
scenario_telemetries= list[ScenarioTelemetry]()
|
||||
failed_post_scenarios = []
|
||||
for scenario in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = scenario
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
with open(scenario) as stream:
|
||||
scenario_config = yaml.safe_load(stream)
|
||||
|
||||
service_name = scenario_config['service_name']
|
||||
service_namespace = scenario_config['service_namespace']
|
||||
plan = scenario_config["plan"]
|
||||
image = scenario_config["image"]
|
||||
target_port = scenario_config["service_target_port"]
|
||||
chaos_duration = scenario_config["chaos_duration"]
|
||||
|
||||
logging.info(f"checking service {service_name} in namespace: {service_namespace}")
|
||||
if not krkn_lib.service_exists(service_name, service_namespace):
|
||||
logging.error(f"service: {service_name} not found in namespace: {service_namespace}, failed to run scenario.")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
try:
|
||||
logging.info(f"service: {service_name} found in namespace: {service_namespace}")
|
||||
logging.info(f"creating webservice and initializing test plan...")
|
||||
# both named ports and port numbers can be used
|
||||
if isinstance(target_port, int):
|
||||
logging.info(f"webservice will listen on port {target_port}")
|
||||
webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_number=target_port)
|
||||
else:
|
||||
logging.info(f"traffic will be redirected to named port: {target_port}")
|
||||
webservice = krkn_lib.deploy_service_hijacking(service_namespace, plan, image, port_name=target_port)
|
||||
logging.info(f"successfully deployed pod: {webservice.pod_name} "
|
||||
f"in namespace:{service_namespace} with selector {webservice.selector}!"
|
||||
)
|
||||
logging.info(f"patching service: {service_name} to hijack traffic towards: {webservice.pod_name}")
|
||||
original_service = krkn_lib.replace_service_selector([webservice.selector], service_name, service_namespace)
|
||||
if original_service is None:
|
||||
logging.error(f"failed to patch service: {service_name}, namespace: {service_namespace} with selector {webservice.selector}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
|
||||
logging.info(f"service: {service_name} successfully patched!")
|
||||
logging.info(f"original service manifest:\n\n{yaml.dump(original_service)}")
|
||||
logging.info(f"waiting {chaos_duration} before restoring the service")
|
||||
time.sleep(chaos_duration)
|
||||
selectors = ["=".join([key, original_service["spec"]["selector"][key]]) for key in original_service["spec"]["selector"].keys()]
|
||||
logging.info(f"restoring the service selectors {selectors}")
|
||||
original_service = krkn_lib.replace_service_selector(selectors, service_name, service_namespace)
|
||||
if original_service is None:
|
||||
logging.error(f"failed to restore original service: {service_name}, namespace: {service_namespace} with selectors: {selectors}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
break
|
||||
logging.info("selectors successfully restored")
|
||||
logging.info("undeploying service-hijacking resources...")
|
||||
krkn_lib.undeploy_service_hijacking(webservice)
|
||||
|
||||
logging.info("End of scenario. Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
logging.info("success")
|
||||
except Exception as e:
|
||||
logging.error(f"scenario {scenario} failed with exception: {e}")
|
||||
fail(scenario_telemetry, scenario_telemetries)
|
||||
failed_post_scenarios.append(scenario)
|
||||
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
def fail(scenario_telemetry: ScenarioTelemetry, scenario_telemetries: list[ScenarioTelemetry]):
|
||||
scenario_telemetry.exit_status = 1
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
@@ -147,7 +147,7 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = config_path
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, config_path)
|
||||
|
||||
with open(config_path, "r") as f:
|
||||
@@ -175,11 +175,11 @@ def run(scenarios_list, config, wait_duration, kubecli: KrknKubernetes, telemetr
|
||||
except (RuntimeError, Exception):
|
||||
log_exception(config_path)
|
||||
failed_scenarios.append(config_path)
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.exit_status = 0
|
||||
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -354,7 +354,7 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
|
||||
for time_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = time_scenario_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, time_scenario_config)
|
||||
try:
|
||||
with open(time_scenario_config, "r") as f:
|
||||
@@ -377,12 +377,12 @@ def run(scenarios_list, config, wait_duration, kubecli:KrknKubernetes, telemetry
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
log_exception(time_scenario_config)
|
||||
failed_scenarios.append(time_scenario_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
@@ -19,7 +19,7 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
|
||||
for zone_outage_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
scenario_telemetry.scenario = zone_outage_config
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
scenario_telemetry.start_timestamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, zone_outage_config)
|
||||
try:
|
||||
if len(zone_outage_config) > 1:
|
||||
@@ -110,12 +110,12 @@ def run(scenarios_list, config, wait_duration, telemetry: KrknTelemetryKubernete
|
||||
end_time
|
||||
)
|
||||
except (RuntimeError, Exception):
|
||||
scenario_telemetry.exitStatus = 1
|
||||
scenario_telemetry.exit_status = 1
|
||||
failed_scenarios.append(zone_outage_config)
|
||||
log_exception(zone_outage_config)
|
||||
else:
|
||||
scenario_telemetry.exitStatus = 0
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
scenario_telemetry.exit_status = 0
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
aliyun-python-sdk-core==2.13.36
|
||||
aliyun-python-sdk-ecs==4.24.25
|
||||
arcaflow==0.9.0
|
||||
arcaflow==0.17.2
|
||||
arcaflow-plugin-sdk==0.10.0
|
||||
boto3==1.28.61
|
||||
azure-identity==1.15.0
|
||||
azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
itsdangerous==2.0.1
|
||||
@@ -14,8 +14,8 @@ gitpython==3.1.41
|
||||
google-api-python-client==2.116.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.3
|
||||
krkn-lib==2.1.2
|
||||
jinja2==3.1.4
|
||||
krkn-lib==2.1.3
|
||||
lxml==5.1.0
|
||||
kubernetes==26.1.0
|
||||
oauth2client==4.1.3
|
||||
@@ -28,11 +28,11 @@ pyfiglet==1.0.2
|
||||
pytest==8.0.0
|
||||
python-ipmi==0.5.4
|
||||
python-openstackclient==6.5.0
|
||||
requests==2.31.0
|
||||
requests==2.32.0
|
||||
service_identity==24.1.0
|
||||
PyYAML==6.0
|
||||
setuptools==65.5.1
|
||||
werkzeug==3.0.1
|
||||
werkzeug==3.0.3
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
|
||||
@@ -25,6 +25,7 @@ import kraken.pvc.pvc_scenario as pvc_scenario
|
||||
import kraken.network_chaos.actions as network_chaos
|
||||
import kraken.arcaflow_plugin as arcaflow_plugin
|
||||
import kraken.prometheus as prometheus_plugin
|
||||
import kraken.service_hijacking.service_hijacking as service_hijacking_plugin
|
||||
import server as server
|
||||
from kraken import plugins
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
@@ -340,7 +341,7 @@ def main(cfg):
|
||||
# krkn_lib
|
||||
elif scenario_type == "pvc_scenarios":
|
||||
logging.info("Running PVC scenario")
|
||||
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, kubecli, telemetry_k8s)
|
||||
failed_post_scenarios, scenario_telemetries = pvc_scenario.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Network scenarios
|
||||
@@ -348,6 +349,10 @@ def main(cfg):
|
||||
elif scenario_type == "network_chaos":
|
||||
logging.info("Running Network Chaos")
|
||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
elif scenario_type == "service_hijacking":
|
||||
logging.info("Running Service Hijacking Chaos")
|
||||
failed_post_scenarios, scenario_telemetries = service_hijacking_plugin.run(scenarios_list, wait_duration, kubecli, telemetry_k8s)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
|
||||
# Check for critical alerts when enabled
|
||||
post_critical_alerts = 0
|
||||
|
||||
@@ -2,7 +2,7 @@ input_list:
|
||||
- cpu_count: 1
|
||||
cpu_load_percentage: 80
|
||||
cpu_method: all
|
||||
duration: 1s
|
||||
duration: 30
|
||||
kubeconfig: ''
|
||||
namespace: default
|
||||
# set the node selector as a key-value pair eg.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -35,7 +35,7 @@ input:
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_count:
|
||||
display:
|
||||
@@ -68,18 +68,18 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: cpu
|
||||
cpu_count: !expr $.input.cpu_count
|
||||
cpu_method: !expr $.input.cpu_method
|
||||
cpu_load: !expr $.input.cpu_load_percentage
|
||||
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: cpu
|
||||
workers: !expr $.input.cpu_count
|
||||
cpu-method: "all"
|
||||
cpu-load: !expr $.input.cpu_load_percentage
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
|
||||
@@ -9,62 +9,10 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
cpu_count:
|
||||
display:
|
||||
description: Number of CPU cores to be used (0 means all)
|
||||
name: number of CPUs
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
cpu_method:
|
||||
display:
|
||||
description: CPU stress method
|
||||
name: fine grained control of which cpu stressors to use (ackermann, cfloat etc.)
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
cpu_load_percentage:
|
||||
display:
|
||||
description: load CPU by percentage
|
||||
name: CPU load
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
input_list:
|
||||
- duration: 30s
|
||||
- duration: 30
|
||||
io_block_size: 1m
|
||||
io_workers: 1
|
||||
io_write_bytes: 10m
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
hostPath:
|
||||
id: HostPathVolumeSource
|
||||
@@ -18,8 +18,8 @@ input:
|
||||
type:
|
||||
id: hostPath
|
||||
type_id: ref
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -51,7 +51,7 @@ input:
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
io_workers:
|
||||
display:
|
||||
@@ -102,19 +102,18 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
workdir: !expr $.input.target_pod_folder
|
||||
stressors:
|
||||
- stressor: hdd
|
||||
hdd: !expr $.input.io_workers
|
||||
hdd_bytes: !expr $.input.io_write_bytes
|
||||
hdd_write_size: !expr $.input.io_block_size
|
||||
timeout: !expr $.input.duration
|
||||
workdir: !expr $.input.target_pod_folder
|
||||
stressors:
|
||||
- stressor: hdd
|
||||
workers: !expr $.input.io_workers
|
||||
hdd-bytes: !expr $.input.io_write_bytes
|
||||
hdd-write-size: !expr $.input.io_block_size
|
||||
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
|
||||
@@ -2,22 +2,6 @@ version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
objects:
|
||||
hostPath:
|
||||
id: HostPathVolumeSource
|
||||
properties:
|
||||
path:
|
||||
type:
|
||||
type_id: string
|
||||
Volume:
|
||||
id: Volume
|
||||
properties:
|
||||
name:
|
||||
type:
|
||||
type_id: string
|
||||
hostPath:
|
||||
type:
|
||||
id: hostPath
|
||||
type_id: ref
|
||||
RootObject:
|
||||
id: RootObject
|
||||
properties:
|
||||
@@ -25,80 +9,9 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in
|
||||
seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
io_workers:
|
||||
display:
|
||||
description: number of workers
|
||||
name: start N workers continually writing, reading and removing temporary files
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
io_block_size:
|
||||
display:
|
||||
description: single write size
|
||||
name: specify size of each write in bytes. Size can be from 1 byte to 4MB.
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
io_write_bytes:
|
||||
display:
|
||||
description: Total number of bytes written
|
||||
name: write N bytes for each hdd process, the default is 1 GB. One can specify the size
|
||||
as % of free space on the file system or in units of Bytes, KBytes, MBytes and
|
||||
GBytes using the suffix b, k, m or g
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_folder:
|
||||
display:
|
||||
description: Target Folder
|
||||
name: Folder in the pod where the test will be executed and the test files will be written
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
target_pod_volume:
|
||||
display:
|
||||
name: kubernetes volume definition
|
||||
description: the volume that will be attached to the pod. In order to stress
|
||||
the node storage only hosPath mode is currently supported
|
||||
type:
|
||||
type_id: ref
|
||||
id: Volume
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
input_list:
|
||||
- duration: 30s
|
||||
- duration: 30
|
||||
vm_bytes: 10%
|
||||
vm_workers: 2
|
||||
# set the node selector as a key-value pair eg.
|
||||
|
||||
@@ -1,9 +1,9 @@
|
||||
version: v0.2.0
|
||||
input:
|
||||
root: RootObject
|
||||
root: SubRootObject
|
||||
objects:
|
||||
RootObject:
|
||||
id: input_item
|
||||
SubRootObject:
|
||||
id: SubRootObject
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
@@ -34,7 +34,7 @@ input:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_workers:
|
||||
display:
|
||||
@@ -60,17 +60,16 @@ steps:
|
||||
kubeconfig: !expr $.input.kubeconfig
|
||||
stressng:
|
||||
plugin:
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.5.0
|
||||
src: quay.io/arcalot/arcaflow-plugin-stressng:0.6.0
|
||||
deployment_type: image
|
||||
step: workload
|
||||
input:
|
||||
cleanup: "true"
|
||||
StressNGParams:
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: vm
|
||||
vm: !expr $.input.vm_workers
|
||||
vm_bytes: !expr $.input.vm_bytes
|
||||
timeout: !expr $.input.duration
|
||||
stressors:
|
||||
- stressor: vm
|
||||
workers: !expr $.input.vm_workers
|
||||
vm-bytes: !expr $.input.vm_bytes
|
||||
deploy:
|
||||
deployer_name: kubernetes
|
||||
connection: !expr $.steps.kubeconfig.outputs.success.connection
|
||||
|
||||
@@ -9,54 +9,10 @@ input:
|
||||
type:
|
||||
type_id: list
|
||||
items:
|
||||
id: input_item
|
||||
type_id: object
|
||||
properties:
|
||||
kubeconfig:
|
||||
display:
|
||||
description: The complete kubeconfig file as a string
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
namespace:
|
||||
display:
|
||||
description: The namespace where the container will be deployed
|
||||
name: Namespace
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
node_selector:
|
||||
display:
|
||||
description: kubernetes node name where the plugin must be deployed
|
||||
type:
|
||||
type_id: map
|
||||
values:
|
||||
type_id: string
|
||||
keys:
|
||||
type_id: string
|
||||
required: true
|
||||
duration:
|
||||
display:
|
||||
name: duration the scenario expressed in seconds
|
||||
description: stop stress test after T seconds. One can also specify the units of time in seconds, minutes, hours, days or years with the suffix s, m, h, d or y
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
vm_workers:
|
||||
display:
|
||||
description: Number of VM stressors to be run (0 means 1 stressor per CPU)
|
||||
name: Number of VM stressors
|
||||
type:
|
||||
type_id: integer
|
||||
required: true
|
||||
vm_bytes:
|
||||
display:
|
||||
description: N bytes per vm process, the default is 256MB. The size can be expressed in units of Bytes, KBytes, MBytes and GBytes using the suffix b, k, m or g.
|
||||
name: Kubeconfig file contents
|
||||
type:
|
||||
type_id: string
|
||||
required: true
|
||||
id: SubRootObject
|
||||
type_id: ref
|
||||
namespace: $.steps.workload_loop.execute.inputs.items
|
||||
|
||||
steps:
|
||||
workload_loop:
|
||||
kind: foreach
|
||||
|
||||
56
scenarios/kube/service_hijacking.yaml
Normal file
56
scenarios/kube/service_hijacking.yaml
Normal file
@@ -0,0 +1,56 @@
|
||||
# refer to the documentation for further infos https://github.com/krkn-chaos/krkn/blob/main/docs/service_hijacking.md
|
||||
|
||||
service_target_port: http-web-svc # The port of the service to be hijacked (can be named or numeric, based on the workload and service configuration).
|
||||
service_name: nginx-service # name of the service to be hijacked
|
||||
service_namespace: default # The namespace where the target service is located
|
||||
image: quay.io/krkn-chaos/krkn-service-hijacking:v0.1.3 # Image of the krkn web service to be deployed to receive traffic.
|
||||
chaos_duration: 30 # Total duration of the chaos scenario in seconds.
|
||||
plan:
|
||||
- resource: "/list/index.php" # Specifies the resource or path to respond to in the scenario. For paths, both the path and query parameters are captured but ignored.
|
||||
# For resources, only query parameters are captured.
|
||||
|
||||
steps: # A time-based plan consisting of steps can be defined for each resource.
|
||||
GET: # One or more HTTP methods can be specified for each step.
|
||||
# Note: Non-standard methods are supported
|
||||
# for fully custom web services (e.g., using NONEXISTENT instead of POST).
|
||||
|
||||
- duration: 15 # Duration in seconds for this step before moving to the next one, if defined. Otherwise,
|
||||
# this step will continue until the chaos scenario ends.
|
||||
|
||||
status: 500 # HTTP status code to be returned in this step.
|
||||
mime_type: "application/json" # MIME type of the response for this step.
|
||||
payload: | # The response payload for this step.
|
||||
{
|
||||
"status":"internal server error"
|
||||
}
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status":"resource created"
|
||||
}
|
||||
POST:
|
||||
- duration: 15
|
||||
status: 401
|
||||
mime_type: "application/json"
|
||||
payload: |
|
||||
{
|
||||
"status": "unauthorized"
|
||||
}
|
||||
- duration: 15
|
||||
status: 404
|
||||
mime_type: "text/plain"
|
||||
payload: "not found"
|
||||
|
||||
- resource: "/patch"
|
||||
steps:
|
||||
PATCH:
|
||||
- duration: 15
|
||||
status: 201
|
||||
mime_type: "text/plain"
|
||||
payload: "resource patched"
|
||||
- duration: 15
|
||||
status: 400
|
||||
mime_type: "text/plain"
|
||||
payload: "bad request"
|
||||
@@ -3,23 +3,24 @@ Enhancing Chaos Engineering with AI-assisted fault injection for better resilien
|
||||
|
||||
## Generate python package wheel file
|
||||
```
|
||||
python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||
$ python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||
$ cp dist/aichaos-0.0.1-py3-none-any.whl docker/
|
||||
```
|
||||
This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder.
|
||||
|
||||
## Build Image
|
||||
```
|
||||
cd docker
|
||||
podman build -t aichaos:1.0 .
|
||||
$ cd docker
|
||||
$ podman build -t aichaos:1.0 .
|
||||
OR
|
||||
docker build -t aichaos:1.0 .
|
||||
$ docker build -t aichaos:1.0 .
|
||||
```
|
||||
|
||||
## Run Chaos AI
|
||||
```
|
||||
podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
OR
|
||||
docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||
$ docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||
```
|
||||
|
||||
The output should look like:
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
numpy
|
||||
pandas
|
||||
requests
|
||||
Flask==2.1.0
|
||||
Werkzeug==2.3.8
|
||||
Flask==2.2.5
|
||||
Werkzeug==3.0.3
|
||||
flasgger==0.9.5
|
||||
|
||||
@@ -7,8 +7,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
||||
## Pre-requisites
|
||||
|
||||
- Openshift Or Kubernetes Environment where the application is hosted
|
||||
- Access to the telemetry data via the exposed Prometheus endpoint
|
||||
- Python3
|
||||
- Access to the metrics via the exposed Prometheus endpoint
|
||||
- Python3.9
|
||||
|
||||
## Usage
|
||||
|
||||
@@ -22,14 +22,14 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
||||
$ pip3 install -r requirements.txt
|
||||
Edit configuration file:
|
||||
$ vi config/recommender_config.yaml
|
||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py -c utils/chaos_recommender/recommender_config.yaml
|
||||
```
|
||||
|
||||
2. Follow the prompts to provide the required information.
|
||||
|
||||
## Configuration
|
||||
To run the recommender with a config file specify the config file path with the `-c` argument.
|
||||
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
|
||||
You can customize the default values by editing the `recommender_config.yaml` file. The configuration file contains the following options:
|
||||
|
||||
- `application`: Specify the application name.
|
||||
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
|
||||
@@ -115,6 +115,6 @@ You can customize the thresholds and options used for data analysis and identify
|
||||
|
||||
## Additional Files
|
||||
|
||||
- `config/recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.
|
||||
- `recommender_config.yaml`: The configuration file containing default values for application, namespace, labels, and kubeconfig.
|
||||
|
||||
Happy Chaos!
|
||||
|
||||
35
utils/chaos_recommender/recommender_config.yaml
Normal file
35
utils/chaos_recommender/recommender_config.yaml
Normal file
@@ -0,0 +1,35 @@
|
||||
application: openshift-etcd
|
||||
namespaces: openshift-etcd
|
||||
labels: app=openshift-etcd
|
||||
kubeconfig: ~/.kube/config.yaml
|
||||
prometheus_endpoint: <Prometheus_Endpoint>
|
||||
auth_token: <Auth_Token>
|
||||
scrape_duration: 10m
|
||||
chaos_library: "kraken"
|
||||
log_level: INFO
|
||||
json_output_file: False
|
||||
json_output_folder_path:
|
||||
|
||||
# for output purpose only do not change if not needed
|
||||
chaos_tests:
|
||||
GENERIC:
|
||||
- pod_failure
|
||||
- container_failure
|
||||
- node_failure
|
||||
- zone_outage
|
||||
- time_skew
|
||||
- namespace_failure
|
||||
- power_outage
|
||||
CPU:
|
||||
- node_cpu_hog
|
||||
NETWORK:
|
||||
- application_outage
|
||||
- node_network_chaos
|
||||
- pod_network_chaos
|
||||
MEM:
|
||||
- node_memory_hog
|
||||
- pvc_disk_fill
|
||||
|
||||
threshold: .7
|
||||
cpu_threshold: .5
|
||||
mem_threshold: .5
|
||||
Reference in New Issue
Block a user