mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-03-16 16:40:42 +00:00
Compare commits
27 Commits
v5.0.0
...
custom_wei
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
b9d7c8ba12 | ||
|
|
e8075743ab | ||
|
|
ec5511b2db | ||
|
|
4e7dca9474 | ||
|
|
edf0f3d1c9 | ||
|
|
8c9bce6987 | ||
|
|
5608482f1b | ||
|
|
a14d3955a6 | ||
|
|
f655ec1a73 | ||
|
|
dfc350ac03 | ||
|
|
c474b810b2 | ||
|
|
072e8d0e87 | ||
|
|
aee61061ac | ||
|
|
544cac8bbb | ||
|
|
49b1affdb8 | ||
|
|
c1dd43fe87 | ||
|
|
8dad2a3996 | ||
|
|
cebc60f5a8 | ||
|
|
2065443622 | ||
|
|
b6ef7fa052 | ||
|
|
4f305e78aa | ||
|
|
b17e933134 | ||
|
|
beea484597 | ||
|
|
0222b0f161 | ||
|
|
f7e674d5ad | ||
|
|
7aea12ce6c | ||
|
|
625e1e90cf |
@@ -2,3 +2,4 @@
|
||||
omit =
|
||||
tests/*
|
||||
krkn/tests/**
|
||||
CI/tests_v2/*
|
||||
|
||||
20
.github/workflows/tests.yml
vendored
20
.github/workflows/tests.yml
vendored
@@ -43,11 +43,11 @@ jobs:
|
||||
|
||||
- name: Deploy test workloads
|
||||
run: |
|
||||
es_pod_name=$(kubectl get pods -l "app=elasticsearch-master" -o name)
|
||||
echo "POD_NAME: $es_pod_name"
|
||||
kubectl --namespace default port-forward $es_pod_name 9200 &
|
||||
prom_name=$(kubectl get pods -n monitoring -l "app.kubernetes.io/name=prometheus" -o name)
|
||||
kubectl --namespace monitoring port-forward $prom_name 9090 &
|
||||
# es_pod_name=$(kubectl get pods -l "app=elasticsearch-master" -o name)
|
||||
# echo "POD_NAME: $es_pod_name"
|
||||
# kubectl --namespace default port-forward $es_pod_name 9200 &
|
||||
# prom_name=$(kubectl get pods -n monitoring -l "app.kubernetes.io/name=prometheus" -o name)
|
||||
# kubectl --namespace monitoring port-forward $prom_name 9090 &
|
||||
|
||||
# Wait for Elasticsearch to be ready
|
||||
echo "Waiting for Elasticsearch to be ready..."
|
||||
@@ -85,7 +85,7 @@ jobs:
|
||||
yq -i '.elastic.enable_elastic=False' CI/config/common_test_config.yaml
|
||||
yq -i '.elastic.password="${{env.ELASTIC_PASSWORD}}"' CI/config/common_test_config.yaml
|
||||
yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
|
||||
echo "test_app_outages" >> ./CI/tests/functional_tests
|
||||
echo "test_app_outages" > ./CI/tests/functional_tests
|
||||
echo "test_container" >> ./CI/tests/functional_tests
|
||||
echo "test_cpu_hog" >> ./CI/tests/functional_tests
|
||||
echo "test_customapp_pod" >> ./CI/tests/functional_tests
|
||||
@@ -94,13 +94,17 @@ jobs:
|
||||
echo "test_namespace" >> ./CI/tests/functional_tests
|
||||
echo "test_net_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_node" >> ./CI/tests/functional_tests
|
||||
echo "test_pod" >> ./CI/tests/functional_tests
|
||||
echo "test_pod_error" >> ./CI/tests/functional_tests
|
||||
echo "test_service_hijacking" >> ./CI/tests/functional_tests
|
||||
echo "test_pod_network_filter" >> ./CI/tests/functional_tests
|
||||
echo "test_pod_server" >> ./CI/tests/functional_tests
|
||||
echo "test_time" >> ./CI/tests/functional_tests
|
||||
echo "test_node_network_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_pod_network_chaos" >> ./CI/tests/functional_tests
|
||||
echo "test_cerberus_unhealthy" >> ./CI/tests/functional_tests
|
||||
echo "test_pod_error" >> ./CI/tests/functional_tests
|
||||
echo "test_pod" >> ./CI/tests/functional_tests
|
||||
# echo "test_pvc" >> ./CI/tests/functional_tests
|
||||
|
||||
|
||||
# Push on main only steps + all other functional to collect coverage
|
||||
# for the badge
|
||||
|
||||
53
.github/workflows/tests_v2.yml
vendored
Normal file
53
.github/workflows/tests_v2.yml
vendored
Normal file
@@ -0,0 +1,53 @@
|
||||
name: Tests v2 (pytest functional)
|
||||
on:
|
||||
pull_request:
|
||||
push:
|
||||
branches:
|
||||
- main
|
||||
jobs:
|
||||
tests-v2:
|
||||
name: Tests v2 (pytest functional)
|
||||
runs-on: ubuntu-latest
|
||||
steps:
|
||||
- name: Check out code
|
||||
uses: actions/checkout@v3
|
||||
|
||||
- name: Create KinD cluster
|
||||
uses: redhat-chaos/actions/kind@main
|
||||
|
||||
- name: Pre-load test images into KinD
|
||||
run: |
|
||||
docker pull nginx:alpine
|
||||
kind load docker-image nginx:alpine
|
||||
docker pull quay.io/krkn-chaos/krkn:tools
|
||||
kind load docker-image quay.io/krkn-chaos/krkn:tools
|
||||
|
||||
- name: Install Python
|
||||
uses: actions/setup-python@v4
|
||||
with:
|
||||
python-version: '3.11'
|
||||
architecture: 'x64'
|
||||
cache: 'pip'
|
||||
|
||||
- name: Install dependencies
|
||||
run: |
|
||||
sudo apt-get install -y build-essential python3-dev
|
||||
pip install --upgrade pip
|
||||
pip install -r requirements.txt
|
||||
pip install -r CI/tests_v2/requirements.txt
|
||||
|
||||
- name: Run tests_v2
|
||||
run: |
|
||||
KRKN_TEST_COVERAGE=1 python -m pytest CI/tests_v2/ -v --timeout=300 --reruns=1 --reruns-delay=5 \
|
||||
--html=CI/tests_v2/report.html -n auto --junitxml=CI/tests_v2/results.xml
|
||||
|
||||
- name: Upload tests_v2 artifacts
|
||||
if: always()
|
||||
uses: actions/upload-artifact@v4
|
||||
with:
|
||||
name: tests-v2-results
|
||||
path: |
|
||||
CI/tests_v2/report.html
|
||||
CI/tests_v2/results.xml
|
||||
CI/tests_v2/assets/
|
||||
if-no-files-found: ignore
|
||||
5
.gitignore
vendored
5
.gitignore
vendored
@@ -17,6 +17,7 @@ __pycache__/*
|
||||
kube-burner*
|
||||
kube_burner*
|
||||
recommender_*.json
|
||||
resiliency*.json
|
||||
|
||||
# Project files
|
||||
.ropeproject
|
||||
@@ -64,6 +65,10 @@ CI/out/*
|
||||
CI/ci_results
|
||||
CI/legacy/*node.yaml
|
||||
CI/results.markdown
|
||||
# CI tests_v2 (pytest-html / pytest outputs)
|
||||
CI/tests_v2/results.xml
|
||||
CI/tests_v2/report.html
|
||||
CI/tests_v2/assets/
|
||||
|
||||
#env
|
||||
chaos/*
|
||||
|
||||
@@ -42,7 +42,7 @@ telemetry:
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
|
||||
79
CI/templates/mock_cerberus.yaml
Normal file
79
CI/templates/mock_cerberus.yaml
Normal file
@@ -0,0 +1,79 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: mock-cerberus-server
|
||||
namespace: default
|
||||
data:
|
||||
server.py: |
|
||||
#!/usr/bin/env python3
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import json
|
||||
|
||||
class MockCerberusHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/':
|
||||
# Return True to indicate cluster is healthy
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'text/plain')
|
||||
self.end_headers()
|
||||
self.wfile.write(b'True')
|
||||
elif self.path.startswith('/history'):
|
||||
# Return empty history (no failures)
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.end_headers()
|
||||
response = {
|
||||
"history": {
|
||||
"failures": []
|
||||
}
|
||||
}
|
||||
self.wfile.write(json.dumps(response).encode())
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
print(f"[MockCerberus] {format % args}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
server = HTTPServer(('0.0.0.0', 8080), MockCerberusHandler)
|
||||
print("[MockCerberus] Starting mock cerberus server on port 8080...")
|
||||
server.serve_forever()
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: mock-cerberus
|
||||
namespace: default
|
||||
labels:
|
||||
app: mock-cerberus
|
||||
spec:
|
||||
containers:
|
||||
- name: mock-cerberus
|
||||
image: python:3.9-slim
|
||||
command: ["python3", "/app/server.py"]
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: server-script
|
||||
mountPath: /app
|
||||
volumes:
|
||||
- name: server-script
|
||||
configMap:
|
||||
name: mock-cerberus-server
|
||||
defaultMode: 0755
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: mock-cerberus
|
||||
namespace: default
|
||||
spec:
|
||||
selector:
|
||||
app: mock-cerberus
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
||||
85
CI/templates/mock_cerberus_unhealthy.yaml
Normal file
85
CI/templates/mock_cerberus_unhealthy.yaml
Normal file
@@ -0,0 +1,85 @@
|
||||
apiVersion: v1
|
||||
kind: ConfigMap
|
||||
metadata:
|
||||
name: mock-cerberus-unhealthy-server
|
||||
namespace: default
|
||||
data:
|
||||
server.py: |
|
||||
#!/usr/bin/env python3
|
||||
from http.server import HTTPServer, BaseHTTPRequestHandler
|
||||
import json
|
||||
|
||||
class MockCerberusUnhealthyHandler(BaseHTTPRequestHandler):
|
||||
def do_GET(self):
|
||||
if self.path == '/':
|
||||
# Return False to indicate cluster is unhealthy
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'text/plain')
|
||||
self.end_headers()
|
||||
self.wfile.write(b'False')
|
||||
elif self.path.startswith('/history'):
|
||||
# Return history with failures
|
||||
self.send_response(200)
|
||||
self.send_header('Content-type', 'application/json')
|
||||
self.end_headers()
|
||||
response = {
|
||||
"history": {
|
||||
"failures": [
|
||||
{
|
||||
"component": "node",
|
||||
"name": "test-node",
|
||||
"timestamp": "2024-01-01T00:00:00Z"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
self.wfile.write(json.dumps(response).encode())
|
||||
else:
|
||||
self.send_response(404)
|
||||
self.end_headers()
|
||||
|
||||
def log_message(self, format, *args):
|
||||
print(f"[MockCerberusUnhealthy] {format % args}")
|
||||
|
||||
if __name__ == '__main__':
|
||||
server = HTTPServer(('0.0.0.0', 8080), MockCerberusUnhealthyHandler)
|
||||
print("[MockCerberusUnhealthy] Starting mock cerberus unhealthy server on port 8080...")
|
||||
server.serve_forever()
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: mock-cerberus-unhealthy
|
||||
namespace: default
|
||||
labels:
|
||||
app: mock-cerberus-unhealthy
|
||||
spec:
|
||||
containers:
|
||||
- name: mock-cerberus-unhealthy
|
||||
image: python:3.9-slim
|
||||
command: ["python3", "/app/server.py"]
|
||||
ports:
|
||||
- containerPort: 8080
|
||||
name: http
|
||||
volumeMounts:
|
||||
- name: server-script
|
||||
mountPath: /app
|
||||
volumes:
|
||||
- name: server-script
|
||||
configMap:
|
||||
name: mock-cerberus-unhealthy-server
|
||||
defaultMode: 0755
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: mock-cerberus-unhealthy
|
||||
namespace: default
|
||||
spec:
|
||||
selector:
|
||||
app: mock-cerberus-unhealthy
|
||||
ports:
|
||||
- protocol: TCP
|
||||
port: 8080
|
||||
targetPort: 8080
|
||||
type: ClusterIP
|
||||
79
CI/tests/test_cerberus_unhealthy.sh
Executable file
79
CI/tests/test_cerberus_unhealthy.sh
Executable file
@@ -0,0 +1,79 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
function functional_test_cerberus_unhealthy {
|
||||
echo "========================================"
|
||||
echo "Starting Cerberus Unhealthy Test"
|
||||
echo "========================================"
|
||||
|
||||
# Deploy mock cerberus unhealthy server
|
||||
echo "Deploying mock cerberus unhealthy server..."
|
||||
kubectl apply -f CI/templates/mock_cerberus_unhealthy.yaml
|
||||
|
||||
# Wait for mock cerberus unhealthy pod to be ready
|
||||
echo "Waiting for mock cerberus unhealthy to be ready..."
|
||||
kubectl wait --for=condition=ready pod -l app=mock-cerberus-unhealthy --timeout=300s
|
||||
|
||||
# Verify mock cerberus service is accessible
|
||||
echo "Verifying mock cerberus unhealthy service..."
|
||||
mock_cerberus_ip=$(kubectl get service mock-cerberus-unhealthy -o jsonpath='{.spec.clusterIP}')
|
||||
echo "Mock Cerberus Unhealthy IP: $mock_cerberus_ip"
|
||||
|
||||
# Test cerberus endpoint from within the cluster (should return False)
|
||||
kubectl run cerberus-unhealthy-test --image=curlimages/curl:latest --rm -i --restart=Never -- \
|
||||
curl -s http://mock-cerberus-unhealthy.default.svc.cluster.local:8080/ || echo "Cerberus unhealthy test curl completed"
|
||||
|
||||
# Configure scenario for pod disruption with cerberus enabled
|
||||
export scenario_type="pod_disruption_scenarios"
|
||||
export scenario_file="scenarios/kind/pod_etcd.yml"
|
||||
export post_config=""
|
||||
|
||||
# Generate config with cerberus enabled
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/cerberus_unhealthy_test_config.yaml
|
||||
|
||||
# Enable cerberus in the config but DON'T exit_on_failure (so the test can verify the behavior)
|
||||
# Using yq jq-wrapper syntax with -i -y
|
||||
yq -i '.cerberus.cerberus_enabled = true' CI/config/cerberus_unhealthy_test_config.yaml
|
||||
yq -i ".cerberus.cerberus_url = \"http://${mock_cerberus_ip}:8080\"" CI/config/cerberus_unhealthy_test_config.yaml
|
||||
yq -i '.kraken.exit_on_failure = false' CI/config/cerberus_unhealthy_test_config.yaml
|
||||
|
||||
echo "========================================"
|
||||
echo "Cerberus Unhealthy Configuration:"
|
||||
yq '.cerberus' CI/config/cerberus_unhealthy_test_config.yaml
|
||||
echo "exit_on_failure:"
|
||||
yq '.kraken.exit_on_failure' CI/config/cerberus_unhealthy_test_config.yaml
|
||||
echo "========================================"
|
||||
|
||||
# Run kraken with cerberus unhealthy (should detect unhealthy but not exit due to exit_on_failure=false)
|
||||
echo "Running kraken with cerberus unhealthy integration..."
|
||||
|
||||
# We expect this to complete (not exit 1) because exit_on_failure is false
|
||||
# But cerberus should log that the cluster is unhealthy
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/cerberus_unhealthy_test_config.yaml || {
|
||||
exit_code=$?
|
||||
echo "Kraken exited with code: $exit_code"
|
||||
# If exit_code is 1, that's expected when cerberus reports unhealthy and exit_on_failure would be true
|
||||
# But since we set exit_on_failure=false, it should not exit
|
||||
if [ $exit_code -eq 1 ]; then
|
||||
echo "WARNING: Kraken exited with 1, which may indicate cerberus detected unhealthy cluster"
|
||||
fi
|
||||
}
|
||||
|
||||
# Verify cerberus was called by checking mock cerberus logs
|
||||
echo "Checking mock cerberus unhealthy logs..."
|
||||
kubectl logs -l app=mock-cerberus-unhealthy --tail=50
|
||||
|
||||
# Cleanup
|
||||
echo "Cleaning up mock cerberus unhealthy..."
|
||||
kubectl delete -f CI/templates/mock_cerberus_unhealthy.yaml || true
|
||||
|
||||
echo "========================================"
|
||||
echo "Cerberus unhealthy functional test: Success"
|
||||
echo "========================================"
|
||||
}
|
||||
|
||||
functional_test_cerberus_unhealthy
|
||||
165
CI/tests/test_node_network_chaos.sh
Executable file
165
CI/tests/test_node_network_chaos.sh
Executable file
@@ -0,0 +1,165 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
function functional_test_node_network_chaos {
|
||||
echo "Starting node network chaos functional test"
|
||||
|
||||
# Get a worker node
|
||||
get_node
|
||||
export TARGET_NODE=$(echo $WORKER_NODE | awk '{print $1}')
|
||||
echo "Target node: $TARGET_NODE"
|
||||
|
||||
# Deploy nginx workload on the target node
|
||||
echo "Deploying nginx workload on $TARGET_NODE..."
|
||||
kubectl create deployment nginx-node-net-chaos --image=nginx:latest
|
||||
|
||||
# Add node selector to ensure pod runs on target node
|
||||
kubectl patch deployment nginx-node-net-chaos -p '{"spec":{"template":{"spec":{"nodeSelector":{"kubernetes.io/hostname":"'$TARGET_NODE'"}}}}}'
|
||||
|
||||
# Expose service
|
||||
kubectl expose deployment nginx-node-net-chaos --port=80 --target-port=80 --name=nginx-node-net-chaos-svc
|
||||
|
||||
# Wait for nginx to be ready
|
||||
echo "Waiting for nginx pod to be ready on $TARGET_NODE..."
|
||||
kubectl wait --for=condition=ready pod -l app=nginx-node-net-chaos --timeout=120s
|
||||
|
||||
# Verify pod is on correct node
|
||||
export POD_NAME=$(kubectl get pods -l app=nginx-node-net-chaos -o jsonpath='{.items[0].metadata.name}')
|
||||
export POD_NODE=$(kubectl get pod $POD_NAME -o jsonpath='{.spec.nodeName}')
|
||||
echo "Pod $POD_NAME is running on node $POD_NODE"
|
||||
|
||||
if [ "$POD_NODE" != "$TARGET_NODE" ]; then
|
||||
echo "ERROR: Pod is not on target node (expected $TARGET_NODE, got $POD_NODE)"
|
||||
kubectl get pods -l app=nginx-node-net-chaos -o wide
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Setup port-forward to access nginx
|
||||
echo "Setting up port-forward to nginx service..."
|
||||
kubectl port-forward service/nginx-node-net-chaos-svc 8091:80 &
|
||||
PORT_FORWARD_PID=$!
|
||||
sleep 3 # Give port-forward time to start
|
||||
|
||||
# Test baseline connectivity
|
||||
echo "Testing baseline connectivity..."
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8091 || echo "000")
|
||||
if [ "$response" != "200" ]; then
|
||||
echo "ERROR: Nginx not responding correctly (got $response, expected 200)"
|
||||
kubectl get pods -l app=nginx-node-net-chaos
|
||||
kubectl describe pod $POD_NAME
|
||||
exit 1
|
||||
fi
|
||||
echo "Baseline test passed: nginx responding with 200"
|
||||
|
||||
# Measure baseline latency
|
||||
echo "Measuring baseline latency..."
|
||||
baseline_start=$(date +%s%3N)
|
||||
curl -s http://localhost:8091 > /dev/null || true
|
||||
baseline_end=$(date +%s%3N)
|
||||
baseline_latency=$((baseline_end - baseline_start))
|
||||
echo "Baseline latency: ${baseline_latency}ms"
|
||||
|
||||
# Configure node network chaos scenario
|
||||
echo "Configuring node network chaos scenario..."
|
||||
yq -i '.[0].config.target="'$TARGET_NODE'"' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.namespace="default"' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.test_duration=20' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.latency="200ms"' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.loss=15' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.bandwidth="10mbit"' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.ingress=true' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.egress=true' scenarios/kube/node-network-chaos.yml
|
||||
yq -i '.[0].config.force=false' scenarios/kube/node-network-chaos.yml
|
||||
yq -i 'del(.[0].config.interfaces)' scenarios/kube/node-network-chaos.yml
|
||||
|
||||
# Prepare krkn config
|
||||
export scenario_type="network_chaos_ng_scenarios"
|
||||
export scenario_file="scenarios/kube/node-network-chaos.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/node_network_chaos_config.yaml
|
||||
|
||||
# Run krkn in background
|
||||
echo "Starting krkn with node network chaos scenario..."
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/node_network_chaos_config.yaml &
|
||||
KRKN_PID=$!
|
||||
echo "Krkn started with PID: $KRKN_PID"
|
||||
|
||||
# Wait for chaos to start (give it time to inject chaos)
|
||||
echo "Waiting for chaos injection to begin..."
|
||||
sleep 10
|
||||
|
||||
# Test during chaos - check for increased latency or packet loss effects
|
||||
echo "Testing network behavior during chaos..."
|
||||
chaos_test_count=0
|
||||
chaos_success=0
|
||||
|
||||
for i in {1..5}; do
|
||||
chaos_test_count=$((chaos_test_count + 1))
|
||||
chaos_start=$(date +%s%3N)
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 http://localhost:8091 || echo "000")
|
||||
chaos_end=$(date +%s%3N)
|
||||
chaos_latency=$((chaos_end - chaos_start))
|
||||
|
||||
echo "Attempt $i: HTTP $response, latency: ${chaos_latency}ms"
|
||||
|
||||
# We expect either increased latency or some failures due to packet loss
|
||||
if [ "$response" == "200" ] || [ "$response" == "000" ]; then
|
||||
chaos_success=$((chaos_success + 1))
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "Chaos test results: $chaos_success/$chaos_test_count requests processed"
|
||||
|
||||
# Verify node-level chaos affects pod
|
||||
echo "Verifying node-level chaos affects pod on $TARGET_NODE..."
|
||||
# The node chaos should affect all pods on the node
|
||||
|
||||
# Wait for krkn to complete
|
||||
echo "Waiting for krkn to complete..."
|
||||
wait $KRKN_PID || true
|
||||
echo "Krkn completed"
|
||||
|
||||
# Wait a bit for cleanup
|
||||
sleep 5
|
||||
|
||||
# Verify recovery - nginx should respond normally again
|
||||
echo "Verifying service recovery..."
|
||||
recovery_attempts=0
|
||||
max_recovery_attempts=10
|
||||
|
||||
while [ $recovery_attempts -lt $max_recovery_attempts ]; do
|
||||
recovery_attempts=$((recovery_attempts + 1))
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8091 || echo "000")
|
||||
|
||||
if [ "$response" == "200" ]; then
|
||||
echo "Recovery verified: nginx responding normally (attempt $recovery_attempts)"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Recovery attempt $recovery_attempts/$max_recovery_attempts: got $response, retrying..."
|
||||
sleep 3
|
||||
done
|
||||
|
||||
if [ "$response" != "200" ]; then
|
||||
echo "ERROR: Service did not recover after chaos (got $response)"
|
||||
kubectl get pods -l app=nginx-node-net-chaos
|
||||
kubectl describe pod $POD_NAME
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
echo "Cleaning up test resources..."
|
||||
kill $PORT_FORWARD_PID 2>/dev/null || true
|
||||
kubectl delete deployment nginx-node-net-chaos --ignore-not-found=true
|
||||
kubectl delete service nginx-node-net-chaos-svc --ignore-not-found=true
|
||||
|
||||
echo "Node network chaos test: Success"
|
||||
}
|
||||
|
||||
functional_test_node_network_chaos
|
||||
@@ -7,14 +7,15 @@ trap finish EXIT
|
||||
|
||||
function functional_test_pod_crash {
|
||||
export scenario_type="pod_disruption_scenarios"
|
||||
export scenario_file="scenarios/kind/pod_etcd.yml"
|
||||
export scenario_file="scenarios/kind/pod_path_provisioner.yml"
|
||||
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml
|
||||
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml
|
||||
echo "Pod disruption scenario test: Success"
|
||||
date
|
||||
kubectl get pods -n kube-system -l component=etcd -o yaml
|
||||
kubectl get pods -n local-path-storage -l app=local-path-provisioner -o yaml
|
||||
}
|
||||
|
||||
functional_test_pod_crash
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
@@ -8,7 +9,9 @@ function functional_test_pod_error {
|
||||
export scenario_type="pod_disruption_scenarios"
|
||||
export scenario_file="scenarios/kind/pod_etcd.yml"
|
||||
export post_config=""
|
||||
# this test will check if krkn exits with an error when too many pods are targeted
|
||||
yq -i '.[0].config.kill=5' scenarios/kind/pod_etcd.yml
|
||||
yq -i '.[0].config.krkn_pod_recovery_time=1' scenarios/kind/pod_etcd.yml
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml
|
||||
cat CI/config/pod_config.yaml
|
||||
|
||||
|
||||
143
CI/tests/test_pod_network_chaos.sh
Executable file
143
CI/tests/test_pod_network_chaos.sh
Executable file
@@ -0,0 +1,143 @@
|
||||
set -xeEo pipefail
|
||||
|
||||
source CI/tests/common.sh
|
||||
|
||||
trap error ERR
|
||||
trap finish EXIT
|
||||
|
||||
function functional_test_pod_network_chaos {
|
||||
echo "Starting pod network chaos functional test"
|
||||
|
||||
# Deploy nginx workload
|
||||
echo "Deploying nginx workload..."
|
||||
kubectl create deployment nginx-pod-net-chaos --image=nginx:latest
|
||||
kubectl expose deployment nginx-pod-net-chaos --port=80 --target-port=80 --name=nginx-pod-net-chaos-svc
|
||||
|
||||
# Wait for nginx to be ready
|
||||
echo "Waiting for nginx pod to be ready..."
|
||||
kubectl wait --for=condition=ready pod -l app=nginx-pod-net-chaos --timeout=120s
|
||||
|
||||
# Get pod name
|
||||
export POD_NAME=$(kubectl get pods -l app=nginx-pod-net-chaos -o jsonpath='{.items[0].metadata.name}')
|
||||
echo "Target pod: $POD_NAME"
|
||||
|
||||
# Setup port-forward to access nginx
|
||||
echo "Setting up port-forward to nginx service..."
|
||||
kubectl port-forward service/nginx-pod-net-chaos-svc 8090:80 &
|
||||
PORT_FORWARD_PID=$!
|
||||
sleep 3 # Give port-forward time to start
|
||||
|
||||
# Test baseline connectivity
|
||||
echo "Testing baseline connectivity..."
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8090 || echo "000")
|
||||
if [ "$response" != "200" ]; then
|
||||
echo "ERROR: Nginx not responding correctly (got $response, expected 200)"
|
||||
kubectl get pods -l app=nginx-pod-net-chaos
|
||||
kubectl describe pod $POD_NAME
|
||||
exit 1
|
||||
fi
|
||||
echo "Baseline test passed: nginx responding with 200"
|
||||
|
||||
# Measure baseline latency
|
||||
echo "Measuring baseline latency..."
|
||||
baseline_start=$(date +%s%3N)
|
||||
curl -s http://localhost:8090 > /dev/null || true
|
||||
baseline_end=$(date +%s%3N)
|
||||
baseline_latency=$((baseline_end - baseline_start))
|
||||
echo "Baseline latency: ${baseline_latency}ms"
|
||||
|
||||
# Configure pod network chaos scenario
|
||||
echo "Configuring pod network chaos scenario..."
|
||||
yq -i '.[0].config.target="'$POD_NAME'"' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.namespace="default"' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.test_duration=20' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.latency="200ms"' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.loss=15' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.bandwidth="10mbit"' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.ingress=true' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i '.[0].config.egress=true' scenarios/kube/pod-network-chaos.yml
|
||||
yq -i 'del(.[0].config.interfaces)' scenarios/kube/pod-network-chaos.yml
|
||||
|
||||
# Prepare krkn config
|
||||
export scenario_type="network_chaos_ng_scenarios"
|
||||
export scenario_file="scenarios/kube/pod-network-chaos.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/pod_network_chaos_config.yaml
|
||||
|
||||
# Run krkn in background
|
||||
echo "Starting krkn with pod network chaos scenario..."
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/pod_network_chaos_config.yaml &
|
||||
KRKN_PID=$!
|
||||
echo "Krkn started with PID: $KRKN_PID"
|
||||
|
||||
# Wait for chaos to start (give it time to inject chaos)
|
||||
echo "Waiting for chaos injection to begin..."
|
||||
sleep 10
|
||||
|
||||
# Test during chaos - check for increased latency or packet loss effects
|
||||
echo "Testing network behavior during chaos..."
|
||||
chaos_test_count=0
|
||||
chaos_success=0
|
||||
|
||||
for i in {1..5}; do
|
||||
chaos_test_count=$((chaos_test_count + 1))
|
||||
chaos_start=$(date +%s%3N)
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 10 http://localhost:8090 || echo "000")
|
||||
chaos_end=$(date +%s%3N)
|
||||
chaos_latency=$((chaos_end - chaos_start))
|
||||
|
||||
echo "Attempt $i: HTTP $response, latency: ${chaos_latency}ms"
|
||||
|
||||
# We expect either increased latency or some failures due to packet loss
|
||||
if [ "$response" == "200" ] || [ "$response" == "000" ]; then
|
||||
chaos_success=$((chaos_success + 1))
|
||||
fi
|
||||
|
||||
sleep 2
|
||||
done
|
||||
|
||||
echo "Chaos test results: $chaos_success/$chaos_test_count requests processed"
|
||||
|
||||
# Wait for krkn to complete
|
||||
echo "Waiting for krkn to complete..."
|
||||
wait $KRKN_PID || true
|
||||
echo "Krkn completed"
|
||||
|
||||
# Wait a bit for cleanup
|
||||
sleep 5
|
||||
|
||||
# Verify recovery - nginx should respond normally again
|
||||
echo "Verifying service recovery..."
|
||||
recovery_attempts=0
|
||||
max_recovery_attempts=10
|
||||
|
||||
while [ $recovery_attempts -lt $max_recovery_attempts ]; do
|
||||
recovery_attempts=$((recovery_attempts + 1))
|
||||
response=$(curl -s -o /dev/null -w "%{http_code}" --max-time 5 http://localhost:8090 || echo "000")
|
||||
|
||||
if [ "$response" == "200" ]; then
|
||||
echo "Recovery verified: nginx responding normally (attempt $recovery_attempts)"
|
||||
break
|
||||
fi
|
||||
|
||||
echo "Recovery attempt $recovery_attempts/$max_recovery_attempts: got $response, retrying..."
|
||||
sleep 3
|
||||
done
|
||||
|
||||
if [ "$response" != "200" ]; then
|
||||
echo "ERROR: Service did not recover after chaos (got $response)"
|
||||
kubectl get pods -l app=nginx-pod-net-chaos
|
||||
kubectl describe pod $POD_NAME
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Cleanup
|
||||
echo "Cleaning up test resources..."
|
||||
kill $PORT_FORWARD_PID 2>/dev/null || true
|
||||
kubectl delete deployment nginx-pod-net-chaos --ignore-not-found=true
|
||||
kubectl delete service nginx-pod-net-chaos-svc --ignore-not-found=true
|
||||
|
||||
echo "Pod network chaos test: Success"
|
||||
}
|
||||
|
||||
functional_test_pod_network_chaos
|
||||
@@ -19,12 +19,12 @@ function functional_test_telemetry {
|
||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||
|
||||
export scenario_type="pod_disruption_scenarios"
|
||||
export scenario_file="scenarios/kind/pod_etcd.yml"
|
||||
export scenario_file="scenarios/kind/pod_path_provisioner.yml"
|
||||
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p" | sed 's/\x1b\[[0-9;]*m//g'`
|
||||
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||
echo "checking if telemetry files are uploaded on s3"
|
||||
cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 )
|
||||
|
||||
175
CI/tests_v2/CONTRIBUTING_TESTS.md
Normal file
175
CI/tests_v2/CONTRIBUTING_TESTS.md
Normal file
@@ -0,0 +1,175 @@
|
||||
# Adding a New Scenario Test (CI/tests_v2)
|
||||
|
||||
This guide explains how to add a new chaos scenario test to the v2 pytest framework. The layout is **folder-per-scenario**: each scenario has its own directory under `scenarios/<scenario_name>/` containing the test file, Kubernetes resources, and the Krkn scenario base YAML.
|
||||
|
||||
## Option 1: Scaffold script (recommended)
|
||||
|
||||
From the **repository root**:
|
||||
|
||||
```bash
|
||||
python CI/tests_v2/scaffold.py --scenario service_hijacking
|
||||
```
|
||||
|
||||
This creates:
|
||||
|
||||
- `CI/tests_v2/scenarios/service_hijacking/test_service_hijacking.py` — A test class extending `BaseScenarioTest` with a stub `test_happy_path` and `WORKLOAD_MANIFEST` pointing to the folder’s `resource.yaml`.
|
||||
- `CI/tests_v2/scenarios/service_hijacking/resource.yaml` — A placeholder Deployment (namespace is patched at deploy time).
|
||||
- `CI/tests_v2/scenarios/service_hijacking/scenario_base.yaml` — A placeholder Krkn scenario; edit this with the structure expected by your scenario type.
|
||||
|
||||
The script automatically registers the marker in `CI/tests_v2/pytest.ini`. For example, it adds:
|
||||
|
||||
```
|
||||
service_hijacking: marks a test as a service_hijacking scenario test
|
||||
```
|
||||
|
||||
**Next steps after scaffolding:**
|
||||
|
||||
1. Verify the marker was added to `pytest.ini` (the scaffold does this automatically).
|
||||
2. Edit `scenario_base.yaml` with the structure your Krkn scenario type expects (see `scenarios/application_outage/scenario_base.yaml` and `scenarios/pod_disruption/scenario_base.yaml` for examples). The top-level key should match `SCENARIO_NAME`.
|
||||
3. If your scenario uses a **list** structure (like pod_disruption) instead of a **dict** with a top-level key, set `NAMESPACE_KEY_PATH` (e.g. `[0, "config", "namespace_pattern"]`) and `NAMESPACE_IS_REGEX = True` if the namespace is a regex pattern.
|
||||
4. The generated `test_happy_path` already uses `self.run_scenario(self.tmp_path, ns)` and assertions. Add more test methods (e.g. negative tests with `@pytest.mark.no_workload`) as needed.
|
||||
5. Adjust `resource.yaml` if your scenario needs a different workload (e.g. specific image or labels).
|
||||
|
||||
If your Kraken scenario type string is not `<scenario>_scenarios`, pass it explicitly:
|
||||
|
||||
```bash
|
||||
python CI/tests_v2/scaffold.py --scenario node_disruption --scenario-type node_scenarios
|
||||
```
|
||||
|
||||
## Option 2: Manual setup
|
||||
|
||||
1. **Create the scenario folder**
|
||||
`CI/tests_v2/scenarios/<scenario_name>/`.
|
||||
|
||||
2. **Add resource.yaml**
|
||||
Kubernetes manifest(s) for the workload (Deployment or Pod). Use a distinct label (e.g. `app: <scenario>-target`). Omit or leave `metadata.namespace`; the framework patches it at deploy time.
|
||||
|
||||
3. **Add scenario_base.yaml**
|
||||
The canonical Krkn scenario structure. Tests will load this, patch namespace (and any overrides), write to `tmp_path`, and pass to `build_config`. See existing scenarios for the format your scenario type expects.
|
||||
|
||||
4. **Add test_<scenario>.py**
|
||||
- Import `BaseScenarioTest` from `lib.base` and helpers from `lib.utils` (e.g. `assert_kraken_success`, `get_pods_list`, `scenario_dir` if needed).
|
||||
- Define a class extending `BaseScenarioTest` with:
|
||||
- `WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/<scenario_name>/resource.yaml"`
|
||||
- `WORKLOAD_IS_PATH = True`
|
||||
- `LABEL_SELECTOR = "app=<label>"`
|
||||
- `SCENARIO_NAME = "<scenario_name>"`
|
||||
- `SCENARIO_TYPE = "<scenario_type>"` (e.g. `application_outages_scenarios`)
|
||||
- `NAMESPACE_KEY_PATH`: path to the namespace field (e.g. `["application_outage", "namespace"]` for dict-based, or `[0, "config", "namespace_pattern"]` for list-based)
|
||||
- `NAMESPACE_IS_REGEX = False` (or `True` for regex patterns like pod_disruption)
|
||||
- `OVERRIDES_KEY_PATH = ["<top-level key>"]` if the scenario supports overrides (e.g. duration, block).
|
||||
- Add `@pytest.mark.functional` and `@pytest.mark.<scenario>` on the class.
|
||||
- In at least one test, call `self.run_scenario(self.tmp_path, self.ns)` and assert with `assert_kraken_success`, `assert_pod_count_unchanged`, and `assert_all_pods_running_and_ready`. Use `self.k8s_core`, `self.tmp_path`, etc. (injected by the base class).
|
||||
|
||||
5. **Register the marker**
|
||||
In `CI/tests_v2/pytest.ini`, under `markers`:
|
||||
```
|
||||
<scenario>: marks a test as a <scenario> scenario test
|
||||
```
|
||||
|
||||
## Conventions
|
||||
|
||||
- **Folder-per-scenario**: One directory per scenario under `scenarios/`. All assets (test, resource.yaml, scenario_base.yaml, and any extra YAMLs) live there for easy tracking and onboarding.
|
||||
- **Ephemeral namespace**: Every test gets a unique `krkn-test-<uuid>` namespace. The base class deploys the workload into it before the test; no manual deploy is required.
|
||||
- **Negative tests**: For tests that don’t need a workload (e.g. invalid scenario, bad namespace), use `@pytest.mark.no_workload`. The test will still get a namespace but no workload will be deployed.
|
||||
- **Scenario type**: `SCENARIO_TYPE` must match the key in Kraken’s config (e.g. `application_outages_scenarios`, `pod_disruption_scenarios`). See `CI/tests_v2/config/common_test_config.yaml` and the scenario plugin’s `get_scenario_types()`.
|
||||
- **Assertions**: Use `assert_kraken_success(result, context=f"namespace={ns}", tmp_path=self.tmp_path)` so failures include stdout/stderr and optional log files.
|
||||
- **Timeouts**: Use constants from `lib.base` (`READINESS_TIMEOUT`, `POLICY_WAIT_TIMEOUT`, etc.) instead of magic numbers.
|
||||
|
||||
## Exit Code Handling
|
||||
|
||||
Kraken uses the following exit codes: **0** = success; **1** = scenario failure (e.g. post scenarios still failing); **2** = critical alerts fired; **3+** = health check / KubeVirt check failures; **-1** = infrastructure error (bad config, no kubeconfig).
|
||||
|
||||
- **Happy-path tests**: Use `assert_kraken_success(result, ...)`. By default only exit code 0 is accepted.
|
||||
- **Alert-aware tests**: If you enable `check_critical_alerts` and expect alerts, use `assert_kraken_success(result, allowed_codes=(0, 2), ...)` so exit code 2 is treated as acceptable.
|
||||
- **Expected-failure tests**: Use `assert_kraken_failure(result, context=..., tmp_path=self.tmp_path)` for negative tests (invalid scenario, bad namespace, etc.). This gives the same diagnostic quality (log dump, tmp_path hint) as success assertions. Prefer this over a bare `assert result.returncode != 0`.
|
||||
|
||||
## Running your new tests
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -m <scenario>
|
||||
```
|
||||
|
||||
For debugging with logs and keeping failed namespaces:
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -m <scenario> --log-cli-level=DEBUG --keep-ns-on-fail
|
||||
```
|
||||
|
||||
---
|
||||
|
||||
## Naming Conventions
|
||||
|
||||
Follow these conventions so the framework stays consistent as new scenarios are added.
|
||||
|
||||
### Quick Reference
|
||||
|
||||
| Element | Pattern | Example |
|
||||
|---|---|---|
|
||||
| Scenario folder | `scenarios/<snake_case>/` | `scenarios/node_disruption/` |
|
||||
| Test file | `test_<scenario>.py` | `test_node_disruption.py` |
|
||||
| Test class | `Test<CamelCase>(BaseScenarioTest)` | `TestNodeDisruption` |
|
||||
| Pytest marker | `@pytest.mark.<scenario>` (matches folder) | `@pytest.mark.node_disruption` |
|
||||
| Scenario YAML | `scenario_base.yaml` | — |
|
||||
| Workload YAML | `resource.yaml` | — |
|
||||
| Extra YAMLs | `<descriptive_name>.yaml` | `nginx_http.yaml` |
|
||||
| Lib modules | `lib/<concern>.py` | `lib/deploy.py` |
|
||||
| Public fixtures | `<verb>_<noun>` or `<noun>` | `run_kraken`, `test_namespace` |
|
||||
| Private/autouse fixtures | `_<descriptive>` | `_cleanup_stale_namespaces` |
|
||||
| Assertion helpers | `assert_<condition>` | `assert_pod_count_unchanged` |
|
||||
| Query helpers | `get_<resource>` or `find_<resource>_by_<criteria>` | `get_pods_list`, `find_network_policy_by_prefix` |
|
||||
| Env var overrides | `KRKN_TEST_<NAME>` | `KRKN_TEST_READINESS_TIMEOUT` |
|
||||
|
||||
### Folders
|
||||
|
||||
- One folder per scenario under `scenarios/`. The folder name is `snake_case` and must match the `SCENARIO_NAME` class attribute in the test.
|
||||
- Shared framework code lives in `lib/`. Each module covers a single concern (`k8s`, `namespace`, `deploy`, `kraken`, `utils`, `base`, `preflight`).
|
||||
- Do **not** add scenario-specific code to `lib/`; keep it in the scenario folder as module-level helpers.
|
||||
|
||||
### Files
|
||||
|
||||
- Test files: `test_<scenario>.py`. This is required for pytest discovery (`test_*.py`).
|
||||
- Workload manifests: always `resource.yaml`. If a scenario needs additional K8s resources (e.g. a Service for traffic testing), use a descriptive name like `nginx_http.yaml`.
|
||||
- Scenario config: always `scenario_base.yaml`. This is the template that `load_and_patch_scenario` loads and patches.
|
||||
|
||||
### Classes
|
||||
|
||||
- One test class per file: `Test<CamelCase>` extending `BaseScenarioTest`.
|
||||
- The CamelCase name must be the PascalCase equivalent of the folder name (e.g. `pod_disruption` -> `TestPodDisruption`).
|
||||
|
||||
### Test Methods
|
||||
|
||||
- Prefix: `test_` (pytest requirement).
|
||||
- Use descriptive names that convey **what is being verified**, not implementation details.
|
||||
- Good: `test_pod_crash_and_recovery`, `test_traffic_blocked_during_outage`, `test_invalid_scenario_fails`.
|
||||
- Avoid: `test_run_1`, `test_scenario`, `test_it_works`.
|
||||
|
||||
### Fixtures
|
||||
|
||||
- **Public fixtures** (intended for use in tests): use `<verb>_<noun>` or plain `<noun>`. Examples: `run_kraken`, `deploy_workload`, `test_namespace`, `kubectl`.
|
||||
- **Private/autouse fixtures** (framework internals): prefix with `_`. Examples: `_kube_config_loaded`, `_preflight_checks`, `_inject_common_fixtures`.
|
||||
- K8s client fixtures use the `k8s_` prefix: `k8s_core`, `k8s_apps`, `k8s_networking`, `k8s_client`.
|
||||
|
||||
### Helpers and Utilities
|
||||
|
||||
- **Assertions**: `assert_<what_is_expected>`. Always raise `AssertionError` with a message that includes the namespace.
|
||||
- **K8s queries**: `get_<resource>_list` for direct API calls, `find_<resource>_by_<criteria>` for filtered lookups.
|
||||
- **Private helpers**: prefix with `_` for module-internal functions (e.g. `_pods`, `_policies`, `_get_nested`).
|
||||
|
||||
### Constants and Environment Variables
|
||||
|
||||
- Timeout constants: `UPPER_CASE` in `lib/base.py`. Each is overridable via an env var prefixed `KRKN_TEST_`.
|
||||
- Feature flags: `KRKN_TEST_DRY_RUN`, `KRKN_TEST_COVERAGE`. Always use the `KRKN_TEST_` prefix so all tunables are discoverable with `grep KRKN_TEST_`.
|
||||
|
||||
### Markers
|
||||
|
||||
- Every test class gets `@pytest.mark.functional` (framework-wide) and `@pytest.mark.<scenario>` (scenario-specific).
|
||||
- The scenario marker name matches the folder name exactly.
|
||||
- Behavioral modifiers use plain descriptive names: `no_workload`, `order`.
|
||||
- Register all custom markers in `pytest.ini` to avoid warnings.
|
||||
|
||||
## Adding Dependencies
|
||||
|
||||
- **Runtime (Kraken needs it)**: Add to the **root** `requirements.txt`. Pin a version (e.g. `package==1.2.3` or `package>=1.2,<2`).
|
||||
- **Test-only (only CI/tests_v2 needs it)**: Add to **`CI/tests_v2/requirements.txt`**. Pin a version there as well.
|
||||
- After changing either file, run `make setup` (or `make -f CI/tests_v2/Makefile setup`) from the repo root to verify both files install cleanly together.
|
||||
97
CI/tests_v2/Makefile
Normal file
97
CI/tests_v2/Makefile
Normal file
@@ -0,0 +1,97 @@
|
||||
# CI/tests_v2 functional tests - single entry point.
|
||||
# Run from repo root: make -f CI/tests_v2/Makefile <target>
|
||||
# Or from CI/tests_v2: make <target> (REPO_ROOT is resolved automatically).
|
||||
|
||||
# Resolve repo root: go to Makefile dir then up two levels (CI/tests_v2 -> repo root)
|
||||
REPO_ROOT := $(shell cd "$(dir $(firstword $(MAKEFILE_LIST)))" && cd ../.. && pwd)
|
||||
VENV := $(REPO_ROOT)/venv
|
||||
PYTHON := $(VENV)/bin/python
|
||||
PIP := $(VENV)/bin/pip
|
||||
CLUSTER_NAME ?= ci-krkn
|
||||
TESTS_DIR := $(REPO_ROOT)/CI/tests_v2
|
||||
|
||||
.PHONY: setup preflight test test-fast test-debug test-scenario test-dry-run clean help
|
||||
|
||||
help:
|
||||
@echo "CI/tests_v2 functional tests - usage: make [target]"
|
||||
@echo ""
|
||||
@echo "Targets:"
|
||||
@echo " setup Create venv (if missing), install Python deps, create KinD cluster (kind-config-dev.yml)."
|
||||
@echo " Run once before first test. Override cluster config: KIND_CONFIG=path make setup"
|
||||
@echo ""
|
||||
@echo " preflight Check Python 3.9+, kind, kubectl, Docker, cluster reachability, test deps."
|
||||
@echo " Invoked automatically by test targets; run standalone to validate environment."
|
||||
@echo ""
|
||||
@echo " test Full run: retries (2), timeout 300s, HTML report, JUnit XML, coverage."
|
||||
@echo " Use for CI or final verification. Output: report.html, results.xml"
|
||||
@echo ""
|
||||
@echo " test-fast Quick run: no retries, 120s timeout, no report. For fast local iteration."
|
||||
@echo ""
|
||||
@echo " test-debug Debug run: verbose (-s), keep failed namespaces (--keep-ns-on-fail), DEBUG logging."
|
||||
@echo " Use when investigating failures; inspect kept namespaces with kubectl."
|
||||
@echo ""
|
||||
@echo " test-scenario Run only one scenario. Requires SCENARIO=<marker>."
|
||||
@echo " Example: make test-scenario SCENARIO=pod_disruption"
|
||||
@echo ""
|
||||
@echo " test-dry-run Validate scenario plumbing only (no Kraken execution). Sets KRKN_TEST_DRY_RUN=1."
|
||||
@echo ""
|
||||
@echo " clean Delete KinD cluster $(CLUSTER_NAME) and remove report.html, results.xml."
|
||||
@echo ""
|
||||
@echo " help Show this help."
|
||||
@echo ""
|
||||
@echo "Run from repo root: make -f CI/tests_v2/Makefile <target>"
|
||||
@echo "Or from CI/tests_v2: make <target>"
|
||||
|
||||
setup: $(VENV)/.installed
|
||||
@echo "Running cluster setup..."
|
||||
$(MAKE) -f $(TESTS_DIR)/Makefile preflight
|
||||
cd $(REPO_ROOT) && ./CI/tests_v2/setup_env.sh
|
||||
@echo "Setup complete. Run 'make test' or 'make -f CI/tests_v2/Makefile test' from repo root."
|
||||
|
||||
$(VENV)/.installed: $(REPO_ROOT)/requirements.txt $(TESTS_DIR)/requirements.txt
|
||||
@if [ ! -d "$(VENV)" ]; then python3 -m venv $(VENV); echo "Created venv at $(VENV)"; fi
|
||||
$(PYTHON) -m pip install -q --upgrade pip
|
||||
# Root = Kraken runtime; tests_v2 = test-only plugins; both required for functional tests.
|
||||
$(PIP) install -q -r $(REPO_ROOT)/requirements.txt
|
||||
$(PIP) install -q -r $(TESTS_DIR)/requirements.txt
|
||||
@touch $(VENV)/.installed
|
||||
@echo "Python deps installed."
|
||||
|
||||
preflight:
|
||||
@echo "Preflight: checking Python, tools, and cluster..."
|
||||
@command -v python3 >/dev/null 2>&1 || { echo "Error: python3 not found."; exit 1; }
|
||||
@python3 -c "import sys; exit(0 if sys.version_info >= (3, 9) else 1)" || { echo "Error: Python 3.9+ required."; exit 1; }
|
||||
@command -v kind >/dev/null 2>&1 || { echo "Error: kind not installed."; exit 1; }
|
||||
@command -v kubectl >/dev/null 2>&1 || { echo "Error: kubectl not installed."; exit 1; }
|
||||
@docker info >/dev/null 2>&1 || { echo "Error: Docker not running (required for KinD)."; exit 1; }
|
||||
@if kind get clusters 2>/dev/null | grep -qx "$(CLUSTER_NAME)"; then \
|
||||
kubectl cluster-info >/dev/null 2>&1 || { echo "Error: Cluster $(CLUSTER_NAME) exists but cluster-info failed."; exit 1; }; \
|
||||
else \
|
||||
echo "Note: Cluster $(CLUSTER_NAME) not found. Run 'make setup' to create it."; \
|
||||
fi
|
||||
@$(PYTHON) -c "import pytest_rerunfailures, pytest_html, pytest_timeout, pytest_order" 2>/dev/null || \
|
||||
{ echo "Error: Install test deps with 'make setup' or pip install -r CI/tests_v2/requirements.txt"; exit 1; }
|
||||
@echo "Preflight OK."
|
||||
|
||||
test: preflight
|
||||
cd $(REPO_ROOT) && KRKN_TEST_COVERAGE=1 $(PYTHON) -m pytest $(TESTS_DIR)/ -v --timeout=300 --reruns=2 --reruns-delay=10 \
|
||||
--html=$(TESTS_DIR)/report.html -n auto --junitxml=$(TESTS_DIR)/results.xml
|
||||
|
||||
test-fast: preflight
|
||||
cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -p no:rerunfailures -n auto --timeout=120
|
||||
|
||||
test-debug: preflight
|
||||
cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -s -p no:rerunfailures --timeout=300 \
|
||||
--keep-ns-on-fail --log-cli-level=DEBUG
|
||||
|
||||
test-scenario: preflight
|
||||
@if [ -z "$(SCENARIO)" ]; then echo "Error: set SCENARIO=pod_disruption (or application_outage, etc.)"; exit 1; fi
|
||||
cd $(REPO_ROOT) && $(PYTHON) -m pytest $(TESTS_DIR)/ -v -m "$(SCENARIO)" --timeout=300 --reruns=2 --reruns-delay=10
|
||||
|
||||
test-dry-run: preflight
|
||||
cd $(REPO_ROOT) && KRKN_TEST_DRY_RUN=1 $(PYTHON) -m pytest $(TESTS_DIR)/ -v
|
||||
|
||||
clean:
|
||||
@kind delete cluster --name $(CLUSTER_NAME) 2>/dev/null || true
|
||||
@rm -f $(TESTS_DIR)/report.html $(TESTS_DIR)/results.xml
|
||||
@echo "Cleaned cluster and report artifacts."
|
||||
198
CI/tests_v2/README.md
Normal file
198
CI/tests_v2/README.md
Normal file
@@ -0,0 +1,198 @@
|
||||
# Pytest Functional Tests (tests_v2)
|
||||
|
||||
This directory contains a pytest-based functional test framework that runs **alongside** the existing bash tests in `CI/tests/`. It covers the **pod disruption** and **application outage** scenarios with proper assertions, retries, and reporting.
|
||||
|
||||
Each test runs in its **own ephemeral Kubernetes namespace** (`krkn-test-<uuid>`). Before the test, the framework creates the namespace, deploys the target workload, and waits for pods to be ready. After the test, the namespace is deleted (cascading all resources). **You do not need to deploy any workloads manually.**
|
||||
|
||||
## Prerequisites
|
||||
|
||||
Without a cluster, tests that need one will **skip** with a clear message (e.g. *"Could not load kube config"*). No manual workload deployment is required; workloads are deployed automatically into ephemeral namespaces per test.
|
||||
|
||||
- **KinD cluster** (or any Kubernetes cluster) running with `kubectl` configured (e.g. `KUBECONFIG` or default `~/.kube/config`).
|
||||
- **Python 3.9+** and main repo deps: `pip install -r requirements.txt`.
|
||||
|
||||
### Supported clusters
|
||||
|
||||
- **KinD** (recommended): Use `make -f CI/tests_v2/Makefile setup` from the repo root. Fastest for local dev; uses a 2-node dev config by default. Override with `KIND_CONFIG=/path/to/kind-config.yml` for a larger cluster.
|
||||
- **Minikube**: Should work; ensure `kubectl` context is set. Not tested in CI.
|
||||
- **Remote/cloud cluster**: Tests create and delete namespaces; use with caution. Use `--require-kind` to avoid accidentally running against production (tests will skip unless context is kind/minikube).
|
||||
|
||||
### Setting up the cluster
|
||||
|
||||
**Option A: Use the setup script (recommended)**
|
||||
|
||||
From the repository root, with `kind` and `kubectl` installed:
|
||||
|
||||
```bash
|
||||
# Create KinD cluster (defaults to CI/tests_v2/kind-config-dev.yml; override with KIND_CONFIG=...)
|
||||
./CI/tests_v2/setup_env.sh
|
||||
```
|
||||
|
||||
Then in the same shell (or after `export KUBECONFIG=~/.kube/config` in another terminal), activate your venv and install Python deps:
|
||||
|
||||
```bash
|
||||
python3 -m venv venv
|
||||
source venv/bin/activate # or: source venv/Scripts/activate on Windows
|
||||
pip install -r requirements.txt
|
||||
pip install -r CI/tests_v2/requirements.txt
|
||||
```
|
||||
|
||||
**Option B: Manual setup**
|
||||
|
||||
1. Install [kind](https://kind.sigs.k8s.io/docs/user/quick-start/) and [kubectl](https://kubernetes.io/docs/tasks/tools/).
|
||||
2. Create a cluster (from repo root):
|
||||
```bash
|
||||
kind create cluster --name kind --config kind-config.yml
|
||||
```
|
||||
3. Wait for the cluster:
|
||||
```bash
|
||||
kubectl wait --for=condition=Ready nodes --all --timeout=120s
|
||||
```
|
||||
4. Create a virtualenv, activate it, and install dependencies (as in Option A).
|
||||
5. Run tests from repo root: `pytest CI/tests_v2/ -v ...`
|
||||
|
||||
## Install test dependencies
|
||||
|
||||
From the repository root:
|
||||
|
||||
```bash
|
||||
pip install -r CI/tests_v2/requirements.txt
|
||||
```
|
||||
|
||||
This adds `pytest-rerunfailures`, `pytest-html`, `pytest-timeout`, and `pytest-order` (pytest and coverage come from the main `requirements.txt`).
|
||||
|
||||
## Dependency Management
|
||||
|
||||
Dependencies are split into two files:
|
||||
|
||||
- **Root `requirements.txt`** — Kraken runtime (cloud SDKs, Kubernetes client, krkn-lib, pytest, coverage, etc.). Required to run Kraken.
|
||||
- **`CI/tests_v2/requirements.txt`** — Test-only pytest plugins (rerunfailures, html, timeout, order, xdist). Not needed by Kraken itself.
|
||||
|
||||
**Rule of thumb:** If Kraken needs it at runtime, add to root. If only the functional tests need it, add to `CI/tests_v2/requirements.txt`.
|
||||
|
||||
Running `make -f CI/tests_v2/Makefile setup` (or `make setup` from `CI/tests_v2`) creates the venv and installs **both** files automatically; you do not need to install them separately. The Makefile re-installs when either file changes (via the `.installed` sentinel).
|
||||
|
||||
## Run tests
|
||||
|
||||
All commands below are from the **repository root**.
|
||||
|
||||
### Basic run (with retries and HTML report)
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10 --html=CI/tests_v2/report.html --junitxml=CI/tests_v2/results.xml
|
||||
```
|
||||
|
||||
- Failed tests are **retried up to 2 times** with a 10s delay (configurable in `CI/tests_v2/pytest.ini`).
|
||||
- Each test has a **5-minute timeout**.
|
||||
- Open `CI/tests_v2/report.html` in a browser for a detailed report.
|
||||
|
||||
### Run in parallel (faster suite)
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -n 4 --timeout=300
|
||||
```
|
||||
|
||||
Ephemeral namespaces make tests parallel-safe; use `-n` with the number of workers (e.g. 4).
|
||||
|
||||
### Run without retries (for debugging)
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -p no:rerunfailures
|
||||
```
|
||||
|
||||
### Run with coverage
|
||||
|
||||
```bash
|
||||
python -m coverage run -m pytest CI/tests_v2/ -v
|
||||
python -m coverage report
|
||||
```
|
||||
|
||||
To append to existing coverage from unit tests, ensure coverage was started with `coverage run -a` for earlier runs, or run the full test suite in one go.
|
||||
|
||||
### Run only pod disruption tests
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -m pod_disruption
|
||||
```
|
||||
|
||||
### Run only application outage tests
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -m application_outage
|
||||
```
|
||||
|
||||
### Run with verbose output and no capture
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v -s
|
||||
```
|
||||
|
||||
### Keep failed test namespaces for debugging
|
||||
|
||||
When a test fails, its ephemeral namespace is normally deleted. To **keep** the namespace so you can inspect pods, logs, and network policies:
|
||||
|
||||
```bash
|
||||
pytest CI/tests_v2/ -v --keep-ns-on-fail
|
||||
```
|
||||
|
||||
On failure, the namespace name is printed (e.g. `[keep-ns-on-fail] Keeping namespace krkn-test-a1b2c3d4 for debugging`). Use `kubectl get pods -n krkn-test-a1b2c3d4` (and similar) to debug, then delete the namespace manually when done.
|
||||
|
||||
### Logging and cluster options
|
||||
|
||||
- **Structured logging**: Use `--log-cli-level=DEBUG` to see namespace creation, workload deploy, and readiness in the console. Use `--log-file=test.log` to capture logs to a file.
|
||||
- **Require dev cluster**: To avoid running against the wrong cluster, use `--require-kind`. Tests will skip unless the current kube context cluster name contains "kind" or "minikube".
|
||||
- **Stale namespace cleanup**: At session start, namespaces matching `krkn-test-*` that are older than 30 minutes are deleted (e.g. from a previous crashed run).
|
||||
- **Timeout overrides**: Set env vars to tune timeouts (e.g. in CI): `KRKN_TEST_READINESS_TIMEOUT`, `KRKN_TEST_DEPLOY_TIMEOUT`, `KRKN_TEST_NS_CLEANUP_TIMEOUT`, `KRKN_TEST_POLICY_WAIT_TIMEOUT`, `KRKN_TEST_KRAKEN_PROC_WAIT_TIMEOUT`, `KRKN_TEST_TIMEOUT_BUDGET`.
|
||||
|
||||
## Architecture
|
||||
|
||||
- **Folder-per-scenario**: Each scenario lives under `scenarios/<scenario_name>/` with:
|
||||
- **test_<scenario>.py** — Test class extending `BaseScenarioTest`; sets `WORKLOAD_MANIFEST`, `SCENARIO_NAME`, `SCENARIO_TYPE`, `NAMESPACE_KEY_PATH`, and optionally `OVERRIDES_KEY_PATH`.
|
||||
- **resource.yaml** — Kubernetes resources (Deployment/Pod) for the scenario; namespace is patched at deploy time.
|
||||
- **scenario_base.yaml** — Canonical Krkn scenario; the base class loads it, patches namespace (and overrides), and passes it to Kraken via `run_scenario()`. Optional extra YAMLs (e.g. `nginx_http.yaml` for application_outage) can live in the same folder.
|
||||
- **lib/**: Shared framework — `lib/base.py` defines `BaseScenarioTest`, timeout constants (env-overridable), and scenario helpers (`load_and_patch_scenario`, `run_scenario`); `lib/utils.py` provides assertion and K8s helpers; `lib/k8s.py` provides K8s client fixtures; `lib/namespace.py` provides namespace lifecycle; `lib/deploy.py` provides `deploy_workload`, `wait_for_pods_running`, `wait_for_deployment_replicas`; `lib/kraken.py` provides `run_kraken`, `build_config` (using `CI/tests_v2/config/common_test_config.yaml`).
|
||||
- **conftest.py**: Re-exports fixtures from the lib modules and defines `pytest_addoption`, logging, and `repo_root`.
|
||||
- **Adding a new scenario**: Use the scaffold script (see [CONTRIBUTING_TESTS.md](CONTRIBUTING_TESTS.md)) to create `scenarios/<name>/` with test file, `resource.yaml`, and `scenario_base.yaml`, or copy an existing scenario folder and adapt.
|
||||
|
||||
## What is tested
|
||||
|
||||
Each test runs in an isolated ephemeral namespace; workloads are deployed automatically before the test and the namespace is deleted after (unless `--keep-ns-on-fail` is set and the test failed).
|
||||
|
||||
- **scenarios/pod_disruption/**
|
||||
Pod disruption scenario. `resource.yaml` is a deployment with label `app=krkn-pod-disruption-target`; `scenario_base.yaml` is loaded and `namespace_pattern` is patched to the test namespace. The test:
|
||||
1. Records baseline pod UIDs and restart counts.
|
||||
2. Runs Kraken with the pod disruption scenario.
|
||||
3. Asserts that chaos had an effect (UIDs changed or restart count increased).
|
||||
4. Waits for pods to be Running and all containers Ready.
|
||||
5. Asserts pod count is unchanged and all pods are healthy.
|
||||
|
||||
- **scenarios/application_outage/**
|
||||
Application outage scenario (block Ingress/Egress to target pods, then restore). `resource.yaml` is the main workload (outage pod); `scenario_base.yaml` is loaded and patched with namespace (and duration/block as needed). Optional `nginx_http.yaml` is used by the traffic test. Tests include:
|
||||
- **test_app_outage_block_restore_and_variants**: Happy path with default, exclude_label, and block variants (Ingress, Egress, both); Krkn exit 0, pods still Running/Ready.
|
||||
- **test_network_policy_created_then_deleted**: Policy with prefix `krkn-deny-` appears during run and is gone after.
|
||||
- **test_traffic_blocked_during_outage** (disabled, planned): Deploys nginx with label `scenario=outage`, port-forwards; during outage curl fails, after run curl succeeds.
|
||||
- **test_invalid_scenario_fails**: Invalid scenario file (missing `application_outage` key) causes Kraken to exit non-zero.
|
||||
- **test_bad_namespace_fails**: Scenario targeting a non-existent namespace causes Kraken to exit non-zero.
|
||||
|
||||
## Configuration
|
||||
|
||||
- **pytest.ini**: Markers (`functional`, `pod_disruption`, `application_outage`, `no_workload`). Use `--timeout=300`, `--reruns=2`, `--reruns-delay=10` on the command line for full runs.
|
||||
- **conftest.py**: Re-exports fixtures from `lib/k8s.py`, `lib/namespace.py`, `lib/deploy.py`, `lib/kraken.py` (e.g. `test_namespace`, `deploy_workload`, `k8s_core`, `wait_for_pods_running`, `run_kraken`, `build_config`). Configs are built from `CI/tests_v2/config/common_test_config.yaml` with monitoring disabled for local runs. Timeout constants in `lib/base.py` can be overridden via env vars.
|
||||
- **Cluster access**: Reads and applies use the Kubernetes Python client; `kubectl` is still used for `port-forward` and for running Kraken.
|
||||
- **utils.py**: Pod/network policy helpers and assertion helpers (`assert_all_pods_running_and_ready`, `assert_pod_count_unchanged`, `assert_kraken_success`, `assert_kraken_failure`, `patch_namespace_in_docs`).
|
||||
|
||||
## Relationship to existing CI
|
||||
|
||||
- The **existing** bash tests in `CI/tests/` and `CI/run.sh` are **unchanged**. They continue to run as before in GitHub Actions.
|
||||
- This framework is **additive**. To run it in CI later, add a separate job or step that runs `pytest CI/tests_v2/ ...` from the repo root.
|
||||
|
||||
## Troubleshooting
|
||||
|
||||
- **`pytest.skip: Could not load kube config`** — No cluster or bad KUBECONFIG. Run `make -f CI/tests_v2/Makefile setup` (or `make setup` from `CI/tests_v2`) or check `kubectl cluster-info`.
|
||||
- **KinD cluster creation hangs** — Docker is not running. Start Docker Desktop or run `systemctl start docker`.
|
||||
- **`Bind for 0.0.0.0:9090 failed: port is already allocated`** — Another process (e.g. Prometheus) is using the port. The default dev config (`kind-config-dev.yml`) no longer maps host ports; if you use `KIND_CONFIG=kind-config.yml` or a custom config with `extraPortMappings`, free the port or switch to `kind-config-dev.yml`.
|
||||
- **`TimeoutError: Pods did not become ready`** — Slow image pull or node resource limits. Increase `KRKN_TEST_READINESS_TIMEOUT` or check node resources.
|
||||
- **`ModuleNotFoundError: pytest_rerunfailures`** — Missing test deps. Run `pip install -r CI/tests_v2/requirements.txt` (or `make setup`).
|
||||
- **Stale `krkn-test-*` namespaces** — Left over from a previous crashed run. They are auto-cleaned at session start (older than 30 min). To remove cluster and reports: `make -f CI/tests_v2/Makefile clean`.
|
||||
- **Wrong cluster targeted** — Multiple kube contexts. Use `--require-kind` to skip unless context is kind/minikube, or set context explicitly: `kubectl config use-context kind-ci-krkn`.
|
||||
- **`OSError: [Errno 48] Address already in use` when running tests in parallel** — Kraken normally starts an HTTP status server on port 8081. With `-n auto` (pytest-xdist), multiple Kraken processes would all try to bind to 8081. The test framework disables this server (`publish_kraken_status: False`) in the generated config, so parallel runs should not hit this. If you see it, ensure you're using the framework's `build_config` and not a config that has `publish_kraken_status: True`.
|
||||
74
CI/tests_v2/config/common_test_config.yaml
Normal file
74
CI/tests_v2/config/common_test_config.yaml
Normal file
@@ -0,0 +1,74 @@
|
||||
kraken:
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift.
|
||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig.
|
||||
exit_on_failure: False # Exit when a post action scenario fails.
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
|
||||
signal_address: 0.0.0.0 # Signal listening address
|
||||
port: 8081 # Signal port
|
||||
auto_rollback: True # Enable auto rollback for scenarios.
|
||||
rollback_versions_directory: /tmp/kraken-rollback # Directory to store rollback version files.
|
||||
chaos_scenarios: # List of policies/chaos scenarios to load.
|
||||
- $scenario_type: # List of chaos pod scenarios to load.
|
||||
- $scenario_file
|
||||
cerberus:
|
||||
cerberus_enabled: False # Enable it when cerberus is previously installed.
|
||||
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal.
|
||||
|
||||
performance_monitoring:
|
||||
capture_metrics: False
|
||||
metrics_profile_path: config/metrics-aggregated.yaml
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set.
|
||||
enable_alerts: True # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
enable_metrics: True
|
||||
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
|
||||
metrics_profile: config/metrics-report.yaml
|
||||
check_critical_alerts: True # Path to alert profile with the prometheus queries.
|
||||
|
||||
tunings:
|
||||
wait_duration: 6 # Duration to wait between each chaos scenario.
|
||||
iterations: 1 # Number of times to execute the scenarios.
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
|
||||
username: $TELEMETRY_USERNAME # telemetry service username
|
||||
password: $TELEMETRY_PASSWORD # telemetry service password
|
||||
prometheus_namespace: 'monitoring' # prometheus namespace
|
||||
prometheus_pod_name: 'prometheus-kind-prometheus-kube-prome-prometheus-0' # prometheus pod_name
|
||||
prometheus_container_name: 'prometheus'
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 10000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
logs_backup: True
|
||||
logs_filter_patterns:
|
||||
- "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532
|
||||
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
telemetry_group: "funtests"
|
||||
elastic:
|
||||
enable_elastic: False
|
||||
verify_certs: False
|
||||
elastic_url: "https://192.168.39.196" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_port: 32766
|
||||
username: "elastic"
|
||||
password: "test"
|
||||
metrics_index: "krkn-metrics"
|
||||
alerts_index: "krkn-alerts"
|
||||
telemetry_index: "krkn-telemetry"
|
||||
|
||||
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
||||
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
||||
config: # Provide list of health check configurations for applications
|
||||
- url: # Provide application endpoint
|
||||
bearer_token: # Bearer token for authentication if any
|
||||
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
||||
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
||||
67
CI/tests_v2/conftest.py
Normal file
67
CI/tests_v2/conftest.py
Normal file
@@ -0,0 +1,67 @@
|
||||
"""
|
||||
Shared fixtures for pytest functional tests (CI/tests_v2).
|
||||
Tests must be run from the repository root so run_kraken.py and config paths resolve.
|
||||
"""
|
||||
|
||||
import logging
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
|
||||
|
||||
def pytest_addoption(parser):
|
||||
parser.addoption(
|
||||
"--keep-ns-on-fail",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Don't delete test namespaces on failure (for debugging)",
|
||||
)
|
||||
parser.addoption(
|
||||
"--require-kind",
|
||||
action="store_true",
|
||||
default=False,
|
||||
help="Skip tests unless current context is a known dev cluster (kind, minikube)",
|
||||
)
|
||||
|
||||
|
||||
@pytest.hookimpl(tryfirst=True, hookwrapper=True)
|
||||
def pytest_runtest_makereport(item, call):
|
||||
outcome = yield
|
||||
rep = outcome.get_result()
|
||||
setattr(item, f"rep_{rep.when}", rep)
|
||||
|
||||
|
||||
def _repo_root() -> Path:
|
||||
"""Repository root (directory containing run_kraken.py and CI/)."""
|
||||
return Path(__file__).resolve().parent.parent.parent
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def repo_root():
|
||||
return _repo_root()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _configure_logging():
|
||||
"""Set log format with timestamps for test runs."""
|
||||
logging.basicConfig(
|
||||
format="%(asctime)s %(levelname)s [%(name)s] %(message)s",
|
||||
datefmt="%Y-%m-%d %H:%M:%S",
|
||||
level=logging.INFO,
|
||||
)
|
||||
|
||||
|
||||
# Re-export fixtures from lib modules so pytest discovers them
|
||||
from lib.deploy import deploy_workload, wait_for_pods_running # noqa: E402, F401
|
||||
from lib.kraken import build_config, run_kraken, run_kraken_background # noqa: E402, F401
|
||||
from lib.k8s import ( # noqa: E402, F401
|
||||
_kube_config_loaded,
|
||||
_log_cluster_context,
|
||||
k8s_apps,
|
||||
k8s_client,
|
||||
k8s_core,
|
||||
k8s_networking,
|
||||
kubectl,
|
||||
)
|
||||
from lib.namespace import _cleanup_stale_namespaces, test_namespace # noqa: E402, F401
|
||||
from lib.preflight import _preflight_checks # noqa: E402, F401
|
||||
8
CI/tests_v2/kind-config-dev.yml
Normal file
8
CI/tests_v2/kind-config-dev.yml
Normal file
@@ -0,0 +1,8 @@
|
||||
# Lean KinD config for local dev (faster than full 5-node). Use KIND_CONFIG to override.
|
||||
# No extraPortMappings so setup works when 9090/30080 are in use (e.g. local Prometheus).
|
||||
# For Prometheus/ES port mapping, use the repo root kind-config.yml.
|
||||
kind: Cluster
|
||||
apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
- role: worker
|
||||
7
CI/tests_v2/lib/__init__.py
Normal file
7
CI/tests_v2/lib/__init__.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# Shared framework for CI/tests_v2 functional tests.
|
||||
# base: BaseScenarioTest, timeout constants
|
||||
# utils: assertions, K8s helpers, patch_namespace_in_docs
|
||||
# k8s: K8s client fixtures, cluster context checks
|
||||
# namespace: test_namespace, stale namespace cleanup
|
||||
# deploy: deploy_workload, wait_for_pods_running, wait_for_deployment_replicas
|
||||
# kraken: run_kraken, run_kraken_background, build_config
|
||||
155
CI/tests_v2/lib/base.py
Normal file
155
CI/tests_v2/lib/base.py
Normal file
@@ -0,0 +1,155 @@
|
||||
"""
|
||||
Base class for CI/tests_v2 scenario tests.
|
||||
Encapsulates the shared lifecycle: ephemeral namespace, optional workload deploy, teardown.
|
||||
"""
|
||||
|
||||
import copy
|
||||
import logging
|
||||
import os
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
from lib.utils import load_scenario_base
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _get_nested(obj, path):
|
||||
"""Walk path (list of keys/indices) and return the value. Supports list and dict."""
|
||||
for key in path:
|
||||
obj = obj[key]
|
||||
return obj
|
||||
|
||||
|
||||
def _set_nested(obj, path, value):
|
||||
"""Walk path to the parent and set the last key to value."""
|
||||
if not path:
|
||||
return
|
||||
parent_path, last_key = path[:-1], path[-1]
|
||||
parent = obj
|
||||
for key in parent_path:
|
||||
parent = parent[key]
|
||||
parent[last_key] = value
|
||||
|
||||
|
||||
# Timeout constants (seconds). Override via env vars (e.g. KRKN_TEST_READINESS_TIMEOUT).
|
||||
# Coordinate with pytest-timeout budget (e.g. 300s).
|
||||
TIMEOUT_BUDGET = int(os.environ.get("KRKN_TEST_TIMEOUT_BUDGET", "300"))
|
||||
DEPLOY_TIMEOUT = int(os.environ.get("KRKN_TEST_DEPLOY_TIMEOUT", "90"))
|
||||
READINESS_TIMEOUT = int(os.environ.get("KRKN_TEST_READINESS_TIMEOUT", "90"))
|
||||
NS_CLEANUP_TIMEOUT = int(os.environ.get("KRKN_TEST_NS_CLEANUP_TIMEOUT", "60"))
|
||||
POLICY_WAIT_TIMEOUT = int(os.environ.get("KRKN_TEST_POLICY_WAIT_TIMEOUT", "30"))
|
||||
KRAKEN_PROC_WAIT_TIMEOUT = int(os.environ.get("KRKN_TEST_KRAKEN_PROC_WAIT_TIMEOUT", "60"))
|
||||
|
||||
|
||||
class BaseScenarioTest:
|
||||
"""
|
||||
Base class for scenario tests. Subclasses set:
|
||||
- WORKLOAD_MANIFEST: path (str), or callable(namespace) -> YAML str for inline manifest
|
||||
- WORKLOAD_IS_PATH: True if WORKLOAD_MANIFEST is a file path, False if inline YAML
|
||||
- LABEL_SELECTOR: label selector for pods to wait on (e.g. "app=my-target")
|
||||
- SCENARIO_NAME: e.g. "pod_disruption", "application_outage"
|
||||
- SCENARIO_TYPE: e.g. "pod_disruption_scenarios", "application_outages_scenarios"
|
||||
- NAMESPACE_KEY_PATH: path to namespace field, e.g. [0, "config", "namespace_pattern"] or ["application_outage", "namespace"]
|
||||
- NAMESPACE_IS_REGEX: True to wrap namespace in ^...$
|
||||
- OVERRIDES_KEY_PATH: path to dict for **overrides (e.g. ["application_outage"]), or [] if none
|
||||
"""
|
||||
|
||||
WORKLOAD_MANIFEST = None
|
||||
WORKLOAD_IS_PATH = True
|
||||
LABEL_SELECTOR = None
|
||||
SCENARIO_NAME = ""
|
||||
SCENARIO_TYPE = ""
|
||||
NAMESPACE_KEY_PATH = []
|
||||
NAMESPACE_IS_REGEX = False
|
||||
OVERRIDES_KEY_PATH = []
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _inject_common_fixtures(
|
||||
self,
|
||||
repo_root,
|
||||
tmp_path,
|
||||
build_config,
|
||||
run_kraken,
|
||||
run_kraken_background,
|
||||
k8s_core,
|
||||
k8s_apps,
|
||||
k8s_networking,
|
||||
k8s_client,
|
||||
):
|
||||
"""Inject common fixtures onto self so test methods don't need to declare them."""
|
||||
self.repo_root = repo_root
|
||||
self.tmp_path = tmp_path
|
||||
self.build_config = build_config
|
||||
self.run_kraken = run_kraken
|
||||
self.run_kraken_background = run_kraken_background
|
||||
self.k8s_core = k8s_core
|
||||
self.k8s_apps = k8s_apps
|
||||
self.k8s_networking = k8s_networking
|
||||
self.k8s_client = k8s_client
|
||||
yield
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def _setup_workload(self, request, repo_root):
|
||||
if "no_workload" in request.keywords:
|
||||
request.instance.ns = request.getfixturevalue("test_namespace")
|
||||
logger.debug("no_workload marker: skipping workload deploy, ns=%s", request.instance.ns)
|
||||
yield
|
||||
return
|
||||
deploy = request.getfixturevalue("deploy_workload")
|
||||
test_namespace = request.getfixturevalue("test_namespace")
|
||||
manifest = self.WORKLOAD_MANIFEST
|
||||
if callable(manifest):
|
||||
manifest = manifest(test_namespace)
|
||||
is_path = False
|
||||
logger.info("Deploying inline workload in ns=%s, label_selector=%s", test_namespace, self.LABEL_SELECTOR)
|
||||
else:
|
||||
is_path = self.WORKLOAD_IS_PATH
|
||||
if is_path and manifest and not Path(manifest).is_absolute():
|
||||
manifest = repo_root / manifest
|
||||
logger.info("Deploying workload from %s in ns=%s, label_selector=%s", manifest, test_namespace, self.LABEL_SELECTOR)
|
||||
ns = deploy(manifest, self.LABEL_SELECTOR, is_path=is_path, timeout=DEPLOY_TIMEOUT)
|
||||
request.instance.ns = ns
|
||||
yield
|
||||
|
||||
def load_and_patch_scenario(self, repo_root, namespace, **overrides):
|
||||
"""Load scenario_base.yaml and patch namespace (and overrides). Returns the scenario structure."""
|
||||
scenario = copy.deepcopy(load_scenario_base(repo_root, self.SCENARIO_NAME))
|
||||
ns_value = f"^{namespace}$" if self.NAMESPACE_IS_REGEX else namespace
|
||||
if self.NAMESPACE_KEY_PATH:
|
||||
_set_nested(scenario, self.NAMESPACE_KEY_PATH, ns_value)
|
||||
if overrides and self.OVERRIDES_KEY_PATH:
|
||||
target = _get_nested(scenario, self.OVERRIDES_KEY_PATH)
|
||||
for key, value in overrides.items():
|
||||
target[key] = value
|
||||
return scenario
|
||||
|
||||
def write_scenario(self, tmp_path, scenario_data, suffix=""):
|
||||
"""Write scenario data to a YAML file in tmp_path. Returns the path."""
|
||||
filename = f"{self.SCENARIO_NAME}_scenario{suffix}.yaml"
|
||||
path = tmp_path / filename
|
||||
path.write_text(yaml.dump(scenario_data, default_flow_style=False, sort_keys=False))
|
||||
return path
|
||||
|
||||
def run_scenario(self, tmp_path, namespace, *, overrides=None, config_filename=None):
|
||||
"""Load, patch, write scenario; build config; run Kraken. Returns CompletedProcess."""
|
||||
scenario = self.load_and_patch_scenario(self.repo_root, namespace, **(overrides or {}))
|
||||
scenario_path = self.write_scenario(tmp_path, scenario)
|
||||
config_path = self.build_config(
|
||||
self.SCENARIO_TYPE,
|
||||
str(scenario_path),
|
||||
filename=config_filename or "test_config.yaml",
|
||||
)
|
||||
if os.environ.get("KRKN_TEST_DRY_RUN", "0") == "1":
|
||||
logger.info(
|
||||
"[dry-run] Would run Kraken with config=%s, scenario=%s",
|
||||
config_path,
|
||||
scenario_path,
|
||||
)
|
||||
return subprocess.CompletedProcess(
|
||||
args=[], returncode=0, stdout="[dry-run] skipped", stderr=""
|
||||
)
|
||||
return self.run_kraken(config_path)
|
||||
145
CI/tests_v2/lib/deploy.py
Normal file
145
CI/tests_v2/lib/deploy.py
Normal file
@@ -0,0 +1,145 @@
|
||||
"""
|
||||
Workload deploy and pod/deployment readiness fixtures for CI/tests_v2.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from kubernetes import utils as k8s_utils
|
||||
|
||||
from lib.base import READINESS_TIMEOUT
|
||||
from lib.utils import patch_namespace_in_docs
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def wait_for_deployment_replicas(k8s_apps, namespace: str, name: str, timeout: int = 120) -> None:
|
||||
"""
|
||||
Poll until the deployment has ready_replicas >= spec.replicas.
|
||||
Raises TimeoutError with diagnostic details on failure.
|
||||
"""
|
||||
deadline = time.monotonic() + timeout
|
||||
last_dep = None
|
||||
attempts = 0
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
dep = k8s_apps.read_namespaced_deployment(name=name, namespace=namespace)
|
||||
except Exception as e:
|
||||
logger.debug("Deployment %s/%s poll attempt %s failed: %s", namespace, name, attempts, e)
|
||||
time.sleep(2)
|
||||
attempts += 1
|
||||
continue
|
||||
last_dep = dep
|
||||
ready = dep.status.ready_replicas or 0
|
||||
desired = dep.spec.replicas or 1
|
||||
if ready >= desired:
|
||||
logger.debug("Deployment %s/%s ready (%s/%s)", namespace, name, ready, desired)
|
||||
return
|
||||
logger.debug("Deployment %s/%s not ready yet: %s/%s", namespace, name, ready, desired)
|
||||
time.sleep(2)
|
||||
attempts += 1
|
||||
diag = ""
|
||||
if last_dep is not None and last_dep.status:
|
||||
diag = f" ready_replicas={last_dep.status.ready_replicas}, desired={last_dep.spec.replicas}"
|
||||
raise TimeoutError(
|
||||
f"Deployment {namespace}/{name} did not become ready within {timeout}s.{diag}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def wait_for_pods_running(k8s_core):
|
||||
"""
|
||||
Poll until all matching pods are Running and all containers ready.
|
||||
Uses exponential backoff: 1s, 2s, 4s, ... capped at 10s.
|
||||
Raises TimeoutError with diagnostic details on failure.
|
||||
"""
|
||||
|
||||
def _wait(namespace: str, label_selector: str, timeout: int = READINESS_TIMEOUT):
|
||||
deadline = time.monotonic() + timeout
|
||||
interval = 1.0
|
||||
max_interval = 10.0
|
||||
last_list = None
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
pod_list = k8s_core.list_namespaced_pod(
|
||||
namespace=namespace,
|
||||
label_selector=label_selector,
|
||||
)
|
||||
except Exception:
|
||||
time.sleep(min(interval, max_interval))
|
||||
interval = min(interval * 2, max_interval)
|
||||
continue
|
||||
last_list = pod_list
|
||||
items = pod_list.items or []
|
||||
if not items:
|
||||
time.sleep(min(interval, max_interval))
|
||||
interval = min(interval * 2, max_interval)
|
||||
continue
|
||||
all_running = all(
|
||||
(p.status and p.status.phase == "Running") for p in items
|
||||
)
|
||||
if not all_running:
|
||||
time.sleep(min(interval, max_interval))
|
||||
interval = min(interval * 2, max_interval)
|
||||
continue
|
||||
all_ready = True
|
||||
for p in items:
|
||||
if not p.status or not p.status.container_statuses:
|
||||
all_ready = False
|
||||
break
|
||||
for cs in p.status.container_statuses:
|
||||
if not getattr(cs, "ready", False):
|
||||
all_ready = False
|
||||
break
|
||||
if all_ready:
|
||||
return
|
||||
time.sleep(min(interval, max_interval))
|
||||
interval = min(interval * 2, max_interval)
|
||||
|
||||
diag = ""
|
||||
if last_list and last_list.items:
|
||||
p = last_list.items[0]
|
||||
diag = f" e.g. pod {p.metadata.name}: phase={getattr(p.status, 'phase', None)}"
|
||||
raise TimeoutError(
|
||||
f"Pods in {namespace} with label {label_selector} did not become ready within {timeout}s.{diag}"
|
||||
)
|
||||
|
||||
return _wait
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def deploy_workload(test_namespace, k8s_client, wait_for_pods_running, repo_root, tmp_path):
|
||||
"""
|
||||
Helper that applies a manifest into the test namespace and waits for pods.
|
||||
Yields a callable: deploy(manifest_path_or_content, label_selector, *, is_path=True)
|
||||
which applies the manifest, waits for readiness, and returns the namespace name.
|
||||
"""
|
||||
|
||||
def _deploy(manifest_path_or_content, label_selector, *, is_path=True, timeout=READINESS_TIMEOUT):
|
||||
try:
|
||||
if is_path:
|
||||
path = Path(manifest_path_or_content)
|
||||
if not path.is_absolute():
|
||||
path = repo_root / path
|
||||
with open(path) as f:
|
||||
docs = list(yaml.safe_load_all(f))
|
||||
else:
|
||||
docs = list(yaml.safe_load_all(manifest_path_or_content))
|
||||
docs = patch_namespace_in_docs(docs, test_namespace)
|
||||
k8s_utils.create_from_yaml(
|
||||
k8s_client,
|
||||
yaml_objects=docs,
|
||||
namespace=test_namespace,
|
||||
)
|
||||
except k8s_utils.FailToCreateError as e:
|
||||
msgs = [str(exc) for exc in e.api_exceptions]
|
||||
raise RuntimeError(f"Failed to create resources: {'; '.join(msgs)}") from e
|
||||
logger.info("Workload applied in namespace=%s, waiting for pods with selector=%s", test_namespace, label_selector)
|
||||
wait_for_pods_running(test_namespace, label_selector, timeout=timeout)
|
||||
logger.info("Pods ready in namespace=%s", test_namespace)
|
||||
return test_namespace
|
||||
|
||||
return _deploy
|
||||
88
CI/tests_v2/lib/k8s.py
Normal file
88
CI/tests_v2/lib/k8s.py
Normal file
@@ -0,0 +1,88 @@
|
||||
"""
|
||||
Kubernetes client fixtures and cluster context checks for CI/tests_v2.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from kubernetes import client, config
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def _kube_config_loaded():
|
||||
"""Load kubeconfig once per session. Skips if cluster unreachable."""
|
||||
try:
|
||||
config.load_kube_config()
|
||||
logger.info("Kube config loaded successfully")
|
||||
except config.ConfigException as e:
|
||||
logger.warning("Could not load kube config: %s", e)
|
||||
pytest.skip(f"Could not load kube config (is a cluster running?): {e}")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def k8s_core(_kube_config_loaded):
|
||||
"""Kubernetes CoreV1Api for pods, etc. Uses default kubeconfig."""
|
||||
return client.CoreV1Api()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def k8s_networking(_kube_config_loaded):
|
||||
"""Kubernetes NetworkingV1Api for network policies."""
|
||||
return client.NetworkingV1Api()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def k8s_client(_kube_config_loaded):
|
||||
"""Kubernetes ApiClient for create_from_yaml and other generic API calls."""
|
||||
return client.ApiClient()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def k8s_apps(_kube_config_loaded):
|
||||
"""Kubernetes AppsV1Api for deployment status polling."""
|
||||
return client.AppsV1Api()
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _log_cluster_context(request):
|
||||
"""Log current cluster context at session start; skip if --require-kind and not a dev cluster."""
|
||||
try:
|
||||
contexts, active = config.list_kube_config_contexts()
|
||||
except Exception as e:
|
||||
logger.warning("Could not list kube config contexts: %s", e)
|
||||
return
|
||||
if not active:
|
||||
return
|
||||
context_name = active.get("name", "?")
|
||||
cluster = (active.get("context") or {}).get("cluster", "?")
|
||||
logger.info("Running tests against cluster: context=%s cluster=%s", context_name, cluster)
|
||||
if not request.config.getoption("--require-kind", False):
|
||||
return
|
||||
cluster_lower = (cluster or "").lower()
|
||||
if "kind" in cluster_lower or "minikube" in cluster_lower:
|
||||
return
|
||||
pytest.skip(
|
||||
f"Cluster '{cluster}' does not look like kind/minikube. "
|
||||
"Use default kubeconfig or pass --require-kind only on dev clusters."
|
||||
)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def kubectl(repo_root):
|
||||
"""Run kubectl with given args from repo root. Returns CompletedProcess."""
|
||||
|
||||
def run(args, timeout=120):
|
||||
cmd = ["kubectl"] + (args if isinstance(args, list) else list(args))
|
||||
return subprocess.run(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
return run
|
||||
94
CI/tests_v2/lib/kraken.py
Normal file
94
CI/tests_v2/lib/kraken.py
Normal file
@@ -0,0 +1,94 @@
|
||||
"""
|
||||
Kraken execution and config building fixtures for CI/tests_v2.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
|
||||
|
||||
def _kraken_cmd(config_path: str, repo_root: Path):
|
||||
"""Use the same Python as the test process so venv/.venv and coverage match."""
|
||||
python = sys.executable
|
||||
if os.environ.get("KRKN_TEST_COVERAGE", "0") == "1":
|
||||
return [
|
||||
python, "-m", "coverage", "run", "-a",
|
||||
"run_kraken.py", "-c", str(config_path),
|
||||
]
|
||||
return [python, "run_kraken.py", "-c", str(config_path)]
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run_kraken(repo_root):
|
||||
"""Run Kraken with the given config path. Returns CompletedProcess. Default timeout 300s."""
|
||||
|
||||
def run(config_path, timeout=300, extra_args=None):
|
||||
cmd = _kraken_cmd(config_path, repo_root)
|
||||
if extra_args:
|
||||
cmd.extend(extra_args)
|
||||
return subprocess.run(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=timeout,
|
||||
)
|
||||
|
||||
return run
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def run_kraken_background(repo_root):
|
||||
"""Start Kraken in background. Returns Popen. Call proc.terminate() or proc.wait() to stop."""
|
||||
|
||||
def start(config_path):
|
||||
cmd = _kraken_cmd(config_path, repo_root)
|
||||
return subprocess.Popen(
|
||||
cmd,
|
||||
cwd=repo_root,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.PIPE,
|
||||
text=True,
|
||||
)
|
||||
|
||||
return start
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def build_config(repo_root, tmp_path):
|
||||
"""
|
||||
Build a Kraken config from tests_v2's common_test_config.yaml with scenario_type and scenario_file
|
||||
substituted. Disables Prometheus/Elastic checks for local runs.
|
||||
Returns the path to the written config file.
|
||||
"""
|
||||
common_path = repo_root / "CI" / "tests_v2" / "config" / "common_test_config.yaml"
|
||||
|
||||
def _build(scenario_type: str, scenario_file: str, filename: str = "test_config.yaml"):
|
||||
content = common_path.read_text()
|
||||
content = content.replace("$scenario_type", scenario_type)
|
||||
content = content.replace("$scenario_file", scenario_file)
|
||||
content = content.replace("$post_config", "")
|
||||
|
||||
config = yaml.safe_load(content)
|
||||
if "kraken" in config:
|
||||
# Disable status server so parallel test workers don't all bind to port 8081
|
||||
config["kraken"]["publish_kraken_status"] = False
|
||||
if "performance_monitoring" in config:
|
||||
config["performance_monitoring"]["check_critical_alerts"] = False
|
||||
config["performance_monitoring"]["enable_alerts"] = False
|
||||
config["performance_monitoring"]["enable_metrics"] = False
|
||||
if "elastic" in config:
|
||||
config["elastic"]["enable_elastic"] = False
|
||||
if "tunings" in config:
|
||||
config["tunings"]["wait_duration"] = 1
|
||||
|
||||
out_path = tmp_path / filename
|
||||
with open(out_path, "w") as f:
|
||||
yaml.dump(config, f, default_flow_style=False, sort_keys=False)
|
||||
return str(out_path)
|
||||
|
||||
return _build
|
||||
114
CI/tests_v2/lib/namespace.py
Normal file
114
CI/tests_v2/lib/namespace.py
Normal file
@@ -0,0 +1,114 @@
|
||||
"""
|
||||
Namespace lifecycle fixtures for CI/tests_v2: create, delete, stale cleanup.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import os
|
||||
import time
|
||||
import uuid
|
||||
from datetime import datetime
|
||||
|
||||
import pytest
|
||||
from kubernetes import client
|
||||
from kubernetes.client.rest import ApiException
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
STALE_NS_AGE_MINUTES = 30
|
||||
|
||||
|
||||
def _namespace_age_minutes(metadata) -> float:
|
||||
"""Return age of namespace in minutes from its creation_timestamp."""
|
||||
if not metadata or not metadata.creation_timestamp:
|
||||
return 0.0
|
||||
created = metadata.creation_timestamp
|
||||
if hasattr(created, "timestamp"):
|
||||
created_ts = created.timestamp()
|
||||
else:
|
||||
try:
|
||||
dt = datetime.fromisoformat(created.replace("Z", "+00:00"))
|
||||
created_ts = dt.timestamp()
|
||||
except Exception:
|
||||
return 0.0
|
||||
return (time.time() - created_ts) / 60.0
|
||||
|
||||
|
||||
def _wait_for_namespace_gone(k8s_core, name: str, timeout: int = 60):
|
||||
"""Poll until the namespace no longer exists."""
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
try:
|
||||
k8s_core.read_namespace(name=name)
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
return
|
||||
raise
|
||||
time.sleep(1)
|
||||
raise TimeoutError(f"Namespace {name} did not disappear within {timeout}s")
|
||||
|
||||
|
||||
@pytest.fixture(scope="function")
|
||||
def test_namespace(request, k8s_core):
|
||||
"""
|
||||
Create an ephemeral namespace for the test. Deleted after the test unless
|
||||
--keep-ns-on-fail is set and the test failed.
|
||||
"""
|
||||
name = f"krkn-test-{uuid.uuid4().hex[:8]}"
|
||||
ns = client.V1Namespace(
|
||||
metadata=client.V1ObjectMeta(
|
||||
name=name,
|
||||
labels={
|
||||
"pod-security.kubernetes.io/audit": "privileged",
|
||||
"pod-security.kubernetes.io/enforce": "privileged",
|
||||
"pod-security.kubernetes.io/enforce-version": "v1.24",
|
||||
"pod-security.kubernetes.io/warn": "privileged",
|
||||
"security.openshift.io/scc.podSecurityLabelSync": "false",
|
||||
},
|
||||
)
|
||||
)
|
||||
k8s_core.create_namespace(body=ns)
|
||||
logger.info("Created test namespace: %s", name)
|
||||
|
||||
yield name
|
||||
|
||||
keep_on_fail = request.config.getoption("--keep-ns-on-fail", False)
|
||||
rep_call = getattr(request.node, "rep_call", None)
|
||||
failed = rep_call is not None and rep_call.failed
|
||||
if keep_on_fail and failed:
|
||||
logger.info("[keep-ns-on-fail] Keeping namespace %s for debugging", name)
|
||||
return
|
||||
|
||||
try:
|
||||
k8s_core.delete_namespace(
|
||||
name=name,
|
||||
body=client.V1DeleteOptions(propagation_policy="Background"),
|
||||
)
|
||||
logger.debug("Scheduled background deletion for namespace: %s", name)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete namespace %s: %s", name, e)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _cleanup_stale_namespaces(k8s_core):
|
||||
"""Delete krkn-test-* namespaces older than STALE_NS_AGE_MINUTES at session start."""
|
||||
if os.environ.get("PYTEST_XDIST_WORKER"):
|
||||
return
|
||||
try:
|
||||
namespaces = k8s_core.list_namespace()
|
||||
except Exception as e:
|
||||
logger.warning("Could not list namespaces for stale cleanup: %s", e)
|
||||
return
|
||||
for ns in namespaces.items or []:
|
||||
name = ns.metadata.name if ns.metadata else ""
|
||||
if not name.startswith("krkn-test-"):
|
||||
continue
|
||||
if _namespace_age_minutes(ns.metadata) <= STALE_NS_AGE_MINUTES:
|
||||
continue
|
||||
try:
|
||||
logger.warning("Deleting stale namespace: %s", name)
|
||||
k8s_core.delete_namespace(
|
||||
name=name,
|
||||
body=client.V1DeleteOptions(propagation_policy="Background"),
|
||||
)
|
||||
except Exception as e:
|
||||
logger.warning("Failed to delete stale namespace %s: %s", name, e)
|
||||
48
CI/tests_v2/lib/preflight.py
Normal file
48
CI/tests_v2/lib/preflight.py
Normal file
@@ -0,0 +1,48 @@
|
||||
"""
|
||||
Preflight checks for CI/tests_v2: cluster reachability and test deps at session start.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import subprocess
|
||||
|
||||
import pytest
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
@pytest.fixture(scope="session", autouse=True)
|
||||
def _preflight_checks(repo_root):
|
||||
"""
|
||||
Verify cluster is reachable and test deps are importable at session start.
|
||||
Skips the session if cluster-info fails or required plugins are missing.
|
||||
"""
|
||||
# Check test deps (pytest plugins)
|
||||
try:
|
||||
import pytest_rerunfailures # noqa: F401
|
||||
import pytest_html # noqa: F401
|
||||
import pytest_timeout # noqa: F401
|
||||
import pytest_order # noqa: F401
|
||||
import xdist # noqa: F401
|
||||
except ImportError as e:
|
||||
pytest.skip(
|
||||
f"Missing test dependency: {e}. "
|
||||
"Run: pip install -r CI/tests_v2/requirements.txt"
|
||||
)
|
||||
|
||||
# Check cluster reachable and log server URL
|
||||
result = subprocess.run(
|
||||
["kubectl", "cluster-info"],
|
||||
cwd=repo_root,
|
||||
capture_output=True,
|
||||
text=True,
|
||||
timeout=10,
|
||||
)
|
||||
if result.returncode != 0:
|
||||
pytest.skip(
|
||||
f"Cluster not reachable (kubectl cluster-info failed). "
|
||||
f"Start a cluster (e.g. make setup) or check KUBECONFIG. stderr: {result.stderr or '(none)'}"
|
||||
)
|
||||
# Log first line of cluster-info (server URL) for debugging
|
||||
if result.stdout:
|
||||
first_line = result.stdout.strip().split("\n")[0]
|
||||
logger.info("Preflight: %s", first_line)
|
||||
212
CI/tests_v2/lib/utils.py
Normal file
212
CI/tests_v2/lib/utils.py
Normal file
@@ -0,0 +1,212 @@
|
||||
"""
|
||||
Shared helpers for CI/tests_v2 functional tests.
|
||||
"""
|
||||
|
||||
import logging
|
||||
import time
|
||||
from pathlib import Path
|
||||
from typing import List, Optional, Union
|
||||
|
||||
import pytest
|
||||
import yaml
|
||||
from kubernetes.client import V1NetworkPolicy, V1NetworkPolicyList, V1Pod, V1PodList
|
||||
|
||||
logger = logging.getLogger(__name__)
|
||||
|
||||
|
||||
def _pods(pod_list: Union[V1PodList, List[V1Pod]]) -> List[V1Pod]:
|
||||
"""Normalize V1PodList or list of V1Pod to list of V1Pod."""
|
||||
return pod_list.items if hasattr(pod_list, "items") else pod_list
|
||||
|
||||
|
||||
def _policies(
|
||||
policy_list: Union[V1NetworkPolicyList, List[V1NetworkPolicy]],
|
||||
) -> List[V1NetworkPolicy]:
|
||||
"""Normalize V1NetworkPolicyList or list to list of V1NetworkPolicy."""
|
||||
return policy_list.items if hasattr(policy_list, "items") else policy_list
|
||||
|
||||
|
||||
def scenario_dir(repo_root: Path, scenario_name: str) -> Path:
|
||||
"""Return the path to a scenario folder under CI/tests_v2/scenarios/."""
|
||||
return repo_root / "CI" / "tests_v2" / "scenarios" / scenario_name
|
||||
|
||||
|
||||
def load_scenario_base(
|
||||
repo_root: Path,
|
||||
scenario_name: str,
|
||||
filename: str = "scenario_base.yaml",
|
||||
) -> Union[dict, list]:
|
||||
"""
|
||||
Load and parse the scenario base YAML for a scenario.
|
||||
Returns dict or list depending on the YAML structure.
|
||||
"""
|
||||
path = scenario_dir(repo_root, scenario_name) / filename
|
||||
text = path.read_text()
|
||||
data = yaml.safe_load(text)
|
||||
if data is None:
|
||||
raise ValueError(f"Empty or invalid YAML in {path}")
|
||||
return data
|
||||
|
||||
|
||||
def patch_namespace_in_docs(docs: list, namespace: str) -> list:
|
||||
"""Override metadata.namespace in each doc so create_from_yaml respects target namespace."""
|
||||
for doc in docs:
|
||||
if isinstance(doc, dict) and doc.get("metadata") is not None:
|
||||
doc["metadata"]["namespace"] = namespace
|
||||
return docs
|
||||
|
||||
|
||||
def get_pods_list(k8s_core, namespace: str, label_selector: str) -> V1PodList:
|
||||
"""Return V1PodList from the Kubernetes API."""
|
||||
return k8s_core.list_namespaced_pod(
|
||||
namespace=namespace,
|
||||
label_selector=label_selector,
|
||||
)
|
||||
|
||||
|
||||
def get_pods_or_skip(
|
||||
k8s_core,
|
||||
namespace: str,
|
||||
label_selector: str,
|
||||
no_pods_reason: Optional[str] = None,
|
||||
) -> V1PodList:
|
||||
"""
|
||||
Get pods via Kubernetes API or skip if cluster unreachable or no matching pods.
|
||||
Use at test start when prerequisites may be missing.
|
||||
no_pods_reason: message when no pods match; if None, a default message is used.
|
||||
"""
|
||||
try:
|
||||
pod_list = k8s_core.list_namespaced_pod(
|
||||
namespace=namespace,
|
||||
label_selector=label_selector,
|
||||
)
|
||||
except Exception as e:
|
||||
pytest.skip(f"Cluster unreachable: {e}")
|
||||
if not pod_list.items or len(pod_list.items) == 0:
|
||||
reason = (
|
||||
no_pods_reason
|
||||
if no_pods_reason
|
||||
else f"No pods in {namespace} with label {label_selector}. "
|
||||
"Start a KinD cluster with default storage (local-path-provisioner)."
|
||||
)
|
||||
pytest.skip(reason)
|
||||
return pod_list
|
||||
|
||||
|
||||
def pod_uids(pod_list: Union[V1PodList, List[V1Pod]]) -> list:
|
||||
"""Return list of pod UIDs from V1PodList or list of V1Pod."""
|
||||
return [p.metadata.uid for p in _pods(pod_list)]
|
||||
|
||||
|
||||
def restart_counts(pod_list: Union[V1PodList, List[V1Pod]]) -> int:
|
||||
"""Return total restart count across all containers in V1PodList or list of V1Pod."""
|
||||
total = 0
|
||||
for p in _pods(pod_list):
|
||||
if not p.status or not p.status.container_statuses:
|
||||
continue
|
||||
for cs in p.status.container_statuses:
|
||||
total += getattr(cs, "restart_count", 0)
|
||||
return total
|
||||
|
||||
|
||||
def get_network_policies_list(k8s_networking, namespace: str) -> V1NetworkPolicyList:
|
||||
"""Return V1NetworkPolicyList from the Kubernetes API."""
|
||||
return k8s_networking.list_namespaced_network_policy(namespace=namespace)
|
||||
|
||||
|
||||
def find_network_policy_by_prefix(
|
||||
policy_list: Union[V1NetworkPolicyList, List[V1NetworkPolicy]],
|
||||
name_prefix: str,
|
||||
) -> Optional[V1NetworkPolicy]:
|
||||
"""Return the first NetworkPolicy whose name starts with name_prefix, or None."""
|
||||
for policy in _policies(policy_list):
|
||||
if (
|
||||
policy.metadata
|
||||
and policy.metadata.name
|
||||
and policy.metadata.name.startswith(name_prefix)
|
||||
):
|
||||
return policy
|
||||
return None
|
||||
|
||||
|
||||
def assert_all_pods_running_and_ready(
|
||||
pod_list: Union[V1PodList, List[V1Pod]],
|
||||
namespace: str = "",
|
||||
) -> None:
|
||||
"""
|
||||
Assert all pods are Running and all containers Ready.
|
||||
Include namespace in assertion messages for debugging.
|
||||
"""
|
||||
ns_suffix = f" (namespace={namespace})" if namespace else ""
|
||||
for pod in _pods(pod_list):
|
||||
assert pod.status and pod.status.phase == "Running", (
|
||||
f"Pod {pod.metadata.name} not Running after scenario: {pod.status}{ns_suffix}"
|
||||
)
|
||||
if pod.status.container_statuses:
|
||||
for cs in pod.status.container_statuses:
|
||||
assert getattr(cs, "ready", False) is True, (
|
||||
f"Container {getattr(cs, 'name', '?')} not ready in pod {pod.metadata.name}{ns_suffix}"
|
||||
)
|
||||
|
||||
|
||||
def assert_pod_count_unchanged(
|
||||
before: Union[V1PodList, List[V1Pod]],
|
||||
after: Union[V1PodList, List[V1Pod]],
|
||||
namespace: str = "",
|
||||
) -> None:
|
||||
"""Assert pod count is unchanged; include namespace in failure message."""
|
||||
before_items = _pods(before)
|
||||
after_items = _pods(after)
|
||||
ns_suffix = f" (namespace={namespace})" if namespace else ""
|
||||
assert len(after_items) == len(before_items), (
|
||||
f"Pod count changed after scenario: expected {len(before_items)}, got {len(after_items)}.{ns_suffix}"
|
||||
)
|
||||
|
||||
|
||||
def assert_kraken_success(result, context: str = "", tmp_path=None, allowed_codes=(0,)) -> None:
|
||||
"""
|
||||
Assert Kraken run succeeded (returncode in allowed_codes). On failure, include stdout and stderr
|
||||
in the assertion message and optionally write full output to tmp_path.
|
||||
Default allowed_codes=(0,). For alert-aware tests, use allowed_codes=(0, 2).
|
||||
"""
|
||||
if result.returncode in allowed_codes:
|
||||
return
|
||||
if tmp_path is not None:
|
||||
try:
|
||||
(tmp_path / "kraken_stdout.log").write_text(result.stdout or "")
|
||||
(tmp_path / "kraken_stderr.log").write_text(result.stderr or "")
|
||||
except Exception as e:
|
||||
logger.warning("Could not write Kraken logs to tmp_path: %s", e)
|
||||
lines = (result.stdout or "").splitlines()
|
||||
tail_stdout = "\n".join(lines[-20:]) if lines else "(empty)"
|
||||
context_str = f" {context}" if context else ""
|
||||
path_hint = f"\nFull logs: {tmp_path}/kraken_stdout.log, {tmp_path}/kraken_stderr.log" if tmp_path else ""
|
||||
raise AssertionError(
|
||||
f"Krkn failed (rc={result.returncode}){context_str}.{path_hint}\n"
|
||||
f"--- stderr ---\n{result.stderr or '(empty)'}\n"
|
||||
f"--- stdout (last 20 lines) ---\n{tail_stdout}"
|
||||
)
|
||||
|
||||
|
||||
def assert_kraken_failure(result, context: str = "", tmp_path=None) -> None:
|
||||
"""
|
||||
Assert Kraken run failed (returncode != 0). On failure (Kraken unexpectedly succeeded),
|
||||
raise AssertionError with stdout/stderr and optional tmp_path log files for diagnostics.
|
||||
"""
|
||||
if result.returncode != 0:
|
||||
return
|
||||
if tmp_path is not None:
|
||||
try:
|
||||
(tmp_path / "kraken_stdout.log").write_text(result.stdout or "")
|
||||
(tmp_path / "kraken_stderr.log").write_text(result.stderr or "")
|
||||
except Exception as e:
|
||||
logger.warning("Could not write Kraken logs to tmp_path: %s", e)
|
||||
lines = (result.stdout or "").splitlines()
|
||||
tail_stdout = "\n".join(lines[-20:]) if lines else "(empty)"
|
||||
context_str = f" {context}" if context else ""
|
||||
path_hint = f"\nFull logs: {tmp_path}/kraken_stdout.log, {tmp_path}/kraken_stderr.log" if tmp_path else ""
|
||||
raise AssertionError(
|
||||
f"Expected Krkn to fail but it succeeded (rc=0){context_str}.{path_hint}\n"
|
||||
f"--- stderr ---\n{result.stderr or '(empty)'}\n"
|
||||
f"--- stdout (last 20 lines) ---\n{tail_stdout}"
|
||||
)
|
||||
14
CI/tests_v2/pytest.ini
Normal file
14
CI/tests_v2/pytest.ini
Normal file
@@ -0,0 +1,14 @@
|
||||
[pytest]
|
||||
testpaths = .
|
||||
python_files = test_*.py
|
||||
python_functions = test_*
|
||||
# Install CI/tests_v2/requirements.txt for --timeout, --reruns, --reruns-delay.
|
||||
# Example full run: pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10 --html=... --junitxml=...
|
||||
addopts = -v
|
||||
markers =
|
||||
functional: marks a test as a functional test (deselect with '-m "not functional"')
|
||||
pod_disruption: marks a test as a pod disruption scenario test
|
||||
application_outage: marks a test as an application outage scenario test
|
||||
no_workload: skip workload deployment for this test (e.g. negative tests)
|
||||
order: set test order (pytest-order)
|
||||
junit_family = xunit2
|
||||
15
CI/tests_v2/requirements.txt
Normal file
15
CI/tests_v2/requirements.txt
Normal file
@@ -0,0 +1,15 @@
|
||||
# Pytest plugin deps for CI/tests_v2 functional tests.
|
||||
#
|
||||
# Kept separate from the root requirements.txt because:
|
||||
# - Root deps are Kraken runtime (cloud SDKs, K8s client, etc.)
|
||||
# - These are test-only plugins not needed by Kraken itself
|
||||
# - Merging would bloat installs for users who don't run functional tests
|
||||
# - Separate files reduce version-conflict risk between test and runtime deps
|
||||
#
|
||||
# pytest and coverage are already in root requirements.txt; do NOT duplicate here.
|
||||
# The Makefile installs both files automatically via `make setup`.
|
||||
pytest-rerunfailures>=14.0
|
||||
pytest-html>=4.1.0
|
||||
pytest-timeout>=2.2.0
|
||||
pytest-order>=1.2.0
|
||||
pytest-xdist>=3.5.0
|
||||
230
CI/tests_v2/scaffold.py
Normal file
230
CI/tests_v2/scaffold.py
Normal file
@@ -0,0 +1,230 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Generate boilerplate for a new scenario test in CI/tests_v2.
|
||||
|
||||
Usage (from repository root):
|
||||
python CI/tests_v2/scaffold.py --scenario service_hijacking
|
||||
python CI/tests_v2/scaffold.py --scenario node_disruption --scenario-type node_scenarios
|
||||
|
||||
Creates (folder-per-scenario layout):
|
||||
- CI/tests_v2/scenarios/<scenario>/test_<scenario>.py (BaseScenarioTest subclass + stub test)
|
||||
- CI/tests_v2/scenarios/<scenario>/resource.yaml (placeholder workload)
|
||||
- CI/tests_v2/scenarios/<scenario>/scenario_base.yaml (placeholder Krkn scenario; edit for your scenario_type)
|
||||
- Adds the scenario marker to pytest.ini (if not already present)
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def snake_to_camel(snake: str) -> str:
|
||||
"""Convert snake_case to CamelCase."""
|
||||
return "".join(word.capitalize() for word in snake.split("_"))
|
||||
|
||||
|
||||
def scenario_type_default(scenario: str) -> str:
|
||||
"""Default scenario_type for build_config (e.g. service_hijacking -> service_hijacking_scenarios)."""
|
||||
return f"{scenario}_scenarios"
|
||||
|
||||
|
||||
TEST_FILE_TEMPLATE = '''"""
|
||||
Functional test for {scenario} scenario.
|
||||
Each test runs in its own ephemeral namespace with workload deployed automatically.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from lib.base import BaseScenarioTest
|
||||
from lib.utils import (
|
||||
assert_all_pods_running_and_ready,
|
||||
assert_kraken_failure,
|
||||
assert_kraken_success,
|
||||
assert_pod_count_unchanged,
|
||||
get_pods_list,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.functional
|
||||
@pytest.mark.{marker}
|
||||
class Test{class_name}(BaseScenarioTest):
|
||||
"""{scenario} scenario."""
|
||||
|
||||
WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/{scenario}/resource.yaml"
|
||||
WORKLOAD_IS_PATH = True
|
||||
LABEL_SELECTOR = "app={app_label}"
|
||||
SCENARIO_NAME = "{scenario}"
|
||||
SCENARIO_TYPE = "{scenario_type}"
|
||||
NAMESPACE_KEY_PATH = {namespace_key_path}
|
||||
NAMESPACE_IS_REGEX = {namespace_is_regex}
|
||||
OVERRIDES_KEY_PATH = {overrides_key_path}
|
||||
|
||||
@pytest.mark.order(1)
|
||||
def test_happy_path(self):
|
||||
"""Run {scenario} scenario and assert pods remain healthy."""
|
||||
ns = self.ns
|
||||
before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
|
||||
result = self.run_scenario(self.tmp_path, ns)
|
||||
assert_kraken_success(result, context=f"namespace={{ns}}", tmp_path=self.tmp_path)
|
||||
|
||||
after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
assert_pod_count_unchanged(before, after, namespace=ns)
|
||||
assert_all_pods_running_and_ready(after, namespace=ns)
|
||||
'''
|
||||
|
||||
RESOURCE_YAML_TEMPLATE = '''# Target workload for {scenario} scenario tests.
|
||||
# Namespace is patched at deploy time by the test framework.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: {app_label}
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: {app_label}
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: {app_label}
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: nginx:alpine
|
||||
ports:
|
||||
- containerPort: 80
|
||||
'''
|
||||
|
||||
SCENARIO_BASE_DICT_TEMPLATE = '''# Base scenario for {scenario} (used by build_config with scenario_type: {scenario_type}).
|
||||
# Edit this file with the structure expected by Krkn. Top-level key must match SCENARIO_NAME.
|
||||
# See scenarios/application_outage/scenario_base.yaml and scenarios/pod_disruption/scenario_base.yaml for examples.
|
||||
{scenario}:
|
||||
namespace: default
|
||||
# Add fields required by your scenario plugin.
|
||||
'''
|
||||
|
||||
SCENARIO_BASE_LIST_TEMPLATE = '''# Base scenario for {scenario} (list format). Tests patch config.namespace_pattern with ^<ns>$.
|
||||
# Edit with the structure expected by your scenario plugin. See scenarios/pod_disruption/scenario_base.yaml.
|
||||
- id: {scenario}-default
|
||||
config:
|
||||
namespace_pattern: "^default$"
|
||||
# Add fields required by your scenario plugin.
|
||||
'''
|
||||
|
||||
|
||||
def main() -> int:
|
||||
parser = argparse.ArgumentParser(description="Scaffold a new scenario test in CI/tests_v2 (folder-per-scenario)")
|
||||
parser.add_argument(
|
||||
"--scenario",
|
||||
required=True,
|
||||
help="Scenario name in snake_case (e.g. service_hijacking)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--scenario-type",
|
||||
default=None,
|
||||
help="Kraken scenario_type for build_config (default: <scenario>_scenarios)",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--list-based",
|
||||
action="store_true",
|
||||
help="Use list-based scenario (NAMESPACE_KEY_PATH [0, 'config', 'namespace_pattern'], OVERRIDES_KEY_PATH [0, 'config'])",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--regex-namespace",
|
||||
action="store_true",
|
||||
help="Set NAMESPACE_IS_REGEX = True (namespace wrapped in ^...$)",
|
||||
)
|
||||
args = parser.parse_args()
|
||||
|
||||
scenario = args.scenario.strip().lower()
|
||||
if not re.match(r"^[a-z][a-z0-9_]*$", scenario):
|
||||
print("Error: --scenario must be snake_case (e.g. service_hijacking)", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
scenario_type = args.scenario_type or scenario_type_default(scenario)
|
||||
class_name = snake_to_camel(scenario)
|
||||
marker = scenario
|
||||
app_label = scenario.replace("_", "-")
|
||||
|
||||
if args.list_based:
|
||||
namespace_key_path = [0, "config", "namespace_pattern"]
|
||||
namespace_is_regex = True
|
||||
overrides_key_path = [0, "config"]
|
||||
scenario_base_template = SCENARIO_BASE_LIST_TEMPLATE
|
||||
else:
|
||||
namespace_key_path = [scenario, "namespace"]
|
||||
namespace_is_regex = args.regex_namespace
|
||||
overrides_key_path = [scenario]
|
||||
scenario_base_template = SCENARIO_BASE_DICT_TEMPLATE
|
||||
|
||||
repo_root = Path(__file__).resolve().parent.parent.parent
|
||||
scenario_dir_path = repo_root / "CI" / "tests_v2" / "scenarios" / scenario
|
||||
test_path = scenario_dir_path / f"test_{scenario}.py"
|
||||
resource_path = scenario_dir_path / "resource.yaml"
|
||||
scenario_base_path = scenario_dir_path / "scenario_base.yaml"
|
||||
|
||||
if scenario_dir_path.exists() and any(scenario_dir_path.iterdir()):
|
||||
print(f"Error: scenario directory already exists and is non-empty: {scenario_dir_path}", file=sys.stderr)
|
||||
return 1
|
||||
if test_path.exists():
|
||||
print(f"Error: {test_path} already exists", file=sys.stderr)
|
||||
return 1
|
||||
|
||||
scenario_dir_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
test_content = TEST_FILE_TEMPLATE.format(
|
||||
scenario=scenario,
|
||||
marker=marker,
|
||||
class_name=class_name,
|
||||
app_label=app_label,
|
||||
scenario_type=scenario_type,
|
||||
namespace_key_path=repr(namespace_key_path),
|
||||
namespace_is_regex=namespace_is_regex,
|
||||
overrides_key_path=repr(overrides_key_path),
|
||||
)
|
||||
resource_content = RESOURCE_YAML_TEMPLATE.format(scenario=scenario, app_label=app_label)
|
||||
scenario_base_content = scenario_base_template.format(
|
||||
scenario=scenario,
|
||||
scenario_type=scenario_type,
|
||||
)
|
||||
|
||||
test_path.write_text(test_content, encoding="utf-8")
|
||||
resource_path.write_text(resource_content, encoding="utf-8")
|
||||
scenario_base_path.write_text(scenario_base_content, encoding="utf-8")
|
||||
|
||||
# Auto-add marker to pytest.ini if not already present
|
||||
pytest_ini_path = repo_root / "CI" / "tests_v2" / "pytest.ini"
|
||||
marker_line = f" {marker}: marks a test as a {scenario} scenario test"
|
||||
if pytest_ini_path.exists():
|
||||
content = pytest_ini_path.read_text(encoding="utf-8")
|
||||
if f" {marker}:" not in content and f"{marker}: marks" not in content:
|
||||
lines = content.splitlines(keepends=True)
|
||||
insert_at = None
|
||||
for i, line in enumerate(lines):
|
||||
if re.match(r"^ \w+:\s*.+", line):
|
||||
insert_at = i + 1
|
||||
if insert_at is not None:
|
||||
lines.insert(insert_at, marker_line + "\n")
|
||||
pytest_ini_path.write_text("".join(lines), encoding="utf-8")
|
||||
print("Added marker to pytest.ini")
|
||||
else:
|
||||
print("Could not find markers block in pytest.ini; add manually:")
|
||||
print(marker_line)
|
||||
else:
|
||||
print("Marker already in pytest.ini")
|
||||
else:
|
||||
print("pytest.ini not found; add this marker under 'markers':")
|
||||
print(marker_line)
|
||||
|
||||
print(f"Created: {test_path}")
|
||||
print(f"Created: {resource_path}")
|
||||
print(f"Created: {scenario_base_path}")
|
||||
print()
|
||||
print("Then edit scenario_base.yaml with your scenario structure (top-level key should match SCENARIO_NAME).")
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
sys.exit(main())
|
||||
34
CI/tests_v2/scenarios/application_outage/nginx_http.yaml
Normal file
34
CI/tests_v2/scenarios/application_outage/nginx_http.yaml
Normal file
@@ -0,0 +1,34 @@
|
||||
# Nginx Deployment + Service for application outage traffic test.
|
||||
# Namespace is patched at deploy time by the test framework.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: nginx-outage-http
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: nginx-outage-http
|
||||
scenario: outage
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: nginx-outage-http
|
||||
scenario: outage
|
||||
spec:
|
||||
containers:
|
||||
- name: nginx
|
||||
image: nginx:alpine
|
||||
ports:
|
||||
- containerPort: 80
|
||||
---
|
||||
apiVersion: v1
|
||||
kind: Service
|
||||
metadata:
|
||||
name: nginx-outage-http
|
||||
spec:
|
||||
selector:
|
||||
app: nginx-outage-http
|
||||
ports:
|
||||
- port: 80
|
||||
targetPort: 80
|
||||
15
CI/tests_v2/scenarios/application_outage/resource.yaml
Normal file
15
CI/tests_v2/scenarios/application_outage/resource.yaml
Normal file
@@ -0,0 +1,15 @@
|
||||
apiVersion: v1
|
||||
kind: Pod
|
||||
metadata:
|
||||
name: outage
|
||||
labels:
|
||||
scenario: outage
|
||||
spec:
|
||||
containers:
|
||||
- name: fedtools
|
||||
image: quay.io/krkn-chaos/krkn:tools
|
||||
command:
|
||||
- /bin/sh
|
||||
- -c
|
||||
- |
|
||||
sleep infinity
|
||||
10
CI/tests_v2/scenarios/application_outage/scenario_base.yaml
Normal file
10
CI/tests_v2/scenarios/application_outage/scenario_base.yaml
Normal file
@@ -0,0 +1,10 @@
|
||||
# Base application_outage scenario. Tests load this and patch namespace (and optionally duration, block, exclude_label).
|
||||
application_outage:
|
||||
duration: 10
|
||||
namespace: default
|
||||
pod_selector:
|
||||
scenario: outage
|
||||
block:
|
||||
- Ingress
|
||||
- Egress
|
||||
exclude_label: ""
|
||||
@@ -0,0 +1,229 @@
|
||||
"""
|
||||
Functional test for application outage scenario (block network to target pods, then restore).
|
||||
Equivalent to CI/tests/test_app_outages.sh with proper assertions.
|
||||
The main happy-path test reuses one namespace and workload for multiple scenario runs (default, exclude_label, block variants); other tests use their own ephemeral namespace as needed.
|
||||
"""
|
||||
|
||||
import time
|
||||
|
||||
import pytest
|
||||
|
||||
from lib.base import (
|
||||
BaseScenarioTest,
|
||||
KRAKEN_PROC_WAIT_TIMEOUT,
|
||||
POLICY_WAIT_TIMEOUT,
|
||||
)
|
||||
from lib.utils import (
|
||||
assert_all_pods_running_and_ready,
|
||||
assert_kraken_failure,
|
||||
assert_kraken_success,
|
||||
assert_pod_count_unchanged,
|
||||
find_network_policy_by_prefix,
|
||||
get_network_policies_list,
|
||||
get_pods_list,
|
||||
)
|
||||
|
||||
|
||||
def _wait_for_network_policy(k8s_networking, namespace: str, prefix: str, timeout: int = 30):
|
||||
"""Poll until a NetworkPolicy with name starting with prefix exists. Return its name."""
|
||||
deadline = time.monotonic() + timeout
|
||||
while time.monotonic() < deadline:
|
||||
policy_list = get_network_policies_list(k8s_networking, namespace)
|
||||
policy = find_network_policy_by_prefix(policy_list, prefix)
|
||||
if policy:
|
||||
return policy.metadata.name
|
||||
time.sleep(1)
|
||||
raise TimeoutError(f"No NetworkPolicy with prefix {prefix!r} in {namespace} within {timeout}s")
|
||||
|
||||
|
||||
def _assert_no_network_policy_with_prefix(k8s_networking, namespace: str, prefix: str):
|
||||
policy_list = get_network_policies_list(k8s_networking, namespace)
|
||||
policy = find_network_policy_by_prefix(policy_list, prefix)
|
||||
name = policy.metadata.name if policy and policy.metadata else "?"
|
||||
assert policy is None, (
|
||||
f"Expected no NetworkPolicy with prefix {prefix!r} in namespace={namespace}, found {name}"
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.functional
|
||||
@pytest.mark.application_outage
|
||||
class TestApplicationOutage(BaseScenarioTest):
|
||||
"""Application outage scenario: block network to target pods, then restore."""
|
||||
|
||||
WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/application_outage/resource.yaml"
|
||||
WORKLOAD_IS_PATH = True
|
||||
LABEL_SELECTOR = "scenario=outage"
|
||||
POLICY_PREFIX = "krkn-deny-"
|
||||
SCENARIO_NAME = "application_outage"
|
||||
SCENARIO_TYPE = "application_outages_scenarios"
|
||||
NAMESPACE_KEY_PATH = ["application_outage", "namespace"]
|
||||
NAMESPACE_IS_REGEX = False
|
||||
OVERRIDES_KEY_PATH = ["application_outage"]
|
||||
|
||||
@pytest.mark.order(1)
|
||||
def test_app_outage_block_restore_and_variants(self):
|
||||
"""Default, exclude_label, and block-type variants (Ingress, Egress, both) run successfully in one namespace; each run restores and pods stay ready."""
|
||||
ns = self.ns
|
||||
before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
|
||||
cases = [
|
||||
("default", {}, "app_outage_config.yaml"),
|
||||
("exclude_label", {"exclude_label": {"env": "prod"}}, "app_outage_exclude_config.yaml"),
|
||||
("block=Ingress", {"block": ["Ingress"]}, "app_outage_block_ingress_config.yaml"),
|
||||
("block=Egress", {"block": ["Egress"]}, "app_outage_block_egress_config.yaml"),
|
||||
("block=Ingress,Egress", {"block": ["Ingress", "Egress"]}, "app_outage_block_ingress_egress_config.yaml"),
|
||||
]
|
||||
for context_name, overrides, config_filename in cases:
|
||||
result = self.run_scenario(
|
||||
self.tmp_path, ns,
|
||||
overrides=overrides if overrides else None,
|
||||
config_filename=config_filename,
|
||||
)
|
||||
assert_kraken_success(
|
||||
result, context=f"{context_name} namespace={ns}", tmp_path=self.tmp_path
|
||||
)
|
||||
after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
assert_pod_count_unchanged(before, after, namespace=ns)
|
||||
assert_all_pods_running_and_ready(after, namespace=ns)
|
||||
|
||||
def test_network_policy_created_then_deleted(self):
|
||||
"""NetworkPolicy with prefix krkn-deny- is created during run and deleted after."""
|
||||
ns = self.ns
|
||||
scenario = self.load_and_patch_scenario(self.repo_root, ns, duration=12)
|
||||
scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_np_lifecycle")
|
||||
config_path = self.build_config(
|
||||
self.SCENARIO_TYPE, str(scenario_path),
|
||||
filename="app_outage_np_lifecycle.yaml",
|
||||
)
|
||||
proc = self.run_kraken_background(config_path)
|
||||
try:
|
||||
policy_name = _wait_for_network_policy(
|
||||
self.k8s_networking, ns, self.POLICY_PREFIX, timeout=POLICY_WAIT_TIMEOUT
|
||||
)
|
||||
assert policy_name.startswith(self.POLICY_PREFIX), (
|
||||
f"Policy name {policy_name!r} should start with {self.POLICY_PREFIX!r} (namespace={ns})"
|
||||
)
|
||||
policy_list = get_network_policies_list(self.k8s_networking, ns)
|
||||
policy = find_network_policy_by_prefix(policy_list, self.POLICY_PREFIX)
|
||||
assert policy is not None and policy.spec is not None, (
|
||||
f"Expected NetworkPolicy with spec (namespace={ns})"
|
||||
)
|
||||
assert policy.spec.pod_selector is not None, f"Policy should have pod_selector (namespace={ns})"
|
||||
assert policy.spec.policy_types is not None, f"Policy should have policy_types (namespace={ns})"
|
||||
finally:
|
||||
proc.wait(timeout=KRAKEN_PROC_WAIT_TIMEOUT)
|
||||
_assert_no_network_policy_with_prefix(self.k8s_networking, ns, self.POLICY_PREFIX)
|
||||
|
||||
# def test_traffic_blocked_during_outage(self, request):
|
||||
# """During outage, ingress to target pods is blocked; after run, traffic is restored."""
|
||||
# ns = self.ns
|
||||
# nginx_path = scenario_dir(self.repo_root, "application_outage") / "nginx_http.yaml"
|
||||
# docs = list(yaml.safe_load_all(nginx_path.read_text()))
|
||||
# docs = patch_namespace_in_docs(docs, ns)
|
||||
# try:
|
||||
# k8s_utils.create_from_yaml(
|
||||
# self.k8s_client,
|
||||
# yaml_objects=docs,
|
||||
# namespace=ns,
|
||||
# )
|
||||
# except k8s_utils.FailToCreateError as e:
|
||||
# msgs = [str(exc) for exc in e.api_exceptions]
|
||||
# raise AssertionError(
|
||||
# f"Failed to create nginx resources (namespace={ns}): {'; '.join(msgs)}"
|
||||
# ) from e
|
||||
# wait_for_deployment_replicas(self.k8s_apps, ns, "nginx-outage-http", timeout=READINESS_TIMEOUT)
|
||||
# port = _get_free_port()
|
||||
# pf_ref = []
|
||||
|
||||
# def _kill_port_forward():
|
||||
# if pf_ref and pf_ref[0].poll() is None:
|
||||
# pf_ref[0].terminate()
|
||||
# try:
|
||||
# pf_ref[0].wait(timeout=5)
|
||||
# except subprocess.TimeoutExpired:
|
||||
# pf_ref[0].kill()
|
||||
|
||||
# request.addfinalizer(_kill_port_forward)
|
||||
# pf = subprocess.Popen(
|
||||
# ["kubectl", "port-forward", "-n", ns, "service/nginx-outage-http", f"{port}:80"],
|
||||
# cwd=self.repo_root,
|
||||
# stdout=subprocess.DEVNULL,
|
||||
# stderr=subprocess.DEVNULL,
|
||||
# )
|
||||
# pf_ref.append(pf)
|
||||
# url = f"http://127.0.0.1:{port}/"
|
||||
# try:
|
||||
# time.sleep(2)
|
||||
# baseline_ok = False
|
||||
# for _ in range(10):
|
||||
# try:
|
||||
# resp = requests.get(url, timeout=3)
|
||||
# if resp.ok:
|
||||
# baseline_ok = True
|
||||
# break
|
||||
# except (requests.ConnectionError, requests.Timeout):
|
||||
# pass
|
||||
# time.sleep(1)
|
||||
# assert baseline_ok, f"Baseline: HTTP request to nginx should succeed (namespace={ns})"
|
||||
|
||||
# scenario = self.load_and_patch_scenario(self.repo_root, ns, duration=15)
|
||||
# scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_traffic")
|
||||
# config_path = self.build_config(
|
||||
# self.SCENARIO_TYPE, str(scenario_path),
|
||||
# filename="app_outage_traffic_config.yaml",
|
||||
# )
|
||||
# proc = self.run_kraken_background(config_path)
|
||||
# policy_name = _wait_for_network_policy(
|
||||
# self.k8s_networking, ns, self.POLICY_PREFIX, timeout=POLICY_WAIT_TIMEOUT
|
||||
# )
|
||||
# assert policy_name, f"Expected policy to exist (namespace={ns})"
|
||||
# time.sleep(2)
|
||||
# failed = False
|
||||
# for _ in range(5):
|
||||
# try:
|
||||
# resp = requests.get(url, timeout=2)
|
||||
# if not resp.ok:
|
||||
# failed = True
|
||||
# break
|
||||
# except (requests.ConnectionError, requests.Timeout):
|
||||
# failed = True
|
||||
# break
|
||||
# time.sleep(1)
|
||||
# assert failed, f"During outage, HTTP request to nginx should fail (namespace={ns})"
|
||||
# proc.wait(timeout=KRAKEN_PROC_WAIT_TIMEOUT)
|
||||
# time.sleep(1)
|
||||
# resp = requests.get(url, timeout=5)
|
||||
# assert resp.ok, f"After scenario, HTTP request to nginx should succeed (namespace={ns})"
|
||||
# finally:
|
||||
# pf.terminate()
|
||||
# pf.wait(timeout=5)
|
||||
|
||||
@pytest.mark.no_workload
|
||||
def test_invalid_scenario_fails(self):
|
||||
"""Invalid scenario file (missing application_outage) causes Kraken to exit non-zero."""
|
||||
invalid_scenario_path = self.tmp_path / "invalid_scenario.yaml"
|
||||
invalid_scenario_path.write_text("foo: bar\n")
|
||||
config_path = self.build_config(
|
||||
self.SCENARIO_TYPE, str(invalid_scenario_path),
|
||||
filename="invalid_config.yaml",
|
||||
)
|
||||
result = self.run_kraken(config_path)
|
||||
assert_kraken_failure(
|
||||
result, context=f"namespace={self.ns}", tmp_path=self.tmp_path
|
||||
)
|
||||
|
||||
@pytest.mark.no_workload
|
||||
def test_bad_namespace_fails(self):
|
||||
"""Scenario targeting non-existent namespace causes Kraken to exit non-zero."""
|
||||
scenario = self.load_and_patch_scenario(self.repo_root, "nonexistent-namespace-xyz-12345")
|
||||
scenario_path = self.write_scenario(self.tmp_path, scenario, suffix="_bad_ns")
|
||||
config_path = self.build_config(
|
||||
self.SCENARIO_TYPE, str(scenario_path),
|
||||
filename="app_outage_bad_ns_config.yaml",
|
||||
)
|
||||
result = self.run_kraken(config_path)
|
||||
assert_kraken_failure(
|
||||
result,
|
||||
context=f"test namespace={self.ns}",
|
||||
tmp_path=self.tmp_path,
|
||||
)
|
||||
21
CI/tests_v2/scenarios/pod_disruption/resource.yaml
Normal file
21
CI/tests_v2/scenarios/pod_disruption/resource.yaml
Normal file
@@ -0,0 +1,21 @@
|
||||
# Single-pod deployment targeted by pod disruption scenario.
|
||||
# Namespace is patched at deploy time by the test framework.
|
||||
apiVersion: apps/v1
|
||||
kind: Deployment
|
||||
metadata:
|
||||
name: krkn-pod-disruption-target
|
||||
spec:
|
||||
replicas: 1
|
||||
selector:
|
||||
matchLabels:
|
||||
app: krkn-pod-disruption-target
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app: krkn-pod-disruption-target
|
||||
spec:
|
||||
containers:
|
||||
- name: app
|
||||
image: nginx:alpine
|
||||
ports:
|
||||
- containerPort: 80
|
||||
7
CI/tests_v2/scenarios/pod_disruption/scenario_base.yaml
Normal file
7
CI/tests_v2/scenarios/pod_disruption/scenario_base.yaml
Normal file
@@ -0,0 +1,7 @@
|
||||
# Base pod_disruption scenario (list). Tests load this and patch namespace_pattern with ^<ns>$.
|
||||
- id: kill-pods
|
||||
config:
|
||||
namespace_pattern: "^default$"
|
||||
label_selector: app=krkn-pod-disruption-target
|
||||
krkn_pod_recovery_time: 5
|
||||
kill: 1
|
||||
58
CI/tests_v2/scenarios/pod_disruption/test_pod_disruption.py
Normal file
58
CI/tests_v2/scenarios/pod_disruption/test_pod_disruption.py
Normal file
@@ -0,0 +1,58 @@
|
||||
"""
|
||||
Functional test for pod disruption scenario (pod crash and recovery).
|
||||
Equivalent to CI/tests/test_pod.sh with proper before/after assertions.
|
||||
Each test runs in its own ephemeral namespace with workload deployed automatically.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
|
||||
from lib.base import BaseScenarioTest, READINESS_TIMEOUT
|
||||
from lib.utils import (
|
||||
assert_all_pods_running_and_ready,
|
||||
assert_kraken_success,
|
||||
assert_pod_count_unchanged,
|
||||
get_pods_list,
|
||||
pod_uids,
|
||||
restart_counts,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.functional
|
||||
@pytest.mark.pod_disruption
|
||||
class TestPodDisruption(BaseScenarioTest):
|
||||
"""Pod disruption scenario: kill pods and verify recovery."""
|
||||
|
||||
WORKLOAD_MANIFEST = "CI/tests_v2/scenarios/pod_disruption/resource.yaml"
|
||||
WORKLOAD_IS_PATH = True
|
||||
LABEL_SELECTOR = "app=krkn-pod-disruption-target"
|
||||
SCENARIO_NAME = "pod_disruption"
|
||||
SCENARIO_TYPE = "pod_disruption_scenarios"
|
||||
NAMESPACE_KEY_PATH = [0, "config", "namespace_pattern"]
|
||||
NAMESPACE_IS_REGEX = True
|
||||
|
||||
@pytest.mark.order(1)
|
||||
def test_pod_crash_and_recovery(self, wait_for_pods_running):
|
||||
ns = self.ns
|
||||
before = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
before_uids = pod_uids(before)
|
||||
before_restarts = restart_counts(before)
|
||||
|
||||
result = self.run_scenario(self.tmp_path, ns)
|
||||
assert_kraken_success(result, context=f"namespace={ns}", tmp_path=self.tmp_path)
|
||||
|
||||
after = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
after_uids = pod_uids(after)
|
||||
after_restarts = restart_counts(after)
|
||||
uids_changed = set(after_uids) != set(before_uids)
|
||||
restarts_increased = after_restarts > before_restarts
|
||||
assert uids_changed or restarts_increased, (
|
||||
f"Chaos had no effect in namespace={ns}: pod UIDs unchanged and restart count did not increase. "
|
||||
f"Before UIDs: {before_uids}, restarts: {before_restarts}. "
|
||||
f"After UIDs: {after_uids}, restarts: {after_restarts}."
|
||||
)
|
||||
|
||||
wait_for_pods_running(ns, self.LABEL_SELECTOR, timeout=READINESS_TIMEOUT)
|
||||
|
||||
after_final = get_pods_list(self.k8s_core, ns, self.LABEL_SELECTOR)
|
||||
assert_pod_count_unchanged(before, after_final, namespace=ns)
|
||||
assert_all_pods_running_and_ready(after_final, namespace=ns)
|
||||
74
CI/tests_v2/setup_env.sh
Executable file
74
CI/tests_v2/setup_env.sh
Executable file
@@ -0,0 +1,74 @@
|
||||
#!/usr/bin/env bash
|
||||
# Setup environment for CI/tests_v2 pytest functional tests.
|
||||
# Run from the repository root: ./CI/tests_v2/setup_env.sh
|
||||
#
|
||||
# - Creates a KinD cluster using kind-config-dev.yml (override with KIND_CONFIG=...).
|
||||
# - Waits for the cluster and for local-path-provisioner pods (required by pod disruption test).
|
||||
# - Does not install Python deps; use a venv and pip install -r requirements.txt and CI/tests_v2/requirements.txt yourself.
|
||||
|
||||
set -e
|
||||
|
||||
REPO_ROOT="$(cd "$(dirname "$0")/../.." && pwd)"
|
||||
KIND_CONFIG="${KIND_CONFIG:-${REPO_ROOT}/CI/tests_v2/kind-config-dev.yml}"
|
||||
CLUSTER_NAME="${KIND_CLUSTER_NAME:-ci-krkn}"
|
||||
|
||||
echo "Repository root: $REPO_ROOT"
|
||||
cd "$REPO_ROOT"
|
||||
|
||||
# Check required tools
|
||||
command -v kind >/dev/null 2>&1 || { echo "Error: kind is not installed. Install from https://kind.sigs.k8s.io/docs/user/quick-start/"; exit 1; }
|
||||
command -v kubectl >/dev/null 2>&1 || { echo "Error: kubectl is not installed."; exit 1; }
|
||||
|
||||
# Python 3.9+
|
||||
python3 -c "import sys; exit(0 if sys.version_info >= (3, 9) else 1)" 2>/dev/null || { echo "Error: Python 3.9+ required. Check: python3 --version"; exit 1; }
|
||||
|
||||
# Docker running (required for KinD)
|
||||
docker info >/dev/null 2>&1 || { echo "Error: Docker is not running. Start Docker Desktop or run: systemctl start docker"; exit 1; }
|
||||
|
||||
# Tool versions for reproducibility
|
||||
echo "kind: $(kind --version 2>/dev/null || kind version 2>/dev/null)"
|
||||
echo "kubectl: $(kubectl version --client --short 2>/dev/null || kubectl version --client 2>/dev/null)"
|
||||
|
||||
# Create cluster if it doesn't exist (use "kind get clusters" so we skip when nodes exist even if kubeconfig check would fail)
|
||||
if kind get clusters 2>/dev/null | grep -qx "$CLUSTER_NAME"; then
|
||||
echo "KinD cluster '$CLUSTER_NAME' already exists, skipping creation."
|
||||
else
|
||||
echo "Creating KinD cluster '$CLUSTER_NAME' from $KIND_CONFIG ..."
|
||||
kind create cluster --name "$CLUSTER_NAME" --config "$KIND_CONFIG"
|
||||
fi
|
||||
|
||||
# echo "Pre-pulling test workload images into KinD cluster..."
|
||||
# docker pull nginx:alpine
|
||||
# kind load docker-image nginx:alpine --name "$CLUSTER_NAME"
|
||||
|
||||
# kind merges into default kubeconfig (~/.kube/config), so kubectl should work in this shell.
|
||||
# If you need to use this cluster from another terminal: export KUBECONFIG=~/.kube/config
|
||||
# and ensure context: kubectl config use-context kind-$CLUSTER_NAME
|
||||
|
||||
echo "Waiting for cluster nodes to be Ready..."
|
||||
kubectl wait --for=condition=Ready nodes --all --timeout=120s 2>/dev/null || true
|
||||
|
||||
echo "Waiting for local-path-provisioner pods (namespace local-path-storage, label app=local-path-provisioner)..."
|
||||
for i in {1..60}; do
|
||||
if kubectl get pods -n local-path-storage -l app=local-path-provisioner -o name 2>/dev/null | grep -q .; then
|
||||
echo "Found local-path-provisioner pod(s). Waiting for Ready..."
|
||||
kubectl wait --for=condition=ready pod -l app=local-path-provisioner -n local-path-storage --timeout=120s 2>/dev/null && break
|
||||
fi
|
||||
echo "Attempt $i: local-path-provisioner not ready yet..."
|
||||
sleep 3
|
||||
done
|
||||
|
||||
if ! kubectl get pods -n local-path-storage -l app=local-path-provisioner -o name 2>/dev/null | grep -q .; then
|
||||
echo "Warning: No pods with label app=local-path-provisioner in local-path-storage."
|
||||
echo "KinD usually deploys this by default. Check: kubectl get pods -n local-path-storage"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo ""
|
||||
echo "Cluster is ready for CI/tests_v2."
|
||||
echo " kubectl uses the default kubeconfig (kind merged it). For another terminal: export KUBECONFIG=~/.kube/config"
|
||||
echo ""
|
||||
echo "Next: activate your venv, install deps, and run tests from repo root:"
|
||||
echo " pip install -r requirements.txt"
|
||||
echo " pip install -r CI/tests_v2/requirements.txt"
|
||||
echo " pytest CI/tests_v2/ -v --timeout=300 --reruns=2 --reruns-delay=10"
|
||||
@@ -26,7 +26,7 @@ Here is an excerpt:
|
||||
## Maintainer Levels
|
||||
|
||||
### Contributor
|
||||
Contributors contributor to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.
|
||||
Contributors contribute to the community. Anyone can become a contributor by participating in discussions, reporting bugs, or contributing code or documentation.
|
||||
|
||||
#### Responsibilities:
|
||||
|
||||
@@ -80,4 +80,4 @@ Represent the project in the broader open-source community.
|
||||
|
||||
|
||||
# Credits
|
||||
Sections of this documents have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
|
||||
Sections of this document have been borrowed from [Kubernetes governance](https://github.com/kubernetes/community/blob/master/governance.md)
|
||||
10
ROADMAP.md
10
ROADMAP.md
@@ -16,5 +16,11 @@ Following are a list of enhancements that we are planning to work on adding supp
|
||||
- [x] [Krknctl - client for running Krkn scenarios with ease](https://github.com/krkn-chaos/krknctl)
|
||||
- [x] [AI Chat bot to help get started with Krkn and commands](https://github.com/krkn-chaos/krkn-lightspeed)
|
||||
- [ ] [Ability to roll back cluster to original state if chaos fails](https://github.com/krkn-chaos/krkn/issues/804)
|
||||
- [ ] Add recovery time metrics to each scenario for each better regression analysis
|
||||
- [ ] [Add resiliency scoring to chaos scenarios ran on cluster](https://github.com/krkn-chaos/krkn/issues/125)
|
||||
- [ ] Add recovery time metrics to each scenario for better regression analysis
|
||||
- [ ] [Add resiliency scoring to chaos scenarios ran on cluster](https://github.com/krkn-chaos/krkn/issues/125)
|
||||
- [ ] [Add AI-based Chaos Configuration Generator](https://github.com/krkn-chaos/krkn/issues/1166)
|
||||
- [ ] [Introduce Security Chaos Engineering Scenarios](https://github.com/krkn-chaos/krkn/issues/1165)
|
||||
- [ ] [Add AWS-native Chaos Scenarios (S3, Lambda, Networking)](https://github.com/krkn-chaos/krkn/issues/1164)
|
||||
- [ ] [Unify Krkn Ecosystem under krknctl for Enhanced UX](https://github.com/krkn-chaos/krknctl/issues/113)
|
||||
- [ ] [Build Web UI for Creating, Monitoring, and Reviewing Chaos Scenarios](https://github.com/krkn-chaos/krkn/issues/1167)
|
||||
- [ ] [Add Predefined Chaos Scenario Templates (KRKN Chaos Library)](https://github.com/krkn-chaos/krkn/issues/1168)
|
||||
|
||||
@@ -40,4 +40,4 @@ The security team currently consists of the [Maintainers of Krkn](https://github
|
||||
|
||||
## Process and Supported Releases
|
||||
|
||||
The Krkn security team will investigate and provide a fix in a timely mannner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
The Krkn security team will investigate and provide a fix in a timely manner depending on the severity. The fix will be included in the new release of Krkn and details will be included in the release notes.
|
||||
|
||||
@@ -39,7 +39,7 @@ cerberus:
|
||||
Sunday:
|
||||
slack_team_alias: # The slack team alias to be tagged while reporting failures in the slack channel when no watcher is assigned
|
||||
|
||||
custom_checks: # Relative paths of files conataining additional user defined checks
|
||||
custom_checks: # Relative paths of files containing additional user defined checks
|
||||
|
||||
tunings:
|
||||
timeout: 3 # Number of seconds before requests fail
|
||||
|
||||
@@ -50,6 +50,8 @@ kraken:
|
||||
- network_chaos_ng_scenarios:
|
||||
- scenarios/kube/pod-network-filter.yml
|
||||
- scenarios/kube/node-network-filter.yml
|
||||
- scenarios/kube/node-network-chaos.yml
|
||||
- scenarios/kube/pod-network-chaos.yml
|
||||
- kubevirt_vm_outage:
|
||||
- scenarios/kubevirt/kubevirt-vm-outage.yaml
|
||||
|
||||
@@ -77,6 +79,7 @@ elastic:
|
||||
metrics_index: "krkn-metrics"
|
||||
alerts_index: "krkn-alerts"
|
||||
telemetry_index: "krkn-telemetry"
|
||||
run_tag: ""
|
||||
|
||||
tunings:
|
||||
wait_duration: 1 # Duration to wait between each chaos scenario
|
||||
@@ -93,7 +96,7 @@ telemetry:
|
||||
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000
|
||||
@@ -128,4 +131,5 @@ kubevirt_checks: # Utilizing virt che
|
||||
disconnected: False # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False
|
||||
ssh_node: "" # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos
|
||||
node_names: ""
|
||||
exit_on_failure: # If value is True and VMI's are failing post chaos returns failure, values can be True/False
|
||||
exit_on_failure: # If value is True and VMI's are failing post chaos returns failure, values can be True/False
|
||||
|
||||
|
||||
@@ -32,7 +32,7 @@ tunings:
|
||||
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
events_backup: False # enables/disables cluster events collection
|
||||
logs_backup: False
|
||||
|
||||
|
||||
@@ -61,7 +61,7 @@ telemetry:
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
archive_path: /tmp # local path where the archive files will be temporarily stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
|
||||
@@ -1,4 +1,5 @@
|
||||
#!/bin/bash
|
||||
set -e
|
||||
# Run SSH setup
|
||||
./containers/setup-ssh.sh
|
||||
# Change to kraken directory
|
||||
|
||||
@@ -3,10 +3,16 @@ apiVersion: kind.x-k8s.io/v1alpha4
|
||||
nodes:
|
||||
- role: control-plane
|
||||
extraPortMappings:
|
||||
- containerPort: 30000
|
||||
hostPort: 9090
|
||||
- containerPort: 32766
|
||||
hostPort: 9200
|
||||
- containerPort: 30036
|
||||
hostPort: 8888
|
||||
- containerPort: 30037
|
||||
hostPort: 8889
|
||||
- containerPort: 30080
|
||||
hostPort: 30080
|
||||
- role: control-plane
|
||||
- role: control-plane
|
||||
- role: worker
|
||||
|
||||
@@ -2,19 +2,33 @@ import logging
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
from krkn_lib.utils.functions import get_yaml_item_value
|
||||
|
||||
check_application_routes = ""
|
||||
cerberus_url = None
|
||||
exit_on_failure = False
|
||||
cerberus_enabled = False
|
||||
|
||||
def get_status(config, start_time, end_time):
|
||||
def set_url(config):
|
||||
global exit_on_failure
|
||||
exit_on_failure = get_yaml_item_value(config["kraken"], "exit_on_failure", False)
|
||||
global cerberus_enabled
|
||||
cerberus_enabled = get_yaml_item_value(config["cerberus"],"cerberus_enabled", False)
|
||||
if cerberus_enabled:
|
||||
global cerberus_url
|
||||
cerberus_url = get_yaml_item_value(config["cerberus"],"cerberus_url", "")
|
||||
global check_application_routes
|
||||
check_application_routes = \
|
||||
get_yaml_item_value(config["cerberus"],"check_applicaton_routes","")
|
||||
|
||||
def get_status(start_time, end_time):
|
||||
"""
|
||||
Get cerberus status
|
||||
"""
|
||||
cerberus_status = True
|
||||
check_application_routes = False
|
||||
application_routes_status = True
|
||||
if config["cerberus"]["cerberus_enabled"]:
|
||||
cerberus_url = config["cerberus"]["cerberus_url"]
|
||||
check_application_routes = \
|
||||
config["cerberus"]["check_application_routes"]
|
||||
if cerberus_enabled:
|
||||
if not cerberus_url:
|
||||
logging.error(
|
||||
"url where Cerberus publishes True/False signal "
|
||||
@@ -61,40 +75,38 @@ def get_status(config, start_time, end_time):
|
||||
return cerberus_status
|
||||
|
||||
|
||||
def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
|
||||
def publish_kraken_status( start_time, end_time):
|
||||
"""
|
||||
Publish kraken status to cerberus
|
||||
"""
|
||||
cerberus_status = get_status(config, start_time, end_time)
|
||||
cerberus_status = get_status(start_time, end_time)
|
||||
if not cerberus_status:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing"
|
||||
)
|
||||
if exit_on_failure:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing"
|
||||
)
|
||||
else:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios "
|
||||
"are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios "
|
||||
"are still failing"
|
||||
)
|
||||
if exit_on_failure:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios "
|
||||
"are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios "
|
||||
"are still failing"
|
||||
)
|
||||
|
||||
|
||||
def application_status(cerberus_url, start_time, end_time):
|
||||
def application_status( start_time, end_time):
|
||||
"""
|
||||
Check application availability
|
||||
"""
|
||||
|
||||
@@ -46,7 +46,7 @@ def alerts(
|
||||
sys.exit(1)
|
||||
|
||||
for alert in profile_yaml:
|
||||
if list(alert.keys()).sort() != ["expr", "description", "severity"].sort():
|
||||
if sorted(alert.keys()) != sorted(["expr", "description", "severity"]):
|
||||
logging.error(f"wrong alert {alert}, skipping")
|
||||
continue
|
||||
|
||||
@@ -205,8 +205,8 @@ def metrics(
|
||||
query
|
||||
)
|
||||
elif (
|
||||
list(metric_query.keys()).sort()
|
||||
== ["query", "metricName"].sort()
|
||||
sorted(metric_query.keys())
|
||||
== sorted(["query", "metricName"])
|
||||
):
|
||||
metrics_result = prom_cli.process_prom_query_in_range(
|
||||
query,
|
||||
@@ -214,7 +214,7 @@ def metrics(
|
||||
end_time=datetime.datetime.fromtimestamp(end_time), granularity=30
|
||||
)
|
||||
else:
|
||||
logging.info('didnt match keys')
|
||||
logging.info("didn't match keys")
|
||||
continue
|
||||
|
||||
for returned_metric in metrics_result:
|
||||
|
||||
79
krkn/prometheus/collector.py
Normal file
79
krkn/prometheus/collector.py
Normal file
@@ -0,0 +1,79 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
from typing import Dict, Any, List, Optional
|
||||
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
|
||||
|
||||
# -----------------------------------------------------------------------------
|
||||
# SLO evaluation helpers (used by krkn.resiliency)
|
||||
# -----------------------------------------------------------------------------
|
||||
|
||||
|
||||
def slo_passed(prometheus_result: List[Any]) -> Optional[bool]:
|
||||
if not prometheus_result:
|
||||
return None
|
||||
has_samples = False
|
||||
for series in prometheus_result:
|
||||
if "values" in series:
|
||||
has_samples = True
|
||||
for _ts, val in series["values"]:
|
||||
try:
|
||||
if float(val) > 0:
|
||||
return False
|
||||
except (TypeError, ValueError):
|
||||
continue
|
||||
elif "value" in series:
|
||||
has_samples = True
|
||||
try:
|
||||
return float(series["value"][1]) == 0
|
||||
except (TypeError, ValueError):
|
||||
return False
|
||||
|
||||
# If we reached here and never saw any samples, skip
|
||||
return None if not has_samples else True
|
||||
|
||||
|
||||
def evaluate_slos(
|
||||
prom_cli: KrknPrometheus,
|
||||
slo_list: List[Dict[str, Any]],
|
||||
start_time: datetime.datetime,
|
||||
end_time: datetime.datetime,
|
||||
) -> Dict[str, bool]:
|
||||
"""Evaluate a list of SLO expressions against Prometheus.
|
||||
|
||||
Args:
|
||||
prom_cli: Configured Prometheus client.
|
||||
slo_list: List of dicts with keys ``name``, ``expr``.
|
||||
start_time: Start timestamp.
|
||||
end_time: End timestamp.
|
||||
granularity: Step in seconds for range queries.
|
||||
Returns:
|
||||
Mapping name -> bool indicating pass status.
|
||||
True means good we passed the SLO test otherwise failed the SLO
|
||||
"""
|
||||
results: Dict[str, bool] = {}
|
||||
logging.info("Evaluating %d SLOs over window %s – %s", len(slo_list), start_time, end_time)
|
||||
for slo in slo_list:
|
||||
expr = slo["expr"]
|
||||
name = slo["name"]
|
||||
try:
|
||||
response = prom_cli.process_prom_query_in_range(
|
||||
expr,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
|
||||
passed = slo_passed(response)
|
||||
if passed is None:
|
||||
# Absence of data indicates the condition did not trigger; treat as pass.
|
||||
logging.debug("SLO '%s' query returned no data; assuming pass.", name)
|
||||
results[name] = True
|
||||
else:
|
||||
results[name] = passed
|
||||
except Exception as exc:
|
||||
logging.error("PromQL query failed for SLO '%s': %s", name, exc)
|
||||
results[name] = False
|
||||
return results
|
||||
4
krkn/resiliency/__init__.py
Normal file
4
krkn/resiliency/__init__.py
Normal file
@@ -0,0 +1,4 @@
|
||||
"""krkn.resiliency package public interface."""
|
||||
|
||||
from .resiliency import Resiliency # noqa: F401
|
||||
from .score import calculate_resiliency_score # noqa: F401
|
||||
366
krkn/resiliency/resiliency.py
Normal file
366
krkn/resiliency/resiliency.py
Normal file
@@ -0,0 +1,366 @@
|
||||
"""Resiliency evaluation orchestrator for Krkn chaos runs.
|
||||
|
||||
This module provides the `Resiliency` class which loads the canonical
|
||||
`alerts.yaml`, executes every SLO expression against Prometheus in the
|
||||
chaos-test time window, determines pass/fail status and calculates an
|
||||
overall resiliency score using the generic weighted model implemented
|
||||
in `krkn.resiliency.score`.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import datetime
|
||||
import logging
|
||||
import os
|
||||
from typing import Dict, List, Any, Optional, Tuple
|
||||
|
||||
import yaml
|
||||
import json
|
||||
import dataclasses
|
||||
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
from krkn.prometheus.collector import evaluate_slos
|
||||
from krkn.resiliency.score import calculate_resiliency_score
|
||||
|
||||
|
||||
class Resiliency:
|
||||
"""Central orchestrator for resiliency scoring."""
|
||||
|
||||
def __init__(self, alerts_yaml_path: str):
|
||||
|
||||
if not os.path.exists(alerts_yaml_path):
|
||||
raise FileNotFoundError(f"alerts file not found: {alerts_yaml_path}")
|
||||
with open(alerts_yaml_path, "r", encoding="utf-8") as fp:
|
||||
raw_yaml_data = yaml.safe_load(fp)
|
||||
logging.info("Loaded SLO configuration from %s", alerts_yaml_path)
|
||||
|
||||
self._slos = self._normalise_alerts(raw_yaml_data)
|
||||
self._results: Dict[str, bool] = {}
|
||||
self._score: Optional[int] = None
|
||||
self._breakdown: Optional[Dict[str, int]] = None
|
||||
self._health_check_results: Dict[str, bool] = {}
|
||||
self.scenario_reports: List[Dict[str, Any]] = []
|
||||
self.summary: Optional[Dict[str, Any]] = None
|
||||
self.detailed_report: Optional[Dict[str, Any]] = None
|
||||
|
||||
# ---------------------------------------------------------------------
|
||||
# Public API
|
||||
# ---------------------------------------------------------------------
|
||||
|
||||
def calculate_score(
|
||||
self,
|
||||
*,
|
||||
health_check_results: Optional[Dict[str, bool]] = None,
|
||||
) -> int:
|
||||
"""Calculate the resiliency score using collected SLO results."""
|
||||
slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
|
||||
score, breakdown = calculate_resiliency_score(
|
||||
slo_definitions=slo_defs,
|
||||
prometheus_results=self._results,
|
||||
health_check_results=health_check_results or {},
|
||||
)
|
||||
self._score = score
|
||||
self._breakdown = breakdown
|
||||
self._health_check_results = health_check_results or {}
|
||||
return score
|
||||
|
||||
def to_dict(self) -> Dict[str, Any]:
|
||||
"""Return a dictionary ready for telemetry output."""
|
||||
if self._score is None:
|
||||
raise RuntimeError("calculate_score() must be called before to_dict()")
|
||||
return {
|
||||
"score": self._score,
|
||||
"breakdown": self._breakdown,
|
||||
"slo_results": self._results,
|
||||
"health_check_results": getattr(self, "_health_check_results", {}),
|
||||
}
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Scenario-based resiliency evaluation
|
||||
# ------------------------------------------------------------------
|
||||
def add_scenario_report(
|
||||
self,
|
||||
*,
|
||||
scenario_name: str,
|
||||
prom_cli: KrknPrometheus,
|
||||
start_time: datetime.datetime,
|
||||
end_time: datetime.datetime,
|
||||
weight: float | int = 1,
|
||||
health_check_results: Optional[Dict[str, bool]] = None,
|
||||
) -> int:
|
||||
"""
|
||||
Evaluate SLOs for a single scenario window and store the result.
|
||||
|
||||
Args:
|
||||
scenario_name: Human-friendly scenario identifier.
|
||||
prom_cli: Initialized KrknPrometheus instance.
|
||||
start_time: Window start.
|
||||
end_time: Window end.
|
||||
weight: Weight to use for the final weighted average calculation.
|
||||
health_check_results: Optional mapping of custom health-check name ➡ bool.
|
||||
Returns:
|
||||
The calculated integer resiliency score (0-100) for this scenario.
|
||||
"""
|
||||
slo_results = evaluate_slos(
|
||||
prom_cli=prom_cli,
|
||||
slo_list=self._slos,
|
||||
start_time=start_time,
|
||||
end_time=end_time,
|
||||
)
|
||||
slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
|
||||
score, breakdown = calculate_resiliency_score(
|
||||
slo_definitions=slo_defs,
|
||||
prometheus_results=slo_results,
|
||||
health_check_results=health_check_results or {},
|
||||
)
|
||||
self.scenario_reports.append(
|
||||
{
|
||||
"name": scenario_name,
|
||||
"window": {
|
||||
"start": start_time.isoformat(),
|
||||
"end": end_time.isoformat(),
|
||||
},
|
||||
"score": score,
|
||||
"weight": weight,
|
||||
"breakdown": breakdown,
|
||||
"slo_results": slo_results,
|
||||
"health_check_results": health_check_results or {},
|
||||
}
|
||||
)
|
||||
return score
|
||||
|
||||
def finalize_report(
|
||||
self,
|
||||
*,
|
||||
prom_cli: KrknPrometheus,
|
||||
total_start_time: datetime.datetime,
|
||||
total_end_time: datetime.datetime,
|
||||
) -> None:
|
||||
if not self.scenario_reports:
|
||||
raise RuntimeError("No scenario reports added – nothing to finalize")
|
||||
|
||||
# ---------------- Weighted average (primary resiliency_score) ----------
|
||||
total_weight = sum(rep["weight"] for rep in self.scenario_reports)
|
||||
resiliency_score = int(
|
||||
sum(rep["score"] * rep["weight"] for rep in self.scenario_reports) / total_weight
|
||||
)
|
||||
|
||||
# ---------------- Overall SLO evaluation across full test window -----------------------------
|
||||
full_slo_results = evaluate_slos(
|
||||
prom_cli=prom_cli,
|
||||
slo_list=self._slos,
|
||||
start_time=total_start_time,
|
||||
end_time=total_end_time,
|
||||
)
|
||||
slo_defs = {slo["name"]: {"severity": slo["severity"], "weight": slo.get("weight")} for slo in self._slos}
|
||||
_overall_score, full_breakdown = calculate_resiliency_score(
|
||||
slo_definitions=slo_defs,
|
||||
prometheus_results=full_slo_results,
|
||||
health_check_results={},
|
||||
)
|
||||
|
||||
self.summary = {
|
||||
"scenarios": {rep["name"]: rep["score"] for rep in self.scenario_reports},
|
||||
"resiliency_score": resiliency_score,
|
||||
"passed_slos": full_breakdown.get("passed", 0),
|
||||
"total_slos": full_breakdown.get("passed", 0) + full_breakdown.get("failed", 0),
|
||||
}
|
||||
|
||||
# Detailed report currently limited to per-scenario information; system stability section removed
|
||||
self.detailed_report = {
|
||||
"scenarios": self.scenario_reports,
|
||||
}
|
||||
|
||||
def get_summary(self) -> Dict[str, Any]:
|
||||
"""Return the concise resiliency_summary structure."""
|
||||
if not hasattr(self, "summary") or self.summary is None:
|
||||
raise RuntimeError("finalize_report() must be called first")
|
||||
return self.summary
|
||||
|
||||
def get_detailed_report(self) -> Dict[str, Any]:
|
||||
"""Return the full resiliency-report structure."""
|
||||
if not hasattr(self, "detailed_report") or self.detailed_report is None:
|
||||
raise RuntimeError("finalize_report() must be called first")
|
||||
return self.detailed_report
|
||||
|
||||
@staticmethod
|
||||
def compact_breakdown(report: Dict[str, Any]) -> Dict[str, int]:
|
||||
"""Return a compact summary dict for a single scenario report."""
|
||||
try:
|
||||
passed = report["breakdown"]["passed"]
|
||||
failed = report["breakdown"]["failed"]
|
||||
score_val = report["score"]
|
||||
except Exception:
|
||||
passed = report.get("breakdown", {}).get("passed", 0)
|
||||
failed = report.get("breakdown", {}).get("failed", 0)
|
||||
score_val = report.get("score", 0)
|
||||
return {
|
||||
"resiliency_score": score_val,
|
||||
"passed_slos": passed,
|
||||
"total_slos": passed + failed,
|
||||
}
|
||||
|
||||
def attach_compact_to_telemetry(self, chaos_telemetry: ChaosRunTelemetry) -> None:
|
||||
"""Embed per-scenario compact resiliency reports into a ChaosRunTelemetry instance."""
|
||||
score_map = {
|
||||
rep["name"]: self.compact_breakdown(rep) for rep in self.scenario_reports
|
||||
}
|
||||
new_scenarios = []
|
||||
for item in getattr(chaos_telemetry, "scenarios", []):
|
||||
if isinstance(item, dict):
|
||||
name = item.get("scenario")
|
||||
if name in score_map:
|
||||
item["resiliency_report"] = score_map[name]
|
||||
new_scenarios.append(item)
|
||||
else:
|
||||
name = getattr(item, "scenario", None)
|
||||
try:
|
||||
item_dict = dataclasses.asdict(item)
|
||||
except Exception:
|
||||
item_dict = {
|
||||
k: getattr(item, k)
|
||||
for k in dir(item)
|
||||
if not k.startswith("__") and not callable(getattr(item, k))
|
||||
}
|
||||
if name in score_map:
|
||||
item_dict["resiliency_report"] = score_map[name]
|
||||
new_scenarios.append(item_dict)
|
||||
chaos_telemetry.scenarios = new_scenarios
|
||||
|
||||
def add_scenario_reports(
|
||||
self,
|
||||
*,
|
||||
scenario_telemetries,
|
||||
prom_cli: KrknPrometheus,
|
||||
scenario_type: str,
|
||||
batch_start_dt: datetime.datetime,
|
||||
batch_end_dt: datetime.datetime,
|
||||
weight: int | float = 1,
|
||||
) -> None:
|
||||
"""Evaluate SLOs for every telemetry item belonging to a scenario window,
|
||||
store the result and enrich the telemetry list with a compact resiliency breakdown.
|
||||
|
||||
Args:
|
||||
scenario_telemetries: Iterable with telemetry objects/dicts for the
|
||||
current scenario batch window.
|
||||
prom_cli: Pre-configured :class:`KrknPrometheus` instance.
|
||||
scenario_type: Fallback scenario identifier in case individual
|
||||
telemetry items do not provide one.
|
||||
batch_start_dt: Fallback start timestamp for the batch window.
|
||||
batch_end_dt: Fallback end timestamp for the batch window.
|
||||
weight: Weight to assign to every scenario when calculating the final
|
||||
weighted average.
|
||||
logger: Optional custom logger.
|
||||
"""
|
||||
|
||||
for tel in scenario_telemetries:
|
||||
try:
|
||||
# -------- Extract timestamps & scenario name --------------------
|
||||
if isinstance(tel, dict):
|
||||
st_ts = tel.get("start_timestamp")
|
||||
en_ts = tel.get("end_timestamp")
|
||||
scen_name = tel.get("scenario", scenario_type)
|
||||
else:
|
||||
st_ts = getattr(tel, "start_timestamp", None)
|
||||
en_ts = getattr(tel, "end_timestamp", None)
|
||||
scen_name = getattr(tel, "scenario", scenario_type)
|
||||
|
||||
if st_ts and en_ts:
|
||||
st_dt = datetime.datetime.fromtimestamp(int(st_ts))
|
||||
en_dt = datetime.datetime.fromtimestamp(int(en_ts))
|
||||
else:
|
||||
st_dt = batch_start_dt
|
||||
en_dt = batch_end_dt
|
||||
|
||||
# -------- Calculate resiliency score for the scenario -----------
|
||||
self.add_scenario_report(
|
||||
scenario_name=str(scen_name),
|
||||
prom_cli=prom_cli,
|
||||
start_time=st_dt,
|
||||
end_time=en_dt,
|
||||
weight=weight,
|
||||
health_check_results=None,
|
||||
)
|
||||
|
||||
compact = self.compact_breakdown(self.scenario_reports[-1])
|
||||
if isinstance(tel, dict):
|
||||
tel["resiliency_report"] = compact
|
||||
else:
|
||||
setattr(tel, "resiliency_report", compact)
|
||||
except Exception as exc:
|
||||
logging.error("Resiliency per-scenario evaluation failed: %s", exc)
|
||||
|
||||
def finalize_and_save(
|
||||
self,
|
||||
*,
|
||||
prom_cli: KrknPrometheus,
|
||||
total_start_time: datetime.datetime,
|
||||
total_end_time: datetime.datetime,
|
||||
run_mode: str = "standalone",
|
||||
detailed_path: str = "resiliency-report.json",
|
||||
) -> Tuple[Dict[str, Any], Dict[str, Any]]:
|
||||
"""Finalize resiliency scoring, persist reports and return them.
|
||||
|
||||
Args:
|
||||
prom_cli: Pre-configured KrknPrometheus instance.
|
||||
total_start_time: Start time for the full test window.
|
||||
total_end_time: End time for the full test window.
|
||||
run_mode: "controller" or "standalone" mode.
|
||||
|
||||
Returns:
|
||||
(detailed_report)
|
||||
"""
|
||||
|
||||
try:
|
||||
self.finalize_report(
|
||||
prom_cli=prom_cli,
|
||||
total_start_time=total_start_time,
|
||||
total_end_time=total_end_time,
|
||||
)
|
||||
detailed = self.get_detailed_report()
|
||||
|
||||
if run_mode == "controller":
|
||||
# krknctl expects the detailed report on stdout in a special format
|
||||
try:
|
||||
detailed_json = json.dumps(detailed)
|
||||
print(f"KRKN_RESILIENCY_REPORT_JSON:{detailed_json}")
|
||||
logging.info("Resiliency report logged to stdout for krknctl.")
|
||||
except Exception as exc:
|
||||
logging.error("Failed to serialize and log detailed resiliency report: %s", exc)
|
||||
else:
|
||||
# Stand-alone mode – write to files for post-run consumption
|
||||
try:
|
||||
with open(detailed_path, "w", encoding="utf-8") as fp:
|
||||
json.dump(detailed, fp, indent=2)
|
||||
logging.info("Resiliency report written: %s", detailed_path)
|
||||
except Exception as io_exc:
|
||||
logging.error("Failed to write resiliency report files: %s", io_exc)
|
||||
|
||||
except Exception as exc:
|
||||
logging.error("Failed to finalize resiliency scoring: %s", exc)
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# Internal helpers
|
||||
# ------------------------------------------------------------------
|
||||
@staticmethod
|
||||
def _normalise_alerts(raw_alerts: Any) -> List[Dict[str, Any]]:
|
||||
"""Convert raw YAML alerts data into internal SLO list structure."""
|
||||
if not isinstance(raw_alerts, list):
|
||||
raise ValueError("SLO configuration must be a list under key 'slos' or top-level list")
|
||||
|
||||
slos: List[Dict[str, Any]] = []
|
||||
for idx, alert in enumerate(raw_alerts):
|
||||
if not (isinstance(alert, dict) and "expr" in alert and "severity" in alert):
|
||||
logging.warning("Skipping invalid alert entry at index %d: %s", idx, alert)
|
||||
continue
|
||||
name = alert.get("description") or f"slo_{idx}"
|
||||
slos.append(
|
||||
{
|
||||
"name": name,
|
||||
"expr": alert["expr"],
|
||||
"severity": str(alert["severity"]).lower(),
|
||||
"weight": alert.get("weight")
|
||||
}
|
||||
)
|
||||
return slos
|
||||
76
krkn/resiliency/score.py
Normal file
76
krkn/resiliency/score.py
Normal file
@@ -0,0 +1,76 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Dict, List, Tuple
|
||||
|
||||
DEFAULT_WEIGHTS = {"critical": 3, "warning": 1}
|
||||
|
||||
|
||||
class SLOResult:
|
||||
"""Simple container representing evaluation outcome for a single SLO."""
|
||||
|
||||
def __init__(self, name: str, severity: str, passed: bool, weight: int | None = None):
|
||||
self.name = name
|
||||
self.severity = severity
|
||||
self.passed = passed
|
||||
self._custom_weight = weight
|
||||
|
||||
def weight(self, severity_weights: Dict[str, int]) -> int:
|
||||
"""Return the weight for this SLO. Uses custom weight if set, otherwise uses severity-based weight."""
|
||||
if self._custom_weight is not None:
|
||||
return self._custom_weight
|
||||
return severity_weights.get(self.severity, severity_weights.get("warning", 1))
|
||||
|
||||
|
||||
def calculate_resiliency_score(
|
||||
slo_definitions: Dict[str, str] | Dict[str, Dict[str, int | str | None]],
|
||||
prometheus_results: Dict[str, bool],
|
||||
health_check_results: Dict[str, bool],
|
||||
) -> Tuple[int, Dict[str, int]]:
|
||||
"""Compute a resiliency score between 0-100 based on SLO pass/fail results.
|
||||
|
||||
Args:
|
||||
slo_definitions: Mapping of SLO name -> severity ("critical" | "warning") OR
|
||||
SLO name -> {"severity": str, "weight": int | None}.
|
||||
prometheus_results: Mapping of SLO name -> bool indicating whether the SLO
|
||||
passed. Any SLO missing in this mapping is treated as failed.
|
||||
health_check_results: Mapping of custom health-check name -> bool pass flag.
|
||||
These checks are always treated as *critical*.
|
||||
|
||||
Returns:
|
||||
Tuple containing (final_score, breakdown) where *breakdown* is a dict with
|
||||
the counts of passed/failed SLOs per severity.
|
||||
"""
|
||||
|
||||
slo_objects: List[SLOResult] = []
|
||||
for slo_name, slo_def in slo_definitions.items():
|
||||
# Exclude SLOs that were not evaluated (query returned no data)
|
||||
if slo_name not in prometheus_results:
|
||||
continue
|
||||
passed = bool(prometheus_results[slo_name])
|
||||
|
||||
# Support both old format (str) and new format (dict)
|
||||
if isinstance(slo_def, str):
|
||||
severity = slo_def
|
||||
slo_weight = None
|
||||
else:
|
||||
severity = slo_def.get("severity", "warning")
|
||||
slo_weight = slo_def.get("weight")
|
||||
|
||||
slo_objects.append(SLOResult(slo_name, severity, passed, weight=slo_weight))
|
||||
|
||||
# Health-check SLOs (by default keeping them critical)
|
||||
for hc_name, hc_passed in health_check_results.items():
|
||||
slo_objects.append(SLOResult(hc_name, "critical", bool(hc_passed)))
|
||||
|
||||
total_points = sum(slo.weight(DEFAULT_WEIGHTS) for slo in slo_objects)
|
||||
points_lost = sum(slo.weight(DEFAULT_WEIGHTS) for slo in slo_objects if not slo.passed)
|
||||
|
||||
score = 0 if total_points == 0 else int(((total_points - points_lost) / total_points) * 100)
|
||||
|
||||
breakdown = {
|
||||
"total_points": total_points,
|
||||
"points_lost": points_lost,
|
||||
"passed": len([s for s in slo_objects if s.passed]),
|
||||
"failed": len([s for s in slo_objects if not s.passed]),
|
||||
}
|
||||
return score, breakdown
|
||||
@@ -4,7 +4,7 @@ from abc import ABC, abstractmethod
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn import utils
|
||||
from krkn import utils, cerberus
|
||||
from krkn.rollback.handler import (
|
||||
RollbackHandler,
|
||||
execute_rollback_version_files,
|
||||
@@ -30,7 +30,6 @@ class AbstractScenarioPlugin(ABC):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -104,7 +103,6 @@ class AbstractScenarioPlugin(ABC):
|
||||
return_value = self.run(
|
||||
run_uuid=run_uuid,
|
||||
scenario=scenario_config,
|
||||
krkn_config=krkn_config,
|
||||
lib_telemetry=telemetry,
|
||||
scenario_telemetry=scenario_telemetry,
|
||||
)
|
||||
@@ -126,12 +124,14 @@ class AbstractScenarioPlugin(ABC):
|
||||
)
|
||||
scenario_telemetry.exit_status = return_value
|
||||
scenario_telemetry.end_timestamp = time.time()
|
||||
start_time = int(scenario_telemetry.start_timestamp)
|
||||
end_time = int(scenario_telemetry.end_timestamp)
|
||||
utils.collect_and_put_ocp_logs(
|
||||
telemetry,
|
||||
parsed_scenario_config,
|
||||
telemetry.get_telemetry_request_id(),
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
int(scenario_telemetry.end_timestamp),
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
|
||||
if events_backup:
|
||||
@@ -139,15 +139,17 @@ class AbstractScenarioPlugin(ABC):
|
||||
krkn_config,
|
||||
parsed_scenario_config,
|
||||
telemetry.get_lib_kubernetes(),
|
||||
int(scenario_telemetry.start_timestamp),
|
||||
int(scenario_telemetry.end_timestamp),
|
||||
start_time,
|
||||
end_time
|
||||
)
|
||||
|
||||
if scenario_telemetry.exit_status != 0:
|
||||
failed_scenarios.append(scenario_config)
|
||||
scenario_telemetries.append(scenario_telemetry)
|
||||
logging.info(f"waiting {wait_duration} before running the next scenario")
|
||||
cerberus.publish_kraken_status(start_time,end_time)
|
||||
logging.info(f"wating {wait_duration} before running the next scenario")
|
||||
time.sleep(wait_duration)
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value, get_random_string
|
||||
from jinja2 import Template
|
||||
from krkn import cerberus
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.rollback.config import RollbackContent
|
||||
from krkn.rollback.handler import set_rollback_context_decorator
|
||||
@@ -17,11 +16,9 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
wait_duration = krkn_config["tunings"]["wait_duration"]
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
app_outage_config_yaml = yaml.full_load(f)
|
||||
@@ -110,14 +107,8 @@ class ApplicationOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
policy_name, namespace
|
||||
)
|
||||
|
||||
logging.info(
|
||||
"End of scenario. Waiting for the specified duration: %s"
|
||||
% wait_duration
|
||||
)
|
||||
time.sleep(wait_duration)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"ApplicationOutageScenarioPlugin exiting due to Exception %s" % e
|
||||
|
||||
@@ -10,7 +10,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
|
||||
@@ -19,7 +18,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
|
||||
@@ -23,7 +23,7 @@ from krkn.rollback.handler import set_rollback_context_decorator
|
||||
class HogsScenarioPlugin(AbstractScenarioPlugin):
|
||||
|
||||
@set_rollback_context_decorator
|
||||
def run(self, run_uuid: str, scenario: str, krkn_config: dict[str, any], lib_telemetry: KrknTelemetryOpenshift,
|
||||
def run(self, run_uuid: str, scenario: str, lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry) -> int:
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
|
||||
@@ -46,7 +46,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
try:
|
||||
with open(scenario, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
|
||||
|
||||
self.init_clients(lib_telemetry.get_lib_kubernetes())
|
||||
pods_status = PodsStatus()
|
||||
for config in scenario_config["scenarios"]:
|
||||
@@ -71,75 +71,14 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
self.custom_object_client = k8s_client.custom_object_client
|
||||
logging.info("Successfully initialized Kubernetes client for KubeVirt operations")
|
||||
|
||||
def get_vmi(self, name: str, namespace: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get a Virtual Machine Instance by name and namespace.
|
||||
|
||||
:param name: Name of the VMI to retrieve
|
||||
:param namespace: Namespace of the VMI
|
||||
:return: The VMI object if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
vmi = self.custom_object_client.get_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachineinstances",
|
||||
name=name
|
||||
)
|
||||
return vmi
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
logging.warning(f"VMI {name} not found in namespace {namespace}")
|
||||
return None
|
||||
else:
|
||||
logging.error(f"Error getting VMI {name}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error getting VMI {name}: {e}")
|
||||
raise
|
||||
|
||||
def get_vmis(self, regex_name: str, namespace: str) -> Optional[Dict]:
|
||||
"""
|
||||
Get a Virtual Machine Instance by name and namespace.
|
||||
|
||||
:param name: Name of the VMI to retrieve
|
||||
:param namespace: Namespace of the VMI
|
||||
:return: The VMI object if found, None otherwise
|
||||
"""
|
||||
try:
|
||||
namespaces = self.k8s_client.list_namespaces_by_regex(namespace)
|
||||
for namespace in namespaces:
|
||||
vmis = self.custom_object_client.list_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachineinstances",
|
||||
)
|
||||
|
||||
for vmi in vmis.get("items"):
|
||||
vmi_name = vmi.get("metadata",{}).get("name")
|
||||
match = re.match(regex_name, vmi_name)
|
||||
if match:
|
||||
self.vmis_list.append(vmi)
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
logging.warning(f"VMI {regex_name} not found in namespace {namespace}")
|
||||
return []
|
||||
else:
|
||||
logging.error(f"Error getting VMI {regex_name}: {e}")
|
||||
raise
|
||||
except Exception as e:
|
||||
logging.error(f"Unexpected error getting VMI {regex_name}: {e}")
|
||||
raise
|
||||
|
||||
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> int:
|
||||
def execute_scenario(self, config: Dict[str, Any], scenario_telemetry: ScenarioTelemetry) -> PodsStatus:
|
||||
"""
|
||||
Execute a KubeVirt VM outage scenario based on the provided configuration.
|
||||
|
||||
|
||||
:param config: The scenario configuration
|
||||
:param scenario_telemetry: The telemetry object for recording metrics
|
||||
:return: 0 for success, 1 for failure
|
||||
:return: PodsStatus object containing recovered and unrecovered pods
|
||||
"""
|
||||
self.pods_status = PodsStatus()
|
||||
try:
|
||||
@@ -149,12 +88,12 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
timeout = params.get("timeout", 60)
|
||||
kill_count = params.get("kill_count", 1)
|
||||
disable_auto_restart = params.get("disable_auto_restart", False)
|
||||
|
||||
|
||||
if not vm_name:
|
||||
logging.error("vm_name parameter is required")
|
||||
return 1
|
||||
return self.pods_status
|
||||
self.pods_status = PodsStatus()
|
||||
self.get_vmis(vm_name,namespace)
|
||||
self.vmis_list = self.k8s_client.get_vmis(vm_name,namespace)
|
||||
for _ in range(kill_count):
|
||||
|
||||
rand_int = random.randint(0, len(self.vmis_list) - 1)
|
||||
@@ -163,17 +102,22 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
logging.info(f"Starting KubeVirt VM outage scenario for VM: {vm_name} in namespace: {namespace}")
|
||||
vmi_name = vmi.get("metadata").get("name")
|
||||
vmi_namespace = vmi.get("metadata").get("namespace")
|
||||
if not self.validate_environment(vmi_name, vmi_namespace):
|
||||
return 1
|
||||
|
||||
vmi = self.get_vmi(vmi_name, vmi_namespace)
|
||||
|
||||
# Create affected_pod early so we can track failures
|
||||
self.affected_pod = AffectedPod(
|
||||
pod_name=vmi_name,
|
||||
namespace=vmi_namespace,
|
||||
)
|
||||
|
||||
if not self.validate_environment(vmi_name, vmi_namespace):
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
continue
|
||||
|
||||
vmi = self.k8s_client.get_vmi(vmi_name, vmi_namespace)
|
||||
if not vmi:
|
||||
logging.error(f"VMI {vm_name} not found in namespace {namespace}")
|
||||
return 1
|
||||
self.pods_status.unrecovered.append(self.affected_pod)
|
||||
continue
|
||||
|
||||
self.original_vmi = vmi
|
||||
logging.info(f"Captured initial state of VMI: {vm_name}")
|
||||
@@ -212,15 +156,13 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
"""
|
||||
try:
|
||||
# Check if KubeVirt CRDs exist
|
||||
crd_list = self.custom_object_client.list_namespaced_custom_object("kubevirt.io","v1",namespace,"virtualmachines")
|
||||
kubevirt_crds = [crd for crd in crd_list.items() ]
|
||||
|
||||
kubevirt_crds = self.k8s_client.get_vms(vm_name, namespace)
|
||||
if not kubevirt_crds:
|
||||
logging.error("KubeVirt CRDs not found. Ensure KubeVirt/CNV is installed in the cluster")
|
||||
return False
|
||||
|
||||
# Check if VMI exists
|
||||
vmi = self.get_vmi(vm_name, namespace)
|
||||
vmi = self.k8s_client.get_vmi(vm_name, namespace)
|
||||
if not vmi:
|
||||
logging.error(f"VMI {vm_name} not found in namespace {namespace}")
|
||||
return False
|
||||
@@ -243,13 +185,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
"""
|
||||
try:
|
||||
# Get the VM object first to get its current spec
|
||||
vm = self.custom_object_client.get_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachines",
|
||||
name=vm_name
|
||||
)
|
||||
vm = self.k8s_client.get_vm(vm_name, namespace)
|
||||
|
||||
# Update the running state
|
||||
if 'spec' not in vm:
|
||||
@@ -257,14 +193,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
vm['spec']['running'] = running
|
||||
|
||||
# Apply the patch
|
||||
self.custom_object_client.patch_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachines",
|
||||
name=vm_name,
|
||||
body=vm
|
||||
)
|
||||
self.k8s_client.patch_vm(vm_name,namespace,vm)
|
||||
return True
|
||||
|
||||
except ApiException as e:
|
||||
@@ -293,26 +222,12 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
" - proceeding with deletion but VM may auto-restart")
|
||||
start_creation_time = self.original_vmi.get('metadata', {}).get('creationTimestamp')
|
||||
start_time = time.time()
|
||||
try:
|
||||
self.custom_object_client.delete_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachineinstances",
|
||||
name=vm_name
|
||||
)
|
||||
except ApiException as e:
|
||||
if e.status == 404:
|
||||
logging.warning(f"VMI {vm_name} not found during deletion")
|
||||
return 1
|
||||
else:
|
||||
logging.error(f"API error during VMI deletion: {e}")
|
||||
return 1
|
||||
self.k8s_client.delete_vmi(vm_name, namespace)
|
||||
|
||||
# Wait for the VMI to be deleted
|
||||
|
||||
while time.time() - start_time < timeout:
|
||||
deleted_vmi = self.get_vmi(vm_name, namespace)
|
||||
deleted_vmi = self.k8s_client.get_vmi(vm_name, namespace)
|
||||
if deleted_vmi:
|
||||
if start_creation_time != deleted_vmi.get('metadata', {}).get('creationTimestamp'):
|
||||
logging.info(f"VMI {vm_name} successfully recreated")
|
||||
@@ -337,7 +252,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
while time.time() - start_time < timeout:
|
||||
|
||||
# Check current state once since we've already waited for the duration
|
||||
vmi = self.get_vmi(vm_name, namespace)
|
||||
vmi = self.k8s_client.get_vmi(vm_name, namespace)
|
||||
|
||||
if vmi:
|
||||
if vmi.get('status', {}).get('phase') == "Running":
|
||||
@@ -378,13 +293,7 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
del metadata[field]
|
||||
|
||||
# Create the VMI
|
||||
self.custom_object_client.create_namespaced_custom_object(
|
||||
group="kubevirt.io",
|
||||
version="v1",
|
||||
namespace=namespace,
|
||||
plural="virtualmachineinstances",
|
||||
body=vmi_dict
|
||||
)
|
||||
self.k8s_client.create_vmi(vm_name, namespace, vmi_dict)
|
||||
logging.info(f"Successfully recreated VMI {vm_name}")
|
||||
|
||||
# Wait for VMI to start running
|
||||
|
||||
@@ -7,7 +7,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
from krkn import cerberus, utils
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.managed_cluster.common_functions import get_managedcluster
|
||||
from krkn.scenario_plugins.managed_cluster.scenarios import Scenarios
|
||||
@@ -18,7 +17,6 @@ class ManagedClusterScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -38,8 +36,6 @@ class ManagedClusterScenarioPlugin(AbstractScenarioPlugin):
|
||||
managedcluster_scenario_object,
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(krkn_config, start_time, end_time)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"ManagedClusterScenarioPlugin exiting due to Exception %s"
|
||||
|
||||
@@ -12,7 +12,6 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -21,7 +20,6 @@ class NativeScenarioPlugin(AbstractScenarioPlugin):
|
||||
PLUGINS.run(
|
||||
scenario,
|
||||
lib_telemetry.get_lib_kubernetes().get_kubeconfig_path(),
|
||||
krkn_config,
|
||||
run_uuid,
|
||||
)
|
||||
|
||||
|
||||
@@ -1,141 +0,0 @@
|
||||
import logging
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
def get_status(config, start_time, end_time):
|
||||
"""
|
||||
Function to get Cerberus status
|
||||
|
||||
Args:
|
||||
config
|
||||
- Kraken config dictionary
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
|
||||
Returns:
|
||||
Cerberus status
|
||||
"""
|
||||
|
||||
cerberus_status = True
|
||||
check_application_routes = False
|
||||
application_routes_status = True
|
||||
if config["cerberus"]["cerberus_enabled"]:
|
||||
cerberus_url = config["cerberus"]["cerberus_url"]
|
||||
check_application_routes = config["cerberus"]["check_application_routes"]
|
||||
if not cerberus_url:
|
||||
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
cerberus_status = requests.get(cerberus_url, timeout=60).content
|
||||
cerberus_status = True if cerberus_status == b"True" else False
|
||||
|
||||
# Fail if the application routes monitored by cerberus experience downtime during the chaos
|
||||
if check_application_routes:
|
||||
application_routes_status, unavailable_routes = application_status(cerberus_url, start_time, end_time)
|
||||
if not application_routes_status:
|
||||
logging.error(
|
||||
"Application routes: %s monitored by cerberus encountered downtime during the run, failing"
|
||||
% unavailable_routes
|
||||
)
|
||||
else:
|
||||
logging.info("Application routes being monitored didn't encounter any downtime during the run!")
|
||||
|
||||
if not cerberus_status:
|
||||
logging.error(
|
||||
"Received a no-go signal from Cerberus, looks like "
|
||||
"the cluster is unhealthy. Please check the Cerberus "
|
||||
"report for more details. Test failed."
|
||||
)
|
||||
|
||||
if not application_routes_status or not cerberus_status:
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
|
||||
return cerberus_status
|
||||
|
||||
|
||||
def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
|
||||
"""
|
||||
Function to publish Kraken status to Cerberus
|
||||
|
||||
Args:
|
||||
config
|
||||
- Kraken config dictionary
|
||||
|
||||
failed_post_scenarios
|
||||
- String containing the failed post scenarios
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
"""
|
||||
|
||||
cerberus_status = get_status(config, start_time, end_time)
|
||||
if not cerberus_status:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
|
||||
else:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
|
||||
|
||||
|
||||
def application_status(cerberus_url, start_time, end_time):
|
||||
"""
|
||||
Function to check application availability
|
||||
|
||||
Args:
|
||||
cerberus_url
|
||||
- url where Cerberus publishes True/False signal
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
|
||||
Returns:
|
||||
Application status and failed routes
|
||||
"""
|
||||
|
||||
if not cerberus_url:
|
||||
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
duration = (end_time - start_time) / 60
|
||||
url = cerberus_url + "/" + "history" + "?" + "loopback=" + str(duration)
|
||||
logging.info("Scraping the metrics for the test duration from cerberus url: %s" % url)
|
||||
try:
|
||||
failed_routes = []
|
||||
status = True
|
||||
metrics = requests.get(url, timeout=60).content
|
||||
metrics_json = json.loads(metrics)
|
||||
for entry in metrics_json["history"]["failures"]:
|
||||
if entry["component"] == "route":
|
||||
name = entry["name"]
|
||||
failed_routes.append(name)
|
||||
status = False
|
||||
else:
|
||||
continue
|
||||
except Exception as e:
|
||||
logging.error("Failed to scrape metrics from cerberus API at %s: %s" % (url, e))
|
||||
sys.exit(1)
|
||||
return status, set(failed_routes)
|
||||
@@ -9,7 +9,6 @@ import random
|
||||
from traceback import format_exc
|
||||
from jinja2 import Environment, FileSystemLoader
|
||||
from . import kubernetes_functions as kube_helper
|
||||
from . import cerberus
|
||||
import typing
|
||||
from arcaflow_plugin_sdk import validation, plugin
|
||||
from kubernetes.client.api.core_v1_api import CoreV1Api as CoreV1Api
|
||||
@@ -100,13 +99,13 @@ class NetworkScenarioConfig:
|
||||
default=None,
|
||||
metadata={
|
||||
"name": "Network Parameters",
|
||||
"description": "The network filters that are applied on the interface. "
|
||||
"The currently supported filters are latency, "
|
||||
"loss and bandwidth",
|
||||
},
|
||||
"description":
|
||||
"The network filters that are applied on the interface. "
|
||||
"The currently supported filters are latency, "
|
||||
"loss and bandwidth"
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetworkScenarioSuccessOutput:
|
||||
filter_direction: str = field(
|
||||
@@ -773,8 +772,7 @@ def network_chaos(
|
||||
logging.info("Deleting jobs")
|
||||
delete_jobs(cli, batch_cli, job_list[:])
|
||||
job_list = []
|
||||
logging.info("Waiting for wait_duration : %ss" % cfg.wait_duration)
|
||||
time.sleep(cfg.wait_duration)
|
||||
|
||||
create_interfaces = False
|
||||
else:
|
||||
|
||||
|
||||
@@ -49,7 +49,7 @@ class Plugins:
|
||||
def unserialize_scenario(self, file: str) -> Any:
|
||||
return serialization.load_from_file(abspath(file))
|
||||
|
||||
def run(self, file: str, kubeconfig_path: str, kraken_config: str, run_uuid: str):
|
||||
def run(self, file: str, kubeconfig_path: str, run_uuid: str):
|
||||
"""
|
||||
Run executes a series of steps
|
||||
"""
|
||||
@@ -93,8 +93,6 @@ class Plugins:
|
||||
unserialized_input = step.schema.input.unserialize(entry["config"])
|
||||
if "kubeconfig_path" in step.schema.input.properties:
|
||||
unserialized_input.kubeconfig_path = kubeconfig_path
|
||||
if "kraken_config" in step.schema.input.properties:
|
||||
unserialized_input.kraken_config = kraken_config
|
||||
output_id, output_data = step.schema(
|
||||
params=unserialized_input, run_id=run_uuid
|
||||
)
|
||||
|
||||
@@ -1,157 +0,0 @@
|
||||
import logging
|
||||
import requests
|
||||
import sys
|
||||
import json
|
||||
|
||||
|
||||
def get_status(config, start_time, end_time):
|
||||
"""
|
||||
Function to get Cerberus status
|
||||
|
||||
Args:
|
||||
config
|
||||
- Kraken config dictionary
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
|
||||
Returns:
|
||||
Cerberus status
|
||||
"""
|
||||
|
||||
cerberus_status = True
|
||||
check_application_routes = False
|
||||
application_routes_status = True
|
||||
if config["cerberus"]["cerberus_enabled"]:
|
||||
cerberus_url = config["cerberus"]["cerberus_url"]
|
||||
check_application_routes = config["cerberus"]["check_application_routes"]
|
||||
if not cerberus_url:
|
||||
logging.error(
|
||||
"url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
cerberus_status = requests.get(cerberus_url, timeout=60).content
|
||||
cerberus_status = True if cerberus_status == b"True" else False
|
||||
|
||||
# Fail if the application routes monitored by cerberus experience
|
||||
# downtime during the chaos
|
||||
if check_application_routes:
|
||||
application_routes_status, unavailable_routes = application_status(
|
||||
cerberus_url, start_time, end_time)
|
||||
if not application_routes_status:
|
||||
logging.error(
|
||||
"Application routes: %s monitored by cerberus encountered downtime during the run, failing"
|
||||
% unavailable_routes
|
||||
)
|
||||
else:
|
||||
logging.info(
|
||||
"Application routes being monitored didn't encounter any downtime during the run!")
|
||||
|
||||
if not cerberus_status:
|
||||
logging.error(
|
||||
"Received a no-go signal from Cerberus, looks like "
|
||||
"the cluster is unhealthy. Please check the Cerberus "
|
||||
"report for more details. Test failed."
|
||||
)
|
||||
|
||||
if not application_routes_status or not cerberus_status:
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Received a go signal from Ceberus, the cluster is healthy. "
|
||||
"Test passed.")
|
||||
return cerberus_status
|
||||
|
||||
|
||||
def publish_kraken_status(config, failed_post_scenarios, start_time, end_time):
|
||||
"""
|
||||
Function to publish Kraken status to Cerberus
|
||||
|
||||
Args:
|
||||
config
|
||||
- Kraken config dictionary
|
||||
|
||||
failed_post_scenarios
|
||||
- String containing the failed post scenarios
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
"""
|
||||
|
||||
cerberus_status = get_status(config, start_time, end_time)
|
||||
if not cerberus_status:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing")
|
||||
else:
|
||||
if failed_post_scenarios:
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios "
|
||||
"are still failing")
|
||||
|
||||
|
||||
def application_status(cerberus_url, start_time, end_time):
|
||||
"""
|
||||
Function to check application availability
|
||||
|
||||
Args:
|
||||
cerberus_url
|
||||
- url where Cerberus publishes True/False signal
|
||||
|
||||
start_time
|
||||
- The time when chaos is injected
|
||||
|
||||
end_time
|
||||
- The time when chaos is removed
|
||||
|
||||
Returns:
|
||||
Application status and failed routes
|
||||
"""
|
||||
|
||||
if not cerberus_url:
|
||||
logging.error(
|
||||
"url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
else:
|
||||
duration = (end_time - start_time) / 60
|
||||
url = cerberus_url + "/" + "history" + \
|
||||
"?" + "loopback=" + str(duration)
|
||||
logging.info(
|
||||
"Scraping the metrics for the test duration from cerberus url: %s" %
|
||||
url)
|
||||
try:
|
||||
failed_routes = []
|
||||
status = True
|
||||
metrics = requests.get(url, timeout=60).content
|
||||
metrics_json = json.loads(metrics)
|
||||
for entry in metrics_json["history"]["failures"]:
|
||||
if entry["component"] == "route":
|
||||
name = entry["name"]
|
||||
failed_routes.append(name)
|
||||
status = False
|
||||
else:
|
||||
continue
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"Failed to scrape metrics from cerberus API at %s: %s" %
|
||||
(url, e))
|
||||
sys.exit(1)
|
||||
return status, set(failed_routes)
|
||||
@@ -15,7 +15,6 @@ from arcaflow_plugin_sdk import plugin, validation
|
||||
from kubernetes import client
|
||||
from kubernetes.client.api.apiextensions_v1_api import ApiextensionsV1Api
|
||||
from kubernetes.client.api.custom_objects_api import CustomObjectsApi
|
||||
from . import cerberus
|
||||
|
||||
|
||||
def get_test_pods(
|
||||
@@ -36,7 +35,7 @@ def get_test_pods(
|
||||
- pods matching the label on which network policy
|
||||
need to be applied
|
||||
|
||||
namepsace (string)
|
||||
namespace (string)
|
||||
- namespace in which the pod is present
|
||||
|
||||
kubecli (KrknKubernetes)
|
||||
@@ -1079,9 +1078,6 @@ def pod_outage(
|
||||
job_list = []
|
||||
publish = False
|
||||
|
||||
if params.kraken_config:
|
||||
publish = True
|
||||
|
||||
for i in params.direction:
|
||||
filter_dict[i] = eval(f"params.{i}_ports")
|
||||
|
||||
@@ -1137,11 +1133,6 @@ def pod_outage(
|
||||
start_time = int(time.time())
|
||||
logging.info("Waiting for job to finish")
|
||||
wait_for_job(job_list[:], kubecli, params.test_duration + 300)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
params.kraken_config, "", start_time, end_time
|
||||
)
|
||||
|
||||
return "success", PodOutageSuccessOutput(
|
||||
test_pods=pods_list,
|
||||
@@ -1412,24 +1403,13 @@ def pod_egress_shaping(
|
||||
wait_for_job(job_list[:], kubecli, params.test_duration + 20)
|
||||
logging.info("Waiting for wait_duration %s" % params.test_duration)
|
||||
time.sleep(params.test_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
params.kraken_config, "", start_time, end_time
|
||||
)
|
||||
if params.execution_type == "parallel":
|
||||
break
|
||||
if params.execution_type == "parallel":
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(job_list[:], kubecli, params.test_duration + 300)
|
||||
logging.info("Waiting for wait_duration %s" % params.test_duration)
|
||||
time.sleep(params.test_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
params.kraken_config, "", start_time, end_time
|
||||
)
|
||||
|
||||
return "success", PodEgressNetShapingSuccessOutput(
|
||||
test_pods=pods_list,
|
||||
@@ -1696,15 +1676,12 @@ def pod_ingress_shaping(
|
||||
)
|
||||
if params.execution_type == "serial":
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(job_list[:], kubecli, params.test_duration + 20)
|
||||
logging.info("Waiting for wait_duration %s" % params.test_duration)
|
||||
wait_for_job(job_list[:], kubecli,
|
||||
params.test_duration + 20)
|
||||
logging.info("Waiting for wait_duration %s" %
|
||||
params.test_duration)
|
||||
time.sleep(params.test_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
params.kraken_config, "", start_time, end_time
|
||||
)
|
||||
|
||||
if params.execution_type == "parallel":
|
||||
break
|
||||
if params.execution_type == "parallel":
|
||||
@@ -1713,11 +1690,6 @@ def pod_ingress_shaping(
|
||||
wait_for_job(job_list[:], kubecli, params.test_duration + 300)
|
||||
logging.info("Waiting for wait_duration %s" % params.test_duration)
|
||||
time.sleep(params.test_duration)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
params.kraken_config, "", start_time, end_time
|
||||
)
|
||||
|
||||
return "success", PodIngressNetShapingSuccessOutput(
|
||||
test_pods=pods_list,
|
||||
|
||||
@@ -10,7 +10,6 @@ from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value, log_exception
|
||||
|
||||
from krkn import cerberus, utils
|
||||
from krkn.scenario_plugins.node_actions import common_node_functions
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
@@ -20,7 +19,6 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -112,34 +110,21 @@ class NetworkChaosScenarioPlugin(AbstractScenarioPlugin):
|
||||
return 1
|
||||
if test_execution == "serial":
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
self.wait_for_job(
|
||||
joblst[:],
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
test_duration + 300,
|
||||
)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
krkn_config,
|
||||
None,
|
||||
start_time,
|
||||
end_time,
|
||||
)
|
||||
if test_execution == "parallel":
|
||||
break
|
||||
if test_execution == "parallel":
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
self.wait_for_job(
|
||||
joblst[:],
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
test_duration + 300,
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
krkn_config, [], start_time, end_time
|
||||
)
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
"NetworkChaosScenarioPlugin exiting due to Exception %s" % e
|
||||
|
||||
@@ -1,5 +1,7 @@
|
||||
import re
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from typing import TypeVar, Optional
|
||||
|
||||
|
||||
class NetworkChaosScenarioType(Enum):
|
||||
@@ -9,16 +11,21 @@ class NetworkChaosScenarioType(Enum):
|
||||
|
||||
@dataclass
|
||||
class BaseNetworkChaosConfig:
|
||||
supported_execution = ["serial", "parallel"]
|
||||
id: str
|
||||
image: str
|
||||
wait_duration: int
|
||||
test_duration: int
|
||||
label_selector: str
|
||||
service_account: str
|
||||
taints: list[str]
|
||||
namespace: str
|
||||
instance_count: int
|
||||
execution: str
|
||||
namespace: str
|
||||
taints: list[str]
|
||||
supported_execution = ["serial", "parallel"]
|
||||
interfaces: list[str]
|
||||
target: str
|
||||
ingress: bool
|
||||
egress: bool
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = []
|
||||
@@ -41,12 +48,7 @@ class BaseNetworkChaosConfig:
|
||||
|
||||
@dataclass
|
||||
class NetworkFilterConfig(BaseNetworkChaosConfig):
|
||||
ingress: bool
|
||||
egress: bool
|
||||
interfaces: list[str]
|
||||
target: str
|
||||
ports: list[int]
|
||||
image: str
|
||||
protocols: list[str]
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
@@ -58,3 +60,30 @@ class NetworkFilterConfig(BaseNetworkChaosConfig):
|
||||
f"{self.protocols} contains not allowed protocols only tcp and udp is allowed"
|
||||
)
|
||||
return errors
|
||||
|
||||
|
||||
@dataclass
|
||||
class NetworkChaosConfig(BaseNetworkChaosConfig):
|
||||
latency: Optional[str] = None
|
||||
loss: Optional[str] = None
|
||||
bandwidth: Optional[str] = None
|
||||
force: Optional[bool] = None
|
||||
|
||||
def validate(self) -> list[str]:
|
||||
errors = super().validate()
|
||||
latency_regex = re.compile(r"^(\d+)(us|ms|s)$")
|
||||
bandwidth_regex = re.compile(r"^(\d+)(bit|kbit|mbit|gbit|tbit)$")
|
||||
if self.latency:
|
||||
if not (latency_regex.match(self.latency)):
|
||||
errors.append(
|
||||
"latency must be a number followed by `us` (microseconds) or `ms` (milliseconds), or `s` (seconds)"
|
||||
)
|
||||
if self.bandwidth:
|
||||
if not (bandwidth_regex.match(self.bandwidth)):
|
||||
errors.append(
|
||||
"bandwidth must be a number followed by `bit` `kbit` or `mbit` or `tbit`"
|
||||
)
|
||||
if self.loss:
|
||||
if "%" in self.loss or not self.loss.isdigit():
|
||||
errors.append("loss must be a number followed without the `%` symbol")
|
||||
return errors
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
import abc
|
||||
import logging
|
||||
import queue
|
||||
from typing import Tuple
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
@@ -27,7 +28,7 @@ class AbstractNetworkChaosModule(abc.ABC):
|
||||
pass
|
||||
|
||||
@abc.abstractmethod
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
|
||||
"""
|
||||
returns the common subset of settings shared by all the scenarios `BaseNetworkChaosConfig` and the type of Network
|
||||
Chaos Scenario that is running (Pod Scenario or Node Scenario)
|
||||
@@ -41,6 +42,42 @@ class AbstractNetworkChaosModule(abc.ABC):
|
||||
|
||||
pass
|
||||
|
||||
def get_node_targets(self, config: BaseNetworkChaosConfig):
|
||||
if self.base_network_config.label_selector:
|
||||
return self.kubecli.get_lib_kubernetes().list_nodes(
|
||||
self.base_network_config.label_selector
|
||||
)
|
||||
else:
|
||||
if not config.target:
|
||||
raise Exception(
|
||||
"neither node selector nor node_name (target) specified, aborting."
|
||||
)
|
||||
node_info = self.kubecli.get_lib_kubernetes().list_nodes()
|
||||
if config.target not in node_info:
|
||||
raise Exception(f"node {config.target} not found, aborting")
|
||||
|
||||
return [config.target]
|
||||
|
||||
def get_pod_targets(self, config: BaseNetworkChaosConfig):
|
||||
if not config.namespace:
|
||||
raise Exception("namespace not specified, aborting")
|
||||
if self.base_network_config.label_selector:
|
||||
return self.kubecli.get_lib_kubernetes().list_pods(
|
||||
config.namespace, config.label_selector
|
||||
)
|
||||
else:
|
||||
if not config.target:
|
||||
raise Exception(
|
||||
"neither node selector nor node_name (target) specified, aborting."
|
||||
)
|
||||
if not self.kubecli.get_lib_kubernetes().check_if_pod_exists(
|
||||
config.target, config.namespace
|
||||
):
|
||||
raise Exception(
|
||||
f"pod {config.target} not found in namespace {config.namespace}"
|
||||
)
|
||||
return [config.target]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
base_network_config: BaseNetworkChaosConfig,
|
||||
|
||||
@@ -0,0 +1,156 @@
|
||||
import queue
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
NetworkChaosScenarioType,
|
||||
BaseNetworkChaosConfig,
|
||||
NetworkChaosConfig,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
|
||||
log_info,
|
||||
setup_network_chaos_ng_scenario,
|
||||
log_error,
|
||||
log_warning,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
|
||||
common_set_limit_rules,
|
||||
common_delete_limit_rules,
|
||||
node_qdisc_is_simple,
|
||||
)
|
||||
|
||||
|
||||
class NodeNetworkChaosModule(AbstractNetworkChaosModule):
|
||||
|
||||
def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
|
||||
super().__init__(config, kubecli)
|
||||
self.config = config
|
||||
|
||||
def run(self, target: str, error_queue: queue.Queue = None):
|
||||
parallel = False
|
||||
if error_queue:
|
||||
parallel = True
|
||||
try:
|
||||
network_chaos_pod_name = f"node-network-chaos-{get_random_string(5)}"
|
||||
container_name = f"fedora-container-{get_random_string(5)}"
|
||||
|
||||
log_info(
|
||||
f"creating workload to inject network chaos in node {target} network"
|
||||
f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
|
||||
f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
|
||||
f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
_, interfaces = setup_network_chaos_ng_scenario(
|
||||
self.config,
|
||||
target,
|
||||
network_chaos_pod_name,
|
||||
container_name,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
target,
|
||||
parallel,
|
||||
True,
|
||||
)
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
if len(interfaces) == 0:
|
||||
log_error(
|
||||
"no network interface found in pod, impossible to execute the network chaos scenario",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
return
|
||||
log_info(
|
||||
f"detected network interfaces: {','.join(interfaces)}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
else:
|
||||
interfaces = self.config.interfaces
|
||||
|
||||
log_info(
|
||||
f"targeting node {target}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
complex_config_interfaces = []
|
||||
for interface in interfaces:
|
||||
is_simple = node_qdisc_is_simple(
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
network_chaos_pod_name,
|
||||
self.config.namespace,
|
||||
interface,
|
||||
)
|
||||
if not is_simple:
|
||||
complex_config_interfaces.append(interface)
|
||||
|
||||
if len(complex_config_interfaces) > 0 and not self.config.force:
|
||||
log_warning(
|
||||
f"node already has tc rules set for {','.join(complex_config_interfaces)}, this action might damage the cluster,"
|
||||
"if you want to continue set `force` to True in the node network "
|
||||
"chaos scenario config file and try again"
|
||||
)
|
||||
else:
|
||||
if len(complex_config_interfaces) > 0 and self.config.force:
|
||||
log_warning(
|
||||
f"you are forcing node network configuration override for {','.join(complex_config_interfaces)},"
|
||||
"this action might lead to unpredictable node behaviour, "
|
||||
"you're doing it in your own responsibility"
|
||||
"waiting 10 seconds before continuing"
|
||||
)
|
||||
time.sleep(10)
|
||||
common_set_limit_rules(
|
||||
self.config.egress,
|
||||
self.config.ingress,
|
||||
interfaces,
|
||||
self.config.bandwidth,
|
||||
self.config.latency,
|
||||
self.config.loss,
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
network_chaos_pod_name,
|
||||
self.config.namespace,
|
||||
None,
|
||||
)
|
||||
|
||||
time.sleep(self.config.test_duration)
|
||||
|
||||
log_info("removing tc rules", parallel, network_chaos_pod_name)
|
||||
|
||||
common_delete_limit_rules(
|
||||
self.config.egress,
|
||||
self.config.ingress,
|
||||
interfaces,
|
||||
network_chaos_pod_name,
|
||||
self.config.namespace,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
None,
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
self.kubecli.get_lib_kubernetes().delete_pod(
|
||||
network_chaos_pod_name, self.config.namespace
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if error_queue is None:
|
||||
raise e
|
||||
else:
|
||||
error_queue.put(str(e))
|
||||
|
||||
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
|
||||
return NetworkChaosScenarioType.Node, self.config
|
||||
|
||||
def get_targets(self) -> list[str]:
|
||||
return self.get_node_targets(self.config)
|
||||
@@ -1,5 +1,6 @@
|
||||
import queue
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
@@ -11,14 +12,16 @@ from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
|
||||
log_info,
|
||||
deploy_network_chaos_ng_pod,
|
||||
get_pod_default_interface,
|
||||
)
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_filter import (
|
||||
deploy_network_filter_pod,
|
||||
apply_network_rules,
|
||||
clean_network_rules,
|
||||
generate_rules,
|
||||
get_default_interface,
|
||||
)
|
||||
|
||||
|
||||
@@ -41,7 +44,7 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
)
|
||||
|
||||
pod_name = f"node-filter-{get_random_string(5)}"
|
||||
deploy_network_filter_pod(
|
||||
deploy_network_chaos_ng_pod(
|
||||
self.config,
|
||||
target,
|
||||
pod_name,
|
||||
@@ -50,7 +53,7 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
interfaces = [
|
||||
get_default_interface(
|
||||
get_pod_default_interface(
|
||||
pod_name,
|
||||
self.config.namespace,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
@@ -108,21 +111,8 @@ class NodeNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
super().__init__(config, kubecli)
|
||||
self.config = config
|
||||
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
|
||||
return NetworkChaosScenarioType.Node, self.config
|
||||
|
||||
def get_targets(self) -> list[str]:
|
||||
if self.base_network_config.label_selector:
|
||||
return self.kubecli.get_lib_kubernetes().list_nodes(
|
||||
self.base_network_config.label_selector
|
||||
)
|
||||
else:
|
||||
if not self.config.target:
|
||||
raise Exception(
|
||||
"neither node selector nor node_name (target) specified, aborting."
|
||||
)
|
||||
node_info = self.kubecli.get_lib_kubernetes().list_nodes()
|
||||
if self.config.target not in node_info:
|
||||
raise Exception(f"node {self.config.target} not found, aborting")
|
||||
|
||||
return [self.config.target]
|
||||
return self.get_node_targets(self.config)
|
||||
|
||||
@@ -0,0 +1,159 @@
|
||||
import queue
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
NetworkChaosScenarioType,
|
||||
BaseNetworkChaosConfig,
|
||||
NetworkChaosConfig,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
|
||||
log_info,
|
||||
setup_network_chaos_ng_scenario,
|
||||
log_error,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_chaos import (
|
||||
common_set_limit_rules,
|
||||
common_delete_limit_rules,
|
||||
)
|
||||
|
||||
|
||||
class PodNetworkChaosModule(AbstractNetworkChaosModule):
|
||||
|
||||
def __init__(self, config: NetworkChaosConfig, kubecli: KrknTelemetryOpenshift):
|
||||
super().__init__(config, kubecli)
|
||||
self.config = config
|
||||
|
||||
def run(self, target: str, error_queue: queue.Queue = None):
|
||||
parallel = False
|
||||
if error_queue:
|
||||
parallel = True
|
||||
try:
|
||||
network_chaos_pod_name = f"pod-network-chaos-{get_random_string(5)}"
|
||||
container_name = f"fedora-container-{get_random_string(5)}"
|
||||
pod_info = self.kubecli.get_lib_kubernetes().get_pod_info(
|
||||
target, self.config.namespace
|
||||
)
|
||||
|
||||
log_info(
|
||||
f"creating workload to inject network chaos in pod {target} network"
|
||||
f"latency:{str(self.config.latency) if self.config.latency else '0'}, "
|
||||
f"packet drop:{str(self.config.loss) if self.config.loss else '0'} "
|
||||
f"bandwidth restriction:{str(self.config.bandwidth) if self.config.bandwidth else '0'} ",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
if not pod_info:
|
||||
raise Exception(
|
||||
f"impossible to retrieve infos for pod {target} namespace {self.config.namespace}"
|
||||
)
|
||||
|
||||
container_ids, interfaces = setup_network_chaos_ng_scenario(
|
||||
self.config,
|
||||
pod_info.nodeName,
|
||||
network_chaos_pod_name,
|
||||
container_name,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
target,
|
||||
parallel,
|
||||
False,
|
||||
)
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
if len(interfaces) == 0:
|
||||
log_error(
|
||||
"no network interface found in pod, impossible to execute the network chaos scenario",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
return
|
||||
log_info(
|
||||
f"detected network interfaces: {','.join(interfaces)}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
else:
|
||||
interfaces = self.config.interfaces
|
||||
|
||||
if len(container_ids) == 0:
|
||||
raise Exception(
|
||||
f"impossible to resolve container id for pod {target} namespace {self.config.namespace}"
|
||||
)
|
||||
|
||||
log_info(
|
||||
f"targeting container {container_ids[0]}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
pids = self.kubecli.get_lib_kubernetes().get_pod_pids(
|
||||
base_pod_name=network_chaos_pod_name,
|
||||
base_pod_namespace=self.config.namespace,
|
||||
base_pod_container_name=container_name,
|
||||
pod_name=target,
|
||||
pod_namespace=self.config.namespace,
|
||||
pod_container_id=container_ids[0],
|
||||
)
|
||||
|
||||
if not pids:
|
||||
raise Exception(f"impossible to resolve pid for pod {target}")
|
||||
|
||||
log_info(
|
||||
f"resolved pids {pids} in node {pod_info.nodeName} for pod {target}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
common_set_limit_rules(
|
||||
self.config.egress,
|
||||
self.config.ingress,
|
||||
interfaces,
|
||||
self.config.bandwidth,
|
||||
self.config.latency,
|
||||
self.config.loss,
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
network_chaos_pod_name,
|
||||
self.config.namespace,
|
||||
pids,
|
||||
)
|
||||
|
||||
time.sleep(self.config.test_duration)
|
||||
|
||||
log_info("removing tc rules", parallel, network_chaos_pod_name)
|
||||
|
||||
common_delete_limit_rules(
|
||||
self.config.egress,
|
||||
self.config.ingress,
|
||||
interfaces,
|
||||
network_chaos_pod_name,
|
||||
self.config.namespace,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
pids,
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
|
||||
self.kubecli.get_lib_kubernetes().delete_pod(
|
||||
network_chaos_pod_name, self.config.namespace
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
if error_queue is None:
|
||||
raise e
|
||||
else:
|
||||
error_queue.put(str(e))
|
||||
|
||||
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
|
||||
return NetworkChaosScenarioType.Pod, self.config
|
||||
|
||||
def get_targets(self) -> list[str]:
|
||||
return self.get_pod_targets(self.config)
|
||||
@@ -1,6 +1,6 @@
|
||||
import logging
|
||||
import queue
|
||||
import time
|
||||
from typing import Tuple
|
||||
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string
|
||||
@@ -13,12 +13,17 @@ from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info, log_error
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
|
||||
log_info,
|
||||
log_error,
|
||||
deploy_network_chaos_ng_pod,
|
||||
get_pod_default_interface,
|
||||
setup_network_chaos_ng_scenario,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils_network_filter import (
|
||||
deploy_network_filter_pod,
|
||||
generate_namespaced_rules,
|
||||
apply_network_rules,
|
||||
clean_network_rules_namespaced,
|
||||
generate_namespaced_rules,
|
||||
)
|
||||
|
||||
|
||||
@@ -50,22 +55,18 @@ class PodNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
f"impossible to retrieve infos for pod {self.config.target} namespace {self.config.namespace}"
|
||||
)
|
||||
|
||||
deploy_network_filter_pod(
|
||||
container_ids, interfaces = setup_network_chaos_ng_scenario(
|
||||
self.config,
|
||||
pod_info.nodeName,
|
||||
pod_name,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
container_name,
|
||||
host_network=False,
|
||||
self.kubecli.get_lib_kubernetes(),
|
||||
target,
|
||||
parallel,
|
||||
False,
|
||||
)
|
||||
|
||||
if len(self.config.interfaces) == 0:
|
||||
interfaces = (
|
||||
self.kubecli.get_lib_kubernetes().list_pod_network_interfaces(
|
||||
target, self.config.namespace
|
||||
)
|
||||
)
|
||||
|
||||
if len(interfaces) == 0:
|
||||
log_error(
|
||||
"no network interface found in pod, impossible to execute the network filter scenario",
|
||||
@@ -157,26 +158,8 @@ class PodNetworkFilterModule(AbstractNetworkChaosModule):
|
||||
super().__init__(config, kubecli)
|
||||
self.config = config
|
||||
|
||||
def get_config(self) -> (NetworkChaosScenarioType, BaseNetworkChaosConfig):
|
||||
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
|
||||
return NetworkChaosScenarioType.Pod, self.config
|
||||
|
||||
def get_targets(self) -> list[str]:
|
||||
if not self.config.namespace:
|
||||
raise Exception("namespace not specified, aborting")
|
||||
if self.base_network_config.label_selector:
|
||||
return self.kubecli.get_lib_kubernetes().list_pods(
|
||||
self.config.namespace, self.config.label_selector
|
||||
)
|
||||
else:
|
||||
if not self.config.target:
|
||||
raise Exception(
|
||||
"neither node selector nor node_name (target) specified, aborting."
|
||||
)
|
||||
if not self.kubecli.get_lib_kubernetes().check_if_pod_exists(
|
||||
self.config.target, self.config.namespace
|
||||
):
|
||||
raise Exception(
|
||||
f"pod {self.config.target} not found in namespace {self.config.namespace}"
|
||||
)
|
||||
|
||||
return [self.config.target]
|
||||
return self.get_pod_targets(self.config)
|
||||
|
||||
@@ -1,4 +1,15 @@
|
||||
import logging
|
||||
import os
|
||||
from typing import Tuple
|
||||
|
||||
import yaml
|
||||
from jinja2 import FileSystemLoader, Environment
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.k8s import Pod
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
BaseNetworkChaosConfig,
|
||||
)
|
||||
|
||||
|
||||
def log_info(message: str, parallel: bool = False, node_name: str = ""):
|
||||
@@ -29,3 +40,101 @@ def log_warning(message: str, parallel: bool = False, node_name: str = ""):
|
||||
logging.warning(f"[{node_name}]: {message}")
|
||||
else:
|
||||
logging.warning(message)
|
||||
|
||||
|
||||
def deploy_network_chaos_ng_pod(
|
||||
config: BaseNetworkChaosConfig,
|
||||
target_node: str,
|
||||
pod_name: str,
|
||||
kubecli: KrknKubernetes,
|
||||
container_name: str = "fedora",
|
||||
host_network: bool = True,
|
||||
):
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_template = env.get_template("templates/network-chaos.j2")
|
||||
tolerations = []
|
||||
|
||||
for taint in config.taints:
|
||||
key_value_part, effect = taint.split(":", 1)
|
||||
if "=" in key_value_part:
|
||||
key, value = key_value_part.split("=", 1)
|
||||
operator = "Equal"
|
||||
else:
|
||||
key = key_value_part
|
||||
value = None
|
||||
operator = "Exists"
|
||||
toleration = {
|
||||
"key": key,
|
||||
"operator": operator,
|
||||
"effect": effect,
|
||||
}
|
||||
if value is not None:
|
||||
toleration["value"] = value
|
||||
tolerations.append(toleration)
|
||||
|
||||
pod_body = yaml.safe_load(
|
||||
pod_template.render(
|
||||
pod_name=pod_name,
|
||||
namespace=config.namespace,
|
||||
host_network=host_network,
|
||||
target=target_node,
|
||||
container_name=container_name,
|
||||
workload_image=config.image,
|
||||
taints=tolerations,
|
||||
service_account=config.service_account,
|
||||
)
|
||||
)
|
||||
|
||||
kubecli.create_pod(pod_body, config.namespace, 300)
|
||||
|
||||
|
||||
def get_pod_default_interface(
|
||||
pod_name: str, namespace: str, kubecli: KrknKubernetes
|
||||
) -> str:
|
||||
cmd = "ip r | grep default | awk '/default/ {print $5}'"
|
||||
output = kubecli.exec_cmd_in_pod([cmd], pod_name, namespace)
|
||||
return output.replace("\n", "")
|
||||
|
||||
|
||||
def setup_network_chaos_ng_scenario(
|
||||
config: BaseNetworkChaosConfig,
|
||||
node_name: str,
|
||||
pod_name: str,
|
||||
container_name: str,
|
||||
kubecli: KrknKubernetes,
|
||||
target: str,
|
||||
parallel: bool,
|
||||
host_network: bool,
|
||||
) -> Tuple[list[str], list[str]]:
|
||||
|
||||
deploy_network_chaos_ng_pod(
|
||||
config,
|
||||
node_name,
|
||||
pod_name,
|
||||
kubecli,
|
||||
container_name,
|
||||
host_network=host_network,
|
||||
)
|
||||
|
||||
if len(config.interfaces) == 0:
|
||||
interfaces = [
|
||||
get_pod_default_interface(
|
||||
pod_name,
|
||||
config.namespace,
|
||||
kubecli,
|
||||
)
|
||||
]
|
||||
|
||||
log_info(f"detected default interface {interfaces[0]}", parallel, target)
|
||||
|
||||
else:
|
||||
interfaces = config.interfaces
|
||||
# if not host_network means that the target is a pod so container_ids need to be resolved
|
||||
# otherwise it's not needed
|
||||
if not host_network:
|
||||
container_ids = kubecli.get_container_ids(target, config.namespace)
|
||||
else:
|
||||
container_ids = []
|
||||
|
||||
return container_ids, interfaces
|
||||
|
||||
@@ -0,0 +1,263 @@
|
||||
import subprocess
|
||||
import logging
|
||||
from typing import Optional
|
||||
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
|
||||
log_info,
|
||||
log_warning,
|
||||
log_error,
|
||||
)
|
||||
|
||||
ROOT_HANDLE = "100:"
|
||||
CLASS_ID = "100:1"
|
||||
NETEM_HANDLE = "101:"
|
||||
|
||||
|
||||
def run(cmd: list[str], check: bool = True) -> subprocess.CompletedProcess:
|
||||
return subprocess.run(cmd, check=check, text=True, capture_output=True)
|
||||
|
||||
|
||||
def tc_node(args: list[str]) -> subprocess.CompletedProcess:
|
||||
return run(["tc"] + args)
|
||||
|
||||
|
||||
def get_build_tc_tree_commands(devs: list[str]) -> list[str]:
|
||||
tree = []
|
||||
for dev in devs:
|
||||
tree.append(f"tc qdisc add dev {dev} root handle {ROOT_HANDLE} htb default 1")
|
||||
tree.append(
|
||||
f"tc class add dev {dev} parent {ROOT_HANDLE} classid {CLASS_ID} htb rate 1gbit",
|
||||
)
|
||||
tree.append(
|
||||
f"tc qdisc add dev {dev} parent {CLASS_ID} handle {NETEM_HANDLE} netem delay 0ms loss 0%",
|
||||
)
|
||||
|
||||
return tree
|
||||
|
||||
|
||||
def namespaced_tc_commands(pids: list[str], commands: list[str]) -> list[str]:
|
||||
return [
|
||||
f"nsenter --target {pid} --net -- {rule}" for pid in pids for rule in commands
|
||||
]
|
||||
|
||||
|
||||
def get_egress_shaping_comand(
|
||||
devices: list[str],
|
||||
rate_mbit: Optional[str],
|
||||
delay_ms: Optional[str],
|
||||
loss_pct: Optional[str],
|
||||
) -> list[str]:
|
||||
|
||||
rate_commands = []
|
||||
rate = f"{rate_mbit}mbit" if rate_mbit is not None else "1gbit"
|
||||
d = delay_ms if delay_ms is not None else 0
|
||||
l = loss_pct if loss_pct is not None else 0
|
||||
for dev in devices:
|
||||
rate_commands.append(
|
||||
f"tc class change dev {dev} parent {ROOT_HANDLE} classid {CLASS_ID} htb rate {rate}"
|
||||
)
|
||||
rate_commands.append(
|
||||
f"tc qdisc change dev {dev} parent {CLASS_ID} handle {NETEM_HANDLE} netem delay {d}ms loss {l}%"
|
||||
)
|
||||
return rate_commands
|
||||
|
||||
|
||||
def get_clear_egress_shaping_commands(devices: list[str]) -> list[str]:
|
||||
return [f"tc qdisc del dev {dev} root handle {ROOT_HANDLE}" for dev in devices]
|
||||
|
||||
|
||||
def get_ingress_shaping_commands(
|
||||
devs: list[str],
|
||||
rate_mbit: Optional[str],
|
||||
delay_ms: Optional[str],
|
||||
loss_pct: Optional[str],
|
||||
ifb_dev: str = "ifb0",
|
||||
) -> list[str]:
|
||||
|
||||
rate_commands = [
|
||||
f"modprobe ifb || true",
|
||||
f"ip link add {ifb_dev} type ifb || true",
|
||||
f"ip link set {ifb_dev} up || true",
|
||||
]
|
||||
|
||||
for dev in devs:
|
||||
rate_commands.append(f"tc qdisc add dev {dev} handle ffff: ingress || true")
|
||||
|
||||
rate_commands.append(
|
||||
f"tc filter add dev {dev} parent ffff: protocol all prio 1 "
|
||||
f"matchall action mirred egress redirect dev {ifb_dev} || true"
|
||||
)
|
||||
|
||||
rate_commands.append(
|
||||
f"tc qdisc add dev {ifb_dev} root handle {ROOT_HANDLE} htb default 1 || true"
|
||||
)
|
||||
rate_commands.append(
|
||||
f"tc class add dev {ifb_dev} parent {ROOT_HANDLE} classid {CLASS_ID} "
|
||||
f"htb rate {rate_mbit if rate_mbit else '1gbit'} || true"
|
||||
)
|
||||
rate_commands.append(
|
||||
f"tc qdisc add dev {ifb_dev} parent {CLASS_ID} handle {NETEM_HANDLE} "
|
||||
f"netem delay {delay_ms if delay_ms else '0ms'} "
|
||||
f"loss {loss_pct if loss_pct else '0'}% || true"
|
||||
)
|
||||
|
||||
return rate_commands
|
||||
|
||||
|
||||
def get_clear_ingress_shaping_commands(
|
||||
devs: list[str],
|
||||
ifb_dev: str = "ifb0",
|
||||
) -> list[str]:
|
||||
|
||||
cmds: list[str] = []
|
||||
for dev in devs:
|
||||
cmds.append(f"tc qdisc del dev {dev} ingress || true")
|
||||
|
||||
cmds.append(f"tc qdisc del dev {ifb_dev} root handle {ROOT_HANDLE} || true")
|
||||
|
||||
cmds.append(f"ip link set {ifb_dev} down || true")
|
||||
cmds.append(f"ip link del {ifb_dev} || true")
|
||||
|
||||
return cmds
|
||||
|
||||
|
||||
def node_qdisc_is_simple(
|
||||
kubecli: KrknKubernetes, pod_name, namespace: str, interface: str
|
||||
) -> bool:
|
||||
|
||||
result = kubecli.exec_cmd_in_pod(
|
||||
[f"tc qdisc show dev {interface}"], pod_name, namespace
|
||||
)
|
||||
lines = [l for l in result.splitlines() if l.strip()]
|
||||
if len(lines) != 1:
|
||||
return False
|
||||
|
||||
line = lines[0].lower()
|
||||
if "htb" in line or "netem" in line or "clsact" in line:
|
||||
return False
|
||||
|
||||
return True
|
||||
|
||||
|
||||
def common_set_limit_rules(
|
||||
egress: bool,
|
||||
ingress: bool,
|
||||
interfaces: list[str],
|
||||
bandwidth: str,
|
||||
latency: str,
|
||||
loss: str,
|
||||
parallel: bool,
|
||||
target: str,
|
||||
kubecli: KrknKubernetes,
|
||||
network_chaos_pod_name: str,
|
||||
namespace: str,
|
||||
pids: Optional[list[str]] = None,
|
||||
):
|
||||
if egress:
|
||||
build_tree_commands = get_build_tc_tree_commands(interfaces)
|
||||
if pids:
|
||||
build_tree_commands = namespaced_tc_commands(pids, build_tree_commands)
|
||||
egress_shaping_commands = get_egress_shaping_comand(
|
||||
interfaces,
|
||||
bandwidth,
|
||||
latency,
|
||||
loss,
|
||||
)
|
||||
if pids:
|
||||
egress_shaping_commands = namespaced_tc_commands(
|
||||
pids, egress_shaping_commands
|
||||
)
|
||||
error_counter = 0
|
||||
for rule in build_tree_commands:
|
||||
result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
|
||||
if not result:
|
||||
log_info(f"created tc tree in pod: {rule}", parallel, target)
|
||||
else:
|
||||
error_counter += 1
|
||||
if len(build_tree_commands) == error_counter:
|
||||
log_error(
|
||||
"failed to apply egress shaping rules on cluster", parallel, target
|
||||
)
|
||||
|
||||
for rule in egress_shaping_commands:
|
||||
result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
|
||||
if not result:
|
||||
log_info(f"applied egress shaping rules: {rule}", parallel, target)
|
||||
if ingress:
|
||||
ingress_shaping_commands = get_ingress_shaping_commands(
|
||||
interfaces,
|
||||
bandwidth,
|
||||
latency,
|
||||
loss,
|
||||
)
|
||||
if pids:
|
||||
ingress_shaping_commands = namespaced_tc_commands(
|
||||
pids, ingress_shaping_commands
|
||||
)
|
||||
error_counter = 0
|
||||
for rule in ingress_shaping_commands:
|
||||
|
||||
result = kubecli.exec_cmd_in_pod([rule], network_chaos_pod_name, namespace)
|
||||
if not result:
|
||||
log_info(
|
||||
f"applied ingress shaping rule: {rule}",
|
||||
parallel,
|
||||
network_chaos_pod_name,
|
||||
)
|
||||
else:
|
||||
error_counter += 1
|
||||
|
||||
if len(ingress_shaping_commands) == error_counter:
|
||||
log_error(
|
||||
"failed to apply ingress shaping rules on cluster", parallel, target
|
||||
)
|
||||
|
||||
|
||||
def common_delete_limit_rules(
|
||||
egress: bool,
|
||||
ingress: bool,
|
||||
interfaces: list[str],
|
||||
network_chaos_pod_name: str,
|
||||
network_chaos_namespace: str,
|
||||
kubecli: KrknKubernetes,
|
||||
pids: Optional[list[str]],
|
||||
parallel: bool,
|
||||
target: str,
|
||||
):
|
||||
if egress:
|
||||
clear_commands = get_clear_egress_shaping_commands(interfaces)
|
||||
if pids:
|
||||
clear_commands = namespaced_tc_commands(pids, clear_commands)
|
||||
error_counter = 0
|
||||
for rule in clear_commands:
|
||||
result = kubecli.exec_cmd_in_pod(
|
||||
[rule], network_chaos_pod_name, network_chaos_namespace
|
||||
)
|
||||
if not result:
|
||||
log_info(f"removed egress shaping rule : {rule}", parallel, target)
|
||||
else:
|
||||
error_counter += 1
|
||||
if len(clear_commands) == error_counter:
|
||||
log_error(
|
||||
"failed to remove egress shaping rules on cluster", parallel, target
|
||||
)
|
||||
|
||||
if ingress:
|
||||
clear_commands = get_clear_ingress_shaping_commands(interfaces)
|
||||
if pids:
|
||||
clear_commands = namespaced_tc_commands(pids, clear_commands)
|
||||
error_counter = 0
|
||||
for rule in clear_commands:
|
||||
result = kubecli.exec_cmd_in_pod(
|
||||
[rule], network_chaos_pod_name, network_chaos_namespace
|
||||
)
|
||||
if not result:
|
||||
log_info(f"removed ingress shaping rule: {rule}", parallel, target)
|
||||
else:
|
||||
error_counter += 1
|
||||
if len(clear_commands) == error_counter:
|
||||
log_error(
|
||||
"failed to remove ingress shaping rules on cluster", parallel, target
|
||||
)
|
||||
@@ -1,7 +1,5 @@
|
||||
import os
|
||||
from typing import Tuple
|
||||
|
||||
import yaml
|
||||
from jinja2 import FileSystemLoader, Environment
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
|
||||
@@ -10,7 +8,7 @@ from krkn.scenario_plugins.network_chaos_ng.modules.utils import log_info
|
||||
|
||||
def generate_rules(
|
||||
interfaces: list[str], config: NetworkFilterConfig
|
||||
) -> (list[str], list[str]):
|
||||
) -> Tuple[list[str], list[str]]:
|
||||
input_rules = []
|
||||
output_rules = []
|
||||
for interface in interfaces:
|
||||
@@ -29,72 +27,6 @@ def generate_rules(
|
||||
return input_rules, output_rules
|
||||
|
||||
|
||||
def generate_namespaced_rules(
|
||||
interfaces: list[str], config: NetworkFilterConfig, pids: list[str]
|
||||
) -> (list[str], list[str]):
|
||||
namespaced_input_rules: list[str] = []
|
||||
namespaced_output_rules: list[str] = []
|
||||
input_rules, output_rules = generate_rules(interfaces, config)
|
||||
for pid in pids:
|
||||
ns_input_rules = [
|
||||
f"nsenter --target {pid} --net -- {rule}" for rule in input_rules
|
||||
]
|
||||
ns_output_rules = [
|
||||
f"nsenter --target {pid} --net -- {rule}" for rule in output_rules
|
||||
]
|
||||
namespaced_input_rules.extend(ns_input_rules)
|
||||
namespaced_output_rules.extend(ns_output_rules)
|
||||
|
||||
return namespaced_input_rules, namespaced_output_rules
|
||||
|
||||
|
||||
def deploy_network_filter_pod(
|
||||
config: NetworkFilterConfig,
|
||||
target_node: str,
|
||||
pod_name: str,
|
||||
kubecli: KrknKubernetes,
|
||||
container_name: str = "fedora",
|
||||
host_network: bool = True,
|
||||
):
|
||||
file_loader = FileSystemLoader(os.path.abspath(os.path.dirname(__file__)))
|
||||
env = Environment(loader=file_loader, autoescape=True)
|
||||
pod_template = env.get_template("templates/network-chaos.j2")
|
||||
tolerations = []
|
||||
|
||||
for taint in config.taints:
|
||||
key_value_part, effect = taint.split(":", 1)
|
||||
if "=" in key_value_part:
|
||||
key, value = key_value_part.split("=", 1)
|
||||
operator = "Equal"
|
||||
else:
|
||||
key = key_value_part
|
||||
value = None
|
||||
operator = "Exists"
|
||||
toleration = {
|
||||
"key": key,
|
||||
"operator": operator,
|
||||
"effect": effect,
|
||||
}
|
||||
if value is not None:
|
||||
toleration["value"] = value
|
||||
tolerations.append(toleration)
|
||||
|
||||
pod_body = yaml.safe_load(
|
||||
pod_template.render(
|
||||
pod_name=pod_name,
|
||||
namespace=config.namespace,
|
||||
host_network=host_network,
|
||||
target=target_node,
|
||||
container_name=container_name,
|
||||
workload_image=config.image,
|
||||
taints=tolerations,
|
||||
service_account=config.service_account,
|
||||
)
|
||||
)
|
||||
|
||||
kubecli.create_pod(pod_body, config.namespace, 300)
|
||||
|
||||
|
||||
def apply_network_rules(
|
||||
kubecli: KrknKubernetes,
|
||||
input_rules: list[str],
|
||||
@@ -153,9 +85,20 @@ def clean_network_rules_namespaced(
|
||||
)
|
||||
|
||||
|
||||
def get_default_interface(
|
||||
pod_name: str, namespace: str, kubecli: KrknKubernetes
|
||||
) -> str:
|
||||
cmd = "ip r | grep default | awk '/default/ {print $5}'"
|
||||
output = kubecli.exec_cmd_in_pod([cmd], pod_name, namespace)
|
||||
return output.replace("\n", "")
|
||||
def generate_namespaced_rules(
|
||||
interfaces: list[str], config: NetworkFilterConfig, pids: list[str]
|
||||
) -> Tuple[list[str], list[str]]:
|
||||
namespaced_input_rules: list[str] = []
|
||||
namespaced_output_rules: list[str] = []
|
||||
input_rules, output_rules = generate_rules(interfaces, config)
|
||||
for pid in pids:
|
||||
ns_input_rules = [
|
||||
f"nsenter --target {pid} --net -- {rule}" for rule in input_rules
|
||||
]
|
||||
ns_output_rules = [
|
||||
f"nsenter --target {pid} --net -- {rule}" for rule in output_rules
|
||||
]
|
||||
namespaced_input_rules.extend(ns_input_rules)
|
||||
namespaced_output_rules.extend(ns_output_rules)
|
||||
|
||||
return namespaced_input_rules, namespaced_output_rules
|
||||
|
||||
@@ -1,17 +1,31 @@
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import NetworkFilterConfig
|
||||
from krkn.scenario_plugins.network_chaos_ng.models import (
|
||||
NetworkFilterConfig,
|
||||
NetworkChaosConfig,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
|
||||
AbstractNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.node_network_chaos import (
|
||||
NodeNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.node_network_filter import (
|
||||
NodeNetworkFilterModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_chaos import (
|
||||
PodNetworkChaosModule,
|
||||
)
|
||||
from krkn.scenario_plugins.network_chaos_ng.modules.pod_network_filter import (
|
||||
PodNetworkFilterModule,
|
||||
)
|
||||
|
||||
supported_modules = ["node_network_filter", "pod_network_filter"]
|
||||
supported_modules = [
|
||||
"node_network_filter",
|
||||
"pod_network_filter",
|
||||
"pod_network_chaos",
|
||||
"node_network_chaos",
|
||||
]
|
||||
|
||||
|
||||
class NetworkChaosFactory:
|
||||
@@ -26,14 +40,28 @@ class NetworkChaosFactory:
|
||||
raise Exception(f"{config['id']} is not a supported network chaos module")
|
||||
|
||||
if config["id"] == "node_network_filter":
|
||||
config = NetworkFilterConfig(**config)
|
||||
errors = config.validate()
|
||||
scenario_config = NetworkFilterConfig(**config)
|
||||
errors = scenario_config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return NodeNetworkFilterModule(config, kubecli)
|
||||
return NodeNetworkFilterModule(scenario_config, kubecli)
|
||||
if config["id"] == "pod_network_filter":
|
||||
config = NetworkFilterConfig(**config)
|
||||
errors = config.validate()
|
||||
scenario_config = NetworkFilterConfig(**config)
|
||||
errors = scenario_config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return PodNetworkFilterModule(config, kubecli)
|
||||
return PodNetworkFilterModule(scenario_config, kubecli)
|
||||
if config["id"] == "pod_network_chaos":
|
||||
scenario_config = NetworkChaosConfig(**config)
|
||||
errors = scenario_config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return PodNetworkChaosModule(scenario_config, kubecli)
|
||||
if config["id"] == "node_network_chaos":
|
||||
scenario_config = NetworkChaosConfig(**config)
|
||||
errors = scenario_config.validate()
|
||||
if len(errors) > 0:
|
||||
raise Exception(f"config validation errors: [{';'.join(errors)}]")
|
||||
return NodeNetworkChaosModule(scenario_config, kubecli)
|
||||
else:
|
||||
raise Exception(f"invalid network chaos id {config['id']}")
|
||||
|
||||
@@ -22,7 +22,6 @@ class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
|
||||
@@ -11,7 +11,7 @@ def get_node_by_name(node_name_list, kubecli: KrknKubernetes):
|
||||
for node_name in node_name_list:
|
||||
if node_name not in killable_nodes:
|
||||
logging.info(
|
||||
f"Node with provided ${node_name} does not exist or the node might "
|
||||
f"Node with provided {node_name} does not exist or the node might "
|
||||
"be in NotReady state."
|
||||
)
|
||||
return
|
||||
|
||||
@@ -40,7 +40,6 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -62,7 +61,7 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
scenario_telemetry,
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.get_status(krkn_config, start_time, end_time)
|
||||
cerberus.get_status(start_time, end_time)
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error("Node Actions exiting due to Exception %s" % e)
|
||||
return 1
|
||||
@@ -196,13 +195,11 @@ class NodeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
exclude_nodes = common_node_functions.get_node(
|
||||
exclude_label, 0, kubecli
|
||||
)
|
||||
|
||||
for node in nodes:
|
||||
if node in exclude_nodes:
|
||||
logging.info(
|
||||
f"excluding node {node} with exclude label {exclude_nodes}"
|
||||
)
|
||||
nodes.remove(node)
|
||||
if exclude_nodes:
|
||||
logging.info(
|
||||
f"excluding nodes {exclude_nodes} with exclude label {exclude_label}"
|
||||
)
|
||||
nodes = [node for node in nodes if node not in exclude_nodes]
|
||||
|
||||
# GCP api doesn't support multiprocessing calls, will only actually run 1
|
||||
if parallel_nodes:
|
||||
|
||||
@@ -28,7 +28,6 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
|
||||
@@ -9,9 +9,8 @@ import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value, log_exception
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
from krkn import cerberus, utils
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.rollback.config import RollbackContent
|
||||
from krkn.rollback.handler import set_rollback_context_decorator
|
||||
@@ -23,7 +22,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -181,7 +179,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
)
|
||||
|
||||
start_time = int(time.time())
|
||||
# Create temp file in the PVC
|
||||
full_path = "%s/%s" % (str(mount_path), str(file_name))
|
||||
|
||||
@@ -285,8 +282,6 @@ class PvcScenarioPlugin(AbstractScenarioPlugin):
|
||||
file_size_kb,
|
||||
lib_telemetry.get_lib_kubernetes(),
|
||||
)
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error("PvcScenarioPlugin exiting due to Exception %s" % e)
|
||||
return 1
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
import importlib
|
||||
import inspect
|
||||
import pkgutil
|
||||
from typing import Type, Tuple, Optional
|
||||
from typing import Type, Tuple, Optional, Any
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
|
||||
@@ -11,7 +11,7 @@ class ScenarioPluginNotFound(Exception):
|
||||
|
||||
class ScenarioPluginFactory:
|
||||
|
||||
loaded_plugins: dict[str, any] = {}
|
||||
loaded_plugins: dict[str, Any] = {}
|
||||
failed_plugins: list[Tuple[str, str, str]] = []
|
||||
package_name = None
|
||||
|
||||
|
||||
@@ -6,9 +6,8 @@ import yaml
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_yaml_item_value, log_exception
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
from krkn import cerberus, utils
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
|
||||
@@ -17,7 +16,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -59,8 +57,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
|
||||
+ str(run_sleep)
|
||||
+ str(wait_time)
|
||||
)
|
||||
logging.info("done")
|
||||
start_time = int(time.time())
|
||||
for i in range(run_count):
|
||||
killed_namespaces = {}
|
||||
namespaces = (
|
||||
@@ -114,10 +110,6 @@ class ServiceDisruptionScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
time.sleep(run_sleep)
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
krkn_config, [], start_time, end_time
|
||||
)
|
||||
except (Exception, RuntimeError) as e:
|
||||
logging.error(
|
||||
"ServiceDisruptionScenarioPlugin exiting due to Exception %s" % e
|
||||
|
||||
@@ -16,7 +16,6 @@ class ServiceHijackingScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
|
||||
@@ -7,7 +7,6 @@ from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn import cerberus
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.az_node_scenarios import Azure
|
||||
@@ -24,7 +23,6 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -34,15 +32,12 @@ class ShutDownScenarioPlugin(AbstractScenarioPlugin):
|
||||
shut_down_config_scenario = shut_down_config_yaml[
|
||||
"cluster_shut_down_scenario"
|
||||
]
|
||||
start_time = int(time.time())
|
||||
affected_nodes_status = AffectedNodeStatus()
|
||||
self.cluster_shut_down(
|
||||
shut_down_config_scenario, lib_telemetry.get_lib_kubernetes(), affected_nodes_status
|
||||
)
|
||||
|
||||
scenario_telemetry.affected_nodes = affected_nodes_status.affected_nodes
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
return 0
|
||||
except Exception as e:
|
||||
logging.error(
|
||||
|
||||
@@ -19,7 +19,6 @@ class SynFloodScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
|
||||
@@ -11,7 +11,6 @@ from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.utils import get_random_string, get_yaml_item_value, log_exception
|
||||
from kubernetes.client import ApiException
|
||||
|
||||
from krkn import cerberus, utils
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
|
||||
|
||||
@@ -20,7 +19,6 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -28,7 +26,6 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
with open(scenario, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
for time_scenario in scenario_config["time_scenarios"]:
|
||||
start_time = int(time.time())
|
||||
object_type, object_names = self.skew_time(
|
||||
time_scenario, lib_telemetry.get_lib_kubernetes()
|
||||
)
|
||||
@@ -39,11 +36,7 @@ class TimeActionsScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
if len(not_reset) > 0:
|
||||
logging.info("Object times were not reset")
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(
|
||||
krkn_config, not_reset, start_time, end_time
|
||||
)
|
||||
except (RuntimeError, Exception) as e:
|
||||
except (RuntimeError, Exception):
|
||||
logging.error(
|
||||
f"TimeActionsScenarioPlugin scenario {scenario} failed with exception: {e}"
|
||||
)
|
||||
|
||||
@@ -11,9 +11,8 @@ from krkn_lib.models.k8s import AffectedNodeStatus
|
||||
from krkn_lib.models.telemetry import ScenarioTelemetry
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
from krkn.scenario_plugins.abstract_scenario_plugin import AbstractScenarioPlugin
|
||||
from krkn.scenario_plugins.native.network import cerberus
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS
|
||||
from krkn.scenario_plugins.node_actions.gcp_node_scenarios import gcp_node_scenarios
|
||||
@@ -23,7 +22,6 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
self,
|
||||
run_uuid: str,
|
||||
scenario: str,
|
||||
krkn_config: dict[str, any],
|
||||
lib_telemetry: KrknTelemetryOpenshift,
|
||||
scenario_telemetry: ScenarioTelemetry,
|
||||
) -> int:
|
||||
@@ -52,8 +50,6 @@ class ZoneOutageScenarioPlugin(AbstractScenarioPlugin):
|
||||
)
|
||||
return 1
|
||||
|
||||
end_time = int(time.time())
|
||||
cerberus.publish_kraken_status(krkn_config, [], start_time, end_time)
|
||||
except (RuntimeError, Exception) as e:
|
||||
logging.error(
|
||||
f"ZoneOutageScenarioPlugin scenario {scenario} failed with exception: {e}"
|
||||
|
||||
71
krkn/utils/ErrorCollectionHandler.py
Normal file
71
krkn/utils/ErrorCollectionHandler.py
Normal file
@@ -0,0 +1,71 @@
|
||||
import logging
|
||||
import threading
|
||||
from datetime import datetime, timezone
|
||||
from krkn.utils.ErrorLog import ErrorLog
|
||||
|
||||
|
||||
class ErrorCollectionHandler(logging.Handler):
|
||||
"""
|
||||
Custom logging handler that captures ERROR and CRITICAL level logs
|
||||
in structured format for telemetry collection.
|
||||
|
||||
Stores logs in memory as ErrorLog objects for later retrieval.
|
||||
Thread-safe for concurrent logging operations.
|
||||
"""
|
||||
|
||||
def __init__(self, level=logging.ERROR):
|
||||
"""
|
||||
Initialize the error collection handler.
|
||||
|
||||
Args:
|
||||
level: Minimum log level to capture (default: ERROR)
|
||||
"""
|
||||
super().__init__(level)
|
||||
self.error_logs: list[ErrorLog] = []
|
||||
self._lock = threading.Lock()
|
||||
|
||||
def emit(self, record: logging.LogRecord):
|
||||
"""
|
||||
Capture ERROR and CRITICAL logs and store as ErrorLog objects.
|
||||
|
||||
Args:
|
||||
record: LogRecord from Python logging framework
|
||||
"""
|
||||
try:
|
||||
# Only capture ERROR (40) and CRITICAL (50) levels
|
||||
if record.levelno < logging.ERROR:
|
||||
return
|
||||
|
||||
# Format timestamp as ISO 8601 UTC
|
||||
timestamp = datetime.fromtimestamp(
|
||||
record.created, tz=timezone.utc
|
||||
).strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + "Z"
|
||||
|
||||
# Create ErrorLog object
|
||||
error_log = ErrorLog(
|
||||
timestamp=timestamp,
|
||||
message=record.getMessage()
|
||||
)
|
||||
|
||||
# Thread-safe append
|
||||
with self._lock:
|
||||
self.error_logs.append(error_log)
|
||||
|
||||
except Exception:
|
||||
# Handler should never raise exceptions (logging best practice)
|
||||
self.handleError(record)
|
||||
|
||||
def get_error_logs(self) -> list[dict]:
|
||||
"""
|
||||
Retrieve all collected error logs as list of dictionaries.
|
||||
|
||||
Returns:
|
||||
List of error log dictionaries with timestamp and message
|
||||
"""
|
||||
with self._lock:
|
||||
return [log.to_dict() for log in self.error_logs]
|
||||
|
||||
def clear(self):
|
||||
"""Clear all collected error logs (useful for testing)"""
|
||||
with self._lock:
|
||||
self.error_logs.clear()
|
||||
18
krkn/utils/ErrorLog.py
Normal file
18
krkn/utils/ErrorLog.py
Normal file
@@ -0,0 +1,18 @@
|
||||
from dataclasses import dataclass, asdict
|
||||
|
||||
|
||||
@dataclass
|
||||
class ErrorLog:
|
||||
"""
|
||||
Represents a single error log entry for telemetry collection.
|
||||
|
||||
Attributes:
|
||||
timestamp: ISO 8601 formatted timestamp (UTC)
|
||||
message: Full error message text
|
||||
"""
|
||||
timestamp: str
|
||||
message: str
|
||||
|
||||
def to_dict(self) -> dict:
|
||||
"""Convert to dictionary for JSON serialization"""
|
||||
return asdict(self)
|
||||
@@ -77,7 +77,7 @@ class HealthChecker:
|
||||
success_response = {
|
||||
"url": url,
|
||||
"status": True,
|
||||
"status_code": response["status_code"],
|
||||
"status_code": health_check_tracker[url]["status_code"],
|
||||
"start_timestamp": health_check_tracker[url]["start_timestamp"].isoformat(),
|
||||
"end_timestamp": health_check_end_time_stamp.isoformat(),
|
||||
"duration": duration
|
||||
|
||||
@@ -40,16 +40,20 @@ class VirtChecker:
|
||||
self.kube_vm_plugin = KubevirtVmOutageScenarioPlugin()
|
||||
self.kube_vm_plugin.init_clients(k8s_client=krkn_lib)
|
||||
|
||||
self.kube_vm_plugin.get_vmis(vmi_name_match,self.namespace)
|
||||
self.vmis_list = self.kube_vm_plugin.k8s_client.get_vmis(vmi_name_match,self.namespace)
|
||||
except Exception as e:
|
||||
logging.error('Virt Check init exception: ' + str(e))
|
||||
return
|
||||
# See if multiple node names exist
|
||||
node_name_list = [node_name for node_name in self.node_names.split(',') if node_name]
|
||||
for vmi in self.kube_vm_plugin.vmis_list:
|
||||
for vmi in self.vmis_list:
|
||||
node_name = vmi.get("status",{}).get("nodeName")
|
||||
vmi_name = vmi.get("metadata",{}).get("name")
|
||||
ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
|
||||
interfaces = vmi.get("status",{}).get("interfaces",[])
|
||||
if not interfaces:
|
||||
logging.warning(f"VMI {vmi_name} has no network interfaces, skipping")
|
||||
continue
|
||||
ip_address = interfaces[0].get("ipAddress")
|
||||
namespace = vmi.get("metadata",{}).get("namespace")
|
||||
# If node_name_list exists, only add if node name is in list
|
||||
|
||||
@@ -74,7 +78,8 @@ class VirtChecker:
|
||||
else:
|
||||
logging.debug(f"Disconnected access for {ip_address} on {worker_name} is failed: {output}")
|
||||
vmi = self.kube_vm_plugin.get_vmi(vmi_name,self.namespace)
|
||||
new_ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress")
|
||||
interfaces = vmi.get("status",{}).get("interfaces",[])
|
||||
new_ip_address = interfaces[0].get("ipAddress") if interfaces else None
|
||||
new_node_name = vmi.get("status",{}).get("nodeName")
|
||||
# if vm gets deleted, it'll start up with a new ip address
|
||||
if new_ip_address != ip_address:
|
||||
@@ -102,7 +107,7 @@ class VirtChecker:
|
||||
|
||||
def get_vm_access(self, vm_name: str = '', namespace: str = ''):
|
||||
"""
|
||||
This method returns True when the VM is access and an error message when it is not, using virtctl protocol
|
||||
This method returns True when the VM is accessible and an error message when it is not, using virtctl protocol
|
||||
:param vm_name:
|
||||
:param namespace:
|
||||
:return: virtctl_status 'True' if successful, or an error message if it fails.
|
||||
|
||||
@@ -1,2 +1,4 @@
|
||||
from .TeeLogHandler import TeeLogHandler
|
||||
from .ErrorLog import ErrorLog
|
||||
from .ErrorCollectionHandler import ErrorCollectionHandler
|
||||
from .functions import *
|
||||
|
||||
@@ -1,23 +1,23 @@
|
||||
aliyun-python-sdk-core==2.13.36
|
||||
aliyun-python-sdk-ecs==4.24.25
|
||||
arcaflow-plugin-sdk==0.14.0
|
||||
boto3==1.28.61
|
||||
boto3>=1.34.0 # Updated to support urllib3 2.x
|
||||
azure-identity==1.16.1
|
||||
azure-keyvault==4.2.0
|
||||
azure-mgmt-compute==30.5.0
|
||||
azure-mgmt-network==27.0.0
|
||||
coverage==7.6.12
|
||||
datetime==5.4
|
||||
docker>=6.0,<7.0 # docker 7.0+ has breaking changes with Unix sockets
|
||||
docker>=6.0,<7.0 # docker 7.0+ has breaking changes; works with requests<2.32
|
||||
gitpython==3.1.41
|
||||
google-auth==2.37.0
|
||||
google-cloud-compute==1.22.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
ibm_cloud_sdk_core>=3.20.0 # Requires urllib3>=2.1.0 (compatible with updated boto3)
|
||||
ibm_vpc==0.26.3 # Requires ibm_cloud_sdk_core
|
||||
jinja2==3.1.6
|
||||
krkn-lib==6.0.1
|
||||
lxml==5.1.0
|
||||
kubernetes==34.1.0
|
||||
krkn-lib==6.0.5
|
||||
numpy==1.26.4
|
||||
pandas==2.2.0
|
||||
openshift-client==1.0.21
|
||||
@@ -29,11 +29,13 @@ python-ipmi==0.5.4
|
||||
python-openstackclient==6.5.0
|
||||
requests<2.32 # requests 2.32+ breaks Unix socket support (http+docker scheme)
|
||||
requests-unixsocket>=0.4.0 # Required for Docker Unix socket support
|
||||
urllib3>=2.1.0,<2.4.0 # Compatible with all dependencies
|
||||
service_identity==24.1.0
|
||||
PyYAML==6.0.1
|
||||
setuptools==78.1.1
|
||||
wheel>=0.44.0
|
||||
zope.interface==6.1
|
||||
colorlog==6.10.1
|
||||
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
127
run_kraken.py
127
run_kraken.py
@@ -6,28 +6,34 @@ import sys
|
||||
import yaml
|
||||
import logging
|
||||
import optparse
|
||||
from colorlog import ColoredFormatter
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
import queue
|
||||
import threading
|
||||
from typing import Optional
|
||||
from typing import Optional, Dict
|
||||
|
||||
from krkn import cerberus
|
||||
from krkn_lib.elastic.krkn_elastic import KrknElastic
|
||||
from krkn_lib.models.elastic import ElasticChaosRunTelemetry
|
||||
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
import krkn.prometheus as prometheus_plugin
|
||||
import server as server
|
||||
from krkn.resiliency.resiliency import (
|
||||
Resiliency
|
||||
)
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.ocp import KrknOpenshift
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||
from krkn_lib.models.k8s import ResiliencyReport
|
||||
from krkn_lib.utils import SafeLogger
|
||||
from krkn_lib.utils.functions import get_yaml_item_value, get_junit_test_case
|
||||
|
||||
from krkn.utils import TeeLogHandler
|
||||
from krkn.utils import TeeLogHandler, ErrorCollectionHandler
|
||||
from krkn.utils.HealthChecker import HealthChecker
|
||||
from krkn.utils.VirtChecker import VirtChecker
|
||||
from krkn.scenario_plugins.scenario_plugin_factory import (
|
||||
@@ -52,6 +58,8 @@ def main(options, command: Optional[str]) -> int:
|
||||
print(pyfiglet.figlet_format("kraken"))
|
||||
logging.info("Starting kraken")
|
||||
|
||||
|
||||
|
||||
cfg = options.cfg
|
||||
# Parse and read the config
|
||||
if os.path.isfile(cfg):
|
||||
@@ -63,6 +71,7 @@ def main(options, command: Optional[str]) -> int:
|
||||
get_yaml_item_value(config["kraken"], "kubeconfig_path", "")
|
||||
)
|
||||
kraken_config = cfg
|
||||
|
||||
chaos_scenarios = get_yaml_item_value(config["kraken"], "chaos_scenarios", [])
|
||||
publish_running_status = get_yaml_item_value(
|
||||
config["kraken"], "publish_kraken_status", False
|
||||
@@ -84,14 +93,20 @@ def main(options, command: Optional[str]) -> int:
|
||||
config["kraken"], "signal_address", "0.0.0.0"
|
||||
)
|
||||
run_signal = get_yaml_item_value(config["kraken"], "signal_state", "RUN")
|
||||
|
||||
resiliency_config = get_yaml_item_value(config,"resiliency",{})
|
||||
# Determine execution mode (standalone, controller, or disabled)
|
||||
run_mode = get_yaml_item_value(resiliency_config, "resiliency_run_mode", "standalone")
|
||||
valid_run_modes = {"standalone", "detailed", "disabled"}
|
||||
if run_mode not in valid_run_modes:
|
||||
logging.warning("Unknown resiliency_run_mode '%s'. Defaulting to 'standalone'", run_mode)
|
||||
run_mode = "standalone"
|
||||
wait_duration = get_yaml_item_value(config["tunings"], "wait_duration", 60)
|
||||
iterations = get_yaml_item_value(config["tunings"], "iterations", 1)
|
||||
daemon_mode = get_yaml_item_value(config["tunings"], "daemon_mode", False)
|
||||
|
||||
prometheus_url = config["performance_monitoring"].get("prometheus_url")
|
||||
prometheus_bearer_token = config["performance_monitoring"].get(
|
||||
"prometheus_bearer_token"
|
||||
)
|
||||
prometheus_bearer_token = config["performance_monitoring"].get("prometheus_bearer_token")
|
||||
run_uuid = config["performance_monitoring"].get("uuid")
|
||||
enable_alerts = get_yaml_item_value(
|
||||
config["performance_monitoring"], "enable_alerts", False
|
||||
@@ -99,9 +114,13 @@ def main(options, command: Optional[str]) -> int:
|
||||
enable_metrics = get_yaml_item_value(
|
||||
config["performance_monitoring"], "enable_metrics", False
|
||||
)
|
||||
|
||||
|
||||
# Default placeholder; will be overridden if a Prometheus URL is available
|
||||
prometheus = None
|
||||
# elastic search
|
||||
enable_elastic = get_yaml_item_value(config["elastic"], "enable_elastic", False)
|
||||
|
||||
elastic_run_tag = get_yaml_item_value(config["elastic"], "run_tag", "")
|
||||
elastic_url = get_yaml_item_value(config["elastic"], "elastic_url", "")
|
||||
|
||||
elastic_verify_certs = get_yaml_item_value(
|
||||
@@ -144,6 +163,9 @@ def main(options, command: Optional[str]) -> int:
|
||||
return -1
|
||||
logging.info("Initializing client to talk to the Kubernetes cluster")
|
||||
|
||||
# Set Cerberus url if enabled
|
||||
cerberus.set_url(config)
|
||||
|
||||
# Generate uuid for the run
|
||||
if run_uuid:
|
||||
logging.info(
|
||||
@@ -226,6 +248,11 @@ def main(options, command: Optional[str]) -> int:
|
||||
else:
|
||||
logging.info("Cluster version CRD not detected, skipping")
|
||||
|
||||
# Final check: ensure Prometheus URL is available; disable resiliency if not
|
||||
if (not prometheus_url or prometheus_url.strip() == "") and run_mode != "disabled":
|
||||
logging.warning("Prometheus URL not provided; disabling resiliency score features.")
|
||||
run_mode = "disabled"
|
||||
|
||||
# KrknTelemetry init
|
||||
telemetry_k8s = KrknTelemetryKubernetes(
|
||||
safe_logger, kubecli, config["telemetry"]
|
||||
@@ -246,9 +273,18 @@ def main(options, command: Optional[str]) -> int:
|
||||
else:
|
||||
elastic_search = None
|
||||
summary = ChaosRunAlertSummary()
|
||||
if enable_metrics or enable_alerts or check_critical_alerts:
|
||||
if enable_metrics or enable_alerts or check_critical_alerts or run_mode != "disabled":
|
||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||
|
||||
# Quick connectivity probe for Prometheus – disable resiliency if unreachable
|
||||
try:
|
||||
prometheus.process_prom_query_in_range(
|
||||
"up", datetime.datetime.utcnow() - datetime.timedelta(seconds=60), datetime.datetime.utcnow(), granularity=60
|
||||
)
|
||||
except Exception as prom_exc:
|
||||
logging.error("Prometheus connectivity test failed: %s. Disabling resiliency features as Prometheus is required for SLO evaluation.", prom_exc)
|
||||
run_mode = "disabled"
|
||||
resiliency_alerts = get_yaml_item_value(resiliency_config, "resiliency_file", get_yaml_item_value(config['performance_monitoring'],"alert_profile", "config/alerts.yaml"))
|
||||
resiliency_obj = Resiliency(resiliency_alerts) if run_mode != "disabled" else None # Initialize resiliency orchestrator
|
||||
logging.info("Server URL: %s" % kubecli.get_host())
|
||||
|
||||
if command == "list-rollback":
|
||||
@@ -288,6 +324,7 @@ def main(options, command: Optional[str]) -> int:
|
||||
chaos_output = ChaosRunOutput()
|
||||
chaos_telemetry = ChaosRunTelemetry()
|
||||
chaos_telemetry.run_uuid = run_uuid
|
||||
chaos_telemetry.tag = elastic_run_tag
|
||||
scenario_plugin_factory = ScenarioPluginFactory()
|
||||
classes_and_types: dict[str, list[str]] = {}
|
||||
for loaded in scenario_plugin_factory.loaded_plugins.keys():
|
||||
@@ -363,12 +400,24 @@ def main(options, command: Optional[str]) -> int:
|
||||
)
|
||||
sys.exit(-1)
|
||||
|
||||
failed_post_scenarios, scenario_telemetries = (
|
||||
|
||||
batch_window_start_dt = datetime.datetime.utcnow()
|
||||
failed_scenarios_current, scenario_telemetries = (
|
||||
scenario_plugin.run_scenarios(
|
||||
run_uuid, scenarios_list, config, telemetry_ocp
|
||||
)
|
||||
)
|
||||
failed_post_scenarios.extend(failed_scenarios_current)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
batch_window_end_dt = datetime.datetime.utcnow()
|
||||
if resiliency_obj:
|
||||
resiliency_obj.add_scenario_reports(
|
||||
scenario_telemetries=scenario_telemetries,
|
||||
prom_cli=prometheus,
|
||||
scenario_type=scenario_type,
|
||||
batch_start_dt=batch_window_start_dt,
|
||||
batch_end_dt=batch_window_end_dt,
|
||||
)
|
||||
|
||||
post_critical_alerts = 0
|
||||
if check_critical_alerts:
|
||||
@@ -425,16 +474,51 @@ def main(options, command: Optional[str]) -> int:
|
||||
logging.info("collecting Kubernetes cluster metadata....")
|
||||
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
||||
|
||||
# Collect error logs from handler
|
||||
error_logs = error_collection_handler.get_error_logs()
|
||||
if error_logs:
|
||||
logging.info(f"Collected {len(error_logs)} error logs for telemetry")
|
||||
chaos_telemetry.error_logs = error_logs
|
||||
else:
|
||||
logging.info("No error logs collected during chaos run")
|
||||
chaos_telemetry.error_logs = []
|
||||
if resiliency_obj:
|
||||
try:
|
||||
resiliency_obj.attach_compact_to_telemetry(chaos_telemetry)
|
||||
except Exception as exc:
|
||||
logging.error("Failed to embed per-scenario resiliency in telemetry: %s", exc)
|
||||
|
||||
if resiliency_obj:
|
||||
try:
|
||||
resiliency_obj.finalize_and_save(
|
||||
prom_cli=prometheus,
|
||||
total_start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
total_end_time=datetime.datetime.fromtimestamp(end_time),
|
||||
run_mode=run_mode,
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
logging.error("Failed to finalize resiliency scoring: %s", e)
|
||||
|
||||
|
||||
telemetry_json = chaos_telemetry.to_json()
|
||||
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(telemetry_json))
|
||||
if resiliency_obj and hasattr(resiliency_obj, "summary") and resiliency_obj.summary is not None:
|
||||
summary_dict = resiliency_obj.get_summary()
|
||||
decoded_chaos_run_telemetry.overall_resiliency_report = ResiliencyReport(
|
||||
json_object=summary_dict,
|
||||
resiliency_score=summary_dict.get("resiliency_score", 0),
|
||||
passed_slos=summary_dict.get("passed_slos", 0),
|
||||
total_slos=summary_dict.get("total_slos", 0)
|
||||
)
|
||||
chaos_output.telemetry = decoded_chaos_run_telemetry
|
||||
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
|
||||
if enable_elastic:
|
||||
elastic_telemetry = ElasticChaosRunTelemetry(
|
||||
elastic_telemetry = ElasticChaosRunTelemetry(
|
||||
chaos_run_telemetry=decoded_chaos_run_telemetry
|
||||
)
|
||||
result = elastic_search.push_telemetry(
|
||||
elastic_telemetry, elastic_telemetry_index
|
||||
decoded_chaos_run_telemetry, elastic_telemetry_index
|
||||
)
|
||||
if result == -1:
|
||||
safe_logger.error(
|
||||
@@ -646,15 +730,30 @@ if __name__ == "__main__":
|
||||
# If no command or regular execution, continue with existing logic
|
||||
report_file = options.output
|
||||
tee_handler = TeeLogHandler()
|
||||
|
||||
fmt = "%(asctime)s [%(levelname)s] %(message)s"
|
||||
plain = logging.Formatter(fmt)
|
||||
colored = ColoredFormatter(
|
||||
"%(asctime)s [%(log_color)s%(levelname)s%(reset)s] %(message)s",
|
||||
log_colors={'DEBUG': 'white', 'INFO': 'white', 'WARNING': 'yellow', 'ERROR': 'red', 'CRITICAL': 'bold_red'},
|
||||
reset=True, style='%'
|
||||
)
|
||||
file_handler = logging.FileHandler(report_file, mode="w")
|
||||
file_handler.setFormatter(plain)
|
||||
stream_handler = logging.StreamHandler()
|
||||
stream_handler.setFormatter(colored)
|
||||
tee_handler.setFormatter(plain)
|
||||
error_collection_handler = ErrorCollectionHandler(level=logging.ERROR)
|
||||
|
||||
handlers = [
|
||||
logging.FileHandler(report_file, mode="w"),
|
||||
logging.StreamHandler(),
|
||||
file_handler,
|
||||
stream_handler,
|
||||
tee_handler,
|
||||
error_collection_handler,
|
||||
]
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.DEBUG if options.debug else logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=handlers,
|
||||
)
|
||||
option_error = False
|
||||
|
||||
6
scenarios/kind/pod_path_provisioner.yml
Executable file
6
scenarios/kind/pod_path_provisioner.yml
Executable file
@@ -0,0 +1,6 @@
|
||||
- id: kill-pods
|
||||
config:
|
||||
namespace_pattern: "local-path-storage"
|
||||
label_selector: "app=local-path-provisioner"
|
||||
krkn_pod_recovery_time: 20
|
||||
kill: 1
|
||||
18
scenarios/kube/node-network-chaos.yml
Normal file
18
scenarios/kube/node-network-chaos.yml
Normal file
@@ -0,0 +1,18 @@
|
||||
- id: node_network_chaos
|
||||
image: "quay.io/krkn-chaos/krkn-network-chaos:latest"
|
||||
wait_duration: 1
|
||||
test_duration: 60
|
||||
label_selector: ""
|
||||
service_account: ""
|
||||
taints: []
|
||||
namespace: 'default'
|
||||
instance_count: 1
|
||||
target: "<node_name>"
|
||||
execution: parallel
|
||||
interfaces: []
|
||||
ingress: true
|
||||
egress: true
|
||||
latency: 0s # supported units are us (microseconds), ms, s
|
||||
loss: 10 # percentage
|
||||
bandwidth: 1gbit #supported units are bit kbit mbit gbit tbit
|
||||
force: false
|
||||
@@ -4,7 +4,7 @@
|
||||
test_duration: 10
|
||||
label_selector: "<node_selector>"
|
||||
service_account: ""
|
||||
taints: [] # example ["node-role.kubernetes.io/master:NoSchedule"]
|
||||
taints: []
|
||||
namespace: 'default'
|
||||
instance_count: 1
|
||||
execution: parallel
|
||||
|
||||
17
scenarios/kube/pod-network-chaos.yml
Normal file
17
scenarios/kube/pod-network-chaos.yml
Normal file
@@ -0,0 +1,17 @@
|
||||
- id: pod_network_chaos
|
||||
image: "quay.io/krkn-chaos/krkn-network-chaos:latest"
|
||||
wait_duration: 1
|
||||
test_duration: 60
|
||||
label_selector: ""
|
||||
service_account: ""
|
||||
taints: []
|
||||
namespace: 'default'
|
||||
instance_count: 1
|
||||
target: "<pod_name>"
|
||||
execution: parallel
|
||||
interfaces: []
|
||||
ingress: true
|
||||
egress: true
|
||||
latency: 0s # supported units are us (microseconds), ms, s
|
||||
loss: 10 # percentage
|
||||
bandwidth: 1gbit #supported units are bit kbit mbit gbit tbit
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user