mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
initial version of Chaos AI (#606)
* init push Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove litmus + updated readme Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove redundant files Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * removed generated file+unused reference --------- Signed-off-by: Sandeep Hans <shans001@in.ibm.com> Co-authored-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
This commit is contained in:
39
utils/chaos_ai/README.md
Normal file
39
utils/chaos_ai/README.md
Normal file
@@ -0,0 +1,39 @@
|
||||
# aichaos
|
||||
Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
|
||||
|
||||
## Generate python package wheel file
|
||||
```
|
||||
python3.9 generate_wheel_package.py sdist bdist_wheel
|
||||
```
|
||||
This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder.
|
||||
|
||||
## Build Image
|
||||
```
|
||||
cd docker
|
||||
podman build -t aichaos:1.0 .
|
||||
OR
|
||||
docker build -t aichaos:1.0 .
|
||||
```
|
||||
|
||||
## Run Chaos AI
|
||||
```
|
||||
podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
OR
|
||||
docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
|
||||
```
|
||||
|
||||
The output should look like:
|
||||
```
|
||||
$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
|
||||
* Serving Flask app 'swagger_api' (lazy loading)
|
||||
* Environment: production
|
||||
WARNING: This is a development server. Do not use it in a production deployment.
|
||||
Use a production WSGI server instead.
|
||||
* Debug mode: on
|
||||
WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
|
||||
* Running on all addresses (0.0.0.0)
|
||||
* Running on http://127.0.0.1:5001
|
||||
* Running on http://172.17.0.2:5001
|
||||
```
|
||||
|
||||
You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
|
||||
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
0
utils/chaos_ai/config/experiments/.gitkeep
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
21
utils/chaos_ai/docker/Dockerfile
Normal file
@@ -0,0 +1,21 @@
|
||||
FROM bitnami/kubectl:1.20.9 as kubectl
|
||||
FROM python:3.9
|
||||
WORKDIR /app
|
||||
RUN pip3 install --upgrade pip
|
||||
COPY config config/
|
||||
COPY requirements.txt .
|
||||
RUN mkdir -p /app/logs
|
||||
RUN pip3 install -r requirements.txt
|
||||
|
||||
COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
|
||||
|
||||
COPY swagger_api.py .
|
||||
ENV PYTHONUNBUFFERED=1
|
||||
|
||||
RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
|
||||
|
||||
RUN apt-get update && apt-get install -y podman
|
||||
|
||||
COPY aichaos-0.0.1-py3-none-any.whl .
|
||||
RUN pip3 install aichaos-0.0.1-py3-none-any.whl
|
||||
CMD ["python3", "swagger_api.py"]
|
||||
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
7
utils/chaos_ai/docker/aichaos-config.json
Normal file
@@ -0,0 +1,7 @@
|
||||
{
|
||||
"command": "podman",
|
||||
"chaosengine": "kraken",
|
||||
"faults": "pod-delete",
|
||||
"iterations": 1,
|
||||
"maxfaults": 5
|
||||
}
|
||||
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
15
utils/chaos_ai/docker/config/experiments/log.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
36
utils/chaos_ai/docker/config/pod-delete.json
Normal file
@@ -0,0 +1,36 @@
|
||||
{
|
||||
"apiVersion": "1.0",
|
||||
"kind": "ChaosEngine",
|
||||
"metadata": {
|
||||
"name": "engine-cartns3"
|
||||
},
|
||||
"spec": {
|
||||
"engineState": "active",
|
||||
"annotationCheck": "false",
|
||||
"appinfo": {
|
||||
"appns": "robot-shop",
|
||||
"applabel": "service=payment",
|
||||
"appkind": "deployment"
|
||||
},
|
||||
"chaosServiceAccount": "pod-delete-sa",
|
||||
"experiments": [
|
||||
{
|
||||
"name": "pod-delete",
|
||||
"spec": {
|
||||
"components": {
|
||||
"env": [
|
||||
{
|
||||
"name": "FORCE",
|
||||
"value": "true"
|
||||
},
|
||||
{
|
||||
"name": "TOTAL_CHAOS_DURATION",
|
||||
"value": "120"
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
}
|
||||
]
|
||||
}
|
||||
}
|
||||
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
40
utils/chaos_ai/docker/config/yml/chaosGen.yml
Executable file
@@ -0,0 +1,40 @@
|
||||
|
||||
Generate chaos on an application deployed on a cluster.
|
||||
---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: file
|
||||
in: formData
|
||||
type: file
|
||||
required: true
|
||||
description: Kube-config file
|
||||
- name: namespace
|
||||
in: formData
|
||||
type: string
|
||||
default: robot-shop
|
||||
required: true
|
||||
description: Namespace to test
|
||||
- name: podlabels
|
||||
in: formData
|
||||
type: string
|
||||
default: service=cart,service=payment
|
||||
required: true
|
||||
description: Pod labels to test
|
||||
- name: nodelabels
|
||||
in: formData
|
||||
type: string
|
||||
required: false
|
||||
description: Node labels to test
|
||||
- name: urls
|
||||
in: formData
|
||||
type: string
|
||||
default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
|
||||
required: true
|
||||
description: Application URLs to test
|
||||
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos ID for the initiated chaos.
|
||||
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
15
utils/chaos_ai/docker/config/yml/episodes.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Episodes from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
15
utils/chaos_ai/docker/config/yml/log.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get Log from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
15
utils/chaos_ai/docker/config/yml/qtable.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get QTable from the Chaos ID.---
|
||||
tags:
|
||||
- ChaosAI API Results
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Results for the given Chaos ID.
|
||||
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
15
utils/chaos_ai/docker/config/yml/status.yml
Executable file
@@ -0,0 +1,15 @@
|
||||
|
||||
Get status of the Constraints ID.---
|
||||
tags:
|
||||
- ChaosAI API
|
||||
parameters:
|
||||
- name: chaosid
|
||||
in: path
|
||||
type: string
|
||||
required: true
|
||||
description: Chaos-ID
|
||||
responses:
|
||||
500:
|
||||
description: Error!
|
||||
200:
|
||||
description: Chaos for the given ID.
|
||||
6
utils/chaos_ai/docker/requirements.txt
Normal file
6
utils/chaos_ai/docker/requirements.txt
Normal file
@@ -0,0 +1,6 @@
|
||||
numpy
|
||||
pandas
|
||||
requests
|
||||
Flask==2.1.0
|
||||
Werkzeug==2.2.2
|
||||
flasgger==0.9.5
|
||||
186
utils/chaos_ai/docker/swagger_api.py
Normal file
186
utils/chaos_ai/docker/swagger_api.py
Normal file
@@ -0,0 +1,186 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
# sys.path.append("..")
|
||||
from src.aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# kubeconfigfile = params['file']
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
os.system("export KUBECONFIG="+kubeconfigfile)
|
||||
os.system("echo $KUBECONFIG")
|
||||
print('setting kubeconfig')
|
||||
params['command'] = 'podman'
|
||||
params['chaosengine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaosengine'] = config_params['chaosengine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
# faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
faults = []
|
||||
for f in params['faults'].split(','):
|
||||
if f in ['pod-delete']:
|
||||
for p in params['podlabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
|
||||
for p in params['nodelabels'].split(','):
|
||||
faults.append(f + ':' + p)
|
||||
else:
|
||||
pass
|
||||
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
|
||||
'401': 6, '403': 7, '404': 8, '429': 9,
|
||||
'Timeout': 10, 'Other': 11}
|
||||
rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
|
||||
'401': 1, '403': 1, '404': 1, '429': 1,
|
||||
'Timeout': 1, 'Other': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
# probe_url = params['probeurl']
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=int(params['maxfaults']),
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaosengine'],
|
||||
chaos_dir='config/', kubeconfig=kubeconfigfile,
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
|
||||
command=params['command'])
|
||||
print('checking kubeconfig')
|
||||
os.system("echo $KUBECONFIG")
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
fname = 'kubeconfig-'+str(i)
|
||||
if fname not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
# creating empty file
|
||||
open(kubeconfigfile, 'a').close()
|
||||
# print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
# print('[GenerateChaos]', f.filename, datetime.now())
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
@app.route('/GetLog/<chaosid>', methods=['GET'])
|
||||
@swag_from('config/yml/log.yml')
|
||||
def get_log(chaosid):
|
||||
print('[GetLog]', chaosid)
|
||||
epfile = flaskdir + 'log_' + str(chaosid)
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
21
utils/chaos_ai/generate_wheel_package.py
Normal file
21
utils/chaos_ai/generate_wheel_package.py
Normal file
@@ -0,0 +1,21 @@
|
||||
import setuptools
|
||||
# from setuptools_cythonize import get_cmdclass
|
||||
|
||||
setuptools.setup(
|
||||
# cmdclass=get_cmdclass(),
|
||||
name="aichaos",
|
||||
version="0.0.1",
|
||||
author="Sandeep Hans",
|
||||
author_email="shans001@in.ibm.com",
|
||||
description="Chaos AI",
|
||||
long_description="Chaos Engineering using AI",
|
||||
long_description_content_type="text/markdown",
|
||||
url="",
|
||||
packages=setuptools.find_packages(),
|
||||
classifiers=[
|
||||
"Programming Language :: Python :: 3",
|
||||
"License :: OSI Approved :: MIT License",
|
||||
"Operating System :: OS Independent",
|
||||
],
|
||||
python_requires='>=3.9',
|
||||
)
|
||||
10
utils/chaos_ai/requirements.txt
Normal file
10
utils/chaos_ai/requirements.txt
Normal file
@@ -0,0 +1,10 @@
|
||||
numpy
|
||||
pandas
|
||||
notebook
|
||||
jupyterlab
|
||||
jupyter
|
||||
seaborn
|
||||
requests
|
||||
wheel
|
||||
Flask==2.1.0
|
||||
flasgger==0.9.5
|
||||
0
utils/chaos_ai/src/__init__.py
Normal file
0
utils/chaos_ai/src/__init__.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
213
utils/chaos_ai/src/aichaos.py
Normal file
@@ -0,0 +1,213 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
import sys
|
||||
|
||||
import numpy as np
|
||||
import logging
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
|
||||
chaos_experiment='experiment.json',
|
||||
chaos_journal='journal.json', iterations=1000, static_run=False):
|
||||
self.faults = faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.episodes = []
|
||||
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize parameters
|
||||
self.gamma = 0.75 # Discount factor
|
||||
self.alpha = 0.9 # Learning rate
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([9, 9]))
|
||||
# self.Q = np.array(np.zeros([len(faults), len(faults)]))
|
||||
# currently action is a single fault, later on we will do multiple faults together
|
||||
# For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
|
||||
# eg. {f1,f2},f3,f4,{f4,f5} - f1,f2 in parallel, then f3, then f4, then f4,f5 in parallel produces end state
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
# may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
|
||||
# Q = {'500' = {'f1f2f4': 0.3, 'f1': 0.5}, '404' = {'f2': 0.22}}
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
# End state is reached when system is down or return error code like '500','404'
|
||||
def get_next_state(self):
|
||||
self.logger.info('[GET_NEXT_STATE]')
|
||||
f = open(self.chaos_dir + self.chaos_journal)
|
||||
data = json.load(f)
|
||||
|
||||
# before the experiment (if before steady state is false, after is null?)
|
||||
for probe in data['steady_states']['before']['probes']:
|
||||
if not probe['tolerance_met']:
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['status']
|
||||
start_state, end_state = None, None
|
||||
return start_state, end_state
|
||||
|
||||
# after the experiment
|
||||
for probe in data['steady_states']['after']['probes']:
|
||||
# if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
if not probe['tolerance_met']:
|
||||
# print(probe)
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['output']['status']
|
||||
# end_state = probe['status']
|
||||
return start_state, end_state
|
||||
# if tolerances for all probes are met
|
||||
start_state = probe['activity']['tolerance']
|
||||
end_state = probe['activity']['tolerance']
|
||||
return start_state, end_state
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.info('[INJECT_FAULT] ' + fault)
|
||||
f = open(self.chaos_dir + self.chaos_experiment)
|
||||
data = json.load(f)
|
||||
for m in data['method']:
|
||||
if 'provider' in m:
|
||||
if fault == 'kill_microservice':
|
||||
m['name'] = 'kill-microservice'
|
||||
m['provider']['module'] = 'chaosk8s.actions'
|
||||
m['provider']['arguments']['name'] = pod_name
|
||||
else:
|
||||
m['provider']['arguments']['name_pattern'] = pod_name
|
||||
m['provider']['func'] = fault
|
||||
|
||||
print('[INJECT_FAULT] method:', m)
|
||||
# self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
|
||||
# self.logger.info('[INJECT_FAULT] ' + str(m))
|
||||
|
||||
exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
|
||||
with open(exp_file, 'w') as f:
|
||||
json.dump(data, f)
|
||||
exp_file = self.chaos_dir + 'experiment.json'
|
||||
# execute faults
|
||||
# cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
|
||||
cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
|
||||
if not self.static_run:
|
||||
os.system(cmd)
|
||||
|
||||
def create_episode(self):
|
||||
self.logger.info('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
while True:
|
||||
# inject more faults
|
||||
# TODO: model - choose faults based on q-learning ...
|
||||
fault_pod = random.choice(self.faults)
|
||||
fault = fault_pod.split(':')[0]
|
||||
pod_name = fault_pod.split(':')[1]
|
||||
# fault = random.choice(self.faults)
|
||||
# pod_name = random.choice(self.pod_names)
|
||||
# fault = lstm_model.get_next_fault(episode)
|
||||
# fault = get_max_prob_fault(episode)
|
||||
|
||||
self.inject_faults(fault, pod_name)
|
||||
start_state, next_state = self.get_next_state()
|
||||
print('[CREATE EPISODE]', start_state, next_state)
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
continue
|
||||
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
# self.update_q_fault(fault, episode, start_state, next_state)
|
||||
# if an end_state is reached
|
||||
# if next_state is not None:
|
||||
if start_state != next_state:
|
||||
self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
return episode, start_state, next_state
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state):
|
||||
self.logger.info('[UPDATE_Q]')
|
||||
print('[UPDATE_Q] ', str(start_state), str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
self.Q[current_state, fault_index]
|
||||
self.Q[current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
|
||||
def start_chaos(self):
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
print(self.Q)
|
||||
print(self.state_matrix)
|
||||
|
||||
|
||||
def test_chaos():
|
||||
svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
|
||||
'shipping', 'user', 'web']
|
||||
# Define faults
|
||||
# faults = ['terminate_pods']
|
||||
# faults = ['terminate_pods:' + x for x in pod_names]
|
||||
faults = ['kill_microservice:' + x for x in svc_list]
|
||||
# Define the states
|
||||
states = {
|
||||
'200': 0,
|
||||
'500': 1,
|
||||
'404': 2
|
||||
}
|
||||
# Define rewards, currently not used
|
||||
rewards = {
|
||||
'200': 0,
|
||||
'500': 0.8,
|
||||
'404': 1
|
||||
}
|
||||
|
||||
# cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
|
||||
cdir = '/Users/sandeephans/Downloads/openshift/'
|
||||
cexp = 'experiment.json'
|
||||
cjournal = 'journal.json'
|
||||
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
|
||||
static_run=False)
|
||||
aichaos.start_chaos()
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
|
||||
test_chaos()
|
||||
248
utils/chaos_ai/src/aichaos_main.py
Normal file
248
utils/chaos_ai/src/aichaos_main.py
Normal file
@@ -0,0 +1,248 @@
|
||||
import json
|
||||
import os
|
||||
import random
|
||||
|
||||
import numpy as np
|
||||
import pandas as pd
|
||||
import logging
|
||||
|
||||
# sys.path.insert(1, os.path.join(sys.path[0], '..'))
|
||||
import src.utils as utils
|
||||
from src.kraken_utils import KrakenUtils
|
||||
from src.qlearning import QLearning
|
||||
from src.test_application import TestApplication
|
||||
|
||||
|
||||
class AIChaos:
|
||||
def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
|
||||
service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
|
||||
chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
|
||||
loglevel=logging.INFO,
|
||||
chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
|
||||
num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
|
||||
static_run=False, all_faults=False, command='podman'):
|
||||
self.namespace = namespace
|
||||
self.faults = faults
|
||||
self.unused_faults = faults.copy()
|
||||
self.all_faults = all_faults
|
||||
self.pod_names = pod_names
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
self.urls = urls
|
||||
self.max_faults = max_faults
|
||||
self.episodes = []
|
||||
self.service_weights = service_weights
|
||||
self.ctd_subsets = ctd_subsets
|
||||
|
||||
self.kubeconfig = kubeconfig
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.chaos_journal = chaos_journal
|
||||
self.command = command
|
||||
|
||||
if chaos_engine == 'kraken':
|
||||
self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
|
||||
else:
|
||||
self.chaos_engine = None
|
||||
|
||||
self.iterations = iterations
|
||||
# Initialize RL parameters
|
||||
self.epsilon = epsilon # epsilon decay policy
|
||||
# self.epsdecay = 0
|
||||
|
||||
# log files
|
||||
self.logfile = logfile
|
||||
self.qfile = qfile
|
||||
self.efile = efile
|
||||
self.epfile = epfile
|
||||
open(efile, 'w+').close()
|
||||
open(logfile, 'w+').close()
|
||||
open(logfile, 'r+').truncate(0)
|
||||
logging.getLogger("requests").setLevel(logging.WARNING)
|
||||
logging.getLogger("urllib3").setLevel(logging.WARNING)
|
||||
logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
|
||||
self.logger = logging.getLogger(logfile.replace('/',''))
|
||||
self.logger.addHandler(logging.FileHandler(logfile))
|
||||
|
||||
self.testapp = TestApplication(num_requests, timeout, sleep_time)
|
||||
self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
|
||||
|
||||
# run from old static experiment and journal files
|
||||
self.static_run = static_run
|
||||
|
||||
def realistic(self, faults_pods):
|
||||
self.logger.debug('[Realistic] ' + str(faults_pods))
|
||||
fp = faults_pods.copy()
|
||||
for f1 in faults_pods:
|
||||
for f2 in faults_pods:
|
||||
if f1 == f2:
|
||||
continue
|
||||
if f1 in fp and f2 in fp:
|
||||
f1_fault, load_1 = utils.get_load(f1.split(':')[0])
|
||||
f1_pod = f1.split(':')[1]
|
||||
f2_fault, load_2 = utils.get_load(f2.split(':')[0])
|
||||
f2_pod = f2.split(':')[1]
|
||||
if f1_pod == f2_pod:
|
||||
if f1_fault == 'pod-delete':
|
||||
fp.remove(f2)
|
||||
if f1_fault == f2_fault:
|
||||
# if int(load_1) > int(load_2):
|
||||
# randomly remove one fault from same faults with different params
|
||||
fp.remove(f2)
|
||||
if self.service_weights is None:
|
||||
return fp
|
||||
|
||||
fp_copy = fp.copy()
|
||||
for f in fp:
|
||||
f_fault = f.split(':')[0]
|
||||
f_pod = f.split(':')[1].replace('service=', '')
|
||||
self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
|
||||
if self.service_weights[f_pod][f_fault] == 0:
|
||||
fp_copy.remove(f)
|
||||
|
||||
self.logger.debug('[Realistic] ' + str(fp_copy))
|
||||
return fp_copy
|
||||
|
||||
def select_faults(self):
|
||||
max_faults = min(self.max_faults, len(self.unused_faults))
|
||||
num_faults = random.randint(1, max_faults)
|
||||
if self.all_faults:
|
||||
num_faults = len(self.unused_faults)
|
||||
if random.random() > self.epsilon:
|
||||
self.logger.info('[Exploration]')
|
||||
# faults_pods = random.sample(self.faults, k=num_faults)
|
||||
# using used faults list to avoid starvation
|
||||
faults_pods = random.sample(self.unused_faults, k=num_faults)
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
for f in faults_pods:
|
||||
self.unused_faults.remove(f)
|
||||
if len(self.unused_faults) == 0:
|
||||
self.unused_faults = self.faults.copy()
|
||||
else:
|
||||
self.logger.info('[Exploitation]')
|
||||
first_row = self.ql.Q[:, 0, :][0]
|
||||
top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
|
||||
faults_pods = [self.faults[i] for i in top_k_indices]
|
||||
faults_pods = self.realistic(faults_pods)
|
||||
|
||||
return faults_pods
|
||||
|
||||
def create_episode(self, ctd_subset=None):
|
||||
self.logger.debug('[CREATE_EPISODE]')
|
||||
episode = []
|
||||
|
||||
if ctd_subset is None:
|
||||
faults_pods = self.select_faults()
|
||||
else:
|
||||
faults_pods = ctd_subset
|
||||
self.logger.info('CTD Subset: ' + str(faults_pods))
|
||||
|
||||
# faults_pods = self.realistic(faults_pods)
|
||||
if len(faults_pods) == 0:
|
||||
return [], 200, 200
|
||||
|
||||
engines = []
|
||||
for fp in faults_pods:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = self.chaos_engine.inject_faults(fault, pod_name)
|
||||
engines.append(engine)
|
||||
episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
self.logger.info('[create_episode]' + str(faults_pods))
|
||||
engines_running = self.chaos_engine.wait_engines(engines)
|
||||
self.logger.info('[create_episode] engines_running' + str(engines_running))
|
||||
if not engines_running:
|
||||
return None, None, None
|
||||
|
||||
# randomly shuffling urls
|
||||
urls = random.sample(self.urls, len(self.urls))
|
||||
ep_json = []
|
||||
for url in urls:
|
||||
start_state, next_state = self.testapp.test_load(url)
|
||||
self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
|
||||
# if before state tolerance is not met
|
||||
if start_state is None and next_state is None:
|
||||
# self.cleanup()
|
||||
self.chaos_engine.stop_engines()
|
||||
continue
|
||||
|
||||
### episode.append({'fault': fault, 'pod_name': pod_name})
|
||||
# self.update_q_fault(fault_pod, episode, start_state, next_state)
|
||||
url_index = self.urls.index(url)
|
||||
self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
|
||||
for fp in faults_pods:
|
||||
self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
|
||||
ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
|
||||
|
||||
self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
|
||||
self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
|
||||
|
||||
self.chaos_engine.print_result(engines)
|
||||
self.chaos_engine.stop_engines(episode=episode)
|
||||
# ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
|
||||
|
||||
return ep_json, start_state, next_state
|
||||
|
||||
def start_chaos(self):
|
||||
self.logger.info('[INITIALIZING]')
|
||||
self.logger.info('Logfile: '+self.logfile)
|
||||
self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
|
||||
self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
|
||||
self.logger.debug('Faults:' + str(self.faults))
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
if self.ctd_subsets is None:
|
||||
for i in range(self.iterations):
|
||||
episode, start_state, end_state = self.create_episode()
|
||||
self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
# update Q matrix
|
||||
# will do it with each fault injection
|
||||
# self.update_q(episode, start_state, end_state)
|
||||
# if episode['next_state'] != '200':
|
||||
self.episodes.extend(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
# print(i, self.state_matrix)
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
else:
|
||||
for i, subset in enumerate(self.ctd_subsets):
|
||||
episode, start_state, end_state = self.create_episode(subset)
|
||||
self.logger.debug('[start_chaos]' + str(episode))
|
||||
if episode is None:
|
||||
continue
|
||||
self.episodes.append(episode)
|
||||
self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
|
||||
self.write_q()
|
||||
self.write_episode(episode)
|
||||
|
||||
self.chaos_engine.cleanup()
|
||||
# self.remove_temp_file()
|
||||
with open(self.epfile, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.episodes, f, ensure_ascii=False, indent=4)
|
||||
self.logger.info('COMPLETE!!!')
|
||||
|
||||
def write_q(self):
|
||||
df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
|
||||
df.to_csv(self.qfile)
|
||||
return df
|
||||
|
||||
def write_episode(self, episode):
|
||||
for ep in episode:
|
||||
with open(self.efile, "a") as outfile:
|
||||
x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
|
||||
x.append(ep['url'])
|
||||
x.append(str(ep['next_state']))
|
||||
outfile.write(','.join(x) + '\n')
|
||||
|
||||
def remove_temp_file(self):
|
||||
mydir = self.chaos_dir + 'experiments'
|
||||
print('Removing temp files from: '+mydir)
|
||||
self.logger.debug('Removing temp files: '+mydir)
|
||||
if os.path.exists(mydir):
|
||||
return
|
||||
filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
|
||||
for f in filelist:
|
||||
print(f)
|
||||
os.remove(os.path.join(mydir, f))
|
||||
56
utils/chaos_ai/src/experiments.py
Normal file
56
utils/chaos_ai/src/experiments.py
Normal file
@@ -0,0 +1,56 @@
|
||||
import random
|
||||
|
||||
|
||||
class Experiments:
|
||||
def __init__(self):
|
||||
self.k = 0
|
||||
|
||||
def monotonic(self, aichaos, num_sets=3):
|
||||
for i in range(num_sets):
|
||||
faults_pods = random.sample(aichaos.faults, k=2)
|
||||
faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
|
||||
|
||||
resp1, resp2, resp_both = 0, 0, 0
|
||||
for fl in faults_set:
|
||||
engines = []
|
||||
for fp in fl:
|
||||
fault = fp.split(':')[0]
|
||||
pod_name = fp.split(':')[1]
|
||||
engine = aichaos.inject_faults_litmus(fault, pod_name)
|
||||
engines.append(engine)
|
||||
aichaos.litmus.wait_engines(engines)
|
||||
|
||||
for index, url in enumerate(aichaos.urls):
|
||||
start_state, next_state = aichaos.test_load(url)
|
||||
print(i, fl, next_state)
|
||||
# self.write(str(fl), next_state)
|
||||
if resp1 == 0:
|
||||
resp1 = next_state
|
||||
elif resp2 == 0:
|
||||
resp2 = next_state
|
||||
else:
|
||||
resp_both = next_state
|
||||
|
||||
aichaos.litmus.stop_engines()
|
||||
self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
|
||||
print('Experiment Complete!!!')
|
||||
|
||||
@staticmethod
|
||||
def write(fault, next_state):
|
||||
with open("experiment", "a") as outfile:
|
||||
outfile.write(fault + ',' + str(next_state) + ',' + '\n')
|
||||
|
||||
|
||||
@staticmethod
|
||||
def write_resp(faults, resp1, resp2, resp3):
|
||||
monotonic = True
|
||||
if resp3 == 200:
|
||||
if resp1 != 200 or resp2 != 200:
|
||||
monotonic = False
|
||||
else:
|
||||
if resp1 == 200 and resp2 == 200:
|
||||
monotonic = False
|
||||
|
||||
with open("experiment", "a") as outfile:
|
||||
# outfile.write(faults + ',' + str(resp1) + ',' + '\n')
|
||||
outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
|
||||
99
utils/chaos_ai/src/kraken_utils.py
Normal file
99
utils/chaos_ai/src/kraken_utils.py
Normal file
@@ -0,0 +1,99 @@
|
||||
import json
|
||||
import os
|
||||
import time
|
||||
import logging
|
||||
|
||||
import src.utils as utils
|
||||
|
||||
|
||||
class KrakenUtils:
|
||||
def __init__(self, namespace='robot-shop', chaos_dir='../config/',
|
||||
chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
|
||||
self.chaos_dir = chaos_dir
|
||||
self.chaos_experiment = chaos_experiment
|
||||
self.namespace = namespace
|
||||
self.kubeconfig = kubeconfig
|
||||
self.logger = logging.getLogger()
|
||||
self.engines = []
|
||||
self.wait_checks = wait_checks
|
||||
self.command = command
|
||||
|
||||
def exp_status(self, engine='engine-cartns3'):
|
||||
substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
|
||||
substr = '|'.join(substring_list)
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
|
||||
# cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
|
||||
line = os.popen(cmd).read()
|
||||
self.logger.debug('[exp_status]'+line)
|
||||
# if 'Waiting for the specified duration' in line:
|
||||
# if 'Waiting for' in line or 'waiting for' in line:
|
||||
# if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
|
||||
if any(map(line.__contains__, substring_list)):
|
||||
return 'Running'
|
||||
return 'Not Running'
|
||||
|
||||
# print chaos result, check if litmus showed any error
|
||||
def print_result(self, engines):
|
||||
# self.logger.debug('')
|
||||
for e in engines:
|
||||
# cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
|
||||
# line = os.popen(cmd).read()
|
||||
# self.logger.debug('[Chaos Result] '+e+' : '+line)
|
||||
self.logger.debug('[KRAKEN][Chaos Result] '+e)
|
||||
|
||||
def wait_engines(self, engines=[]):
|
||||
status = 'Completed'
|
||||
max_checks = self.wait_checks
|
||||
for e in engines:
|
||||
self.logger.info('[Wait Engines] ' + e)
|
||||
for i in range(max_checks):
|
||||
status = self.exp_status(e)
|
||||
if status == 'Running':
|
||||
break
|
||||
time.sleep(1)
|
||||
# return False, if even one engine is not running
|
||||
if status != 'Running':
|
||||
return False
|
||||
|
||||
self.engines = engines
|
||||
# return True if all engines are running
|
||||
return True
|
||||
|
||||
|
||||
def cleanup(self):
|
||||
self.logger.debug('Removing previous engines')
|
||||
# cmd = "docker rm $(docker ps -q -f 'status=exited')"
|
||||
if len(self.engines) > 0:
|
||||
cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
|
||||
os.system(cmd)
|
||||
self.engines = []
|
||||
|
||||
cmd = self.command+" container prune -f >> temp"
|
||||
os.system(cmd)
|
||||
self.logger.debug('Engines removed')
|
||||
|
||||
def stop_engines(self, episode=[]):
|
||||
self.cleanup()
|
||||
|
||||
def get_name(self):
|
||||
return 'kraken'
|
||||
|
||||
def inject_faults(self, fault, pod_name):
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
|
||||
fault, load = utils.get_load(fault)
|
||||
engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
|
||||
if fault == 'pod-delete':
|
||||
cmd = self.command+' run -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
|
||||
elif fault == 'network-chaos':
|
||||
# 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10 --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
|
||||
elif fault == 'node-memory-hog':
|
||||
cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
|
||||
elif fault == 'node-cpu-hog':
|
||||
cmd = self.command+' run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
|
||||
else:
|
||||
cmd = 'echo'
|
||||
self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
|
||||
os.system(cmd)
|
||||
return engine
|
||||
62
utils/chaos_ai/src/qlearning.py
Normal file
62
utils/chaos_ai/src/qlearning.py
Normal file
@@ -0,0 +1,62 @@
|
||||
import logging
|
||||
|
||||
import numpy as np
|
||||
|
||||
|
||||
class QLearning:
|
||||
def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
|
||||
self.gamma = gamma # Discount factor
|
||||
self.alpha = alpha # Learning rate
|
||||
self.faults = faults
|
||||
self.states = states
|
||||
self.rewards = rewards
|
||||
|
||||
# Initializing Q-Values
|
||||
# self.Q = np.array(np.zeros([len(states), len(states)]))
|
||||
self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
|
||||
self.state_matrix = np.array(np.zeros([len(states), len(states)]))
|
||||
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def update_q_fault(self, fault, episode, start_state, end_state, url_index):
|
||||
self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
|
||||
if end_state is None:
|
||||
end_state = start_state
|
||||
if end_state not in self.states:
|
||||
end_state = 'Other'
|
||||
# reward is dependent on the error response (eg. '404') and length of episode
|
||||
reward = self.rewards[str(end_state)] / len(episode)
|
||||
current_state = self.states[str(start_state)]
|
||||
next_state = self.states[str(end_state)]
|
||||
fault_index = self.faults.index(fault)
|
||||
# self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
|
||||
# self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
|
||||
# self.logger.debug(
|
||||
# 'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
|
||||
# self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
TD = reward + \
|
||||
self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
|
||||
self.Q[url_index, current_state, fault_index]
|
||||
self.Q[url_index, current_state, fault_index] += self.alpha * TD
|
||||
|
||||
# update state matrix
|
||||
TD_state = reward + \
|
||||
self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
|
||||
self.state_matrix[current_state, next_state]
|
||||
self.state_matrix[current_state, next_state] += self.alpha * TD_state
|
||||
# self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
|
||||
|
||||
# def update_q(self, episode, start_state, end_state):
|
||||
# self.logger.info('[UPDATE_Q]')
|
||||
# if end_state is None:
|
||||
# end_state = start_state
|
||||
#
|
||||
# # reward is dependent on the error response (eg. '404') and length of episode
|
||||
# reward = self.rewards[str(end_state)] / len(episode)
|
||||
# current_state = self.states[str(start_state)]
|
||||
# next_state = self.states[str(end_state)]
|
||||
# TD = reward + \
|
||||
# self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
|
||||
# self.Q[current_state, next_state]
|
||||
# self.Q[current_state, next_state] += self.alpha * TD
|
||||
171
utils/chaos_ai/src/swagger_api.py
Normal file
171
utils/chaos_ai/src/swagger_api.py
Normal file
@@ -0,0 +1,171 @@
|
||||
import json, os
|
||||
import logging
|
||||
# import numpy as np
|
||||
# import pandas as pd
|
||||
import threading
|
||||
from datetime import datetime
|
||||
from flask import Flask, request
|
||||
from flasgger import Swagger
|
||||
from flasgger.utils import swag_from
|
||||
# import zipfile
|
||||
import sys
|
||||
|
||||
sys.path.append("..")
|
||||
from aichaos_main import AIChaos
|
||||
|
||||
app = Flask(__name__)
|
||||
Swagger(app)
|
||||
flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
|
||||
"flask") + '/'
|
||||
|
||||
|
||||
class AIChaosSwagger:
|
||||
def __init__(self, flaskdir=''):
|
||||
self.flaskdir = flaskdir
|
||||
|
||||
@app.route("/")
|
||||
def empty(params=''):
|
||||
return "AI Chaos Repository!"
|
||||
|
||||
def startchaos(self, kubeconfigfile, file_id, params):
|
||||
print('[StartChaos]', file_id, kubeconfigfile)
|
||||
dir = flaskdir
|
||||
outfile = ''.join([dir, 'out-', file_id])
|
||||
initfile = ''.join([dir, 'init-', file_id])
|
||||
with open(initfile, 'w'):
|
||||
pass
|
||||
if os.path.exists(outfile):
|
||||
os.remove(outfile)
|
||||
# cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
|
||||
# write_local=False)
|
||||
os.environ["KUBECONFIG"] = kubeconfigfile
|
||||
params['command'] = 'podman'
|
||||
params['chaos_engine'] = 'kraken'
|
||||
params['faults'] = 'pod-delete'
|
||||
params['iterations'] = 1
|
||||
params['maxfaults'] = 5
|
||||
if os.path.isfile('/config/aichaos-config.json'):
|
||||
with open('/config/aichaos-config.json') as f:
|
||||
config_params = json.load(f)
|
||||
params['command'] = config_params['command']
|
||||
params['chaos_engine'] = config_params['chaos_engine']
|
||||
params['faults']= config_params['faults']
|
||||
params['iterations'] = config_params['iterations']
|
||||
params['maxfaults'] = config_params['maxfaults']
|
||||
faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
|
||||
print('#faults:', len(faults), faults)
|
||||
states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
|
||||
rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
|
||||
logfile = self.flaskdir + 'log_' + str(file_id)
|
||||
qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
|
||||
efile = self.flaskdir + 'efile_' + str(file_id)
|
||||
epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
|
||||
probe_url = params['probeurl']
|
||||
probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
|
||||
'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
|
||||
'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
|
||||
'packet_corruption': 'wolffi/packet_corruption',
|
||||
'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
|
||||
'http_bad_request': 'wolffi/http_bad_request',
|
||||
'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
|
||||
'http_not_found': 'wolffi/http_not_found',
|
||||
'http_method_not_allowed': 'wolffi/http_method_not_allowed',
|
||||
'http_not_acceptable': 'wolffi/http_not_acceptable',
|
||||
'http_request_timeout': 'wolffi/http_request_timeout',
|
||||
'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
|
||||
'http_internal_server_error': 'wolffi/http_internal_server_error',
|
||||
'http_not_implemented': 'wolffi/http_not_implemented',
|
||||
'http_bad_gateway': 'wolffi/http_bad_gateway',
|
||||
'http_service_unavailable': 'wolffi/http_service_unavailable',
|
||||
'bandwidth_restrict': 'wolffi/bandwidth_restrict',
|
||||
'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
|
||||
'pod_io_load': 'wolffi/pod_io_load',
|
||||
'pod_network_load': 'wolffi/pod_network_load'
|
||||
}
|
||||
dstk_probes = {k: probe_url + v for k, v in probes.items()}
|
||||
cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
|
||||
'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
|
||||
'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
|
||||
aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
|
||||
logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
|
||||
urls=params['urls'].split(','), namespace=params['namespace'],
|
||||
max_faults=params['maxfaults'],
|
||||
num_requests=10, timeout=2,
|
||||
chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
|
||||
loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
|
||||
aichaos.start_chaos()
|
||||
|
||||
file = open(outfile, "w")
|
||||
file.write('done')
|
||||
file.close()
|
||||
os.remove(initfile)
|
||||
# os.remove(csvfile)
|
||||
# ConstraintsInference().remove_temp_files(dir, file_id)
|
||||
return 'WRITE'
|
||||
|
||||
@app.route('/GenerateChaos/', methods=['POST'])
|
||||
@swag_from('../config/yml/chaosGen.yml')
|
||||
def chaos_gen():
|
||||
dir = flaskdir
|
||||
sw = AIChaosSwagger(flaskdir=dir)
|
||||
f = request.files['file']
|
||||
list = os.listdir(dir)
|
||||
for i in range(10000):
|
||||
if str(i) not in list:
|
||||
break
|
||||
kubeconfigfile = ''.join([dir, str(i)])
|
||||
f.save(kubeconfigfile)
|
||||
print('HEADER:', f.headers)
|
||||
print('[GenerateChaos] reqs:', request.form.to_dict())
|
||||
print('[GenerateChaos]', f.filename, datetime.now())
|
||||
# thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
|
||||
thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
|
||||
thread.daemon = True
|
||||
print(thread.getName())
|
||||
thread.start()
|
||||
return 'Chaos ID: ' + str(i)
|
||||
|
||||
@app.route('/GetStatus/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/status.yml')
|
||||
def get_status(chaosid):
|
||||
print('[GetStatus]', chaosid, flaskdir)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
return 'Completed'
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Does not exist'
|
||||
|
||||
@app.route('/GetQTable/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/qtable.yml')
|
||||
def get_qtable(chaosid):
|
||||
print('[GetQTable]', chaosid)
|
||||
qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(qfile):
|
||||
f = open(qfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
@app.route('/GetEpisodes/<chaosid>', methods=['GET'])
|
||||
@swag_from('../config/yml/episodes.yml')
|
||||
def get_episodes(chaosid):
|
||||
print('[GetEpisodes]', chaosid)
|
||||
epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
|
||||
initfile = ''.join([flaskdir, 'init-', chaosid])
|
||||
if os.path.exists(epfile):
|
||||
f = open(epfile, "r")
|
||||
return f.read()
|
||||
elif os.path.exists(initfile):
|
||||
return 'Running'
|
||||
else:
|
||||
return 'Invalid Chaos ID: ' + chaosid
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
app.run(debug=True, host='0.0.0.0', port='5001')
|
||||
83
utils/chaos_ai/src/test_application.py
Normal file
83
utils/chaos_ai/src/test_application.py
Normal file
@@ -0,0 +1,83 @@
|
||||
import json
|
||||
import logging
|
||||
import time
|
||||
import requests
|
||||
|
||||
|
||||
class TestApplication:
|
||||
def __init__(self, num_requests=10, timeout=2, sleep_time=1):
|
||||
self.num_requests = num_requests
|
||||
self.timeout = timeout
|
||||
self.sleep_time = sleep_time
|
||||
self.logger = logging.getLogger()
|
||||
|
||||
def test_load(self, url=''):
|
||||
# url = 'http://192.168.49.2:31902/api/cart/health'
|
||||
timeout_count = 0
|
||||
avg_lat = 0
|
||||
for i in range(self.num_requests):
|
||||
try:
|
||||
r = requests.get(url, verify=False, timeout=self.timeout)
|
||||
avg_lat += r.elapsed.total_seconds()
|
||||
self.logger.info(
|
||||
url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
|
||||
+ " {:.2f}".format(avg_lat))
|
||||
if r.status_code != 200:
|
||||
return '200', r.status_code
|
||||
# except requests.exceptions.Timeout as toe:
|
||||
except Exception as toe:
|
||||
self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
|
||||
timeout_count += 1
|
||||
if timeout_count > 3:
|
||||
return '200', 'Timeout'
|
||||
# except Exception as e:
|
||||
# self.logger.debug('Connection refused!'+str(e))
|
||||
time.sleep(self.sleep_time)
|
||||
self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
|
||||
return '200', '200'
|
||||
|
||||
# def test_load_hey(self):
|
||||
# cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
|
||||
# os.system(cmd)
|
||||
# with open('temp') as f:
|
||||
# datafile = f.readlines()
|
||||
# found = False
|
||||
# for line in datafile:
|
||||
# if 'Status code distribution:' in line:
|
||||
# found = True
|
||||
# if found:
|
||||
# print('[test_load]', line)
|
||||
# m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
|
||||
# if m is not None:
|
||||
# resp_code = m.group(1)
|
||||
# if resp_code != 200:
|
||||
# return '200', resp_code
|
||||
# return '200', '200'
|
||||
|
||||
# # End state is reached when system is down or return error code like '500','404'
|
||||
# def get_next_state(self):
|
||||
# self.logger.info('[GET_NEXT_STATE]')
|
||||
# f = open(self.chaos_dir + self.chaos_journal)
|
||||
# data = json.load(f)
|
||||
#
|
||||
# # before the experiment (if before steady state is false, after is null?)
|
||||
# for probe in data['steady_states']['before']['probes']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # start_state = probe['activity']['tolerance']
|
||||
# # end_state = probe['status']
|
||||
# start_state, end_state = None, None
|
||||
# return start_state, end_state
|
||||
#
|
||||
# # after the experiment
|
||||
# for probe in data['steady_states']['after']['probes']:
|
||||
# # if probe['output']['status'] == probe['activity']['tolerance']:
|
||||
# if not probe['tolerance_met']:
|
||||
# # print(probe)
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['output']['status']
|
||||
# # end_state = probe['status']
|
||||
# return start_state, end_state
|
||||
# # if tolerances for all probes are met
|
||||
# start_state = probe['activity']['tolerance']
|
||||
# end_state = probe['activity']['tolerance']
|
||||
# return start_state, end_state
|
||||
10
utils/chaos_ai/src/utils.py
Normal file
10
utils/chaos_ai/src/utils.py
Normal file
@@ -0,0 +1,10 @@
|
||||
import re
|
||||
|
||||
|
||||
def get_load(fault):
|
||||
params = re.findall(r'\(.*?\)', fault)
|
||||
load = 100
|
||||
if len(params) > 0:
|
||||
load = params[0].strip('()')
|
||||
fault = fault.strip(params[0])
|
||||
return fault, load
|
||||
Reference in New Issue
Block a user