initial version of Chaos AI (#606)

* init push Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove litmus + updated readme Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * remove redundant files Signed-off-by: Sandeep Hans <shans001@in.ibm.com> * removed generated file+unused reference --------- Signed-off-by: Sandeep Hans <shans001@in.ibm.com> Co-authored-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>
2026-02-14 09:59:59 +00:00 · 2024-04-16 20:11:31 +05:30
parent 804d7cbf58
commit 19ad2d1a3d
24 changed files with 1383 additions and 0 deletions
--- a/utils/chaos_ai/README.md
+++ b/utils/chaos_ai/README.md
@@ -0,0 +1,39 @@
+# aichaos
+Enhancing Chaos Engineering with AI-assisted fault injection for better resiliency and non-functional testing.
+
+## Generate python package wheel file
+```
+python3.9 generate_wheel_package.py sdist bdist_wheel
+```
+This creates a python package file aichaos-0.0.1-py3-none-any.whl in the dist folder. 
+
+## Build Image
+```
+cd docker
+podman build -t aichaos:1.0 .
+OR
+docker build -t aichaos:1.0 .
+```
+
+## Run Chaos AI
+```
+podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+OR
+docker run -v aichaos-config.json:/config/aichaos-config.json --privileged -v /var/run/docker.sock:/var/run/docker.sock --name aichaos -p 5001:5001 aichaos:1.0
+```
+
+The output should look like:
+```
+$ podman run -v aichaos-config.json:/config/aichaos-config.json --privileged=true --name aichaos -p 5001:5001 aichaos:1.0
+ * Serving Flask app 'swagger_api' (lazy loading)
+ * Environment: production
+   WARNING: This is a development server. Do not use it in a production deployment.
+   Use a production WSGI server instead.
+ * Debug mode: on
+WARNING: This is a development server. Do not use it in a production deployment. Use a production WSGI server instead.
+ * Running on all addresses (0.0.0.0)
+ * Running on http://127.0.0.1:5001
+ * Running on http://172.17.0.2:5001
+```
+
+You can try out the APIs in browser at http://<server-ip>:5001/apidocs (eg. http://127.0.0.1:5001/apidocs). For testing out, you can try “GenerateChaos” api with ‘kubeconfig’ file and application URLs to test.
--- a/utils/chaos_ai/config/experiments/.gitkeep
+++ b/utils/chaos_ai/config/experiments/.gitkeep
--- a/utils/chaos_ai/docker/Dockerfile
+++ b/utils/chaos_ai/docker/Dockerfile
@@ -0,0 +1,21 @@
+FROM bitnami/kubectl:1.20.9 as kubectl
+FROM python:3.9
+WORKDIR /app
+RUN pip3 install --upgrade pip
+COPY config config/
+COPY requirements.txt .
+RUN mkdir -p /app/logs
+RUN pip3 install -r requirements.txt
+
+COPY --from=kubectl /opt/bitnami/kubectl/bin/kubectl /usr/local/bin/
+
+COPY swagger_api.py .
+ENV PYTHONUNBUFFERED=1
+
+RUN curl -fsSLO https://get.docker.com/builds/Linux/x86_64/docker-17.03.1-ce.tgz && tar --strip-components=1 -xvzf docker-17.03.1-ce.tgz -C /usr/local/bin
+
+RUN apt-get update && apt-get install -y podman
+
+COPY aichaos-0.0.1-py3-none-any.whl .
+RUN pip3 install aichaos-0.0.1-py3-none-any.whl
+CMD ["python3", "swagger_api.py"]
--- a/utils/chaos_ai/docker/aichaos-config.json
+++ b/utils/chaos_ai/docker/aichaos-config.json
@@ -0,0 +1,7 @@
+{
+  "command": "podman",
+  "chaosengine": "kraken",
+  "faults": "pod-delete",
+  "iterations": 1,
+  "maxfaults": 5
+}
--- a/utils/chaos_ai/docker/config/experiments/log.yml
+++ b/utils/chaos_ai/docker/config/experiments/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/pod-delete.json
+++ b/utils/chaos_ai/docker/config/pod-delete.json
@@ -0,0 +1,36 @@
+{
+  "apiVersion": "1.0",
+  "kind": "ChaosEngine",
+  "metadata": {
+    "name": "engine-cartns3"
+  },
+  "spec": {
+    "engineState": "active",
+    "annotationCheck": "false",
+    "appinfo": {
+      "appns": "robot-shop",
+      "applabel": "service=payment",
+      "appkind": "deployment"
+    },
+    "chaosServiceAccount": "pod-delete-sa",
+    "experiments": [
+      {
+        "name": "pod-delete",
+        "spec": {
+          "components": {
+            "env": [
+              {
+                "name": "FORCE",
+                "value": "true"
+              },
+              {
+                "name": "TOTAL_CHAOS_DURATION",
+                "value": "120"
+              }
+            ]
+          }
+        }
+      }
+    ]
+  }
+}
--- a/utils/chaos_ai/docker/config/yml/chaosGen.yml
+++ b/utils/chaos_ai/docker/config/yml/chaosGen.yml
@@ -0,0 +1,40 @@
+
+Generate chaos on an application deployed on a cluster.
+---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: file
+        in: formData
+        type: file
+        required: true
+        description: Kube-config file
+      - name: namespace
+        in: formData
+        type: string
+        default: robot-shop
+        required: true
+        description: Namespace to test
+      - name: podlabels
+        in: formData
+        type: string
+        default: service=cart,service=payment
+        required: true
+        description: Pod labels to test
+      - name: nodelabels
+        in: formData
+        type: string
+        required: false
+        description: Node labels to test
+      - name: urls
+        in: formData
+        type: string
+        default: http://<application-url>:8097/api/cart/health,http://<application-url>:8097/api/payment/health
+        required: true
+        description: Application URLs to test
+
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos ID for the initiated chaos.
--- a/utils/chaos_ai/docker/config/yml/episodes.yml
+++ b/utils/chaos_ai/docker/config/yml/episodes.yml
@@ -0,0 +1,15 @@
+
+    Get Episodes from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/log.yml
+++ b/utils/chaos_ai/docker/config/yml/log.yml
@@ -0,0 +1,15 @@
+
+    Get Log from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/qtable.yml
+++ b/utils/chaos_ai/docker/config/yml/qtable.yml
@@ -0,0 +1,15 @@
+
+    Get QTable from the Chaos ID.---
+    tags:
+      - ChaosAI API Results
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Results for the given Chaos ID.
--- a/utils/chaos_ai/docker/config/yml/status.yml
+++ b/utils/chaos_ai/docker/config/yml/status.yml
@@ -0,0 +1,15 @@
+
+     Get status of the Constraints ID.---
+    tags:
+      - ChaosAI API
+    parameters:
+      - name: chaosid
+        in: path
+        type: string
+        required: true
+        description: Chaos-ID
+    responses:
+      500:
+        description: Error!
+      200:
+        description: Chaos for the given ID.
--- a/utils/chaos_ai/docker/requirements.txt
+++ b/utils/chaos_ai/docker/requirements.txt
@@ -0,0 +1,6 @@
+numpy
+pandas
+requests
+Flask==2.1.0
+Werkzeug==2.2.2
+flasgger==0.9.5
--- a/utils/chaos_ai/docker/swagger_api.py
+++ b/utils/chaos_ai/docker/swagger_api.py
@@ -0,0 +1,186 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+# sys.path.append("..")
+from src.aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "app", "logs") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # kubeconfigfile = params['file']
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        os.system("export KUBECONFIG="+kubeconfigfile)
+        os.system("echo $KUBECONFIG")
+        print('setting kubeconfig')
+        params['command'] = 'podman'
+        params['chaosengine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaosengine'] = config_params['chaosengine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        # faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        faults = []
+        for f in params['faults'].split(','):
+            if f in ['pod-delete']:
+                for p in params['podlabels'].split(','):
+                    faults.append(f + ':' + p)
+            elif f in ['network-chaos', 'node-memory-hog', 'node-cpu-hog']:
+                for p in params['nodelabels'].split(','):
+                    faults.append(f + ':' + p)
+            else:
+                pass
+
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '501': 2, '502': 3, '503': 4, '504': 5,
+                  '401': 6,  '403': 7,  '404': 8,  '429': 9,
+                  'Timeout': 10, 'Other': 11}
+        rewards = {'200': -1, '500': 0.8, '501': 0.8, '502': 0.8, '503': 0.8, '504': 0.8,
+                   '401': 1,  '403': 1,  '404': 1,  '429': 1,
+                   'Timeout': 1, 'Other': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        # probe_url = params['probeurl']
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=int(params['maxfaults']),
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaosengine'],
+                          chaos_dir='config/', kubeconfig=kubeconfigfile,
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=int(params['iterations']),
+                          command=params['command'])
+        print('checking kubeconfig')
+        os.system("echo $KUBECONFIG")
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            fname = 'kubeconfig-'+str(i)
+            if fname not in list:
+                break
+        kubeconfigfile = ''.join([dir, 'kubeconfig-', str(i)])
+        f.save(kubeconfigfile)
+        # creating empty file
+        open(kubeconfigfile, 'a').close()
+        # print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        # print('[GenerateChaos]', f.filename, datetime.now())
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+    @app.route('/GetLog/<chaosid>', methods=['GET'])
+    @swag_from('config/yml/log.yml')
+    def get_log(chaosid):
+        print('[GetLog]', chaosid)
+        epfile = flaskdir + 'log_' + str(chaosid)
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/generate_wheel_package.py
+++ b/utils/chaos_ai/generate_wheel_package.py
@@ -0,0 +1,21 @@
+import setuptools
+# from setuptools_cythonize import get_cmdclass
+
+setuptools.setup(
+    # cmdclass=get_cmdclass(),
+    name="aichaos",
+    version="0.0.1",
+    author="Sandeep Hans",
+    author_email="shans001@in.ibm.com",
+    description="Chaos AI",
+    long_description="Chaos Engineering using AI",
+    long_description_content_type="text/markdown",
+    url="",
+    packages=setuptools.find_packages(),
+    classifiers=[
+        "Programming Language :: Python :: 3",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+    ],
+    python_requires='>=3.9',
+)
--- a/utils/chaos_ai/requirements.txt
+++ b/utils/chaos_ai/requirements.txt
@@ -0,0 +1,10 @@
+numpy
+pandas
+notebook
+jupyterlab
+jupyter
+seaborn
+requests
+wheel
+Flask==2.1.0
+flasgger==0.9.5
--- a/utils/chaos_ai/src/init.py
+++ b/utils/chaos_ai/src/init.py
--- a/utils/chaos_ai/src/aichaos.py
+++ b/utils/chaos_ai/src/aichaos.py
@@ -0,0 +1,213 @@
+import json
+import os
+import random
+import sys
+
+import numpy as np
+import logging
+
+
+class AIChaos:
+    def __init__(self, states=None, faults=None, rewards=None, pod_names=[], chaos_dir=None,
+                 chaos_experiment='experiment.json',
+                 chaos_journal='journal.json', iterations=1000, static_run=False):
+        self.faults = faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.episodes = []
+
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+
+        self.iterations = iterations
+        # Initialize parameters
+        self.gamma = 0.75  # Discount factor
+        self.alpha = 0.9  # Learning rate
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([9, 9]))
+        # self.Q = np.array(np.zeros([len(faults), len(faults)]))
+        # currently action is a single fault, later on we will do multiple faults together
+        # For multiple faults, the no of cols in q-matrix will be all combinations of faults (infinite)
+        # eg. {f1,f2},f3,f4,{f4,f5} - f1,f2  in parallel, then f3, then f4,  then f4,f5 in parallel produces end state
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        # may be Q is a dictionary of dictionaries, for each state there is a dictionary of faults
+        # Q = {'500' = {'f1f2f4': 0.3, 'f1':  0.5}, '404' = {'f2': 0.22}}
+
+        self.logger = logging.getLogger()
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    # End state is reached when system is down or return error code like '500','404'
+    def get_next_state(self):
+        self.logger.info('[GET_NEXT_STATE]')
+        f = open(self.chaos_dir + self.chaos_journal)
+        data = json.load(f)
+
+        # before the experiment (if before steady state is false, after is null?)
+        for probe in data['steady_states']['before']['probes']:
+            if not probe['tolerance_met']:
+                # start_state = probe['activity']['tolerance']
+                # end_state = probe['status']
+                start_state, end_state = None, None
+                return start_state, end_state
+
+        # after the experiment
+        for probe in data['steady_states']['after']['probes']:
+            # if probe['output']['status'] == probe['activity']['tolerance']:
+            if not probe['tolerance_met']:
+                # print(probe)
+                start_state = probe['activity']['tolerance']
+                end_state = probe['output']['status']
+                # end_state = probe['status']
+                return start_state, end_state
+        # if tolerances for all probes are met
+        start_state = probe['activity']['tolerance']
+        end_state = probe['activity']['tolerance']
+        return start_state, end_state
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.info('[INJECT_FAULT] ' + fault)
+        f = open(self.chaos_dir + self.chaos_experiment)
+        data = json.load(f)
+        for m in data['method']:
+            if 'provider' in m:
+                if fault == 'kill_microservice':
+                    m['name'] = 'kill-microservice'
+                    m['provider']['module'] = 'chaosk8s.actions'
+                    m['provider']['arguments']['name'] = pod_name
+                else:
+                    m['provider']['arguments']['name_pattern'] = pod_name
+                m['provider']['func'] = fault
+
+                print('[INJECT_FAULT] method:', m)
+                # self.logger.info('[INJECT_FAULT] ' + m['provider']['arguments']['name_pattern'])
+                # self.logger.info('[INJECT_FAULT] ' + str(m))
+
+        exp_file = self.chaos_dir + 'experiment_' + str(random.randint(1, 10)) + '.json'
+        with open(exp_file, 'w') as f:
+            json.dump(data, f)
+        exp_file = self.chaos_dir + 'experiment.json'
+        # execute faults
+        # cmd = 'cd ' + self.chaos_dir + ';chaos run ' + self.chaos_experiment
+        cmd = 'cd ' + self.chaos_dir + ';chaos run ' + exp_file
+        if not self.static_run:
+            os.system(cmd)
+
+    def create_episode(self):
+        self.logger.info('[CREATE_EPISODE]')
+        episode = []
+        while True:
+            # inject more faults
+            # TODO: model - choose faults based on q-learning ...
+            fault_pod = random.choice(self.faults)
+            fault = fault_pod.split(':')[0]
+            pod_name = fault_pod.split(':')[1]
+            # fault = random.choice(self.faults)
+            # pod_name = random.choice(self.pod_names)
+            # fault = lstm_model.get_next_fault(episode)
+            # fault = get_max_prob_fault(episode)
+
+            self.inject_faults(fault, pod_name)
+            start_state, next_state = self.get_next_state()
+            print('[CREATE EPISODE]', start_state, next_state)
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                continue
+
+            episode.append({'fault': fault, 'pod_name': pod_name})
+            self.update_q_fault(fault_pod, episode, start_state, next_state)
+            # self.update_q_fault(fault, episode, start_state, next_state)
+            # if an end_state is reached
+            # if next_state is not None:
+            if start_state != next_state:
+                self.logger.info('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+                self.logger.info('[CREATE_EPISODE] END STATE:' + str(next_state))
+                return episode, start_state, next_state
+
+    def update_q_fault(self, fault, episode, start_state, end_state):
+        self.logger.info('[UPDATE_Q]')
+        print('[UPDATE_Q] ', str(start_state), str(end_state))
+        if end_state is None:
+            end_state = start_state
+
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+
+        TD = reward + \
+             self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+             self.Q[current_state, fault_index]
+        self.Q[current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
+
+    def start_chaos(self):
+        for i in range(self.iterations):
+            episode, start_state, end_state = self.create_episode()
+            # update Q matrix
+            # will do it with each fault injection
+            # self.update_q(episode, start_state, end_state)
+            print(self.Q)
+            print(self.state_matrix)
+
+
+def test_chaos():
+    svc_list = ['cart', 'catalogue', 'dispatch', 'mongodb', 'mysql', 'payment', 'rabbitmq', 'ratings', 'redis',
+                'shipping', 'user', 'web']
+    # Define faults
+    # faults = ['terminate_pods']
+    #     faults = ['terminate_pods:' + x for x in pod_names]
+    faults = ['kill_microservice:' + x for x in svc_list]
+    # Define the states
+    states = {
+        '200': 0,
+        '500': 1,
+        '404': 2
+    }
+    # Define rewards, currently not used
+    rewards = {
+        '200': 0,
+        '500': 0.8,
+        '404': 1
+    }
+
+    # cdir = '/Users/sandeephans/Downloads/chaos/chaostoolkit-samples-master/service-down-not-visible-to-users/'
+    cdir = '/Users/sandeephans/Downloads/openshift/'
+    cexp = 'experiment.json'
+    cjournal = 'journal.json'
+
+    aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                      chaos_dir=cdir, chaos_experiment=cexp, chaos_journal=cjournal,
+                      static_run=False)
+    aichaos.start_chaos()
+
+
+if __name__ == '__main__':
+    logging.basicConfig(stream=sys.stdout, level=logging.INFO)
+    test_chaos()
--- a/utils/chaos_ai/src/aichaos_main.py
+++ b/utils/chaos_ai/src/aichaos_main.py
@@ -0,0 +1,248 @@
+import json
+import os
+import random
+
+import numpy as np
+import pandas as pd
+import logging
+
+# sys.path.insert(1, os.path.join(sys.path[0], '..'))
+import src.utils as utils
+from src.kraken_utils import KrakenUtils
+from src.qlearning import QLearning
+from src.test_application import TestApplication
+
+
+class AIChaos:
+    def __init__(self, namespace='robot-shop', states=None, faults=None, rewards=None, urls=[], max_faults=5,
+                 service_weights=None, ctd_subsets=None, pod_names=[], chaos_dir='../config/', kubeconfig='~/.kube/config',
+                 chaos_experiment='experiment.json', logfile='log', qfile='qfile.csv', efile='efile', epfile='episodes.json',
+                 loglevel=logging.INFO,
+                 chaos_journal='journal.json', iterations=10, alpha=0.9, gamma=0.2, epsilon=0.3,
+                 num_requests=10, sleep_time=1, timeout=2, chaos_engine='kraken', dstk_probes=None,
+                 static_run=False, all_faults=False, command='podman'):
+        self.namespace = namespace
+        self.faults = faults
+        self.unused_faults = faults.copy()
+        self.all_faults = all_faults
+        self.pod_names = pod_names
+        self.states = states
+        self.rewards = rewards
+        self.urls = urls
+        self.max_faults = max_faults
+        self.episodes = []
+        self.service_weights = service_weights
+        self.ctd_subsets = ctd_subsets
+
+        self.kubeconfig = kubeconfig
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.chaos_journal = chaos_journal
+        self.command = command
+
+        if chaos_engine == 'kraken':
+            self.chaos_engine = KrakenUtils(namespace, kubeconfig=kubeconfig, chaos_dir=chaos_dir, chaos_experiment=chaos_experiment, command=self.command)
+        else:
+            self.chaos_engine = None
+
+        self.iterations = iterations
+        # Initialize RL parameters
+        self.epsilon = epsilon  # epsilon decay policy
+        # self.epsdecay = 0
+
+        # log files
+        self.logfile = logfile
+        self.qfile = qfile
+        self.efile = efile
+        self.epfile = epfile
+        open(efile, 'w+').close()
+        open(logfile, 'w+').close()
+        open(logfile, 'r+').truncate(0)
+        logging.getLogger("requests").setLevel(logging.WARNING)
+        logging.getLogger("urllib3").setLevel(logging.WARNING)
+        logging.basicConfig(filename=logfile, filemode='w+', level=loglevel)
+        self.logger = logging.getLogger(logfile.replace('/',''))
+        self.logger.addHandler(logging.FileHandler(logfile))
+
+        self.testapp = TestApplication(num_requests, timeout, sleep_time)
+        self.ql = QLearning(gamma, alpha, faults, states, rewards, urls)
+
+        # run from old static experiment and journal files
+        self.static_run = static_run
+
+    def realistic(self, faults_pods):
+        self.logger.debug('[Realistic] ' + str(faults_pods))
+        fp = faults_pods.copy()
+        for f1 in faults_pods:
+            for f2 in faults_pods:
+                if f1 == f2:
+                    continue
+                if f1 in fp and f2 in fp:
+                    f1_fault, load_1 = utils.get_load(f1.split(':')[0])
+                    f1_pod = f1.split(':')[1]
+                    f2_fault, load_2 = utils.get_load(f2.split(':')[0])
+                    f2_pod = f2.split(':')[1]
+                    if f1_pod == f2_pod:
+                        if f1_fault == 'pod-delete':
+                            fp.remove(f2)
+                        if f1_fault == f2_fault:
+                            # if int(load_1) > int(load_2):
+                            # randomly remove one fault from same faults with different params
+                            fp.remove(f2)
+        if self.service_weights is None:
+            return fp
+
+        fp_copy = fp.copy()
+        for f in fp:
+            f_fault = f.split(':')[0]
+            f_pod = f.split(':')[1].replace('service=', '')
+            self.logger.debug('[ServiceWeights] ' + f + ' ' + str(self.service_weights[f_pod][f_fault]))
+            if self.service_weights[f_pod][f_fault] == 0:
+                fp_copy.remove(f)
+
+        self.logger.debug('[Realistic] ' + str(fp_copy))
+        return fp_copy
+
+    def select_faults(self):
+        max_faults = min(self.max_faults, len(self.unused_faults))
+        num_faults = random.randint(1, max_faults)
+        if self.all_faults:
+            num_faults = len(self.unused_faults)
+        if random.random() > self.epsilon:
+            self.logger.info('[Exploration]')
+            # faults_pods = random.sample(self.faults, k=num_faults)
+            # using used faults list to avoid starvation
+            faults_pods = random.sample(self.unused_faults, k=num_faults)
+            faults_pods = self.realistic(faults_pods)
+            for f in faults_pods:
+                self.unused_faults.remove(f)
+            if len(self.unused_faults) == 0:
+                self.unused_faults = self.faults.copy()
+        else:
+            self.logger.info('[Exploitation]')
+            first_row = self.ql.Q[:, 0, :][0]
+            top_k_indices = np.argpartition(first_row, -num_faults)[-num_faults:]
+            faults_pods = [self.faults[i] for i in top_k_indices]
+            faults_pods = self.realistic(faults_pods)
+
+        return faults_pods
+
+    def create_episode(self, ctd_subset=None):
+        self.logger.debug('[CREATE_EPISODE]')
+        episode = []
+
+        if ctd_subset is None:
+            faults_pods = self.select_faults()
+        else:
+            faults_pods = ctd_subset
+            self.logger.info('CTD Subset: ' + str(faults_pods))
+
+        # faults_pods = self.realistic(faults_pods)
+        if len(faults_pods) == 0:
+            return [], 200, 200
+
+        engines = []
+        for fp in faults_pods:
+            fault = fp.split(':')[0]
+            pod_name = fp.split(':')[1]
+            engine = self.chaos_engine.inject_faults(fault, pod_name)
+            engines.append(engine)
+            episode.append({'fault': fault, 'pod_name': pod_name})
+        self.logger.info('[create_episode]' + str(faults_pods))
+        engines_running = self.chaos_engine.wait_engines(engines)
+        self.logger.info('[create_episode] engines_running' + str(engines_running))
+        if not engines_running:
+            return None, None, None
+
+        # randomly shuffling urls 
+        urls = random.sample(self.urls, len(self.urls))
+        ep_json = []
+        for url in urls:
+            start_state, next_state = self.testapp.test_load(url)
+            self.logger.info('[CREATE EPISODE]' + str(start_state) + ',' + str(next_state))
+            # if before state tolerance is not met
+            if start_state is None and next_state is None:
+                # self.cleanup()
+                self.chaos_engine.stop_engines()
+                continue
+
+                ### episode.append({'fault': fault, 'pod_name': pod_name})
+                # self.update_q_fault(fault_pod, episode, start_state, next_state)
+            url_index = self.urls.index(url)
+            self.logger.info('[CREATEEPISODE]' + str(url) + ':' + str(url_index))
+            for fp in faults_pods:
+                self.ql.update_q_fault(fp, episode, start_state, next_state, self.urls.index(url))
+            ep_json.append({'start_state': start_state, 'next_state': next_state, 'url': url, 'faults': episode})
+
+        self.logger.debug('[CREATE_EPISODE] EPISODE CREATED:' + str(episode))
+        self.logger.debug('[CREATE_EPISODE] END STATE:' + str(next_state))
+
+        self.chaos_engine.print_result(engines)
+        self.chaos_engine.stop_engines(episode=episode)
+        # ep_json = {'start_state': start_state, 'next_state': next_state, 'faults': episode}
+
+        return ep_json, start_state, next_state
+
+    def start_chaos(self):
+        self.logger.info('[INITIALIZING]')
+        self.logger.info('Logfile: '+self.logfile)
+        self.logger.info('Loggerfile: '+self.logger.handlers[0].stream.name)
+        self.logger.info('Chaos Engine: ' + self.chaos_engine.get_name())
+        self.logger.debug('Faults:' + str(self.faults))
+
+        self.chaos_engine.cleanup()
+        if self.ctd_subsets is None:
+            for i in range(self.iterations):
+                episode, start_state, end_state = self.create_episode()
+                self.logger.debug('[start_chaos]' + str(i) + ' ' + str(episode))
+                if episode is None:
+                    continue
+                # update Q matrix
+                # will do it with each fault injection
+                # self.update_q(episode, start_state, end_state)
+                # if episode['next_state'] != '200':
+                self.episodes.extend(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                # print(i, self.state_matrix)
+                self.write_q()
+                self.write_episode(episode)
+        else:
+            for i, subset in enumerate(self.ctd_subsets):
+                episode, start_state, end_state = self.create_episode(subset)
+                self.logger.debug('[start_chaos]' + str(episode))
+                if episode is None:
+                    continue
+                self.episodes.append(episode)
+                self.logger.info(str(i) + ' ' + str(self.ql.Q[:, 0]))
+                self.write_q()
+                self.write_episode(episode)
+
+        self.chaos_engine.cleanup()
+        # self.remove_temp_file()
+        with open(self.epfile, 'w', encoding='utf-8') as f:
+            json.dump(self.episodes, f, ensure_ascii=False, indent=4)
+        self.logger.info('COMPLETE!!!')
+
+    def write_q(self):
+        df = pd.DataFrame(self.ql.Q[:, 0, :], index=self.urls, columns=self.faults)
+        df.to_csv(self.qfile)
+        return df
+
+    def write_episode(self, episode):
+        for ep in episode:
+            with open(self.efile, "a") as outfile:
+                x = [e['fault'] + ':' + e['pod_name'] for e in ep['faults']]
+                x.append(ep['url'])
+                x.append(str(ep['next_state']))
+                outfile.write(','.join(x) + '\n')
+
+    def remove_temp_file(self):
+        mydir = self.chaos_dir + 'experiments'
+        print('Removing temp files from: '+mydir)
+        self.logger.debug('Removing temp files: '+mydir)
+        if os.path.exists(mydir):
+            return
+        filelist = [f for f in os.listdir(mydir) if f.endswith(".json")]
+        for f in filelist:
+            print(f)
+            os.remove(os.path.join(mydir, f))
--- a/utils/chaos_ai/src/experiments.py
+++ b/utils/chaos_ai/src/experiments.py
@@ -0,0 +1,56 @@
+import random
+
+
+class Experiments:
+    def __init__(self):
+        self.k = 0
+
+    def monotonic(self, aichaos, num_sets=3):
+        for i in range(num_sets):
+            faults_pods = random.sample(aichaos.faults, k=2)
+            faults_set = [[faults_pods[0]], [faults_pods[1]], [faults_pods[0], faults_pods[1]]]
+
+            resp1, resp2, resp_both = 0, 0, 0
+            for fl in faults_set:
+                engines = []
+                for fp in fl:
+                    fault = fp.split(':')[0]
+                    pod_name = fp.split(':')[1]
+                    engine = aichaos.inject_faults_litmus(fault, pod_name)
+                    engines.append(engine)
+                aichaos.litmus.wait_engines(engines)
+
+                for index, url in enumerate(aichaos.urls):
+                    start_state, next_state = aichaos.test_load(url)
+                    print(i, fl, next_state)
+                    # self.write(str(fl), next_state)
+                    if resp1 == 0:
+                        resp1 = next_state
+                    elif resp2 == 0:
+                        resp2 = next_state
+                    else:
+                        resp_both = next_state
+
+                aichaos.litmus.stop_engines()
+            self.write_resp(str(faults_set[2]), resp1, resp2, resp_both)
+        print('Experiment Complete!!!')
+
+    @staticmethod
+    def write(fault, next_state):
+        with open("experiment", "a") as outfile:
+            outfile.write(fault + ',' + str(next_state) + ',' + '\n')
+
+
+    @staticmethod
+    def write_resp(faults, resp1, resp2, resp3):
+        monotonic = True
+        if resp3 == 200:
+            if resp1 != 200 or resp2 != 200:
+                monotonic = False
+        else:
+            if resp1 == 200 and resp2 == 200:
+                monotonic = False
+
+        with open("experiment", "a") as outfile:
+            # outfile.write(faults + ',' + str(resp1) + ',' + '\n')
+            outfile.write(faults + ',' + str(resp1) + ',' + str(resp2) + ',' + str(resp3) + ',' + str(monotonic) + '\n')
--- a/utils/chaos_ai/src/kraken_utils.py
+++ b/utils/chaos_ai/src/kraken_utils.py
@@ -0,0 +1,99 @@
+import json
+import os
+import time
+import logging
+
+import src.utils as utils
+
+
+class KrakenUtils:
+    def __init__(self, namespace='robot-shop', chaos_dir='../config/',
+                 chaos_experiment='experiment.json', kubeconfig='~/.kube/config', wait_checks=60, command='podman'):
+        self.chaos_dir = chaos_dir
+        self.chaos_experiment = chaos_experiment
+        self.namespace = namespace
+        self.kubeconfig = kubeconfig
+        self.logger = logging.getLogger()
+        self.engines = []
+        self.wait_checks = wait_checks
+        self.command = command
+
+    def exp_status(self, engine='engine-cartns3'):
+        substring_list = ['Waiting for the specified duration','Waiting for wait_duration', 'Step workload started, waiting for response']
+        substr = '|'.join(substring_list)
+        # cmd = "docker logs "+engine+" 2>&1 | grep Waiting"
+        # cmd = "docker logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        cmd = self.command +" logs "+engine+" 2>&1 | grep -E '"+substr+"'"
+        line = os.popen(cmd).read()
+        self.logger.debug('[exp_status]'+line)
+        # if 'Waiting for the specified duration' in line:
+        # if 'Waiting for' in line or 'waiting for' in line:
+        # if 'Waiting for the specified duration' in line or 'Waiting for wait_duration' in line or 'Step workload started, waiting for response' in line:
+        if any(map(line.__contains__, substring_list)):
+            return 'Running'
+        return 'Not Running'
+ 
+    # print chaos result, check if litmus showed any error
+    def print_result(self, engines):
+        # self.logger.debug('')
+        for e in engines:
+            # cmd = 'kubectl describe chaosresult ' + e + ' -n ' + self.namespace + ' | grep "Fail Step:"'
+            # line = os.popen(cmd).read()
+            # self.logger.debug('[Chaos Result] '+e+' : '+line)
+            self.logger.debug('[KRAKEN][Chaos Result] '+e)
+
+    def wait_engines(self, engines=[]):
+        status = 'Completed'
+        max_checks = self.wait_checks
+        for e in engines:
+            self.logger.info('[Wait Engines] ' + e)
+            for i in range(max_checks):
+                status = self.exp_status(e)
+                if status == 'Running':
+                    break
+                time.sleep(1)
+            # return False, if even one engine is not running
+            if status != 'Running':
+                return False
+
+        self.engines = engines
+        # return True if all engines are running
+        return True
+
+
+    def cleanup(self):
+        self.logger.debug('Removing previous engines')
+        # cmd = "docker rm $(docker ps -q -f 'status=exited')"
+        if len(self.engines) > 0:
+            cmd = self.command+" stop " + " ".join(self.engines) + " >> temp"
+            os.system(cmd)
+        self.engines = []
+
+        cmd = self.command+" container prune -f >> temp"
+        os.system(cmd)
+        self.logger.debug('Engines removed')
+
+    def stop_engines(self, episode=[]):
+        self.cleanup()
+
+    def get_name(self):
+        return 'kraken'
+
+    def inject_faults(self, fault, pod_name):
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + fault + ':' + pod_name)
+        fault, load = utils.get_load(fault)
+        engine = 'engine-' + pod_name.replace('=', '-').replace('/','-') + '-' + fault
+        if fault == 'pod-delete':
+            cmd = self.command+' run  -d -e NAMESPACE='+self.namespace+' -e POD_LABEL='+pod_name+' --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z quay.io/redhat-chaos/krkn-hub:pod-scenarios >> temp'
+        elif fault == 'network-chaos':
+            # 'docker run -e NODE_NAME=minikube-m03 -e DURATION=10  --name=knetwork --net=host -v /home/chaos/.kube/kube-config-raw:/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'        
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120  --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:network-chaos >> temp'
+        elif fault == 'node-memory-hog':
+            cmd = self.command+' run -d -e NODE_NAME='+pod_name+' -e DURATION=120 -e NODES_AFFECTED_PERC=100 --name='+engine+' --net=host -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-memory-hog >> temp'
+        elif fault == 'node-cpu-hog':
+            cmd = self.command+'  run -e NODE_SELECTORS='+pod_name+' -e NODE_CPU_PERCENTAGE=100 -e NAMESPACE='+self.namespace+' -e TOTAL_CHAOS_DURATION=120 -e NODE_CPU_CORE=100 --name='+engine+' --net=host -env-host=true -v '+self.kubeconfig+':/root/.kube/config:Z -d quay.io/redhat-chaos/krkn-hub:node-cpu-hog'
+        else:
+            cmd = 'echo'
+        self.logger.debug('[KRAKEN][INJECT_FAULT] ' + cmd)
+        os.system(cmd)
+        return engine
--- a/utils/chaos_ai/src/qlearning.py
+++ b/utils/chaos_ai/src/qlearning.py
@@ -0,0 +1,62 @@
+import logging
+
+import numpy as np
+
+
+class QLearning:
+    def __init__(self, gamma=None, alpha=None, faults=None, states=None, rewards=None, urls=None):
+        self.gamma = gamma  # Discount factor
+        self.alpha = alpha  # Learning rate
+        self.faults = faults
+        self.states = states
+        self.rewards = rewards
+
+        # Initializing Q-Values
+        # self.Q = np.array(np.zeros([len(states), len(states)]))
+        self.Q = np.array(np.zeros([len(urls), len(states), len(faults)]))
+        self.state_matrix = np.array(np.zeros([len(states), len(states)]))
+
+        self.logger = logging.getLogger()
+
+    def update_q_fault(self, fault, episode, start_state, end_state, url_index):
+        self.logger.info('[UPDATE_Q] ' + str(url_index) + ' ' + fault + ' ' + str(start_state) + '->' + str(end_state))
+        if end_state is None:
+            end_state = start_state
+        if end_state not in self.states:
+            end_state = 'Other'
+        # reward is dependent on the error response (eg. '404') and length of episode
+        reward = self.rewards[str(end_state)] / len(episode)
+        current_state = self.states[str(start_state)]
+        next_state = self.states[str(end_state)]
+        fault_index = self.faults.index(fault)
+        # self.logger.debug('[update_q]' + fault + ' ' + str(fault_index) + ' ' + str(reward))
+        # self.logger.debug('reward, gamma: ' + str(reward) + ' ' + str(self.gamma))
+        # self.logger.debug(
+        #     'gamma*val' + str(self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])]))
+        # self.logger.debug('current state val:' + str(self.Q[url_index, current_state, fault_index]))
+
+        TD = reward + \
+             self.gamma * self.Q[url_index, next_state, np.argmax(self.Q[url_index, next_state,])] - \
+             self.Q[url_index, current_state, fault_index]
+        self.Q[url_index, current_state, fault_index] += self.alpha * TD
+
+        # update state matrix
+        TD_state = reward + \
+                   self.gamma * self.state_matrix[next_state, np.argmax(self.state_matrix[next_state,])] - \
+                   self.state_matrix[current_state, next_state]
+        self.state_matrix[current_state, next_state] += self.alpha * TD_state
+        # self.logger.debug('updated Q' + str(self.Q[url_index, current_state, fault_index]))
+
+    # def update_q(self, episode, start_state, end_state):
+    #     self.logger.info('[UPDATE_Q]')
+    #     if end_state is None:
+    #         end_state = start_state
+    #
+    #     # reward is dependent on the error response (eg. '404') and length of episode
+    #     reward = self.rewards[str(end_state)] / len(episode)
+    #     current_state = self.states[str(start_state)]
+    #     next_state = self.states[str(end_state)]
+    #     TD = reward + \
+    #          self.gamma * self.Q[next_state, np.argmax(self.Q[next_state,])] - \
+    #          self.Q[current_state, next_state]
+    #     self.Q[current_state, next_state] += self.alpha * TD
--- a/utils/chaos_ai/src/swagger_api.py
+++ b/utils/chaos_ai/src/swagger_api.py
@@ -0,0 +1,171 @@
+import json, os
+import logging
+# import numpy as np
+# import pandas as pd
+import threading
+from datetime import datetime
+from flask import Flask, request
+from flasgger import Swagger
+from flasgger.utils import swag_from
+# import zipfile
+import sys
+
+sys.path.append("..")
+from aichaos_main import AIChaos
+
+app = Flask(__name__)
+Swagger(app)
+flaskdir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), "config", "experiments",
+                        "flask") + '/'
+
+
+class AIChaosSwagger:
+    def __init__(self, flaskdir=''):
+        self.flaskdir = flaskdir
+
+    @app.route("/")
+    def empty(params=''):
+        return "AI Chaos Repository!"
+
+    def startchaos(self, kubeconfigfile, file_id, params):
+        print('[StartChaos]', file_id, kubeconfigfile)
+        dir = flaskdir
+        outfile = ''.join([dir, 'out-', file_id])
+        initfile = ''.join([dir, 'init-', file_id])
+        with open(initfile, 'w'):
+            pass
+        if os.path.exists(outfile):
+            os.remove(outfile)
+        # cons = ConstraintsInference(outdir=dir).get_constraints(csvfile, file_id, params, verbose=False,
+        #                                                         write_local=False)
+        os.environ["KUBECONFIG"] = kubeconfigfile
+        params['command'] = 'podman'
+        params['chaos_engine'] = 'kraken'
+        params['faults'] = 'pod-delete'
+        params['iterations'] = 1
+        params['maxfaults'] = 5
+        if os.path.isfile('/config/aichaos-config.json'):
+            with open('/config/aichaos-config.json') as f:
+                config_params = json.load(f)
+                params['command'] = config_params['command']
+                params['chaos_engine'] = config_params['chaos_engine']
+                params['faults']= config_params['faults']
+                params['iterations'] = config_params['iterations']
+                params['maxfaults'] = config_params['maxfaults']
+        faults = [f + ':' + p for f in params['faults'].split(',') for p in params['podlabels'].split(',')]
+        print('#faults:', len(faults), faults)
+        states = {'200': 0, '500': 1, '502': 2, '503': 3, '404': 4, 'Timeout': 5}
+        rewards = {'200': -1, '500': 0.8, '502': 0.8, '503': 0.8, '404': 1, 'Timeout': 1}
+        logfile = self.flaskdir + 'log_' + str(file_id)
+        qfile = self.flaskdir + 'qfile_' + str(file_id) + '.csv'
+        efile = self.flaskdir + 'efile_' + str(file_id)
+        epfile = self.flaskdir + 'episodes_' + str(file_id) + '.json'
+        probe_url = params['probeurl']
+        probes = {'pod-delete': 'executeprobe', 'cpu-hog': 'wolffi/cpu_load', 'disk-fill': 'wolffi/memory_load',
+                  'io_load': 'wolffi/io_load', 'http_delay': 'wolffi/http_delay', 'packet_delay': 'wolffi/packet_delay',
+                  'packet_duplication': 'wolffi/packet_duplication', 'packet_loss': 'wolffi/packet_loss',
+                  'packet_corruption': 'wolffi/packet_corruption',
+                  'packet_reordering': 'wolffi/packet_reordering', 'network_load': 'wolffi/network_load',
+                  'http_bad_request': 'wolffi/http_bad_request',
+                  'http_unauthorized': 'wolffi/http_unauthorized', 'http_forbidden': 'wolffi/http_forbidden',
+                  'http_not_found': 'wolffi/http_not_found',
+                  'http_method_not_allowed': 'wolffi/http_method_not_allowed',
+                  'http_not_acceptable': 'wolffi/http_not_acceptable',
+                  'http_request_timeout': 'wolffi/http_request_timeout',
+                  'http_unprocessable_entity': 'wolffi/http_unprocessable_entity',
+                  'http_internal_server_error': 'wolffi/http_internal_server_error',
+                  'http_not_implemented': 'wolffi/http_not_implemented',
+                  'http_bad_gateway': 'wolffi/http_bad_gateway',
+                  'http_service_unavailable': 'wolffi/http_service_unavailable',
+                  'bandwidth_restrict': 'wolffi/bandwidth_restrict',
+                  'pod_cpu_load': 'wolffi/pod_cpu_load', 'pod_memory_load': 'wolffi/pod_memory_load',
+                  'pod_io_load': 'wolffi/pod_io_load',
+                  'pod_network_load': 'wolffi/pod_network_load'
+                  }
+        dstk_probes = {k: probe_url + v for k, v in probes.items()}
+        cexp = {'pod-delete': 'pod-delete.json', 'cpu-hog': 'pod-cpu-hog.json',
+                'disk-fill': 'disk-fill.json', 'network-loss': 'network-loss.json',
+                'network-corruption': 'network-corruption.json', 'io-stress': 'io-stress.json'}
+        aichaos = AIChaos(states=states, faults=faults, rewards=rewards,
+                          logfile=logfile, qfile=qfile, efile=efile, epfile=epfile,
+                          urls=params['urls'].split(','), namespace=params['namespace'],
+                          max_faults=params['maxfaults'],
+                          num_requests=10, timeout=2,
+                          chaos_engine=params['chaos_engine'], dstk_probes=dstk_probes, command=params['command'],
+                          loglevel=logging.DEBUG, chaos_experiment=cexp, iterations=params['iterations'])
+        aichaos.start_chaos()
+
+        file = open(outfile, "w")
+        file.write('done')
+        file.close()
+        os.remove(initfile)
+        # os.remove(csvfile)
+        # ConstraintsInference().remove_temp_files(dir, file_id)
+        return 'WRITE'
+
+    @app.route('/GenerateChaos/', methods=['POST'])
+    @swag_from('../config/yml/chaosGen.yml')
+    def chaos_gen():
+        dir = flaskdir
+        sw = AIChaosSwagger(flaskdir=dir)
+        f = request.files['file']
+        list = os.listdir(dir)
+        for i in range(10000):
+            if str(i) not in list:
+                break
+        kubeconfigfile = ''.join([dir, str(i)])
+        f.save(kubeconfigfile)
+        print('HEADER:', f.headers)
+        print('[GenerateChaos] reqs:', request.form.to_dict())
+        print('[GenerateChaos]', f.filename, datetime.now())
+        # thread = threading.Thread(target=sw.write_constraints, args=(csvfile, str(i), parameters))
+        thread = threading.Thread(target=sw.startchaos, args=(kubeconfigfile, str(i), request.form.to_dict()))
+        thread.daemon = True
+        print(thread.getName())
+        thread.start()
+        return 'Chaos ID: ' + str(i)
+
+    @app.route('/GetStatus/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/status.yml')
+    def get_status(chaosid):
+        print('[GetStatus]', chaosid, flaskdir)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            return 'Completed'
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Does not exist'
+
+    @app.route('/GetQTable/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/qtable.yml')
+    def get_qtable(chaosid):
+        print('[GetQTable]', chaosid)
+        qfile = flaskdir + 'qfile_' + str(chaosid) + '.csv'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(qfile):
+            f = open(qfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+    @app.route('/GetEpisodes/<chaosid>', methods=['GET'])
+    @swag_from('../config/yml/episodes.yml')
+    def get_episodes(chaosid):
+        print('[GetEpisodes]', chaosid)
+        epfile = flaskdir + 'episodes_' + str(chaosid) + '.json'
+        initfile = ''.join([flaskdir, 'init-', chaosid])
+        if os.path.exists(epfile):
+            f = open(epfile, "r")
+            return f.read()
+        elif os.path.exists(initfile):
+            return 'Running'
+        else:
+            return 'Invalid Chaos ID: ' + chaosid
+
+
+if __name__ == '__main__':
+    app.run(debug=True, host='0.0.0.0', port='5001')
--- a/utils/chaos_ai/src/test_application.py
+++ b/utils/chaos_ai/src/test_application.py
@@ -0,0 +1,83 @@
+import json
+import logging
+import time
+import requests
+
+
+class TestApplication:
+    def __init__(self, num_requests=10, timeout=2, sleep_time=1):
+        self.num_requests = num_requests
+        self.timeout = timeout
+        self.sleep_time = sleep_time
+        self.logger = logging.getLogger()
+
+    def test_load(self, url=''):
+        # url = 'http://192.168.49.2:31902/api/cart/health'
+        timeout_count = 0
+        avg_lat = 0
+        for i in range(self.num_requests):
+            try:
+                r = requests.get(url, verify=False, timeout=self.timeout)
+                avg_lat += r.elapsed.total_seconds()
+                self.logger.info(
+                    url + ' ' + str(i) + ':' + str(r.status_code) + " {:.2f}".format(r.elapsed.total_seconds())
+                    + " {:.2f}".format(avg_lat))
+                if r.status_code != 200:
+                    return '200', r.status_code
+            # except requests.exceptions.Timeout as toe:
+            except Exception as toe:
+                self.logger.info(url + ' ' + str(i) + ':' + 'Timeout Exception!')
+                timeout_count += 1
+                if timeout_count > 3:
+                    return '200', 'Timeout'
+            # except Exception as e:
+            #   self.logger.debug('Connection refused!'+str(e))
+            time.sleep(self.sleep_time)
+        self.logger.info(url + "Avg: {:.2f}".format(avg_lat/self.num_requests))
+        return '200', '200'
+
+    # def test_load_hey(self):
+    #     cmd = 'hey -c 2 -z 20s http://192.168.49.2:31902/api/cart/health > temp'
+    #     os.system(cmd)
+    #     with open('temp') as f:
+    #         datafile = f.readlines()
+    #     found = False
+    #     for line in datafile:
+    #         if 'Status code distribution:' in line:
+    #             found = True
+    #         if found:
+    #             print('[test_load]', line)
+    #             m = re.search(r"\[([A-Za-z0-9_]+)\]", line)
+    #             if m is not None:
+    #                 resp_code = m.group(1)
+    #                 if resp_code != 200:
+    #                     return '200', resp_code
+    #     return '200', '200'
+
+    # # End state is reached when system is down or return error code like '500','404'
+    # def get_next_state(self):
+    #     self.logger.info('[GET_NEXT_STATE]')
+    #     f = open(self.chaos_dir + self.chaos_journal)
+    #     data = json.load(f)
+    #
+    #     # before the experiment (if before steady state is false, after is null?)
+    #     for probe in data['steady_states']['before']['probes']:
+    #         if not probe['tolerance_met']:
+    #             # start_state = probe['activity']['tolerance']
+    #             # end_state = probe['status']
+    #             start_state, end_state = None, None
+    #             return start_state, end_state
+    #
+    #     # after the experiment
+    #     for probe in data['steady_states']['after']['probes']:
+    #         # if probe['output']['status'] == probe['activity']['tolerance']:
+    #         if not probe['tolerance_met']:
+    #             # print(probe)
+    #             start_state = probe['activity']['tolerance']
+    #             end_state = probe['output']['status']
+    #             # end_state = probe['status']
+    #             return start_state, end_state
+    #     # if tolerances for all probes are met
+    #     start_state = probe['activity']['tolerance']
+    #     end_state = probe['activity']['tolerance']
+    #     return start_state, end_state
--- a/utils/chaos_ai/src/utils.py
+++ b/utils/chaos_ai/src/utils.py
@@ -0,0 +1,10 @@
+import re
+
+
+def get_load(fault):
+    params = re.findall(r'\(.*?\)', fault)
+    load = 100
+    if len(params) > 0:
+        load = params[0].strip('()')
+        fault = fault.strip(params[0])
+    return fault, load