mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Install pre-commit and use GitHub Actions (#94)
* added pre-commit and code-cleaning * removed tox and TravisCI
This commit is contained in:
22
.github/workflows/ci.yml
vendored
Normal file
22
.github/workflows/ci.yml
vendored
Normal file
@@ -0,0 +1,22 @@
|
||||
name: Code Quality Check
|
||||
|
||||
on:
|
||||
- push
|
||||
- pull_request
|
||||
|
||||
jobs:
|
||||
lint-ci:
|
||||
runs-on: ubuntu-latest
|
||||
name: Run pre-commit and install test
|
||||
steps:
|
||||
- name: Check out source repository
|
||||
uses: actions/checkout@v2
|
||||
- name: Set up Python environment
|
||||
uses: actions/setup-python@v1
|
||||
with:
|
||||
python-version: "3.8"
|
||||
- name: Run pre-commit
|
||||
uses: pre-commit/action@v2.0.3
|
||||
- name: Install Kraken
|
||||
run: |
|
||||
python setup.py develop
|
||||
1
.gitignore
vendored
1
.gitignore
vendored
@@ -31,7 +31,6 @@ tags
|
||||
# Unittest and coverage
|
||||
htmlcov/*
|
||||
.coverage
|
||||
.tox
|
||||
junit.xml
|
||||
coverage.xml
|
||||
.pytest_cache/
|
||||
|
||||
30
.pre-commit-config.yaml
Normal file
30
.pre-commit-config.yaml
Normal file
@@ -0,0 +1,30 @@
|
||||
---
|
||||
repos:
|
||||
- repo: git://github.com/Lucas-C/pre-commit-hooks
|
||||
rev: v1.1.1
|
||||
hooks:
|
||||
- id: remove-tabs
|
||||
|
||||
- repo: git://github.com/pre-commit/pre-commit-hooks
|
||||
rev: v2.0.0
|
||||
hooks:
|
||||
- id: trailing-whitespace
|
||||
- id: check-merge-conflict
|
||||
- id: end-of-file-fixer
|
||||
- id: check-case-conflict
|
||||
- id: detect-private-key
|
||||
- id: check-ast
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 19.10b0
|
||||
hooks:
|
||||
- id: black
|
||||
args: ['--line-length', '120']
|
||||
|
||||
- repo: https://gitlab.com/PyCQA/flake8
|
||||
rev: '3.7.8'
|
||||
hooks:
|
||||
- id: flake8
|
||||
additional_dependencies: ['pep8-naming']
|
||||
# Ignore all format-related checks as Black takes care of those.
|
||||
args: ['--ignore', 'E123,E125', '--select', 'E,W,F', '--max-line-length=120']
|
||||
15
.travis.yml
15
.travis.yml
@@ -1,15 +0,0 @@
|
||||
dist: xenial
|
||||
|
||||
langauge: python
|
||||
|
||||
python: "3.6.8"
|
||||
|
||||
before_install:
|
||||
- sudo apt-get update
|
||||
|
||||
install:
|
||||
- sudo apt install python3-pip
|
||||
- sudo pip3 install -r test-requirements.txt
|
||||
- sudo pip3 install tox
|
||||
|
||||
script: tox .
|
||||
@@ -18,4 +18,4 @@ scenarios:
|
||||
# The actions will be executed in the order specified
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
count: 3
|
||||
|
||||
@@ -5,9 +5,9 @@ import logging
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
logging.info("out " + str(out))
|
||||
except Exception as e:
|
||||
|
||||
@@ -19,4 +19,3 @@ scenarios:
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 2
|
||||
|
||||
@@ -15,8 +15,11 @@ def list_namespaces():
|
||||
cli = client.CoreV1Api()
|
||||
ret = cli.list_namespace(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n"
|
||||
% e
|
||||
)
|
||||
for namespace in ret.items:
|
||||
namespaces.append(namespace.metadata.name)
|
||||
return namespaces
|
||||
@@ -47,9 +50,9 @@ def check_namespaces(namespaces):
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
|
||||
@@ -42,7 +42,7 @@ Monitoring the Kubernetes/OpenShift cluster to observe the impact of Kraken chao
|
||||
- Blog post emphasizing the importance of making Chaos part of Performance and Scale runs to mimic the production environments: https://www.openshift.com/blog/making-chaos-part-of-kubernetes/openshift-performance-and-scalability-tests
|
||||
|
||||
### Contributions
|
||||
We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
|
||||
We are always looking for more enhancements, fixes to make it better, any contributions are most welcome. Feel free to report or work on the issues filed on github.
|
||||
|
||||
[More information on how to Contribute](docs/contribute.md)
|
||||
|
||||
|
||||
@@ -25,7 +25,7 @@ scenarios: "{{ lookup('env', 'SCENARIOS')|default('[[scenarios/etcd.yml, scenari
|
||||
|
||||
exit_on_failure: "{{ lookup('env', 'EXIT_ON_FAILURE')|default(false, true) }}"
|
||||
|
||||
# Cerberus enabled by user
|
||||
# Cerberus enabled by user
|
||||
cerberus_enabled: "{{ lookup('env', 'CERBERUS_ENABLED')|default(false, true) }}"
|
||||
cerberus_url: "{{ lookup('env', 'CERBERUS_URL')|default('', true) }}"
|
||||
|
||||
|
||||
@@ -1,12 +1,12 @@
|
||||
### Kraken image
|
||||
|
||||
Container image gets automatically built by quay.io at [Kraken image](https://quay.io/repository/openshift-scale/kraken).
|
||||
Container image gets automatically built by quay.io at [Kraken image](https://quay.io/repository/openshift-scale/kraken).
|
||||
|
||||
### Run containerized version
|
||||
Refer [instructions](https://github.com/cloud-bulldozer/kraken/blob/master/docs/installation.md#run-containerized-version) for information on how to run the containerized version of kraken.
|
||||
|
||||
|
||||
### Run Custom Kraken Image
|
||||
### Run Custom Kraken Image
|
||||
Refer to [instructions](https://github.com/cloud-bulldozer/kraken/blob/master/containers/build_own_image-README.md) for information on how to run a custom containerized version of kraken using podman
|
||||
|
||||
|
||||
@@ -25,4 +25,4 @@ To run containerized Kraken as a Kubernetes/OpenShift Deployment, follow these s
|
||||
8. In Openshift, add privileges to service account and execute `oc adm policy add-scc-to-user privileged -z useroot`.
|
||||
9. Create a Deployment and a NodePort Service using `kubectl apply -f kraken.yml`
|
||||
|
||||
NOTE: It is not recommended to run Kraken internal to the cluster as the pod which is running Kraken might get disrupted.
|
||||
NOTE: It is not recommended to run Kraken internal to the cluster as the pod which is running Kraken might get disrupted.
|
||||
|
||||
@@ -10,4 +10,4 @@
|
||||
1. Git clone the Kraken repository using `git clone https://github.com/cloud-bulldozer/kraken.git` on an IBM Power Systems server.
|
||||
2. Modify the python code and yaml files to address your needs.
|
||||
3. Execute `podman build -t <new_image_name>:latest -f Dockerfile-ppc64le` in the containers directory within kraken to build an image from the Dockerfile for Power.
|
||||
4. Execute `podman run --detach --name <container_name> <new_image_name>:latest` to start a container based on your new image.
|
||||
4. Execute `podman run --detach --name <container_name> <new_image_name>:latest` to start a container based on your new image.
|
||||
|
||||
@@ -22,15 +22,17 @@ $ git push
|
||||
```
|
||||
|
||||
## Fix Formatting
|
||||
You can do this before your first commit but please take a look at the formatting outlined using tox.
|
||||
Kraken uses [pre-commit](https://pre-commit.com) framework to maintain the code linting and python code styling.
|
||||
The CI would run the pre-commit check on each pull request.
|
||||
We encourage our contributors to follow the same pattern, while contributing to the code.
|
||||
|
||||
To run:
|
||||
The pre-commit configuration file is present in the repository `.pre-commit-config.yaml`
|
||||
It contains the different code styling and linting guide which we use for the application.
|
||||
|
||||
```pip install tox ```(if not already installed)
|
||||
Following command can be used to run the pre-commit:
|
||||
`pre-commit run --all-files`
|
||||
|
||||
```tox```
|
||||
|
||||
Fix all spacing, import issues and other formatting issues
|
||||
If pre-commit is not installed in your system, it can be install with : `pip install pre-commit`
|
||||
|
||||
## Squash Commits
|
||||
If there are mutliple commits, please rebase/squash multiple commits
|
||||
@@ -50,6 +52,3 @@ Push your rebased commits (you may need to force), then issue your PR.
|
||||
```
|
||||
$ git push origin <my-working-branch> --force
|
||||
```
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
### Litmus Scenarios
|
||||
Kraken consumes [Litmus](https://github.com/litmuschaos/litmus) under the hood for some infrastructure, pod, and node scenarios
|
||||
|
||||
|
||||
Official Litmus documentation and to read more information on specifics of Litmus resources can be found [here](https://docs.litmuschaos.io/docs/next/getstarted/)
|
||||
|
||||
|
||||
@@ -10,32 +10,29 @@ There are 3 custom resources that are created during each Litmus scenario. Below
|
||||
* ChaosExperiment: A resource to group the configuration parameters of a chaos experiment. ChaosExperiment CRs are created by the operator when experiments are invoked by ChaosEngine.
|
||||
* ChaosResult : A resource to hold the results of a chaos-experiment. The Chaos-exporter reads the results and exports the metrics into a configured Prometheus server.
|
||||
|
||||
### Understanding Litmus Scenarios
|
||||
### Understanding Litmus Scenarios
|
||||
|
||||
To run Litmus scenarios we need to apply 3 different resources/yaml files to our cluster
|
||||
1. **Chaos experiments** contain the actual chaos details of a scenario
|
||||
|
||||
i. This is installed automatically by Kraken (does not need to be specified in kraken scenario configuration)
|
||||
|
||||
2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly
|
||||
|
||||
2. **Service Account**: should be created to allow chaosengine to run experiments in your application namespace. Usually sets just enough permissions to a specific namespace to be able to run the experiment properly
|
||||
|
||||
i. This can be defined using either a link to a yaml file or a downloaded file in the scenarios folder
|
||||
|
||||
3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within
|
||||
|
||||
3. **Chaos Engine** connects the application instance to a Chaos Experiment. This is where you define the specifics of your scenario; ie: the node or pod name you want to cause chaos within
|
||||
|
||||
i. This is a downloaded yaml file in the scenarios folder, full list of scenarios can be found [here](https://hub.litmuschaos.io/)
|
||||
|
||||
**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file.
|
||||
**NOTE**: By default all chaos experiments will be installed based on the version you give in the config file.
|
||||
|
||||
Adding a new Litmus based scenario is as simple as adding references to 2 new yaml files (the Service Account and Chaos engine files for your scenario ) in the Kraken config.
|
||||
|
||||
### Current Scenarios
|
||||
|
||||
Following are the start of scenarios for which a chaos scenario config exists today.
|
||||
Following are the start of scenarios for which a chaos scenario config exists today.
|
||||
|
||||
Component | Description | Working
|
||||
------------------------ | ---------------------------------------------------------------------------------------------------| ------------------------- |
|
||||
Node CPU Hog | Chaos scenario that hogs up the CPU on a defined node for a specific amount of time | :heavy_check_mark: |
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -16,7 +16,7 @@ Following node chaos scenarios are supported:
|
||||
|
||||
**NOTE**: node_start_scenario, node_stop_scenario, node_stop_start_scenario, node_termination_scenario, node_reboot_scenario and stop_start_kubelet_scenario are supported only on AWS and GCP as of now.
|
||||
|
||||
#### AWS
|
||||
#### AWS
|
||||
|
||||
**NOTE**: For clusters with AWS make sure [AWS CLI](https://docs.aws.amazon.com/cli/latest/userguide/install-cliv2.html) is installed and properly [configured](https://docs.aws.amazon.com/cli/latest/userguide/cli-configure-quickstart.html) using an AWS account
|
||||
|
||||
@@ -24,9 +24,9 @@ Following node chaos scenarios are supported:
|
||||
**NOTE**: For clusters with GCP make sure [GCP CLI](https://cloud.google.com/sdk/docs/install#linux) is installed.
|
||||
|
||||
A google service account is required to give proper authentication to GCP for node actions. See [here](https://cloud.google.com/docs/authentication/getting-started) for how to create a service account.
|
||||
|
||||
|
||||
**NOTE**: A user with 'resourcemanager.projects.setIamPolicy' permission is required to grant project-level permissions to the service account.
|
||||
|
||||
|
||||
After creating the service account you'll need to enable the account using the following: ```export GOOGLE_APPLICATION_CREDENTIALS="<serviceaccount.json>"```
|
||||
|
||||
#### OPENSTACK
|
||||
@@ -47,7 +47,7 @@ You will also need to create a service principal and give it the correct access,
|
||||
|
||||
To properly run the service principal requires “Azure Active Directory Graph/Application.ReadWrite.OwnedBy” api permission granted and “User Access Administrator”
|
||||
|
||||
Before running you'll need to set the following:
|
||||
Before running you'll need to set the following:
|
||||
1. Login using ```az login```
|
||||
|
||||
2. ```export AZURE_TENANT_ID=<tenant_id>```
|
||||
@@ -87,15 +87,15 @@ node_scenarios:
|
||||
label_selector: node-role.kubernetes.io/infra
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
- actions:
|
||||
- actions:
|
||||
- stop_start_helper_node_scenario # node chaos scenario for helper node
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
instance_kill_count: 1
|
||||
timeout: 120
|
||||
helper_node_ip: # ip address of the helper node
|
||||
service: # check status of the services on the helper node
|
||||
- haproxy
|
||||
- dhcpd
|
||||
- named
|
||||
ssh_private_key: /root/.ssh/id_rsa # ssh key to access the helper node
|
||||
cloud_type: openstack
|
||||
cloud_type: openstack
|
||||
```
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
### Pod Scenarios
|
||||
Kraken consumes [Powerfulseal](https://github.com/powerfulseal/powerfulseal) under the hood to run the pod scenarios.
|
||||
Kraken consumes [Powerfulseal](https://github.com/powerfulseal/powerfulseal) under the hood to run the pod scenarios.
|
||||
|
||||
|
||||
#### Pod chaos scenarios
|
||||
|
||||
@@ -27,4 +27,4 @@ time_scenarios:
|
||||
- action: skew_date
|
||||
object_type: node
|
||||
label_selector: node-role.kubernetes.io/worker
|
||||
```
|
||||
```
|
||||
|
||||
@@ -5,9 +5,9 @@ import logging
|
||||
# Invokes a given command and returns the stdout
|
||||
def invoke(command):
|
||||
try:
|
||||
output = subprocess.Popen(command, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
output = subprocess.Popen(
|
||||
command, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (command, e))
|
||||
|
||||
@@ -55,8 +55,11 @@ def list_pods(namespace):
|
||||
try:
|
||||
ret = cli.list_namespaced_pod(namespace, pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n"
|
||||
% e
|
||||
)
|
||||
for pod in ret.items:
|
||||
pods.append(pod.metadata.name)
|
||||
return pods
|
||||
@@ -76,11 +79,18 @@ def get_all_pods(label_selector=None):
|
||||
# Execute command in pod
|
||||
def exec_cmd_in_pod(command, pod_name, namespace):
|
||||
|
||||
exec_command = ['bash', '-c', command]
|
||||
exec_command = ["bash", "-c", command]
|
||||
try:
|
||||
ret = stream(cli.connect_get_namespaced_pod_exec, pod_name, namespace,
|
||||
command=exec_command, stderr=True, stdin=False,
|
||||
stdout=True, tty=False)
|
||||
ret = stream(
|
||||
cli.connect_get_namespaced_pod_exec,
|
||||
pod_name,
|
||||
namespace,
|
||||
command=exec_command,
|
||||
stderr=True,
|
||||
stdin=False,
|
||||
stdout=True,
|
||||
tty=False,
|
||||
)
|
||||
except Exception:
|
||||
return False
|
||||
return ret
|
||||
@@ -91,8 +101,11 @@ def get_node_status(node):
|
||||
try:
|
||||
node_info = cli.read_node_status(node, pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->read_node_status: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->read_node_status: %s\n"
|
||||
% e
|
||||
)
|
||||
for condition in node_info.status.conditions:
|
||||
if condition.type == "Ready":
|
||||
return condition.status
|
||||
@@ -107,8 +120,11 @@ def monitor_nodes():
|
||||
try:
|
||||
node_info = cli.read_node_status(node, pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->read_node_status: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->read_node_status: %s\n"
|
||||
% e
|
||||
)
|
||||
for condition in node_info.status.conditions:
|
||||
if condition.type == "KernelDeadlock":
|
||||
node_kerneldeadlock_status = condition.status
|
||||
@@ -116,10 +132,7 @@ def monitor_nodes():
|
||||
node_ready_status = condition.status
|
||||
else:
|
||||
continue
|
||||
if (
|
||||
node_kerneldeadlock_status != "False" # noqa
|
||||
or node_ready_status != "True" # noqa
|
||||
):
|
||||
if node_kerneldeadlock_status != "False" or node_ready_status != "True": # noqa # noqa
|
||||
notready_nodes.append(node)
|
||||
if len(notready_nodes) != 0:
|
||||
status = False
|
||||
@@ -135,15 +148,15 @@ def monitor_namespace(namespace):
|
||||
notready_pods = []
|
||||
for pod in pods:
|
||||
try:
|
||||
pod_info = cli.read_namespaced_pod_status(pod, namespace,
|
||||
pretty=True)
|
||||
pod_info = cli.read_namespaced_pod_status(pod, namespace, pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->read_namespaced_pod_status: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->read_namespaced_pod_status: %s\n"
|
||||
% e
|
||||
)
|
||||
pod_status = pod_info.status.phase
|
||||
if pod_status != "Running" \
|
||||
and pod_status != "Completed" \
|
||||
and pod_status != "Succeeded":
|
||||
if pod_status != "Running" and pod_status != "Completed" and pod_status != "Succeeded":
|
||||
notready_pods.append(pod)
|
||||
if len(notready_pods) != 0:
|
||||
status = False
|
||||
@@ -154,10 +167,8 @@ def monitor_namespace(namespace):
|
||||
|
||||
# Monitor component namespace
|
||||
def monitor_component(iteration, component_namespace):
|
||||
watch_component_status, failed_component_pods = \
|
||||
monitor_namespace(component_namespace)
|
||||
logging.info("Iteration %s: %s: %s"
|
||||
% (iteration, component_namespace, watch_component_status))
|
||||
watch_component_status, failed_component_pods = monitor_namespace(component_namespace)
|
||||
logging.info("Iteration %s: %s: %s" % (iteration, component_namespace, watch_component_status))
|
||||
return watch_component_status, failed_component_pods
|
||||
|
||||
|
||||
@@ -178,7 +189,7 @@ def find_kraken_node():
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=" + str(kraken_project))
|
||||
pod_json_str = runcommand.invoke("kubectl get pods/" + str(kraken_pod_name) + " -o json")
|
||||
pod_json = json.loads(pod_json_str)
|
||||
node_name = pod_json['spec']['nodeName']
|
||||
node_name = pod_json["spec"]["nodeName"]
|
||||
|
||||
# Reset to the default project
|
||||
runcommand.invoke("kubectl config set-context --current --namespace=default")
|
||||
|
||||
@@ -6,12 +6,13 @@ import sys
|
||||
|
||||
# Install litmus and wait until pod is running
|
||||
def install_litmus(version):
|
||||
runcommand.invoke("kubectl apply -f "
|
||||
"https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
runcommand.invoke("kubectl apply -f " "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
|
||||
runcommand.invoke("oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' "
|
||||
"[ { \"op\": \"add\", \"path\": \"/spec/template/spec/containers/0/env/-\", "
|
||||
"\"value\": { \"name\": \"ANALYTICS\", \"value\": \"FALSE\" } } ]'")
|
||||
runcommand.invoke(
|
||||
"oc patch -n litmus deployment.apps/chaos-operator-ce --type=json --patch ' "
|
||||
'[ { "op": "add", "path": "/spec/template/spec/containers/0/env/-", '
|
||||
'"value": { "name": "ANALYTICS", "value": "FALSE" } } ]\''
|
||||
)
|
||||
|
||||
runcommand.invoke("oc wait deploy -n litmus chaos-operator-ce --for=condition=Available")
|
||||
|
||||
@@ -19,14 +20,13 @@ def install_litmus(version):
|
||||
def deploy_all_experiments(version_string):
|
||||
|
||||
if not version_string.startswith("v"):
|
||||
logging.error("Incorrect version string for litmus, needs to start with 'v' "
|
||||
"followed by a number")
|
||||
logging.error("Incorrect version string for litmus, needs to start with 'v' " "followed by a number")
|
||||
sys.exit(1)
|
||||
version = version_string[1:]
|
||||
|
||||
runcommand.invoke("kubectl apply -f "
|
||||
"https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml"
|
||||
% version)
|
||||
runcommand.invoke(
|
||||
"kubectl apply -f " "https://hub.litmuschaos.io/api/chaos/%s?file=charts/generic/experiments.yaml" % version
|
||||
)
|
||||
|
||||
|
||||
def delete_experiments():
|
||||
@@ -35,16 +35,18 @@ def delete_experiments():
|
||||
|
||||
# Check status of experiment
|
||||
def check_experiment(engine_name, experiment_name, namespace):
|
||||
chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
|
||||
"'{.status.engineStatus}'" % (engine_name, namespace))
|
||||
chaos_engine = runcommand.invoke(
|
||||
"kubectl get chaosengines/%s -n %s -o jsonpath=" "'{.status.engineStatus}'" % (engine_name, namespace)
|
||||
)
|
||||
engine_status = chaos_engine.strip()
|
||||
max_tries = 30
|
||||
engine_counter = 0
|
||||
while engine_status.lower() != "running" and engine_status.lower() != "completed":
|
||||
time.sleep(10)
|
||||
logging.info("Waiting for engine to start running.")
|
||||
chaos_engine = runcommand.invoke("kubectl get chaosengines/%s -n %s -o jsonpath="
|
||||
"'{.status.engineStatus}'" % (engine_name, namespace))
|
||||
chaos_engine = runcommand.invoke(
|
||||
"kubectl get chaosengines/%s -n %s -o jsonpath=" "'{.status.engineStatus}'" % (engine_name, namespace)
|
||||
)
|
||||
engine_status = chaos_engine.strip()
|
||||
if engine_counter >= max_tries:
|
||||
logging.error("Chaos engine took longer than 5 minutes to be running or complete")
|
||||
@@ -57,19 +59,21 @@ def check_experiment(engine_name, experiment_name, namespace):
|
||||
|
||||
if not chaos_engine:
|
||||
return False
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'"
|
||||
% (engine_name, experiment_name, namespace))
|
||||
chaos_result = runcommand.invoke(
|
||||
"kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'" % (engine_name, experiment_name, namespace)
|
||||
)
|
||||
result_counter = 0
|
||||
status = chaos_result.strip()
|
||||
while status == "Awaited":
|
||||
logging.info("Waiting for chaos result to finish, sleeping 10 seconds")
|
||||
time.sleep(10)
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'"
|
||||
% (engine_name, experiment_name, namespace))
|
||||
chaos_result = runcommand.invoke(
|
||||
"kubectl get chaosresult %s"
|
||||
"-%s -n %s -o "
|
||||
"jsonpath='{.status.experimentstatus.verdict}'" % (engine_name, experiment_name, namespace)
|
||||
)
|
||||
status = chaos_result.strip()
|
||||
if result_counter >= max_tries:
|
||||
logging.error("Chaos results took longer than 5 minutes to get a final result")
|
||||
@@ -82,10 +86,11 @@ def check_experiment(engine_name, experiment_name, namespace):
|
||||
if status == "Pass":
|
||||
return True
|
||||
else:
|
||||
chaos_result = runcommand.invoke("kubectl get chaosresult %s"
|
||||
"-%s -n %s -o jsonpath="
|
||||
"'{.status.experimentstatus.failStep}'" %
|
||||
(engine_name, experiment_name, namespace))
|
||||
chaos_result = runcommand.invoke(
|
||||
"kubectl get chaosresult %s"
|
||||
"-%s -n %s -o jsonpath="
|
||||
"'{.status.experimentstatus.failStep}'" % (engine_name, experiment_name, namespace)
|
||||
)
|
||||
logging.info("Chaos result failed information: " + str(chaos_result))
|
||||
return False
|
||||
|
||||
@@ -99,5 +104,4 @@ def delete_chaos(namespace):
|
||||
|
||||
# Uninstall litmus operator
|
||||
def uninstall_litmus(version):
|
||||
runcommand.invoke("kubectl delete -f "
|
||||
"https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
runcommand.invoke("kubectl delete -f " "https://litmuschaos.github.io/litmus/litmus-operator-%s.yaml" % version)
|
||||
|
||||
@@ -46,8 +46,9 @@ class abstract_node_scenarios:
|
||||
logging.info("The kubelet of the node %s has been stopped" % (node))
|
||||
logging.info("stop_kubelet_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop the kubelet of the node. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to stop the kubelet of the node. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("stop_kubelet_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -64,12 +65,12 @@ class abstract_node_scenarios:
|
||||
try:
|
||||
logging.info("Starting node_crash_scenario injection")
|
||||
logging.info("Crashing the node %s" % (node))
|
||||
runcommand.invoke("oc debug node/" + node + " -- chroot /host "
|
||||
"dd if=/dev/urandom of=/proc/sysrq-trigger")
|
||||
runcommand.invoke(
|
||||
"oc debug node/" + node + " -- chroot /host " "dd if=/dev/urandom of=/proc/sysrq-trigger"
|
||||
)
|
||||
logging.info("node_crash_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to crash the node. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("Failed to crash the node. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_crash_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -9,56 +9,42 @@ from kraken.node_actions.abstract_node_scenarios import abstract_node_scenarios
|
||||
|
||||
class AWS:
|
||||
def __init__(self):
|
||||
self.boto_client = boto3.client('ec2')
|
||||
self.boto_instance = boto3.resource('ec2').Instance('id')
|
||||
self.boto_client = boto3.client("ec2")
|
||||
self.boto_instance = boto3.resource("ec2").Instance("id")
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
return self.boto_client.describe_instances(
|
||||
Filters=[{'Name': 'private-dns-name', 'Values': [node]}]
|
||||
)['Reservations'][0]['Instances'][0]['InstanceId']
|
||||
return self.boto_client.describe_instances(Filters=[{"Name": "private-dns-name", "Values": [node]}])[
|
||||
"Reservations"
|
||||
][0]["Instances"][0]["InstanceId"]
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self, instance_id):
|
||||
self.boto_client.start_instances(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_client.start_instances(InstanceIds=[instance_id])
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, instance_id):
|
||||
self.boto_client.stop_instances(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_client.stop_instances(InstanceIds=[instance_id])
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, instance_id):
|
||||
self.boto_client.terminate_instances(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_client.terminate_instances(InstanceIds=[instance_id])
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, instance_id):
|
||||
self.boto_client.reboot_instances(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_client.reboot_instances(InstanceIds=[instance_id])
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, instance_id):
|
||||
self.boto_instance.wait_until_running(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_instance.wait_until_running(InstanceIds=[instance_id])
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, instance_id):
|
||||
self.boto_instance.wait_until_stopped(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_instance.wait_until_stopped(InstanceIds=[instance_id])
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, instance_id):
|
||||
self.boto_instance.wait_until_terminated(
|
||||
InstanceIds=[instance_id]
|
||||
)
|
||||
self.boto_instance.wait_until_terminated(InstanceIds=[instance_id])
|
||||
|
||||
|
||||
class aws_node_scenarios(abstract_node_scenarios):
|
||||
@@ -78,8 +64,9 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in running state" % (instance_id))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -95,8 +82,7 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in stopped state" % (instance_id))
|
||||
nodeaction.wait_for_unknown_status(node, timeout)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -118,8 +104,9 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been terminated" % (instance_id))
|
||||
logging.info("node_termination_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -136,7 +123,8 @@ class aws_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been rebooted" % (instance_id))
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -12,13 +12,13 @@ import yaml
|
||||
|
||||
class Azure:
|
||||
def __init__(self):
|
||||
logging.info('azure ' + str(self))
|
||||
logging.info("azure " + str(self))
|
||||
# Acquire a credential object using CLI-based authentication.
|
||||
credentials = DefaultAzureCredential()
|
||||
logging.info("credential " + str(credentials))
|
||||
az_account = runcommand.invoke("az account list -o yaml")
|
||||
az_account_yaml = yaml.load(az_account, Loader=yaml.FullLoader)
|
||||
subscription_id = az_account_yaml[0]['id']
|
||||
subscription_id = az_account_yaml[0]["id"]
|
||||
self.compute_client = ComputeManagementClient(credentials, subscription_id)
|
||||
|
||||
# Get the instance ID of the node
|
||||
@@ -49,8 +49,7 @@ class Azure:
|
||||
self.compute_client.virtual_machines.begin_restart(group_name, vm_name)
|
||||
|
||||
def get_vm_status(self, resource_group, vm_name):
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name)\
|
||||
.statuses
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses
|
||||
status = len(statuses) >= 2 and statuses[1]
|
||||
return status
|
||||
|
||||
@@ -58,7 +57,7 @@ class Azure:
|
||||
def wait_until_running(self, resource_group, vm_name, timeout):
|
||||
time_counter = 0
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
while status and status.code != 'PowerState/running':
|
||||
while status and status.code != "PowerState/running":
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
logging.info("Vm %s is still not running, sleeping for 5 seconds" % vm_name)
|
||||
time.sleep(5)
|
||||
@@ -71,7 +70,7 @@ class Azure:
|
||||
def wait_until_stopped(self, resource_group, vm_name, timeout):
|
||||
time_counter = 0
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
while status and status.code != 'PowerState/stopped':
|
||||
while status and status.code != "PowerState/stopped":
|
||||
status = self.get_vm_status(resource_group, vm_name)
|
||||
logging.info("Vm %s is still stopping, sleeping for 5 seconds" % vm_name)
|
||||
time.sleep(5)
|
||||
@@ -82,13 +81,11 @@ class Azure:
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, resource_group, vm_name):
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group,
|
||||
vm_name).statuses[0]
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses[0]
|
||||
logging.info("vm status " + str(statuses))
|
||||
while statuses.code == "ProvisioningState/deleting":
|
||||
try:
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group,
|
||||
vm_name).statuses[0]
|
||||
statuses = self.compute_client.virtual_machines.instance_view(resource_group, vm_name).statuses[0]
|
||||
logging.info("Vm %s is still deleting, waiting 10 seconds" % vm_name)
|
||||
time.sleep(10)
|
||||
except Exception:
|
||||
@@ -114,8 +111,9 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in running state" % node)
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -131,8 +129,7 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in stopped state" % node)
|
||||
nodeaction.wait_for_unknown_status(node, timeout)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % e)
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % e)
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -142,8 +139,7 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
try:
|
||||
logging.info("Starting node_termination_scenario injection")
|
||||
resource_group = self.azure.get_instance_id(node)
|
||||
logging.info("Terminating the node %s with instance ID: %s "
|
||||
% (node, resource_group))
|
||||
logging.info("Terminating the node %s with instance ID: %s " % (node, resource_group))
|
||||
self.azure.terminate_instances(resource_group, node)
|
||||
self.azure.wait_until_terminated(resource_group, node)
|
||||
for _ in range(timeout):
|
||||
@@ -155,8 +151,9 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been terminated" % node)
|
||||
logging.info("node_termination_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -173,7 +170,8 @@ class azure_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been rebooted" % (node))
|
||||
logging.info("node_reboot_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -11,8 +11,7 @@ def get_node(node_name, label_selector):
|
||||
if node_name in kubecli.list_killable_nodes():
|
||||
return node_name
|
||||
else:
|
||||
logging.info("Node with provided node_name does not exist or the node might "
|
||||
"be in NotReady state.")
|
||||
logging.info("Node with provided node_name does not exist or the node might " "be in NotReady state.")
|
||||
nodes = kubecli.list_killable_nodes(label_selector)
|
||||
if not nodes:
|
||||
raise Exception("Ready nodes with the provided label selector do not exist")
|
||||
@@ -24,8 +23,7 @@ def get_node(node_name, label_selector):
|
||||
|
||||
# Wait till node status becomes Ready
|
||||
def wait_for_ready_status(node, timeout):
|
||||
runcommand.invoke("kubectl wait --for=condition=Ready "
|
||||
"node/" + node + " --timeout=" + str(timeout) + "s")
|
||||
runcommand.invoke("kubectl wait --for=condition=Ready " "node/" + node + " --timeout=" + str(timeout) + "s")
|
||||
|
||||
|
||||
# Wait till node status becomes NotReady
|
||||
@@ -40,9 +38,9 @@ def wait_for_unknown_status(node, timeout):
|
||||
|
||||
# Get the ip of the cluster node
|
||||
def get_node_ip(node):
|
||||
return runcommand.invoke("kubectl get node %s -o "
|
||||
"jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'"
|
||||
% (node))
|
||||
return runcommand.invoke(
|
||||
"kubectl get node %s -o " "jsonpath='{.status.addresses[?(@.type==\"InternalIP\")].address}'" % (node)
|
||||
)
|
||||
|
||||
|
||||
def check_service_status(node, service, ssh_private_key, timeout):
|
||||
@@ -55,18 +53,20 @@ def check_service_status(node, service, ssh_private_key, timeout):
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
logging.info("Trying to ssh to instance: %s" % (node))
|
||||
connection = ssh.connect(node, username='root', key_filename=ssh_private_key,
|
||||
timeout=800, banner_timeout=400)
|
||||
connection = ssh.connect(
|
||||
node, username="root", key_filename=ssh_private_key, timeout=800, banner_timeout=400
|
||||
)
|
||||
if connection is None:
|
||||
break
|
||||
except Exception:
|
||||
pass
|
||||
for service_name in service:
|
||||
logging.info("Checking status of Service: %s" % (service_name))
|
||||
stdin, stdout, stderr = ssh.exec_command("systemctl status %s | grep '^ Active' "
|
||||
"| awk '{print $2}'" % (service_name))
|
||||
stdin, stdout, stderr = ssh.exec_command(
|
||||
"systemctl status %s | grep '^ Active' " "| awk '{print $2}'" % (service_name)
|
||||
)
|
||||
service_status = stdout.readlines()[0]
|
||||
logging.info("Status of service %s is %s \n" % (service_name, service_status.strip()))
|
||||
if(service_status.strip() != "active"):
|
||||
if service_status.strip() != "active":
|
||||
logging.error("Service %s is in %s state" % (service_name, service_status.strip()))
|
||||
ssh.close()
|
||||
|
||||
@@ -12,58 +12,50 @@ import kraken.invoke.command as runcommand
|
||||
class GCP:
|
||||
def __init__(self):
|
||||
|
||||
self.project = runcommand.invoke('gcloud config get-value project').split('/n')[0].strip()
|
||||
self.project = runcommand.invoke("gcloud config get-value project").split("/n")[0].strip()
|
||||
logging.info("project " + str(self.project) + "!")
|
||||
credentials = GoogleCredentials.get_application_default()
|
||||
self.client = discovery.build('compute', 'v1', credentials=credentials,
|
||||
cache_discovery=False)
|
||||
self.client = discovery.build("compute", "v1", credentials=credentials, cache_discovery=False)
|
||||
|
||||
# Get the instance ID of the node
|
||||
def get_instance_id(self, node):
|
||||
zone_request = self.client.zones().list(project=self.project)
|
||||
while zone_request is not None:
|
||||
zone_response = zone_request.execute()
|
||||
for zone in zone_response['items']:
|
||||
instances_request = self.client.instances().list(project=self.project,
|
||||
zone=zone['name'])
|
||||
for zone in zone_response["items"]:
|
||||
instances_request = self.client.instances().list(project=self.project, zone=zone["name"])
|
||||
while instances_request is not None:
|
||||
instance_response = instances_request.execute()
|
||||
if "items" in instance_response.keys():
|
||||
for instance in instance_response['items']:
|
||||
if instance['name'] in node:
|
||||
return instance['name'], zone['name']
|
||||
for instance in instance_response["items"]:
|
||||
if instance["name"] in node:
|
||||
return instance["name"], zone["name"]
|
||||
instances_request = self.client.zones().list_next(
|
||||
previous_request=instances_request,
|
||||
previous_response=instance_response)
|
||||
zone_request = self.client.zones().list_next(previous_request=zone_request,
|
||||
previous_response=zone_response)
|
||||
logging.info('no instances ')
|
||||
previous_request=instances_request, previous_response=instance_response
|
||||
)
|
||||
zone_request = self.client.zones().list_next(previous_request=zone_request, previous_response=zone_response)
|
||||
logging.info("no instances ")
|
||||
|
||||
# Start the node instance
|
||||
def start_instances(self, zone, instance_id):
|
||||
self.client.instances().start(project=self.project, zone=zone, instance=instance_id)\
|
||||
.execute()
|
||||
self.client.instances().start(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
|
||||
# Stop the node instance
|
||||
def stop_instances(self, zone, instance_id):
|
||||
self.client.instances().stop(project=self.project, zone=zone, instance=instance_id)\
|
||||
.execute()
|
||||
self.client.instances().stop(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
|
||||
# Start the node instance
|
||||
def suspend_instances(self, zone, instance_id):
|
||||
self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id)\
|
||||
.execute()
|
||||
self.client.instances().suspend(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
|
||||
# Terminate the node instance
|
||||
def terminate_instances(self, zone, instance_id):
|
||||
self.client.instances().delete(project=self.project, zone=zone, instance=instance_id)\
|
||||
.execute()
|
||||
self.client.instances().delete(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
|
||||
# Reboot the node instance
|
||||
def reboot_instances(self, zone, instance_id):
|
||||
response = self.client.instances().reset(project=self.project, zone=zone,
|
||||
instance=instance_id).execute()
|
||||
logging.info('response reboot ' + str(response))
|
||||
response = self.client.instances().reset(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
logging.info("response reboot " + str(response))
|
||||
|
||||
# Get instance status
|
||||
def get_instance_status(self, zone, instance_id, expected_status, timeout):
|
||||
@@ -72,10 +64,9 @@ class GCP:
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = self.client.instances().get(project=self.project, zone=zone,
|
||||
instance=instance_id).execute()
|
||||
logging.info("Status of vm " + str(instStatus['status']))
|
||||
if instStatus['status'] == expected_status:
|
||||
instStatus = self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
if instStatus["status"] == expected_status:
|
||||
return True
|
||||
time.sleep(sleeper)
|
||||
i += sleeper
|
||||
@@ -83,15 +74,15 @@ class GCP:
|
||||
|
||||
# Wait until the node instance is suspended
|
||||
def wait_until_suspended(self, zone, instance_id, timeout):
|
||||
self.get_instance_status(zone, instance_id, 'SUSPENDED', timeout)
|
||||
self.get_instance_status(zone, instance_id, "SUSPENDED", timeout)
|
||||
|
||||
# Wait until the node instance is running
|
||||
def wait_until_running(self, zone, instance_id, timeout):
|
||||
self.get_instance_status(zone, instance_id, 'RUNNING', timeout)
|
||||
self.get_instance_status(zone, instance_id, "RUNNING", timeout)
|
||||
|
||||
# Wait until the node instance is stopped
|
||||
def wait_until_stopped(self, zone, instance_id, timeout):
|
||||
self.get_instance_status(zone, instance_id, 'TERMINATED', timeout)
|
||||
self.get_instance_status(zone, instance_id, "TERMINATED", timeout)
|
||||
|
||||
# Wait until the node instance is terminated
|
||||
def wait_until_terminated(self, zone, instance_id, timeout):
|
||||
@@ -99,9 +90,10 @@ class GCP:
|
||||
i = 0
|
||||
sleeper = 5
|
||||
while i <= timeout:
|
||||
instStatus = self.client.instances().get(project=self.project, zone=zone,
|
||||
instance=instance_id).execute()
|
||||
logging.info("Status of vm " + str(instStatus['status']))
|
||||
instStatus = (
|
||||
self.client.instances().get(project=self.project, zone=zone, instance=instance_id).execute()
|
||||
)
|
||||
logging.info("Status of vm " + str(instStatus["status"]))
|
||||
time.sleep(sleeper)
|
||||
except Exception as e:
|
||||
logging.info("here " + str(e))
|
||||
@@ -125,14 +117,15 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in running state" % instance_id)
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info('stop scenario')
|
||||
logging.info("stop scenario")
|
||||
for _ in range(instance_kill_count):
|
||||
try:
|
||||
logging.info("Starting node_stop_scenario injection")
|
||||
@@ -143,8 +136,7 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in stopped state" % instance_id)
|
||||
nodeaction.wait_for_unknown_status(node, timeout)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -166,8 +158,9 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been terminated" % instance_id)
|
||||
logging.info("node_termination_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to terminate node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % e)
|
||||
logging.error(
|
||||
"Failed to terminate node instance. Encountered following exception:" " %s. Test Failed" % e
|
||||
)
|
||||
logging.error("node_termination_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -183,7 +176,8 @@ class gcp_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s has been rebooted" % instance_id)
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -13,20 +13,16 @@ class general_node_scenarios(abstract_node_scenarios):
|
||||
|
||||
# Node scenario to start the node
|
||||
def node_start_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Node start is not set up yet for this cloud type, "
|
||||
"no action is going to be taken")
|
||||
logging.info("Node start is not set up yet for this cloud type, " "no action is going to be taken")
|
||||
|
||||
# Node scenario to stop the node
|
||||
def node_stop_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Node stop is not set up yet for this cloud type,"
|
||||
" no action is going to be taken")
|
||||
logging.info("Node stop is not set up yet for this cloud type," " no action is going to be taken")
|
||||
|
||||
# Node scenario to terminate the node
|
||||
def node_termination_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Node termination is not set up yet for this cloud type, "
|
||||
"no action is going to be taken")
|
||||
logging.info("Node termination is not set up yet for this cloud type, " "no action is going to be taken")
|
||||
|
||||
# Node scenario to reboot the node
|
||||
def node_reboot_scenario(self, instance_kill_count, node, timeout):
|
||||
logging.info("Node reboot is not set up yet for this cloud type,"
|
||||
" no action is going to be taken")
|
||||
logging.info("Node reboot is not set up yet for this cloud type," " no action is going to be taken")
|
||||
|
||||
@@ -39,12 +39,12 @@ class OPENSTACKCLOUD:
|
||||
i = 0
|
||||
sleeper = 1
|
||||
while i <= timeout:
|
||||
instStatus = runcommand.invoke("openstack server show %s | tr -d ' ' |"
|
||||
"grep '^|status' |"
|
||||
"cut -d '|' -f3 | tr -d '\n'" % (node))
|
||||
instStatus = runcommand.invoke(
|
||||
"openstack server show %s | tr -d ' ' |" "grep '^|status' |" "cut -d '|' -f3 | tr -d '\n'" % (node)
|
||||
)
|
||||
logging.info("instance status is %s" % (instStatus))
|
||||
logging.info("expected status is %s" % (expected_status))
|
||||
if (instStatus.strip() == expected_status):
|
||||
if instStatus.strip() == expected_status:
|
||||
logging.info("instance status has reached desired status %s" % (instStatus))
|
||||
return True
|
||||
time.sleep(sleeper)
|
||||
@@ -53,7 +53,7 @@ class OPENSTACKCLOUD:
|
||||
# Get the openstack instance name
|
||||
def get_openstack_nodename(self, os_node_ip):
|
||||
server_list = runcommand.invoke("openstack server list | grep %s" % (os_node_ip))
|
||||
list_of_servers = server_list.split('\n')
|
||||
list_of_servers = server_list.split("\n")
|
||||
for item in list_of_servers:
|
||||
items = item.split("|")
|
||||
counter = 0
|
||||
@@ -63,7 +63,7 @@ class OPENSTACKCLOUD:
|
||||
logging.info("Openstack node name is %s " % (node_name))
|
||||
counter += 1
|
||||
continue
|
||||
item_list = i.split('=')
|
||||
item_list = i.split("=")
|
||||
if len(item_list) == 2 and item_list[-1].strip() == os_node_ip:
|
||||
return node_name
|
||||
counter += 1
|
||||
@@ -87,8 +87,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance ID: %s is in running state" % (node))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -105,8 +106,7 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance name: %s is in stopped state" % (node))
|
||||
nodeaction.wait_for_ready_status(node, timeout)
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -124,8 +124,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Node with instance name: %s has been rebooted" % (node))
|
||||
logging.info("node_reboot_scenario has been successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to reboot node instance. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to reboot node instance. Encountered following exception:" " %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("node_reboot_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -141,8 +142,9 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Helper node with IP: %s is in running state" % (node_ip))
|
||||
logging.info("node_start_scenario has been successfully injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to start node instance. Encountered following "
|
||||
"exception: %s. Test Failed" % (e))
|
||||
logging.error(
|
||||
"Failed to start node instance. Encountered following " "exception: %s. Test Failed" % (e)
|
||||
)
|
||||
logging.error("helper_node_start_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -157,8 +159,7 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
self.openstackcloud.wait_until_stopped(openstack_node_name)
|
||||
logging.info("Helper node with IP: %s is in stopped state" % (node_ip))
|
||||
except Exception as e:
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. "
|
||||
"Test Failed" % (e))
|
||||
logging.error("Failed to stop node instance. Encountered following exception: %s. " "Test Failed" % (e))
|
||||
logging.error("helper_node_stop_scenario injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -169,7 +170,6 @@ class openstack_node_scenarios(abstract_node_scenarios):
|
||||
logging.info("Service status checked on %s" % (node_ip))
|
||||
logging.info("Check service status is successfuly injected!")
|
||||
except Exception as e:
|
||||
logging.error("Failed to check service status. Encountered following exception:"
|
||||
" %s. Test Failed" % (e))
|
||||
logging.error("Failed to check service status. Encountered following exception:" " %s. Test Failed" % (e))
|
||||
logging.error("helper_node_service_status injection failed!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -12,7 +12,7 @@ def setup(repo):
|
||||
# delete repo to clone the latest copy if exists
|
||||
subprocess.run(delete_repo, shell=True, universal_newlines=True, timeout=45)
|
||||
# clone the repo
|
||||
git.Repo.clone_from(repo, '/tmp/performance-dashboards')
|
||||
git.Repo.clone_from(repo, "/tmp/performance-dashboards")
|
||||
# deploy performance dashboards
|
||||
subprocess.run(command, shell=True, universal_newlines=True)
|
||||
except Exception as e:
|
||||
|
||||
@@ -23,52 +23,52 @@ def pod_exec(pod_name, command, namespace):
|
||||
|
||||
|
||||
def node_debug(node_name, command):
|
||||
response = runcommand.invoke("oc debug node/" + node_name + ' -- chroot /host ' + command)
|
||||
response = runcommand.invoke("oc debug node/" + node_name + " -- chroot /host " + command)
|
||||
return response
|
||||
|
||||
|
||||
def skew_time(scenario):
|
||||
skew_command = "date --set "
|
||||
if scenario['action'] == "skew_date":
|
||||
if scenario["action"] == "skew_date":
|
||||
skewed_date = "00-01-01"
|
||||
skew_command += skewed_date
|
||||
elif scenario['action'] == "skew_time":
|
||||
elif scenario["action"] == "skew_time":
|
||||
skewed_time = "01:01:01"
|
||||
skew_command += skewed_time
|
||||
if "node" in scenario["object_type"]:
|
||||
node_names = []
|
||||
if "object_name" in scenario.keys() and scenario['object_name']:
|
||||
node_names = scenario['object_name']
|
||||
elif "label_selector" in scenario.keys() and scenario['label_selector']:
|
||||
node_names = kubecli.list_nodes(scenario['label_selector'])
|
||||
if "object_name" in scenario.keys() and scenario["object_name"]:
|
||||
node_names = scenario["object_name"]
|
||||
elif "label_selector" in scenario.keys() and scenario["label_selector"]:
|
||||
node_names = kubecli.list_nodes(scenario["label_selector"])
|
||||
|
||||
for node in node_names:
|
||||
node_debug(node, skew_command)
|
||||
logging.info("Reset date/time on node " + str(node))
|
||||
return "node", node_names
|
||||
|
||||
elif "pod" in scenario['object_type']:
|
||||
elif "pod" in scenario["object_type"]:
|
||||
pod_names = []
|
||||
if "object_name" in scenario.keys() and scenario['object_name']:
|
||||
for name in scenario['object_name']:
|
||||
if "object_name" in scenario.keys() and scenario["object_name"]:
|
||||
for name in scenario["object_name"]:
|
||||
if "namespace" not in scenario.keys():
|
||||
logging.error("Need to set namespace when using pod name")
|
||||
sys.exit(1)
|
||||
pod_names.append([name, scenario['namespace']])
|
||||
elif "label_selector" in scenario.keys() and scenario['label_selector']:
|
||||
pod_names = kubecli.get_all_pods(scenario['label_selector'])
|
||||
elif "namespace" in scenario.keys() and scenario['namespace']:
|
||||
pod_names = kubecli.list_pods(scenario['namespace'])
|
||||
pod_names.append([name, scenario["namespace"]])
|
||||
elif "label_selector" in scenario.keys() and scenario["label_selector"]:
|
||||
pod_names = kubecli.get_all_pods(scenario["label_selector"])
|
||||
elif "namespace" in scenario.keys() and scenario["namespace"]:
|
||||
pod_names = kubecli.list_pods(scenario["namespace"])
|
||||
counter = 0
|
||||
for pod_name in pod_names:
|
||||
pod_names[counter] = [pod_name, scenario['namespace']]
|
||||
pod_names[counter] = [pod_name, scenario["namespace"]]
|
||||
counter += 1
|
||||
|
||||
for pod in pod_names:
|
||||
if len(pod) > 1:
|
||||
pod_exec(pod[0], skew_command, pod[1])
|
||||
else:
|
||||
pod_exec(pod, skew_command, scenario['namespace'])
|
||||
pod_exec(pod, skew_command, scenario["namespace"])
|
||||
logging.info("Reset date/time on pod " + str(pod[0]))
|
||||
return "pod", pod_names
|
||||
|
||||
@@ -76,8 +76,7 @@ def skew_time(scenario):
|
||||
# From kubectl/oc command get time output
|
||||
def parse_string_date(obj_datetime):
|
||||
try:
|
||||
date_line = re.search(r'[a-zA-Z0-9_() .]*\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \w{3} '
|
||||
r'\d{4}\W*', obj_datetime)
|
||||
date_line = re.search(r"[a-zA-Z0-9_() .]*\w{3} \w{3} \d{2} \d{2}:\d{2}:\d{2} \w{3} " r"\d{4}\W*", obj_datetime)
|
||||
return date_line.group().strip()
|
||||
except Exception:
|
||||
return ""
|
||||
@@ -87,7 +86,7 @@ def parse_string_date(obj_datetime):
|
||||
def string_to_date(obj_datetime):
|
||||
obj_datetime = parse_string_date(obj_datetime)
|
||||
try:
|
||||
date_time_obj = datetime.datetime.strptime(obj_datetime, '%a %b %d %H:%M:%S %Z %Y')
|
||||
date_time_obj = datetime.datetime.strptime(obj_datetime, "%a %b %d %H:%M:%S %Z %Y")
|
||||
return date_time_obj
|
||||
except Exception:
|
||||
return datetime.datetime(datetime.MINYEAR, 1, 1)
|
||||
@@ -105,14 +104,12 @@ def check_date_time(object_type, names):
|
||||
counter = 0
|
||||
while not first_date_time < node_datetime < datetime.datetime.utcnow():
|
||||
time.sleep(10)
|
||||
logging.info("Date/time on node %s still not reset, waiting 10 seconds and retrying"
|
||||
% node_name)
|
||||
logging.info("Date/time on node %s still not reset, waiting 10 seconds and retrying" % node_name)
|
||||
node_datetime_string = node_debug(node_name, skew_command)
|
||||
node_datetime = string_to_date(node_datetime_string)
|
||||
counter += 1
|
||||
if counter > max_retries:
|
||||
logging.error("Date and time in node %s didn't reset properly"
|
||||
% node_name)
|
||||
logging.error("Date and time in node %s didn't reset properly" % node_name)
|
||||
not_reset.append(node_name)
|
||||
break
|
||||
if counter < max_retries:
|
||||
@@ -125,15 +122,13 @@ def check_date_time(object_type, names):
|
||||
pod_datetime = string_to_date(pod_datetime_string)
|
||||
while not first_date_time < pod_datetime < datetime.datetime.utcnow():
|
||||
time.sleep(10)
|
||||
logging.info("Date/time on pod %s still not reset, waiting 10 seconds and retrying"
|
||||
% pod_name[0])
|
||||
logging.info("Date/time on pod %s still not reset, waiting 10 seconds and retrying" % pod_name[0])
|
||||
first_date_time = datetime.datetime.utcnow()
|
||||
pod_datetime = pod_exec(pod_name[0], skew_command, pod_name[1])
|
||||
pod_datetime = string_to_date(pod_datetime)
|
||||
counter += 1
|
||||
if counter > max_retries:
|
||||
logging.error("Date and time in pod %s didn't reset properly"
|
||||
% pod_name[0])
|
||||
logging.error("Date and time in pod %s didn't reset properly" % pod_name[0])
|
||||
not_reset.append(pod_name[0])
|
||||
break
|
||||
if counter < max_retries:
|
||||
|
||||
@@ -14,4 +14,3 @@ python-openstackclient
|
||||
gitpython
|
||||
paramiko
|
||||
setuptools
|
||||
tox
|
||||
|
||||
194
run_kraken.py
194
run_kraken.py
@@ -25,22 +25,24 @@ node_general = False
|
||||
|
||||
# Get the node scenarios object of specfied cloud type
|
||||
def get_node_scenario_object(node_scenario):
|
||||
if "cloud_type" not in node_scenario.keys() or node_scenario['cloud_type'] == "generic":
|
||||
if "cloud_type" not in node_scenario.keys() or node_scenario["cloud_type"] == "generic":
|
||||
global node_general
|
||||
node_general = True
|
||||
return general_node_scenarios()
|
||||
if node_scenario['cloud_type'] == 'aws':
|
||||
if node_scenario["cloud_type"] == "aws":
|
||||
return aws_node_scenarios()
|
||||
elif node_scenario['cloud_type'] == 'gcp':
|
||||
elif node_scenario["cloud_type"] == "gcp":
|
||||
return gcp_node_scenarios()
|
||||
elif node_scenario['cloud_type'] == 'openstack':
|
||||
elif node_scenario["cloud_type"] == "openstack":
|
||||
return openstack_node_scenarios()
|
||||
elif node_scenario['cloud_type'] == 'azure' or node_scenario['cloud_type'] == 'az':
|
||||
elif node_scenario["cloud_type"] == "azure" or node_scenario["cloud_type"] == "az":
|
||||
return azure_node_scenarios()
|
||||
else:
|
||||
logging.error("Cloud type " + node_scenario['cloud_type'] + " is not currently supported; "
|
||||
"try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
|
||||
"cluster")
|
||||
logging.error(
|
||||
"Cloud type " + node_scenario["cloud_type"] + " is not currently supported; "
|
||||
"try using 'generic' if wanting to stop/start kubelet or fork bomb on any "
|
||||
"cluster"
|
||||
)
|
||||
sys.exit(1)
|
||||
|
||||
|
||||
@@ -77,19 +79,23 @@ def inject_node_scenario(action, node_scenario, node_scenario_object):
|
||||
elif action == "node_crash_scenario":
|
||||
node_scenario_object.node_crash_scenario(instance_kill_count, node, timeout)
|
||||
elif action == "stop_start_helper_node_scenario":
|
||||
if node_scenario['cloud_type'] != "openstack":
|
||||
logging.error("Scenario: " + action + " is not supported for "
|
||||
"cloud type " + node_scenario['cloud_type'] + ", skipping action")
|
||||
if node_scenario["cloud_type"] != "openstack":
|
||||
logging.error(
|
||||
"Scenario: " + action + " is not supported for "
|
||||
"cloud type " + node_scenario["cloud_type"] + ", skipping action"
|
||||
)
|
||||
else:
|
||||
if not node_scenario['helper_node_ip']:
|
||||
if not node_scenario["helper_node_ip"]:
|
||||
logging.error("Helper node IP address is not provided")
|
||||
sys.exit(1)
|
||||
node_scenario_object.helper_node_stop_start_scenario(
|
||||
instance_kill_count, node_scenario['helper_node_ip'], timeout)
|
||||
instance_kill_count, node_scenario["helper_node_ip"], timeout
|
||||
)
|
||||
node_scenario_object.helper_node_service_status(
|
||||
node_scenario['helper_node_ip'], service, ssh_private_key, timeout)
|
||||
node_scenario["helper_node_ip"], service, ssh_private_key, timeout
|
||||
)
|
||||
else:
|
||||
logging.info('There is no node action that matches %s, skipping scenario' % action)
|
||||
logging.info("There is no node action that matches %s, skipping scenario" % action)
|
||||
|
||||
|
||||
# Get cerberus status
|
||||
@@ -101,15 +107,16 @@ def cerberus_integration(config):
|
||||
logging.error("url where Cerberus publishes True/False signal is not provided.")
|
||||
sys.exit(1)
|
||||
cerberus_status = requests.get(cerberus_url).content
|
||||
cerberus_status = True if cerberus_status == b'True' else False
|
||||
cerberus_status = True if cerberus_status == b"True" else False
|
||||
if not cerberus_status:
|
||||
logging.error("Received a no-go signal from Cerberus, looks like "
|
||||
"the cluster is unhealthy. Please check the Cerberus "
|
||||
"report for more details. Test failed.")
|
||||
logging.error(
|
||||
"Received a no-go signal from Cerberus, looks like "
|
||||
"the cluster is unhealthy. Please check the Cerberus "
|
||||
"report for more details. Test failed."
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Received a go signal from Ceberus, the cluster is healthy. "
|
||||
"Test passed.")
|
||||
logging.info("Received a go signal from Ceberus, the cluster is healthy. " "Test passed.")
|
||||
return cerberus_status
|
||||
|
||||
|
||||
@@ -118,35 +125,36 @@ def publish_kraken_status(config, failed_post_scenarios):
|
||||
cerberus_status = cerberus_integration(config)
|
||||
if not cerberus_status:
|
||||
if failed_post_scenarios:
|
||||
if config['kraken']['exit_on_failure']:
|
||||
logging.info("Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing, exiting kraken run")
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is not healthy and post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is not healthy and post action scenarios "
|
||||
"are still failing")
|
||||
logging.info("Cerberus status is not healthy and post action scenarios " "are still failing")
|
||||
else:
|
||||
if failed_post_scenarios:
|
||||
if config['kraken']['exit_on_failure']:
|
||||
logging.info("Cerberus status is healthy but post action scenarios "
|
||||
"are still failing, exiting kraken run")
|
||||
if config["kraken"]["exit_on_failure"]:
|
||||
logging.info(
|
||||
"Cerberus status is healthy but post action scenarios " "are still failing, exiting kraken run"
|
||||
)
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("Cerberus status is healthy but post action scenarios "
|
||||
"are still failing")
|
||||
logging.info("Cerberus status is healthy but post action scenarios " "are still failing")
|
||||
|
||||
|
||||
def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
|
||||
|
||||
if scenario.endswith(".yaml") or scenario.endswith(".yml"):
|
||||
action_output = runcommand.invoke("powerfulseal autonomous "
|
||||
"--use-pod-delete-instead-of-ssh-kill"
|
||||
" --policy-file %s --kubeconfig %s --no-cloud"
|
||||
" --inventory-kubernetes --headless"
|
||||
% (scenario, kubeconfig_path))
|
||||
action_output = runcommand.invoke(
|
||||
"powerfulseal autonomous "
|
||||
"--use-pod-delete-instead-of-ssh-kill"
|
||||
" --policy-file %s --kubeconfig %s --no-cloud"
|
||||
" --inventory-kubernetes --headless" % (scenario, kubeconfig_path)
|
||||
)
|
||||
# read output to make sure no error
|
||||
if "ERROR" in action_output:
|
||||
action_output.split("ERROR")[1].split('\n')[0]
|
||||
action_output.split("ERROR")[1].split("\n")[0]
|
||||
if not pre_action_output:
|
||||
logging.info("Powerful seal pre action check failed for " + str(scenario))
|
||||
return False
|
||||
@@ -159,7 +167,7 @@ def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
|
||||
if pre_action_output == action_output:
|
||||
logging.info(scenario + " post action checks passed")
|
||||
else:
|
||||
logging.info(scenario + ' post action response did not match pre check output')
|
||||
logging.info(scenario + " post action response did not match pre check output")
|
||||
return False
|
||||
elif scenario != "":
|
||||
# invoke custom bash script
|
||||
@@ -168,7 +176,7 @@ def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
|
||||
if pre_action_output == action_output:
|
||||
logging.info(scenario + " post action checks passed")
|
||||
else:
|
||||
logging.info(scenario + ' post action response did not match pre check output')
|
||||
logging.info(scenario + " post action response did not match pre check output")
|
||||
return False
|
||||
|
||||
return action_output
|
||||
@@ -178,12 +186,11 @@ def run_post_action(kubeconfig_path, scenario, pre_action_output=""):
|
||||
def post_actions(kubeconfig_path, scenario, failed_post_scenarios, pre_action_output):
|
||||
|
||||
for failed_scenario in failed_post_scenarios:
|
||||
post_action_output = run_post_action(kubeconfig_path,
|
||||
failed_scenario[0], failed_scenario[1])
|
||||
post_action_output = run_post_action(kubeconfig_path, failed_scenario[0], failed_scenario[1])
|
||||
if post_action_output is not False:
|
||||
failed_post_scenarios.remove(failed_scenario)
|
||||
else:
|
||||
logging.info('Post action scenario ' + str(failed_scenario) + "is still failing")
|
||||
logging.info("Post action scenario " + str(failed_scenario) + "is still failing")
|
||||
|
||||
# check post actions
|
||||
if len(scenario) > 1:
|
||||
@@ -201,11 +208,12 @@ def pod_scenarios(scenarios_list, config, failed_post_scenarios):
|
||||
if len(pod_scenario) > 1:
|
||||
pre_action_output = run_post_action(kubeconfig_path, pod_scenario[1])
|
||||
else:
|
||||
pre_action_output = ''
|
||||
scenario_logs = runcommand.invoke("powerfulseal autonomous --use-pod-delete-instead-"
|
||||
"of-ssh-kill --policy-file %s --kubeconfig %s "
|
||||
"--no-cloud --inventory-kubernetes --headless"
|
||||
% (pod_scenario[0], kubeconfig_path))
|
||||
pre_action_output = ""
|
||||
scenario_logs = runcommand.invoke(
|
||||
"powerfulseal autonomous --use-pod-delete-instead-"
|
||||
"of-ssh-kill --policy-file %s --kubeconfig %s "
|
||||
"--no-cloud --inventory-kubernetes --headless" % (pod_scenario[0], kubeconfig_path)
|
||||
)
|
||||
|
||||
# Display pod scenario logs/actions
|
||||
print(scenario_logs)
|
||||
@@ -214,23 +222,23 @@ def pod_scenarios(scenarios_list, config, failed_post_scenarios):
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
failed_post_scenarios = post_actions(kubeconfig_path, pod_scenario,
|
||||
failed_post_scenarios, pre_action_output)
|
||||
failed_post_scenarios = post_actions(
|
||||
kubeconfig_path, pod_scenario, failed_post_scenarios, pre_action_output
|
||||
)
|
||||
publish_kraken_status(config, failed_post_scenarios)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run scenario: %s. Encountered the following "
|
||||
"exception: %s" % (pod_scenario[0], e))
|
||||
logging.error("Failed to run scenario: %s. Encountered the following " "exception: %s" % (pod_scenario[0], e))
|
||||
return failed_post_scenarios
|
||||
|
||||
|
||||
def node_scenarios(scenarios_list, config):
|
||||
for node_scenario_config in scenarios_list:
|
||||
with open(node_scenario_config, 'r') as f:
|
||||
with open(node_scenario_config, "r") as f:
|
||||
node_scenario_config = yaml.full_load(f)
|
||||
for node_scenario in node_scenario_config['node_scenarios']:
|
||||
for node_scenario in node_scenario_config["node_scenarios"]:
|
||||
node_scenario_object = get_node_scenario_object(node_scenario)
|
||||
if node_scenario['actions']:
|
||||
for action in node_scenario['actions']:
|
||||
if node_scenario["actions"]:
|
||||
for action in node_scenario["actions"]:
|
||||
inject_node_scenario(action, node_scenario, node_scenario_object)
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
@@ -240,13 +248,13 @@ def node_scenarios(scenarios_list, config):
|
||||
|
||||
def time_scenarios(scenarios_list, config):
|
||||
for time_scenario_config in scenarios_list:
|
||||
with open(time_scenario_config, 'r') as f:
|
||||
with open(time_scenario_config, "r") as f:
|
||||
scenario_config = yaml.full_load(f)
|
||||
for time_scenario in scenario_config['time_scenarios']:
|
||||
for time_scenario in scenario_config["time_scenarios"]:
|
||||
object_type, object_names = time_actions.skew_time(time_scenario)
|
||||
not_reset = time_actions.check_date_time(object_type, object_names)
|
||||
if len(not_reset) > 0:
|
||||
logging.info('Object times were not reset')
|
||||
logging.info("Object times were not reset")
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
publish_kraken_status(config, not_reset)
|
||||
@@ -266,36 +274,31 @@ def litmus_scenarios(scenarios_list, config, litmus_namespaces, litmus_uninstall
|
||||
logging.info("opened yaml" + str(item))
|
||||
yaml_item = list(yaml.safe_load_all(f))[0]
|
||||
|
||||
if yaml_item['kind'] == "ChaosEngine":
|
||||
engine_name = yaml_item['metadata']['name']
|
||||
namespace = yaml_item['metadata']['namespace']
|
||||
if yaml_item["kind"] == "ChaosEngine":
|
||||
engine_name = yaml_item["metadata"]["name"]
|
||||
namespace = yaml_item["metadata"]["namespace"]
|
||||
litmus_namespaces.append(namespace)
|
||||
experiment_names = yaml_item['spec']['experiments']
|
||||
experiment_names = yaml_item["spec"]["experiments"]
|
||||
for expr in experiment_names:
|
||||
expr_name = expr['name']
|
||||
experiment_result = common_litmus.check_experiment(engine_name,
|
||||
expr_name,
|
||||
namespace)
|
||||
expr_name = expr["name"]
|
||||
experiment_result = common_litmus.check_experiment(engine_name, expr_name, namespace)
|
||||
if experiment_result:
|
||||
logging.info("Scenario: %s has been successfully injected!"
|
||||
% item)
|
||||
logging.info("Scenario: %s has been successfully injected!" % item)
|
||||
else:
|
||||
logging.info("Scenario: %s was not successfully injected!"
|
||||
% item)
|
||||
logging.info("Scenario: %s was not successfully injected!" % item)
|
||||
if litmus_uninstall:
|
||||
for l_item in l_scenario:
|
||||
logging.info('item ' + str(l_item))
|
||||
logging.info("item " + str(l_item))
|
||||
runcommand.invoke("kubectl delete -f %s" % l_item)
|
||||
if litmus_uninstall:
|
||||
for item in l_scenario:
|
||||
logging.info('item ' + str(item))
|
||||
logging.info("item " + str(item))
|
||||
runcommand.invoke("kubectl delete -f %s" % item)
|
||||
logging.info("Waiting for the specified duration: %s" % wait_duration)
|
||||
time.sleep(wait_duration)
|
||||
cerberus_integration(config)
|
||||
except Exception as e:
|
||||
logging.error("Failed to run litmus scenario: %s. Encountered "
|
||||
"the following exception: %s" % (item, e))
|
||||
logging.error("Failed to run litmus scenario: %s. Encountered " "the following exception: %s" % (item, e))
|
||||
return litmus_namespaces
|
||||
|
||||
|
||||
@@ -307,18 +310,20 @@ def main(cfg):
|
||||
|
||||
# Parse and read the config
|
||||
if os.path.isfile(cfg):
|
||||
with open(cfg, 'r') as f:
|
||||
with open(cfg, "r") as f:
|
||||
config = yaml.full_load(f)
|
||||
global kubeconfig_path, wait_duration
|
||||
kubeconfig_path = config["kraken"].get("kubeconfig_path", "")
|
||||
chaos_scenarios = config["kraken"].get("chaos_scenarios", [])
|
||||
litmus_version = config['kraken'].get("litmus_version", 'v1.9.1')
|
||||
litmus_uninstall = config['kraken'].get("litmus_uninstall", False)
|
||||
litmus_version = config["kraken"].get("litmus_version", "v1.9.1")
|
||||
litmus_uninstall = config["kraken"].get("litmus_uninstall", False)
|
||||
wait_duration = config["tunings"].get("wait_duration", 60)
|
||||
iterations = config["tunings"].get("iterations", 1)
|
||||
daemon_mode = config["tunings"].get("daemon_mode", False)
|
||||
deploy_performance_dashboards = config["performance_monitoring"].get("deploy_dashboards", False)
|
||||
dashboard_repo = config["performance_monitoring"].get("repo", "https://github.com/cloud-bulldozer/performance-dashboards.git") # noqa
|
||||
dashboard_repo = config["performance_monitoring"].get(
|
||||
"repo", "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
) # noqa
|
||||
|
||||
# Initialize clients
|
||||
if not os.path.isfile(kubeconfig_path):
|
||||
@@ -332,8 +337,9 @@ def main(cfg):
|
||||
# Cluster info
|
||||
logging.info("Fetching cluster info")
|
||||
cluster_version = runcommand.invoke("kubectl get clusterversion")
|
||||
cluster_info = runcommand.invoke("kubectl cluster-info | awk 'NR==1' | sed -r "
|
||||
"'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'") # noqa
|
||||
cluster_info = runcommand.invoke(
|
||||
"kubectl cluster-info | awk 'NR==1' | sed -r " "'s/\x1B\[([0-9]{1,3}(;[0-9]{1,2})?)?[mGK]//g'"
|
||||
) # noqa
|
||||
logging.info("\n%s%s" % (cluster_version, cluster_info))
|
||||
|
||||
# Deploy performance dashboards
|
||||
@@ -348,17 +354,16 @@ def main(cfg):
|
||||
if daemon_mode:
|
||||
logging.info("Daemon mode enabled, kraken will cause chaos forever\n")
|
||||
logging.info("Ignoring the iterations set")
|
||||
iterations = float('inf')
|
||||
iterations = float("inf")
|
||||
else:
|
||||
logging.info("Daemon mode not enabled, will run through %s iterations\n"
|
||||
% str(iterations))
|
||||
logging.info("Daemon mode not enabled, will run through %s iterations\n" % str(iterations))
|
||||
iterations = int(iterations)
|
||||
|
||||
failed_post_scenarios = []
|
||||
litmus_namespaces = []
|
||||
litmus_installed = False
|
||||
# Loop to run the chaos starts here
|
||||
while (int(iteration) < iterations):
|
||||
while int(iteration) < iterations:
|
||||
# Inject chaos scenarios specified in the config
|
||||
logging.info("Executing scenarios for iteration " + str(iteration))
|
||||
if chaos_scenarios:
|
||||
@@ -368,8 +373,7 @@ def main(cfg):
|
||||
if scenarios_list:
|
||||
# Inject pod chaos scenarios specified in the config
|
||||
if scenario_type == "pod_scenarios":
|
||||
failed_post_scenarios = pod_scenarios(scenarios_list, config,
|
||||
failed_post_scenarios)
|
||||
failed_post_scenarios = pod_scenarios(scenarios_list, config, failed_post_scenarios)
|
||||
|
||||
# Inject node chaos scenarios specified in the config
|
||||
elif scenario_type == "node_scenarios":
|
||||
@@ -383,9 +387,9 @@ def main(cfg):
|
||||
common_litmus.install_litmus(litmus_version)
|
||||
common_litmus.deploy_all_experiments(litmus_version)
|
||||
litmus_installed = True
|
||||
litmus_namespaces = litmus_scenarios(scenarios_list, config,
|
||||
litmus_namespaces,
|
||||
litmus_uninstall)
|
||||
litmus_namespaces = litmus_scenarios(
|
||||
scenarios_list, config, litmus_namespaces, litmus_uninstall
|
||||
)
|
||||
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
@@ -407,21 +411,15 @@ if __name__ == "__main__":
|
||||
# Initialize the parser to read the config
|
||||
parser = optparse.OptionParser()
|
||||
parser.add_option(
|
||||
"-c", "--config",
|
||||
dest="cfg",
|
||||
help="config location",
|
||||
default="config/config.yaml",
|
||||
"-c", "--config", dest="cfg", help="config location", default="config/config.yaml",
|
||||
)
|
||||
(options, args) = parser.parse_args()
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s [%(levelname)s] %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler("kraken.report", mode='w'),
|
||||
logging.StreamHandler()
|
||||
]
|
||||
handlers=[logging.FileHandler("kraken.report", mode="w"), logging.StreamHandler()],
|
||||
)
|
||||
if (options.cfg is None):
|
||||
if options.cfg is None:
|
||||
logging.error("Please check if you have passed the config")
|
||||
sys.exit(1)
|
||||
else:
|
||||
|
||||
@@ -29,4 +29,4 @@ scenarios:
|
||||
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
count: 3
|
||||
|
||||
@@ -22,4 +22,4 @@ spec:
|
||||
value: '60'
|
||||
# ENTER THE COMMA SEPARATED TARGET NODES NAME
|
||||
- name: TARGET_NODES
|
||||
value: ''
|
||||
value: ''
|
||||
|
||||
@@ -32,4 +32,4 @@ scenarios:
|
||||
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
count: 3
|
||||
|
||||
@@ -30,4 +30,4 @@ scenarios:
|
||||
timeout: 180
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 3
|
||||
count: 3
|
||||
|
||||
@@ -5,9 +5,9 @@ import logging
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
logging.info("out " + str(out))
|
||||
except Exception as e:
|
||||
|
||||
@@ -19,4 +19,3 @@ scenarios:
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 2
|
||||
|
||||
@@ -15,8 +15,11 @@ def list_namespaces():
|
||||
cli = client.CoreV1Api()
|
||||
ret = cli.list_namespace(pretty=True)
|
||||
except ApiException as e:
|
||||
logging.error("Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n" % e)
|
||||
logging.error(
|
||||
"Exception when calling \
|
||||
CoreV1Api->list_namespaced_pod: %s\n"
|
||||
% e
|
||||
)
|
||||
for namespace in ret.items:
|
||||
namespaces.append(namespace.metadata.name)
|
||||
return namespaces
|
||||
@@ -47,9 +50,9 @@ def check_namespaces(namespaces):
|
||||
|
||||
def run(cmd):
|
||||
try:
|
||||
output = subprocess.Popen(cmd, shell=True,
|
||||
universal_newlines=True, stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT)
|
||||
output = subprocess.Popen(
|
||||
cmd, shell=True, universal_newlines=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT
|
||||
)
|
||||
(out, err) = output.communicate()
|
||||
except Exception as e:
|
||||
logging.error("Failed to run %s, error: %s" % (cmd, e))
|
||||
|
||||
@@ -32,4 +32,4 @@ scenarios:
|
||||
|
||||
actions:
|
||||
- checkPodCount:
|
||||
count: 2
|
||||
count: 2
|
||||
|
||||
@@ -36,11 +36,3 @@ dists = bdist_wheel
|
||||
[bdist_wheel]
|
||||
# Use this option if your package is pure-python
|
||||
universal = 1
|
||||
|
||||
[flake8]
|
||||
# Some sane defaults for the code style checker flake8
|
||||
exclude =
|
||||
.tox
|
||||
build
|
||||
dist
|
||||
.eggsse
|
||||
|
||||
2
setup.py
2
setup.py
@@ -5,7 +5,7 @@ from pkg_resources import VersionConflict, require
|
||||
from setuptools import setup
|
||||
|
||||
try:
|
||||
require('setuptools>=38.3')
|
||||
require("setuptools>=38.3")
|
||||
except VersionConflict:
|
||||
print("Error: version of setuptools is too old (<38.3)!")
|
||||
sys.exit(1)
|
||||
|
||||
@@ -1,2 +0,0 @@
|
||||
setuptools
|
||||
flake8
|
||||
24
tox.ini
24
tox.ini
@@ -1,24 +0,0 @@
|
||||
[tox]
|
||||
envlist = pep8
|
||||
|
||||
[testenv]
|
||||
usedevelop = True
|
||||
setenv =
|
||||
VIRTUAL_ENV={envdir}
|
||||
deps = -r{toxinidir}/test-requirements.txt
|
||||
commands = python setup.py develop
|
||||
|
||||
[testenv:pep8]
|
||||
basepython = python
|
||||
commands = flake8 {posargs}
|
||||
|
||||
[testenv:venv]
|
||||
commands = {posargs}
|
||||
|
||||
[flake8]
|
||||
# E123, E125 skipped as they are invalid PEP-8.
|
||||
show-source = True
|
||||
max-line-length = 120
|
||||
ignore = E123,E125
|
||||
builtins = _
|
||||
exclude=.venv,.git,.tox,dist,doc,*lib/python*,*egg,build
|
||||
Reference in New Issue
Block a user