Fixes #265: Replace powerfulseal

This commit is contained in:
Janos Bonic
2022-06-02 21:35:11 +02:00
parent 9810649c62
commit d8c4c27a7d
21 changed files with 555 additions and 5 deletions

View File

@@ -11,16 +11,18 @@ jobs:
steps:
- name: Check out code
uses: actions/checkout@v3
- name: Build the Docker images
run: docker build --no-cache -t quay.io/chaos-kubox/krkn containers/
- name: Create multi-node KinD cluster
uses: chaos-kubox/actions/kind@main
- name: Install environment
run: |
sudo apt-get install build-essential python3-dev
pip install -r requirements.txt
- name: Run CI
- name: Run unit tests
run: python -m unittest discover
- name: Run e2e tests
run: ./CI/run.sh
- name: Build the Docker images
run: docker build --no-cache -t quay.io/chaos-kubox/krkn containers/
- name: Login in quay
if: github.ref == 'refs/heads/main' && github.event_name == 'push'
run: docker login quay.io -u ${QUAY_USER} -p ${QUAY_TOKEN}

View File

View File

View File

@@ -0,0 +1,6 @@
from dataclasses import dataclass
@dataclass
class CerberusConfig:
cerberus_url: str

View File

@@ -0,0 +1,13 @@
import requests as requests
from kraken.health.cerberus.config import CerberusConfig
from kraken.health.health import HealthChecker, HealthCheckDecision
class CerberusHealthChecker(HealthChecker):
def __init__(self, config: CerberusConfig):
self._config = config
def check(self) -> HealthCheckDecision:
cerberus_status = requests.get(self._config.cerberus_url, timeout=60).content
return HealthCheckDecision.GO if cerberus_status == b"True" else HealthCheckDecision.STOP

14
kraken/health/health.py Normal file
View File

@@ -0,0 +1,14 @@
from abc import ABC, abstractmethod
from enum import Enum
class HealthCheckDecision(Enum):
GO = "GO"
PAUSE = "PAUSE"
STOP = "STOP"
class HealthChecker(ABC):
@abstractmethod
def check(self) -> HealthCheckDecision:
pass

125
kraken/kubernetes/kube.py Normal file
View File

@@ -0,0 +1,125 @@
import unittest
from dataclasses import dataclass
from typing import Dict, List
from kubernetes import config, client
from kubernetes.client.models import V1Pod, V1PodSpec, V1ObjectMeta, V1Container
from kubernetes.client.exceptions import ApiException
@dataclass
class Pod:
"""
A pod is a simplified representation of a Kubernetes pod. We only extract the data we need in krkn.
"""
name: str
namespace: str
labels: Dict[str, str]
class Client:
"""
This is the implementation of all Kubernetes API calls used in Krkn.
"""
def __init__(self, kubeconfig_path: str = None):
# Note: this function replicates much of the functionality already represented in the Kubernetes Python client,
# but in an object-oriented manner. This allows for creating multiple clients and accessing multiple clusters
# with minimal effort if needed, which the procedural implementation doesn't allow.
if kubeconfig_path is None:
kubeconfig_path = config.KUBE_CONFIG_DEFAULT_LOCATION
kubeconfig = config.kube_config.KubeConfigMerger(kubeconfig_path)
if kubeconfig.config is None:
raise config.ConfigException(
'Invalid kube-config file: %s. '
'No configuration found.' % kubeconfig_path)
loader = config.kube_config.KubeConfigLoader(
config_dict=kubeconfig.config,
)
client_config = client.Configuration()
loader.load_and_set(client_config)
self.client = client.ApiClient(configuration=client_config)
self.core_v1 = client.CoreV1Api(self.client)
@staticmethod
def _convert_pod(pod: V1Pod) -> Pod:
return Pod(
name=pod.metadata.name,
namespace=pod.metadata.namespace,
labels=pod.metadata.labels
)
def create_test_pod(self) -> Pod:
"""
create_test_pod creates a test pod in the default namespace that can be safely killed.
"""
return self._convert_pod(self.core_v1.create_namespaced_pod(
"default",
V1Pod(
metadata=V1ObjectMeta(
generate_name="test-",
),
spec=V1PodSpec(
containers=[
V1Container(
name="test",
image="alpine",
tty=True,
)
]
),
)
))
def list_all_pods(self, label_selector: str = None) -> List[Pod]:
"""
list_all_pods lists all pods in all namespaces, possibly with a label selector applied.
"""
try:
pod_response = self.core_v1.list_pod_for_all_namespaces(watch=False, label_selector=label_selector)
pod_list: List[client.models.V1Pod] = pod_response.items
result: List[Pod] = []
for pod in pod_list:
result.append(self._convert_pod(pod))
return result
except ApiException as e:
if e.status == 404:
raise NotFoundException(e)
raise
def get_pod(self, name: str, namespace: str = "default") -> Pod:
"""
get_pod returns a pod based on the name and a namespace.
"""
try:
return self._convert_pod(self.core_v1.read_namespaced_pod(name, namespace))
except ApiException as e:
if e.status == 404:
raise NotFoundException(e)
raise
def remove_pod(self, name: str, namespace: str = "default"):
"""
remove_pod removes a pod based on the name and namespace. A NotFoundException is raised if the pod doesn't
exist.
"""
try:
self.core_v1.delete_namespaced_pod(name, namespace)
except ApiException as e:
if e.status == 404:
raise NotFoundException(e)
raise
class NotFoundException(Exception):
"""
NotFoundException is an exception specific to the scenario Kubernetes abstraction and is thrown when a specific
resource (e.g. a pod) cannot be found.
"""
def __init__(self, cause: Exception):
self.__cause__ = cause
if __name__ == '__main__':
unittest.main()

View File

@@ -0,0 +1,42 @@
import unittest
from kraken.scenarios import kube
class TestClient(unittest.TestCase):
def test_list_all_pods(self):
c = kube.Client()
pod = c.create_test_pod()
self.addCleanup(lambda: self._remove_pod(c, pod.name, pod.namespace))
pods = c.list_all_pods()
for pod in pods:
if pod.name == pod.name and pod.namespace == pod.namespace:
return
self.fail("The created pod %s was not in the pod list." % pod.name)
def test_get_pod(self):
c = kube.Client()
pod = c.create_test_pod()
self.addCleanup(lambda: c.remove_pod(pod.name, pod.namespace))
pod2 = c.get_pod(pod.name, pod.namespace)
assert pod2.name == pod.name
assert pod2.namespace == pod.namespace
def test_get_pod_notfound(self):
c = kube.Client()
try:
c.get_pod("non-existent-pod")
self.fail("Fetching a non-existent pod did not result in a NotFoundException.")
except kube.NotFoundException:
pass
@staticmethod
def _remove_pod(c: kube.Client, pod_name: str, pod_namespace: str):
try:
c.remove_pod(pod_name, pod_namespace)
except kube.NotFoundException:
pass
if __name__ == '__main__':
unittest.main()

View File

16
kraken/runner/loader.py Normal file
View File

@@ -0,0 +1,16 @@
from typing import List, Dict
from kraken.scenarios.base import Scenario
from kraken.scenarios.runner import ScenarioRunnerConfig
class Loader:
def __init__(self, scenarios: List[Scenario]):
self.scenarios = scenarios
def load(self, data: Dict) -> ScenarioRunnerConfig:
"""
This function loads data from a dictionary and produces a scenario runner config. It uses the scenarios provided
when instantiating the loader.
"""

28
kraken/runner/runner.py Normal file
View File

@@ -0,0 +1,28 @@
from dataclasses import dataclass
from typing import List
from kraken.scenarios import base
from kraken.scenarios.health import HealthChecker
@dataclass
class ScenarioRunnerConfig:
iterations: int
steps: List[base.ScenarioConfig]
class ScenarioRunner:
"""
This class provides the services to load a scenario configuration and iterate over the scenarios, while
observing the health checks.
"""
def __init__(self, scenarios: List[base.Scenario], health_checker: HealthChecker):
self._scenarios = scenarios
self._health_checker = health_checker
def run(self, config: ScenarioRunnerConfig):
"""
This function runs a list of scenarios described in the configuration.
"""

View File

61
kraken/scenarios/base.py Normal file
View File

@@ -0,0 +1,61 @@
from typing import TypeVar, Generic, Dict
from kraken.scenarios.kube import Client
from abc import ABC, abstractmethod
from dataclasses import dataclass
@dataclass
class ScenarioConfig(ABC):
"""
ScenarioConfig is a generic base class for configurations for individual scenarios. Each scenario should define
its own configuration classes.
"""
@abstractmethod
def from_dict(self, data: Dict) -> None:
"""
from_dict loads the configuration from a dict. It is mainly used to load JSON data into the scenario
configuration.
"""
@abstractmethod
def validate(self) -> None:
"""
validate is a function that validates all data on the scenario configuration. If the scenario configuration
is invalid an Exception should be thrown.
"""
pass
T = TypeVar('T', bound=ScenarioConfig)
class Scenario(Generic[T]):
"""
Scenario is a generic base class that provides a uniform run function to call in a loop. Scenario implementations
should extend this class and accept their configuration via their initializer.
"""
@staticmethod
def create_config(self) -> T:
"""
create_config creates a new copy of the configuration structure that allows loading data from a dictionary
and validating it.
"""
pass
def run(self, kube: Client, config: T) -> None:
"""
run is a function that is called when the scenario should be run. A Kubernetes client implementation will be
passed. The scenario should execute and return immediately. If the scenario fails, an Exception should be
thrown.
"""
pass
class TimeoutException(Exception):
"""
TimeoutException is an exception thrown when a scenario has a timeout waiting for a condition to happen.
"""
pass

96
kraken/scenarios/pod.py Normal file
View File

@@ -0,0 +1,96 @@
import logging
import random
import re
import time
from dataclasses import dataclass
from typing import Dict, List
from kraken.scenarios import base
from kraken.scenarios.base import ScenarioConfig, Scenario
from kraken.scenarios.kube import Client, Pod, NotFoundException
@dataclass
class PodScenarioConfig(ScenarioConfig):
"""
PodScenarioConfig is a configuration structure specific to pod scenarios. It describes which pod from which
namespace(s) to select for killing and how many pods to kill.
"""
name_pattern: str
namespace_pattern: str
label_selector: str
kill: int
def from_dict(self, data: Dict) -> None:
self.name_pattern = data.get("name_pattern")
self.namespace_pattern = data.get("namespace_pattern")
self.label_selector = data.get("label_selector")
self.kill = data.get("kill")
def validate(self) -> None:
re.compile(self.name_pattern)
re.compile(self.namespace_pattern)
if self.kill < 1:
raise Exception("Invalid value for 'kill': %d" % self.kill)
def namespace_regexp(self) -> re.Pattern:
return re.compile(self.namespace_pattern)
def name_regexp(self) -> re.Pattern:
return re.compile(self.name_pattern)
class PodScenario(Scenario[PodScenarioConfig]):
"""
PodScenario is a scenario that tests the stability of a Kubernetes cluster by killing one or more pods based on the
PodScenarioConfig.
"""
def __init__(self, logger: logging.Logger):
self.logger = logger
def create_config(self) -> PodScenarioConfig:
return PodScenarioConfig(
name_pattern=".*",
namespace_pattern=".*",
label_selector="",
kill=1,
)
def run(self, kube: Client, config: PodScenarioConfig):
pod_candidates: List[Pod] = []
namespace_re = config.namespace_regexp()
name_re = config.name_regexp()
self.logger.info("Listing all pods to determine viable pods to kill...")
for pod in kube.list_all_pods(label_selector=config.label_selector):
if namespace_re.match(pod.namespace) and name_re.match(pod.name):
pod_candidates.append(pod)
random.shuffle(pod_candidates)
removed_pod: List[Pod] = []
pods_to_kill = min(config.kill, len(pod_candidates))
self.logger.info("Killing %d pods...", pods_to_kill)
for i in range(pods_to_kill):
pod = pod_candidates[i]
self.logger.info("Killing pod %s...", pod.name)
removed_pod.append(pod)
kube.remove_pod(pod.name, pod.namespace)
self.logger.info("Waiting for pods to be removed...")
for i in range(60):
time.sleep(1)
for pod in removed_pod:
try:
kube.get_pod(pod.name, pod.namespace)
self.logger.info("Pod %s still exists...", pod.name)
except NotFoundException:
self.logger.info("Pod %s is now removed.", pod.name)
removed_pod.remove(pod)
if len(removed_pod) == 0:
self.logger.info("All pods removed, pod scenario complete.")
return
self.logger.warning("Timeout waiting for pods to be removed.")
raise base.TimeoutException("Timeout while waiting for pods to be removed.")

View File

@@ -0,0 +1,43 @@
import logging
import sys
import unittest
from kraken.scenarios import kube
from kraken.scenarios.kube import Client, NotFoundException
from kraken.scenarios.pod import PodScenario
class TestPodScenario(unittest.TestCase):
def test_run(self):
"""
This test creates a test pod and then runs the pod scenario restricting the run to that specific pod.
"""
logging.basicConfig(stream=sys.stdout, level=logging.INFO)
c = Client()
test_pod = c.create_test_pod()
self.addCleanup(lambda: self._remove_test_pod(c, test_pod.name, test_pod.namespace))
scenario = PodScenario(logging.getLogger(__name__))
config = scenario.create_config()
config.kill = 1
config.name_pattern = test_pod.name
config.namespace_pattern = test_pod.namespace
scenario.run(c, config)
try:
c.get_pod(test_pod.name)
self.fail("Getting the pod after a pod scenario run should result in a NotFoundException.")
except NotFoundException:
return
@staticmethod
def _remove_test_pod(c: kube.Client, pod_name: str, pod_namespace: str):
try:
c.remove_pod(pod_name, pod_namespace)
except NotFoundException:
pass
if __name__ == '__main__':
unittest.main()

View File

@@ -1,14 +1,13 @@
datetime
pyfiglet
PyYAML>=5.1
git+https://github.com/powerfulseal/powerfulseal.git@3.3.0
requests
boto3
google-api-python-client
azure-mgmt-compute
azure-keyvault
azure-identity
kubernetes==18.20.0
kubernetes
oauth2client>=4.1.3
python-openstackclient
gitpython

View File

@@ -2,6 +2,8 @@
import os
import sys
from typing import List
import yaml
import logging
import optparse
@@ -22,6 +24,9 @@ import kraken.application_outage.actions as application_outage
import kraken.pvc.pvc_scenario as pvc_scenario
import kraken.network_chaos.actions as network_chaos
import server as server
from kraken.scenarios.base import Scenario
from kraken.scenarios.pod import PodScenario
from kraken.scenarios.runner import ScenarioRunner
def publish_kraken_status(status):
@@ -115,6 +120,12 @@ def main(cfg):
run_uuid = str(uuid.uuid4())
logging.info("Generated a uuid for the run: %s" % run_uuid)
logger = logging.getLogger(__name__)
scenarios: List[Scenario] = [
PodScenario(logger),
]
health_checker = CerberusHealthChecker(config)
runner = ScenarioRunner(scenarios, health_checker)
# Initialize the start iteration to 0
iteration = 0

View File

@@ -0,0 +1,89 @@
{
"$schema": "https://json-schema.org/draft/2019-09/schema",
"$id": "https://github.com/chaos-kubox/krkn/",
"type": "object",
"default": {},
"title": "Composite scenario for Krkn",
"required": [
"steps"
],
"properties": {
"iterations": {
"type": "integer",
"default": 1,
"title": "How many iterations to execute",
"examples": [
3
]
},
"steps": {
"type": "array",
"default": [],
"title": "The steps Schema",
"items": {
"type": "object",
"default": {},
"title": "A Schema",
"required": [
"pod"
],
"properties": {
"pod": {
"type": "object",
"default": {},
"title": "The pod Schema",
"required": [
"name_pattern",
"namespace_pattern"
],
"properties": {
"name_pattern": {
"type": "string",
"default": "",
"title": "The name_pattern Schema",
"examples": [
""
]
},
"namespace_pattern": {
"type": "string",
"default": "",
"title": "The namespace_pattern Schema",
"examples": [
""
]
}
},
"examples": [{
"name_pattern": "test-.*",
"namespace_pattern": "default"
}]
}
},
"examples": [{
"pod": {
"name_pattern": "test-.*",
"namespace_pattern": "default"
}
}]
},
"examples": [
[{
"pod": {
"name_pattern": "test-.*",
"namespace_pattern": "default"
}
}]
]
}
},
"examples": [{
"iterations": 1,
"steps": [{
"pod": {
"name_pattern": "test-.*",
"namespace_pattern": "default"
}
}]
}]
}

View File

@@ -0,0 +1,5 @@
iterations: 1
steps:
- pod:
name_pattern:
namespace_pattern: