mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Integration of the new pod recovery monitoring strategy implemented in krkn-lib (#609)
* pod monitoring integration in plugin scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * pod monitoring integration in container scenario Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed wait-for-pod step from plugin scenario config files Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced global pod recovery time Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> nit Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * introduced krkn_pod_recovery_time in plugin scenario and removed all the references to wait-for-pods Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * main branch functional test fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * increased recovery times Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
This commit is contained in:
committed by
GitHub
parent
19ad2d1a3d
commit
ab98e416a6
@@ -1,7 +1,7 @@
|
||||
ERRORED=false
|
||||
|
||||
function finish {
|
||||
if [ $? -eq 1 ] && [ $ERRORED != "true" ]
|
||||
if [ $? != 0 ] && [ $ERRORED != "true" ]
|
||||
then
|
||||
error
|
||||
fi
|
||||
@@ -13,8 +13,10 @@ function error {
|
||||
then
|
||||
echo "Error caught."
|
||||
ERRORED=true
|
||||
else
|
||||
echo "Exit code greater than zero detected: $exit_code"
|
||||
elif [ $exit_code == 2 ]
|
||||
then
|
||||
echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
|
||||
exit 0
|
||||
fi
|
||||
}
|
||||
|
||||
|
||||
@@ -8,11 +8,11 @@ trap finish EXIT
|
||||
pod_file="CI/scenarios/hello_pod.yaml"
|
||||
|
||||
function functional_test_container_crash {
|
||||
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
|
||||
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
|
||||
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
|
||||
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
|
||||
export scenario_type="container_scenarios"
|
||||
export scenario_file="- scenarios/openshift/app_outage.yaml"
|
||||
export scenario_file="- scenarios/openshift/container_etcd.yml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml
|
||||
|
||||
|
||||
@@ -22,14 +22,14 @@ function functional_test_telemetry {
|
||||
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
|
||||
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||
echo "checking if telemetry files are uploaded on s3"
|
||||
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||
echo "all files uploaded!"
|
||||
echo "Telemetry Collection: Success"
|
||||
}
|
||||
|
||||
@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
|
||||
namespace_pattern: ^<namespace>$
|
||||
label_selector: <pod label>
|
||||
kill: <number of pods to kill>
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^<namespace>$
|
||||
label_selector: <pod label>
|
||||
count: <expected number of pods that match namespace and label>
|
||||
krkn_pod_recovery_time: <expected time for the pod to become ready>
|
||||
```
|
||||
|
||||
#### Node Scenario Yaml Template
|
||||
|
||||
@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
```
|
||||
|
||||
Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.
|
||||
|
||||
@@ -2,11 +2,14 @@ import dataclasses
|
||||
import json
|
||||
import logging
|
||||
from os.path import abspath
|
||||
from typing import List, Dict
|
||||
from typing import List, Dict, Any
|
||||
import time
|
||||
|
||||
from arcaflow_plugin_sdk import schema, serialization, jsonschema
|
||||
from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||
|
||||
import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
|
||||
import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
|
||||
from kraken.plugins.run_python_plugin import run_python_file
|
||||
@@ -47,11 +50,14 @@ class Plugins:
|
||||
)
|
||||
self.steps_by_id[step.schema.id] = step
|
||||
|
||||
def unserialize_scenario(self, file: str) -> Any:
|
||||
return serialization.load_from_file(abspath(file))
|
||||
|
||||
def run(self, file: str, kubeconfig_path: str, kraken_config: str):
|
||||
"""
|
||||
Run executes a series of steps
|
||||
"""
|
||||
data = serialization.load_from_file(abspath(file))
|
||||
data = self.unserialize_scenario(abspath(file))
|
||||
if not isinstance(data, list):
|
||||
raise Exception(
|
||||
"Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
|
||||
@@ -241,7 +247,15 @@ PLUGINS = Plugins(
|
||||
)
|
||||
|
||||
|
||||
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
|
||||
def run(scenarios: List[str],
|
||||
kubeconfig_path: str,
|
||||
kraken_config: str,
|
||||
failed_post_scenarios: List[str],
|
||||
wait_duration: int,
|
||||
telemetry: KrknTelemetryKubernetes,
|
||||
kubecli: KrknKubernetes
|
||||
) -> (List[str], list[ScenarioTelemetry]):
|
||||
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
for scenario in scenarios:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
@@ -249,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
|
||||
scenario_telemetry.startTimeStamp = time.time()
|
||||
telemetry.set_parameters_base64(scenario_telemetry, scenario)
|
||||
logging.info('scenario ' + str(scenario))
|
||||
pool = PodsMonitorPool(kubecli)
|
||||
kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
|
||||
|
||||
try:
|
||||
start_monitoring(pool, kill_scenarios)
|
||||
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
|
||||
result = pool.join()
|
||||
scenario_telemetry.affected_pods = result
|
||||
if result.error:
|
||||
raise Exception(f"unrecovered pods: {result.error}")
|
||||
|
||||
except Exception as e:
|
||||
logging.error(f"scenario exception: {str(e)}")
|
||||
scenario_telemetry.exitStatus = 1
|
||||
pool.cancel()
|
||||
failed_post_scenarios.append(scenario)
|
||||
log_exception(scenario)
|
||||
else:
|
||||
@@ -263,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
|
||||
scenario_telemetry.endTimeStamp = time.time()
|
||||
|
||||
return failed_post_scenarios, scenario_telemetries
|
||||
|
||||
|
||||
def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
|
||||
for kill_scenario in scenarios:
|
||||
recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
|
||||
if ("namespace_pattern" in kill_scenario["config"] and
|
||||
"label_selector" in kill_scenario["config"]):
|
||||
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||
label_selector = kill_scenario["config"]["label_selector"]
|
||||
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||
namespace_pattern=namespace_pattern,
|
||||
label_selector=label_selector,
|
||||
max_timeout=recovery_time)
|
||||
logging.info(
|
||||
f"waiting {recovery_time} seconds for pod recovery, "
|
||||
f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
|
||||
|
||||
elif ("namespace_pattern" in kill_scenario["config"] and
|
||||
"name_pattern" in kill_scenario["config"]):
|
||||
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
|
||||
name_pattern = kill_scenario["config"]["name_pattern"]
|
||||
pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
|
||||
namespace_pattern=namespace_pattern,
|
||||
max_timeout=recovery_time)
|
||||
logging.info(f"waiting {recovery_time} seconds for pod recovery, "
|
||||
f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
|
||||
else:
|
||||
raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")
|
||||
|
||||
@@ -1,9 +1,13 @@
|
||||
import logging
|
||||
import time
|
||||
from typing import Any
|
||||
|
||||
import yaml
|
||||
import sys
|
||||
import random
|
||||
import arcaflow_plugin_kill_pod
|
||||
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
|
||||
|
||||
import kraken.cerberus.setup as cerberus
|
||||
import kraken.post_actions.actions as post_actions
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
@@ -79,6 +83,7 @@ def container_run(kubeconfig_path,
|
||||
|
||||
failed_scenarios = []
|
||||
scenario_telemetries: list[ScenarioTelemetry] = []
|
||||
pool = PodsMonitorPool(kubecli)
|
||||
|
||||
for container_scenario_config in scenarios_list:
|
||||
scenario_telemetry = ScenarioTelemetry()
|
||||
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
|
||||
pre_action_output = ""
|
||||
with open(container_scenario_config[0], "r") as f:
|
||||
cont_scenario_config = yaml.full_load(f)
|
||||
start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
|
||||
for cont_scenario in cont_scenario_config["scenarios"]:
|
||||
# capture start time
|
||||
start_time = int(time.time())
|
||||
try:
|
||||
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
|
||||
if len(container_scenario_config) > 1:
|
||||
failed_post_scenarios = post_actions.check_recovery(
|
||||
kubeconfig_path,
|
||||
container_scenario_config,
|
||||
failed_post_scenarios,
|
||||
pre_action_output
|
||||
)
|
||||
else:
|
||||
failed_post_scenarios = check_failed_containers(
|
||||
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
|
||||
)
|
||||
|
||||
logging.info(f"killed containers: {str(killed_containers)}")
|
||||
result = pool.join()
|
||||
if result.error:
|
||||
raise Exception(f"pods failed to recovery: {result.error}")
|
||||
scenario_telemetry.affected_pods = result
|
||||
logging.info("Waiting for the specified duration: %s" % (wait_duration))
|
||||
time.sleep(wait_duration)
|
||||
|
||||
@@ -117,6 +116,7 @@ def container_run(kubeconfig_path,
|
||||
# publish cerberus status
|
||||
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
|
||||
except (RuntimeError, Exception):
|
||||
pool.cancel()
|
||||
failed_scenarios.append(container_scenario_config[0])
|
||||
log_exception(container_scenario_config[0])
|
||||
scenario_telemetry.exitStatus = 1
|
||||
@@ -129,6 +129,16 @@ def container_run(kubeconfig_path,
|
||||
|
||||
return failed_scenarios, scenario_telemetries
|
||||
|
||||
def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
|
||||
for kill_scenario in kill_scenarios:
|
||||
namespace_pattern = f"^{kill_scenario['namespace']}$"
|
||||
label_selector = kill_scenario["label_selector"]
|
||||
recovery_time = kill_scenario["expected_recovery_time"]
|
||||
pool.select_and_monitor_by_namespace_pattern_and_label(
|
||||
namespace_pattern=namespace_pattern,
|
||||
label_selector=label_selector,
|
||||
max_timeout=recovery_time)
|
||||
|
||||
|
||||
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
|
||||
scenario_name = get_yaml_item_value(cont_scenario, "name", "")
|
||||
|
||||
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.3
|
||||
krkn-lib==2.1.1
|
||||
krkn-lib==2.1.2
|
||||
lxml==5.1.0
|
||||
kubernetes==26.1.0
|
||||
oauth2client==4.1.3
|
||||
|
||||
@@ -264,7 +264,8 @@ def main(cfg):
|
||||
kraken_config,
|
||||
failed_post_scenarios,
|
||||
wait_duration,
|
||||
telemetry_k8s
|
||||
telemetry_k8s,
|
||||
kubecli
|
||||
)
|
||||
chaos_telemetry.scenarios.extend(scenario_telemetries)
|
||||
# krkn_lib
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: component=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: component=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
name_pattern: ^nginx-.*$
|
||||
namespace_pattern: ^default$
|
||||
kill: 1
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^kube-system$
|
||||
label_selector: k8s-app=kube-scheduler
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -5,4 +5,4 @@ scenarios:
|
||||
container_name: "etcd"
|
||||
action: 1
|
||||
count: 1
|
||||
expected_recovery_time: 60
|
||||
expected_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^acme-air$
|
||||
name_pattern: .*
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^acme-air$
|
||||
name_pattern: .*
|
||||
count: 8
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-etcd$
|
||||
label_selector: k8s-app=etcd
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-etcd$
|
||||
label_selector: k8s-app=etcd
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -3,8 +3,5 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-apiserver$
|
||||
label_selector: app=openshift-apiserver-a
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-apiserver$
|
||||
label_selector: app=openshift-apiserver-a
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
|
||||
@@ -3,8 +3,5 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-kube-apiserver$
|
||||
label_selector: app=openshift-kube-apiserver
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-kube-apiserver$
|
||||
label_selector: app=openshift-kube-apiserver
|
||||
count: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
|
||||
@@ -3,8 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
count: 2
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -2,8 +2,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
|
||||
count: 1
|
||||
krkn_pod_recovery_time: 120
|
||||
@@ -3,9 +3,4 @@
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
- id: wait-for-pods
|
||||
config:
|
||||
namespace_pattern: ^openshift-monitoring$
|
||||
label_selector: app=prometheus
|
||||
count: 2
|
||||
timeout: 180
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -4,3 +4,4 @@
|
||||
namespace_pattern: ^openshift-.*$
|
||||
name_pattern: .*
|
||||
kill: 3
|
||||
krkn_pod_recovery_time: 120
|
||||
|
||||
@@ -60,7 +60,14 @@
|
||||
"default": 1,
|
||||
"title": "Backoff",
|
||||
"description": "How many seconds to wait between checks for the target pod status."
|
||||
},
|
||||
"krkn_pod_recovery_time": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"title": "Recovery Time",
|
||||
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||
}
|
||||
|
||||
},
|
||||
"required": [
|
||||
"namespace_pattern"
|
||||
@@ -112,6 +119,12 @@
|
||||
"default": 1,
|
||||
"title": "Backoff",
|
||||
"description": "How many seconds to wait between checks for the target pod status."
|
||||
},
|
||||
"krkn_pod_recovery_time": {
|
||||
"type": "integer",
|
||||
"default": 30,
|
||||
"title": "Recovery Time",
|
||||
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
|
||||
}
|
||||
},
|
||||
"required": [
|
||||
|
||||
Reference in New Issue
Block a user