Integration of the new pod recovery monitoring strategy implemented in krkn-lib (#609)

* pod monitoring integration in plugin scenario

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* pod monitoring integration in container scenario

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* removed wait-for-pod step from plugin scenario config files

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* introduced global pod recovery time

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

nit

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* introduced krkn_pod_recovery_time in plugin scenario and removed all the references to wait-for-pods

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

fix

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* functional test fix

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* main branch functional test fix

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

* increased recovery times

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>

---------

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
This commit is contained in:
Tullio Sebastiani
2024-04-23 10:49:01 +02:00
committed by GitHub
parent 19ad2d1a3d
commit ab98e416a6
22 changed files with 124 additions and 85 deletions

View File

@@ -1,7 +1,7 @@
ERRORED=false
function finish {
if [ $? -eq 1 ] && [ $ERRORED != "true" ]
if [ $? != 0 ] && [ $ERRORED != "true" ]
then
error
fi
@@ -13,8 +13,10 @@ function error {
then
echo "Error caught."
ERRORED=true
else
echo "Exit code greater than zero detected: $exit_code"
elif [ $exit_code == 2 ]
then
echo "Run with exit code 2 detected, it is expected, wrapping the exit code with 0 to avoid pipeline failure"
exit 0
fi
}

View File

@@ -8,11 +8,11 @@ trap finish EXIT
pod_file="CI/scenarios/hello_pod.yaml"
function functional_test_container_crash {
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/app_outage.yaml
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/app_outage.yaml
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/app_outage.yaml
yq -i '.scenarios[0].namespace="default"' scenarios/openshift/container_etcd.yml
yq -i '.scenarios[0].label_selector="scenario=container"' scenarios/openshift/container_etcd.yml
yq -i '.scenarios[0].container_name="fedtools"' scenarios/openshift/container_etcd.yml
export scenario_type="container_scenarios"
export scenario_file="- scenarios/openshift/app_outage.yaml"
export scenario_file="- scenarios/openshift/container_etcd.yml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml

View File

@@ -22,14 +22,14 @@ function functional_test_telemetry {
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
export post_config=""
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
retval=$(python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml)
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
echo "checking if telemetry files are uploaded on s3"
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
cat s3_remote_files | grep critical-alerts-00.log || ( echo "FAILED: critical-alerts-00.log not uploaded" && exit 1 )
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
echo "all files uploaded!"
echo "Telemetry Collection: Success"
}

View File

@@ -14,11 +14,7 @@ For example, for adding a pod level scenario for a new application, refer to the
namespace_pattern: ^<namespace>$
label_selector: <pod label>
kill: <number of pods to kill>
- id: wait-for-pods
config:
namespace_pattern: ^<namespace>$
label_selector: <pod label>
count: <expected number of pods that match namespace and label>
krkn_pod_recovery_time: <expected time for the pod to become ready>
```
#### Node Scenario Yaml Template

View File

@@ -17,11 +17,8 @@ You can then create the scenario file with the following contents:
config:
namespace_pattern: ^kube-system$
label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
config:
namespace_pattern: ^kube-system$
label_selector: k8s-app=kube-scheduler
count: 3
krkn_pod_recovery_time: 120
```
Please adjust the schema reference to point to the [schema file](../scenarios/plugin.schema.json). This file will give you code completion and documentation for the available options in your IDE.

View File

@@ -2,11 +2,14 @@ import dataclasses
import json
import logging
from os.path import abspath
from typing import List, Dict
from typing import List, Dict, Any
import time
from arcaflow_plugin_sdk import schema, serialization, jsonschema
from arcaflow_plugin_kill_pod import kill_pods, wait_for_pods
from krkn_lib.k8s import KrknKubernetes
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
import kraken.plugins.node_scenarios.vmware_plugin as vmware_plugin
import kraken.plugins.node_scenarios.ibmcloud_plugin as ibmcloud_plugin
from kraken.plugins.run_python_plugin import run_python_file
@@ -47,11 +50,14 @@ class Plugins:
)
self.steps_by_id[step.schema.id] = step
def unserialize_scenario(self, file: str) -> Any:
return serialization.load_from_file(abspath(file))
def run(self, file: str, kubeconfig_path: str, kraken_config: str):
"""
Run executes a series of steps
"""
data = serialization.load_from_file(abspath(file))
data = self.unserialize_scenario(abspath(file))
if not isinstance(data, list):
raise Exception(
"Invalid scenario configuration file: {} expected list, found {}".format(file, type(data).__name__)
@@ -241,7 +247,15 @@ PLUGINS = Plugins(
)
def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_post_scenarios: List[str], wait_duration: int, telemetry: KrknTelemetryKubernetes) -> (List[str], list[ScenarioTelemetry]):
def run(scenarios: List[str],
kubeconfig_path: str,
kraken_config: str,
failed_post_scenarios: List[str],
wait_duration: int,
telemetry: KrknTelemetryKubernetes,
kubecli: KrknKubernetes
) -> (List[str], list[ScenarioTelemetry]):
scenario_telemetries: list[ScenarioTelemetry] = []
for scenario in scenarios:
scenario_telemetry = ScenarioTelemetry()
@@ -249,10 +263,21 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
scenario_telemetry.startTimeStamp = time.time()
telemetry.set_parameters_base64(scenario_telemetry, scenario)
logging.info('scenario ' + str(scenario))
pool = PodsMonitorPool(kubecli)
kill_scenarios = [kill_scenario for kill_scenario in PLUGINS.unserialize_scenario(scenario) if kill_scenario["id"] == "kill-pods"]
try:
start_monitoring(pool, kill_scenarios)
PLUGINS.run(scenario, kubeconfig_path, kraken_config)
result = pool.join()
scenario_telemetry.affected_pods = result
if result.error:
raise Exception(f"unrecovered pods: {result.error}")
except Exception as e:
logging.error(f"scenario exception: {str(e)}")
scenario_telemetry.exitStatus = 1
pool.cancel()
failed_post_scenarios.append(scenario)
log_exception(scenario)
else:
@@ -263,3 +288,31 @@ def run(scenarios: List[str], kubeconfig_path: str, kraken_config: str, failed_p
scenario_telemetry.endTimeStamp = time.time()
return failed_post_scenarios, scenario_telemetries
def start_monitoring(pool: PodsMonitorPool, scenarios: list[Any]):
for kill_scenario in scenarios:
recovery_time = kill_scenario["config"]["krkn_pod_recovery_time"]
if ("namespace_pattern" in kill_scenario["config"] and
"label_selector" in kill_scenario["config"]):
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
label_selector = kill_scenario["config"]["label_selector"]
pool.select_and_monitor_by_namespace_pattern_and_label(
namespace_pattern=namespace_pattern,
label_selector=label_selector,
max_timeout=recovery_time)
logging.info(
f"waiting {recovery_time} seconds for pod recovery, "
f"pod label selector: {label_selector} namespace pattern: {namespace_pattern}")
elif ("namespace_pattern" in kill_scenario["config"] and
"name_pattern" in kill_scenario["config"]):
namespace_pattern = kill_scenario["config"]["namespace_pattern"]
name_pattern = kill_scenario["config"]["name_pattern"]
pool.select_and_monitor_by_name_pattern_and_namespace_pattern(pod_name_pattern=name_pattern,
namespace_pattern=namespace_pattern,
max_timeout=recovery_time)
logging.info(f"waiting {recovery_time} seconds for pod recovery, "
f"pod name pattern: {name_pattern} namespace pattern: {namespace_pattern}")
else:
raise Exception(f"impossible to determine monitor parameters, check {kill_scenario} configuration")

View File

@@ -1,9 +1,13 @@
import logging
import time
from typing import Any
import yaml
import sys
import random
import arcaflow_plugin_kill_pod
from krkn_lib.k8s.pods_monitor_pool import PodsMonitorPool
import kraken.cerberus.setup as cerberus
import kraken.post_actions.actions as post_actions
from krkn_lib.k8s import KrknKubernetes
@@ -79,6 +83,7 @@ def container_run(kubeconfig_path,
failed_scenarios = []
scenario_telemetries: list[ScenarioTelemetry] = []
pool = PodsMonitorPool(kubecli)
for container_scenario_config in scenarios_list:
scenario_telemetry = ScenarioTelemetry()
@@ -91,23 +96,17 @@ def container_run(kubeconfig_path,
pre_action_output = ""
with open(container_scenario_config[0], "r") as f:
cont_scenario_config = yaml.full_load(f)
start_monitoring(kill_scenarios=cont_scenario_config["scenarios"], pool=pool)
for cont_scenario in cont_scenario_config["scenarios"]:
# capture start time
start_time = int(time.time())
try:
killed_containers = container_killing_in_pod(cont_scenario, kubecli)
if len(container_scenario_config) > 1:
failed_post_scenarios = post_actions.check_recovery(
kubeconfig_path,
container_scenario_config,
failed_post_scenarios,
pre_action_output
)
else:
failed_post_scenarios = check_failed_containers(
killed_containers, cont_scenario.get("retry_wait", 120), kubecli
)
logging.info(f"killed containers: {str(killed_containers)}")
result = pool.join()
if result.error:
raise Exception(f"pods failed to recovery: {result.error}")
scenario_telemetry.affected_pods = result
logging.info("Waiting for the specified duration: %s" % (wait_duration))
time.sleep(wait_duration)
@@ -117,6 +116,7 @@ def container_run(kubeconfig_path,
# publish cerberus status
cerberus.publish_kraken_status(config, failed_post_scenarios, start_time, end_time)
except (RuntimeError, Exception):
pool.cancel()
failed_scenarios.append(container_scenario_config[0])
log_exception(container_scenario_config[0])
scenario_telemetry.exitStatus = 1
@@ -129,6 +129,16 @@ def container_run(kubeconfig_path,
return failed_scenarios, scenario_telemetries
def start_monitoring(kill_scenarios: list[Any], pool: PodsMonitorPool):
for kill_scenario in kill_scenarios:
namespace_pattern = f"^{kill_scenario['namespace']}$"
label_selector = kill_scenario["label_selector"]
recovery_time = kill_scenario["expected_recovery_time"]
pool.select_and_monitor_by_namespace_pattern_and_label(
namespace_pattern=namespace_pattern,
label_selector=label_selector,
max_timeout=recovery_time)
def container_killing_in_pod(cont_scenario, kubecli: KrknKubernetes):
scenario_name = get_yaml_item_value(cont_scenario, "name", "")

View File

@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
ibm_cloud_sdk_core==3.18.0
ibm_vpc==0.20.0
jinja2==3.1.3
krkn-lib==2.1.1
krkn-lib==2.1.2
lxml==5.1.0
kubernetes==26.1.0
oauth2client==4.1.3

View File

@@ -264,7 +264,8 @@ def main(cfg):
kraken_config,
failed_post_scenarios,
wait_duration,
telemetry_k8s
telemetry_k8s,
kubecli
)
chaos_telemetry.scenarios.extend(scenario_telemetries)
# krkn_lib

View File

@@ -3,8 +3,4 @@
config:
namespace_pattern: ^kube-system$
label_selector: component=kube-scheduler
- id: wait-for-pods
config:
namespace_pattern: ^kube-system$
label_selector: component=kube-scheduler
count: 3
krkn_pod_recovery_time: 120

View File

@@ -4,3 +4,4 @@
name_pattern: ^nginx-.*$
namespace_pattern: ^default$
kill: 1
krkn_pod_recovery_time: 120

View File

@@ -3,8 +3,4 @@
config:
namespace_pattern: ^kube-system$
label_selector: k8s-app=kube-scheduler
- id: wait-for-pods
config:
namespace_pattern: ^kube-system$
label_selector: k8s-app=kube-scheduler
count: 3
krkn_pod_recovery_time: 120

View File

@@ -5,4 +5,4 @@ scenarios:
container_name: "etcd"
action: 1
count: 1
expected_recovery_time: 60
expected_recovery_time: 120

View File

@@ -3,8 +3,4 @@
config:
namespace_pattern: ^acme-air$
name_pattern: .*
- id: wait-for-pods
config:
namespace_pattern: ^acme-air$
name_pattern: .*
count: 8
krkn_pod_recovery_time: 120

View File

@@ -3,8 +3,4 @@
config:
namespace_pattern: ^openshift-etcd$
label_selector: k8s-app=etcd
- id: wait-for-pods
config:
namespace_pattern: ^openshift-etcd$
label_selector: k8s-app=etcd
count: 3
krkn_pod_recovery_time: 120

View File

@@ -3,8 +3,5 @@
config:
namespace_pattern: ^openshift-apiserver$
label_selector: app=openshift-apiserver-a
- id: wait-for-pods
config:
namespace_pattern: ^openshift-apiserver$
label_selector: app=openshift-apiserver-a
count: 3
krkn_pod_recovery_time: 120

View File

@@ -3,8 +3,5 @@
config:
namespace_pattern: ^openshift-kube-apiserver$
label_selector: app=openshift-kube-apiserver
- id: wait-for-pods
config:
namespace_pattern: ^openshift-kube-apiserver$
label_selector: app=openshift-kube-apiserver
count: 3
krkn_pod_recovery_time: 120

View File

@@ -3,8 +3,4 @@
config:
namespace_pattern: ^openshift-monitoring$
label_selector: app=prometheus
- id: wait-for-pods
config:
namespace_pattern: ^openshift-monitoring$
label_selector: app=prometheus
count: 2
krkn_pod_recovery_time: 120

View File

@@ -2,8 +2,4 @@
config:
namespace_pattern: ^openshift-monitoring$
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
- id: wait-for-pods
config:
namespace_pattern: ^openshift-monitoring$
label_selector: statefulset.kubernetes.io/pod-name=prometheus-k8s-0
count: 1
krkn_pod_recovery_time: 120

View File

@@ -3,9 +3,4 @@
config:
namespace_pattern: ^openshift-monitoring$
label_selector: app=prometheus
- id: wait-for-pods
config:
namespace_pattern: ^openshift-monitoring$
label_selector: app=prometheus
count: 2
timeout: 180
krkn_pod_recovery_time: 120

View File

@@ -4,3 +4,4 @@
namespace_pattern: ^openshift-.*$
name_pattern: .*
kill: 3
krkn_pod_recovery_time: 120

View File

@@ -60,7 +60,14 @@
"default": 1,
"title": "Backoff",
"description": "How many seconds to wait between checks for the target pod status."
},
"krkn_pod_recovery_time": {
"type": "integer",
"default": 30,
"title": "Recovery Time",
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
}
},
"required": [
"namespace_pattern"
@@ -112,6 +119,12 @@
"default": 1,
"title": "Backoff",
"description": "How many seconds to wait between checks for the target pod status."
},
"krkn_pod_recovery_time": {
"type": "integer",
"default": 30,
"title": "Recovery Time",
"description": "The Expected Recovery time fo the pod (used by Krkn to monitor the pod lifecycle)."
}
},
"required": [