Adding node interface down/up scenario' (#1192)

* Adding node interface down/up scenario'

Signed-off-by: Paige Patton <prubenda@redhat.com>

* Trigger CI

---------

Signed-off-by: Paige Patton <prubenda@redhat.com>
This commit is contained in:
Paige Patton
2026-03-31 12:59:41 -05:00
committed by GitHub
parent 35ee9d7bae
commit 357889196a
9 changed files with 527 additions and 9 deletions

View File

@@ -52,6 +52,7 @@ kraken:
- scenarios/kube/node-network-filter.yml
- scenarios/kube/node-network-chaos.yml
- scenarios/kube/pod-network-chaos.yml
- scenarios/kube/node_interface_down.yaml
- kubevirt_vm_outage:
- scenarios/kubevirt/kubevirt-vm-outage.yaml

View File

@@ -62,6 +62,19 @@ class NetworkFilterConfig(BaseNetworkChaosConfig):
return errors
@dataclass
class InterfaceDownConfig(BaseNetworkChaosConfig):
ingress: bool = True
egress: bool = True
recovery_time: int = 0
def validate(self) -> list[str]:
errors = super().validate()
if not isinstance(self.recovery_time, int) or self.recovery_time < 0:
errors.append("recovery_time must be a non-negative integer (seconds)")
return errors
@dataclass
class NetworkChaosConfig(BaseNetworkChaosConfig):
latency: Optional[str] = None

View File

@@ -44,7 +44,7 @@ class AbstractNetworkChaosModule(abc.ABC):
def get_node_targets(self, config: BaseNetworkChaosConfig):
if self.base_network_config.label_selector:
return self.kubecli.get_lib_kubernetes().list_nodes(
return self.kubecli.get_lib_kubernetes().list_ready_nodes(
self.base_network_config.label_selector
)
else:
@@ -52,9 +52,9 @@ class AbstractNetworkChaosModule(abc.ABC):
raise Exception(
"neither node selector nor node_name (target) specified, aborting."
)
node_info = self.kubecli.get_lib_kubernetes().list_nodes()
if config.target not in node_info:
raise Exception(f"node {config.target} not found, aborting")
ready_nodes = self.kubecli.get_lib_kubernetes().list_ready_nodes()
if config.target not in ready_nodes:
raise Exception(f"node {config.target} not found or not Ready, aborting")
return [config.target]

View File

@@ -0,0 +1,155 @@
import queue
import time
from typing import Tuple
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn_lib.utils import get_random_string
from krkn.scenario_plugins.network_chaos_ng.models import (
NetworkChaosScenarioType,
BaseNetworkChaosConfig,
InterfaceDownConfig,
)
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
AbstractNetworkChaosModule,
)
from krkn.scenario_plugins.network_chaos_ng.modules.utils import (
log_info,
log_error,
deploy_network_chaos_ng_pod,
get_pod_default_interface,
)
class NodeInterfaceDownModule(AbstractNetworkChaosModule):
config: InterfaceDownConfig
kubecli: KrknTelemetryOpenshift
def __init__(self, config: InterfaceDownConfig, kubecli: KrknTelemetryOpenshift):
super().__init__(config, kubecli)
self.config = config
def run(self, target: str, error_queue: queue.Queue = None):
parallel = False
if error_queue:
parallel = True
try:
pod_name = f"node-iface-down-{get_random_string(5)}"
log_info(
f"creating workload pod on node {target} to bring interface(s) down",
parallel,
target,
)
deploy_network_chaos_ng_pod(
self.config,
target,
pod_name,
self.kubecli.get_lib_kubernetes(),
)
if len(self.config.interfaces) == 0:
interfaces = [
get_pod_default_interface(
pod_name,
self.config.namespace,
self.kubecli.get_lib_kubernetes(),
)
]
if not interfaces[0]:
log_error(
"could not detect default network interface, aborting",
parallel,
target,
)
self.kubecli.get_lib_kubernetes().delete_pod(
pod_name, self.config.namespace
)
return
log_info(
f"detected default interface: {interfaces[0]}", parallel, target
)
else:
interfaces = self.config.interfaces
log_info(
f"scheduling recovery and bringing down interface(s): {', '.join(interfaces)} on node {target}",
parallel,
target,
)
# Pre-schedule recovery as a background process on the node before bringing
# the interface down. Once the interface is down the node loses connectivity
# to the control plane, so exec_cmd_in_pod can no longer reach the pod.
# The background process runs entirely on the node and fires regardless of
# control-plane connectivity.
recovery_cmds = " && ".join(
[f"ip link set {iface} up" for iface in interfaces]
)
down_cmds = " && ".join(
[f"ip link set {iface} down" for iface in interfaces]
)
cmd = f"(sleep {self.config.test_duration} && {recovery_cmds}) & {down_cmds}"
self.kubecli.get_lib_kubernetes().exec_cmd_in_pod(
[cmd], pod_name, self.config.namespace
)
log_info(
f"interface(s) {', '.join(interfaces)} are down on node {target}, "
f"recovery scheduled in {self.config.test_duration}s",
parallel,
target,
)
log_info(
f"waiting {self.config.test_duration} seconds for interface(s) to recover",
parallel,
target,
)
time.sleep(self.config.test_duration)
log_info(
f"waiting for node {target} to become Ready after interface recovery",
parallel,
target,
)
node_ready = False
for _ in range(60):
time.sleep(5)
ready_nodes = self.kubecli.get_lib_kubernetes().list_ready_nodes()
if target in ready_nodes:
node_ready = True
break
if not node_ready:
log_error(
f"node {target} did not become Ready within 5 minutes after interface recovery",
parallel,
target,
)
else:
log_info(f"node {target} is Ready", parallel, target)
if self.config.recovery_time > 0:
log_info(
f"waiting {self.config.recovery_time} seconds for node to stabilize",
parallel,
target,
)
time.sleep(self.config.recovery_time)
self.kubecli.get_lib_kubernetes().delete_pod(
pod_name, self.config.namespace
)
except Exception as e:
if error_queue is None:
raise e
else:
error_queue.put(str(e))
def get_config(self) -> Tuple[NetworkChaosScenarioType, BaseNetworkChaosConfig]:
return NetworkChaosScenarioType.Node, self.config
def get_targets(self) -> list[str]:
return self.get_node_targets(self.config)

View File

@@ -3,10 +3,14 @@ from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
from krkn.scenario_plugins.network_chaos_ng.models import (
NetworkFilterConfig,
NetworkChaosConfig,
InterfaceDownConfig,
)
from krkn.scenario_plugins.network_chaos_ng.modules.abstract_network_chaos_module import (
AbstractNetworkChaosModule,
)
from krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down import (
NodeInterfaceDownModule,
)
from krkn.scenario_plugins.network_chaos_ng.modules.node_network_chaos import (
NodeNetworkChaosModule,
)
@@ -25,6 +29,7 @@ supported_modules = [
"pod_network_filter",
"pod_network_chaos",
"node_network_chaos",
"node_interface_down",
]
@@ -63,5 +68,11 @@ class NetworkChaosFactory:
if len(errors) > 0:
raise Exception(f"config validation errors: [{';'.join(errors)}]")
return NodeNetworkChaosModule(scenario_config, kubecli)
if config["id"] == "node_interface_down":
scenario_config = InterfaceDownConfig(**config)
errors = scenario_config.validate()
if len(errors) > 0:
raise Exception(f"config validation errors: [{';'.join(errors)}]")
return NodeInterfaceDownModule(scenario_config, kubecli)
else:
raise Exception(f"invalid network chaos id {config['id']}")

View File

@@ -53,7 +53,7 @@ class NetworkChaosNgScenarioPlugin(AbstractScenarioPlugin):
if (
network_chaos_config.instance_count != 0
and network_chaos_config.instance_count > len(targets)
and network_chaos_config.instance_count < len(targets)
):
targets = random.sample(
targets, network_chaos_config.instance_count

View File

@@ -0,0 +1,27 @@
- id: node_interface_down
# Container image used to run the chaos workload pod on the target node
image: quay.io/krkn-chaos/krkn-network-chaos:latest
# Kubernetes namespace where the chaos workload pod is deployed
namespace: default
# Service account to use for the chaos workload pod (leave empty for default)
service_account: ""
# Node label selector - selects worker nodes to target.
# Leave label_selector empty and set target to a specific node name instead.
label_selector: "node-role.kubernetes.io/worker="
# Specific node name to target. Used when label_selector is not set.
target: ""
# Number of nodes to target (0 means all matched nodes)
instance_count: 1
# Execution mode: serial (one node at a time) or parallel (all nodes simultaneously)
execution: serial
# Network interfaces to bring down. Leave empty to auto-detect the default interface.
interfaces: []
# Duration in seconds the interface(s) remain down before being restored
test_duration: 60
# Optional wait time in seconds after the interface is brought back up,
# to allow the node to fully recover before the workload pod is removed
recovery_time: 30
# Time in seconds to wait before running the next module in the list
wait_duration: 0
# Node taints to tolerate (format: "key=value:Effect" or "key:Effect")
taints: []

View File

@@ -0,0 +1,311 @@
#!/usr/bin/env python3
"""
Test suite for NodeInterfaceDownModule class
Usage:
python -m coverage run -a -m unittest tests/test_node_interface_down.py -v
"""
import unittest
import queue
from unittest.mock import MagicMock, patch
from krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down import (
NodeInterfaceDownModule,
)
from krkn.scenario_plugins.network_chaos_ng.models import (
InterfaceDownConfig,
NetworkChaosScenarioType,
)
def make_config(**overrides) -> InterfaceDownConfig:
defaults = dict(
id="node_interface_down",
image="test-image",
wait_duration=0,
test_duration=60,
label_selector="node-role.kubernetes.io/worker=",
service_account="",
taints=[],
namespace="default",
instance_count=1,
target="",
execution="serial",
interfaces=["eth0"],
ingress=False,
egress=False,
recovery_time=0,
)
defaults.update(overrides)
return InterfaceDownConfig(**defaults)
class TestInterfaceDownConfig(unittest.TestCase):
def test_valid_config(self):
config = make_config()
errors = config.validate()
self.assertEqual(errors, [])
def test_invalid_recovery_time_negative(self):
config = make_config(recovery_time=-1)
errors = config.validate()
self.assertTrue(any("recovery_time" in e for e in errors))
def test_invalid_recovery_time_not_int(self):
config = make_config(recovery_time="30s")
errors = config.validate()
self.assertTrue(any("recovery_time" in e for e in errors))
def test_zero_recovery_time_is_valid(self):
config = make_config(recovery_time=0)
errors = config.validate()
self.assertEqual(errors, [])
def test_invalid_execution(self):
config = make_config(execution="random")
errors = config.validate()
self.assertTrue(any("execution" in e for e in errors))
def test_invalid_wait_duration(self):
config = make_config(wait_duration="ten")
errors = config.validate()
self.assertTrue(any("wait_duration" in e for e in errors))
def test_invalid_test_duration(self):
config = make_config(test_duration="sixty")
errors = config.validate()
self.assertTrue(any("test_duration" in e for e in errors))
class TestNodeInterfaceDownModule(unittest.TestCase):
def setUp(self):
self.mock_kubecli = MagicMock()
self.mock_kubernetes = MagicMock()
self.mock_kubecli.get_lib_kubernetes.return_value = self.mock_kubernetes
# Default: target node is immediately Ready after recovery
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1"]
self.config = make_config()
self.module = NodeInterfaceDownModule(self.config, self.mock_kubecli)
def test_initialization(self):
self.assertEqual(self.module.config, self.config)
self.assertEqual(self.module.kubecli, self.mock_kubecli)
self.assertEqual(self.module.base_network_config, self.config)
def test_get_config(self):
scenario_type, config = self.module.get_config()
self.assertEqual(scenario_type, NetworkChaosScenarioType.Node)
self.assertEqual(config, self.config)
def test_get_targets_with_label_selector(self):
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
targets = self.module.get_targets()
self.assertEqual(targets, ["worker-1", "worker-2"])
self.mock_kubernetes.list_ready_nodes.assert_called_once_with(
"node-role.kubernetes.io/worker="
)
def test_get_targets_with_target_name(self):
self.config.label_selector = ""
self.config.target = "worker-1"
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
targets = self.module.get_targets()
self.assertEqual(targets, ["worker-1"])
def test_get_targets_node_not_found(self):
self.config.label_selector = ""
self.config.target = "non-existent"
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
with self.assertRaises(Exception) as ctx:
self.module.get_targets()
self.assertIn("not found", str(ctx.exception))
def test_get_targets_no_target_or_selector(self):
self.config.label_selector = ""
self.config.target = ""
with self.assertRaises(Exception) as ctx:
self.module.get_targets()
self.assertIn("neither", str(ctx.exception))
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_brings_interface_down_and_up_in_single_command(self, _mock_log, _mock_deploy, _mock_sleep):
self.config.interfaces = ["eth0"]
self.module.run("worker-1")
exec_calls = [str(c) for c in self.mock_kubernetes.exec_cmd_in_pod.call_args_list]
self.assertEqual(len(exec_calls), 1)
cmd = exec_calls[0]
self.assertIn("ip link set eth0 down", cmd)
self.assertIn("ip link set eth0 up", cmd)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_recovery_is_scheduled_before_interface_goes_down(self, _mock_log, _mock_deploy, mock_sleep):
self.config.interfaces = ["eth0"]
self.config.test_duration = 30
self.module.run("worker-1")
exec_calls = [str(c) for c in self.mock_kubernetes.exec_cmd_in_pod.call_args_list]
cmd = exec_calls[0]
# Background recovery (sleep + up) must appear before the down command
self.assertIn("sleep 30", cmd)
up_pos = cmd.index("ip link set eth0 up")
down_pos = cmd.index("ip link set eth0 down")
self.assertLess(up_pos, down_pos)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_sleeps_test_duration(self, mock_log, mock_deploy, mock_sleep):
self.config.test_duration = 45
self.config.recovery_time = 0
self.module.run("worker-1")
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
self.assertIn(45, sleep_values)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_sleeps_recovery_time_when_set(self, mock_log, mock_deploy, mock_sleep):
self.config.test_duration = 30
self.config.recovery_time = 15
self.module.run("worker-1")
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
self.assertIn(30, sleep_values)
self.assertIn(15, sleep_values)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_no_recovery_sleep_when_zero(self, mock_log, mock_deploy, mock_sleep):
self.config.test_duration = 30
self.config.recovery_time = 0
self.module.run("worker-1")
sleep_values = [c[0][0] for c in mock_sleep.call_args_list]
self.assertIn(30, sleep_values)
self.assertNotIn(0, sleep_values)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_polls_node_readiness_after_sleep(self, mock_log, mock_deploy, mock_sleep):
self.module.run("worker-1")
self.mock_kubernetes.list_ready_nodes.assert_called()
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_error")
def test_run_logs_error_when_node_does_not_recover(self, mock_log_error, mock_log, mock_deploy, mock_sleep):
self.mock_kubernetes.list_ready_nodes.return_value = []
self.module.run("worker-1")
mock_log_error.assert_called()
self.assertIn("Ready", str(mock_log_error.call_args))
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_deletes_pod_on_success(self, mock_log, mock_deploy, mock_sleep):
self.module.run("worker-1")
self.mock_kubernetes.delete_pod.assert_called_once()
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_deletes_pod_even_when_node_does_not_recover(self, mock_log, mock_deploy, mock_sleep):
self.mock_kubernetes.list_ready_nodes.return_value = []
self.module.run("worker-1")
self.mock_kubernetes.delete_pod.assert_called_once()
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_multiple_interfaces(self, mock_log, mock_deploy, mock_sleep):
self.config.interfaces = ["eth0", "eth1", "bond0"]
self.module.run("worker-1")
exec_calls = [str(c) for c in self.mock_kubernetes.exec_cmd_in_pod.call_args_list]
self.assertEqual(len(exec_calls), 1)
cmd = exec_calls[0]
for iface in ["eth0", "eth1", "bond0"]:
self.assertIn(f"ip link set {iface} down", cmd)
self.assertIn(f"ip link set {iface} up", cmd)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.time.sleep")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.get_pod_default_interface")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_auto_detects_default_interface(self, mock_log, mock_get_iface, mock_deploy, mock_sleep):
self.config.interfaces = []
mock_get_iface.return_value = "ens3"
self.module.run("worker-1")
mock_get_iface.assert_called_once()
exec_calls = [str(c) for c in self.mock_kubernetes.exec_cmd_in_pod.call_args_list]
cmd = exec_calls[0]
self.assertIn("ip link set ens3 down", cmd)
self.assertIn("ip link set ens3 up", cmd)
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.get_pod_default_interface")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_error")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_aborts_when_no_interface_detected(self, mock_log, mock_log_error, mock_get_iface, mock_deploy):
self.config.interfaces = []
mock_get_iface.return_value = ""
self.module.run("worker-1")
mock_log_error.assert_called()
self.assertIn("could not detect", str(mock_log_error.call_args).lower())
self.mock_kubernetes.delete_pod.assert_called_once()
self.mock_kubernetes.exec_cmd_in_pod.assert_not_called()
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_raises_exception_without_error_queue(self, mock_log, mock_deploy):
mock_deploy.side_effect = Exception("deploy failed")
with self.assertRaises(Exception) as ctx:
self.module.run("worker-1")
self.assertIn("deploy failed", str(ctx.exception))
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.deploy_network_chaos_ng_pod")
@patch("krkn.scenario_plugins.network_chaos_ng.modules.node_interface_down.log_info")
def test_run_puts_error_in_queue_for_parallel(self, mock_log, mock_deploy):
mock_deploy.side_effect = Exception("deploy failed")
error_queue = queue.Queue()
self.module.run("worker-1", error_queue)
self.assertFalse(error_queue.empty())
self.assertEqual(error_queue.get(), "deploy failed")
if __name__ == "__main__":
unittest.main()

View File

@@ -78,7 +78,7 @@ class TestNodeNetworkChaosModule(unittest.TestCase):
"""
self.config.label_selector = ""
self.config.target = "worker-1"
self.mock_kubernetes.list_nodes.return_value = ["worker-1", "worker-2"]
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
targets = self.module.get_targets()
@@ -89,12 +89,12 @@ class TestNodeNetworkChaosModule(unittest.TestCase):
Test get_targets with label selector
"""
self.config.label_selector = "node-role.kubernetes.io/worker="
self.mock_kubernetes.list_nodes.return_value = ["worker-1", "worker-2"]
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
targets = self.module.get_targets()
self.assertEqual(targets, ["worker-1", "worker-2"])
self.mock_kubernetes.list_nodes.assert_called_once_with(
self.mock_kubernetes.list_ready_nodes.assert_called_once_with(
"node-role.kubernetes.io/worker="
)
@@ -104,7 +104,7 @@ class TestNodeNetworkChaosModule(unittest.TestCase):
"""
self.config.label_selector = ""
self.config.target = "non-existent-node"
self.mock_kubernetes.list_nodes.return_value = ["worker-1", "worker-2"]
self.mock_kubernetes.list_ready_nodes.return_value = ["worker-1", "worker-2"]
with self.assertRaises(Exception) as context:
self.module.get_targets()