From 197ce43f9a88b72bf05f6ccf941b202af10b5cec Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Tue, 2 Dec 2025 14:10:05 -0500 Subject: [PATCH 01/11] adding test server (#982) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 6 +- CI/config/common_test_config.yaml | 4 + CI/tests/test_pod_server.sh | 35 +++ tests/test_server.py | 385 ++++++++++++++++++++++++++++++ 4 files changed, 426 insertions(+), 4 deletions(-) create mode 100755 CI/tests/test_pod_server.sh create mode 100644 tests/test_server.py diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 5eb3e90c..d774b666 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -79,8 +79,6 @@ jobs: if: | github.event_name == 'pull_request' run: | - yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml - yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml yq -i '.elastic.elastic_port=9200' CI/config/common_test_config.yaml yq -i '.elastic.elastic_url="https://localhost"' CI/config/common_test_config.yaml @@ -100,6 +98,7 @@ jobs: echo "test_memory_hog" >> ./CI/tests/functional_tests echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests + echo "test_pod_server" >> ./CI/tests/functional_tests # Push on main only steps + all other functional to collect coverage # for the badge @@ -113,8 +112,6 @@ jobs: - name: Setup Post Merge Request Functional Tests if: github.ref == 'refs/heads/main' && github.event_name == 'push' run: | - yq -i '.kraken.port="8081"' CI/config/common_test_config.yaml - yq -i '.kraken.signal_address="0.0.0.0"' CI/config/common_test_config.yaml yq -i '.kraken.performance_monitoring="localhost:9090"' CI/config/common_test_config.yaml yq -i '.elastic.enable_elastic=False' CI/config/common_test_config.yaml yq -i '.elastic.password="${{env.ELASTIC_PASSWORD}}"' CI/config/common_test_config.yaml @@ -137,6 +134,7 @@ jobs: echo "test_memory_hog" >> ./CI/tests/functional_tests echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests + echo "test_pod_server" >> ./CI/tests/functional_tests # Final common steps - name: Run Functional tests diff --git a/CI/config/common_test_config.yaml b/CI/config/common_test_config.yaml index c12c9f30..ef3a4f80 100644 --- a/CI/config/common_test_config.yaml +++ b/CI/config/common_test_config.yaml @@ -2,6 +2,10 @@ kraken: distribution: kubernetes # Distribution can be kubernetes or openshift. kubeconfig_path: ~/.kube/config # Path to kubeconfig. exit_on_failure: False # Exit when a post action scenario fails. + publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081 + signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details + signal_address: 0.0.0.0 # Signal listening address + port: 8081 # Signal port auto_rollback: True # Enable auto rollback for scenarios. rollback_versions_directory: /tmp/kraken-rollback # Directory to store rollback version files. chaos_scenarios: # List of policies/chaos scenarios to load. diff --git a/CI/tests/test_pod_server.sh b/CI/tests/test_pod_server.sh new file mode 100755 index 00000000..0f9b2c36 --- /dev/null +++ b/CI/tests/test_pod_server.sh @@ -0,0 +1,35 @@ +set -xeEo pipefail + +source CI/tests/common.sh + +trap error ERR +trap finish EXIT + +function functional_test_pod_server { + export scenario_type="pod_disruption_scenarios" + export scenario_file="scenarios/kind/pod_etcd.yml" + export post_config="" + + envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml + yq -i '.[0].config.kill=1' scenarios/kind/pod_etcd.yml + + yq -i '.tunings.daemon_mode=True' CI/config/pod_config.yaml + cat CI/config/pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml & + sleep 15 + curl -X POST http:/0.0.0.0:8081/STOP + + wait + + yq -i '.kraken.signal_state="PAUSE"' CI/config/pod_config.yaml + yq -i '.tunings.daemon_mode=False' CI/config/pod_config.yaml + cat CI/config/pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml & + sleep 5 + curl -X POST http:/0.0.0.0:8081/RUN + wait + + echo "Pod disruption with server scenario test: Success" +} + +functional_test_pod_server diff --git a/tests/test_server.py b/tests/test_server.py new file mode 100644 index 00000000..b970bffe --- /dev/null +++ b/tests/test_server.py @@ -0,0 +1,385 @@ +#!/usr/bin/env python3 + +""" +Test suite for SimpleHTTPRequestHandler class + +Usage: + python -m coverage run -a -m unittest tests/test_server.py -v + +Assisted By: Claude Code +""" + +import unittest +from unittest.mock import Mock, patch, MagicMock +from io import BytesIO + +import server +from server import SimpleHTTPRequestHandler + + +class TestSimpleHTTPRequestHandler(unittest.TestCase): + + def setUp(self): + """ + Set up test fixtures for SimpleHTTPRequestHandler + """ + # Reset the global server_status before each test + server.server_status = "" + # Reset the requests_served counter + SimpleHTTPRequestHandler.requests_served = 0 + + # Create a mock request + self.mock_request = MagicMock() + self.mock_client_address = ('127.0.0.1', 12345) + self.mock_server = MagicMock() + + def _create_handler(self, method='GET', path='/'): + """ + Helper method to create a handler instance with mocked request + """ + # Create a mock request with proper attributes + mock_request = MagicMock() + mock_request.makefile.return_value = BytesIO( + f"{method} {path} HTTP/1.1\r\n\r\n".encode('utf-8') + ) + + # Create handler + handler = SimpleHTTPRequestHandler( + mock_request, + self.mock_client_address, + self.mock_server + ) + + # Mock the wfile (write file) for response + handler.wfile = BytesIO() + + return handler + + def test_do_GET_root_path_calls_do_status(self): + """ + Test do_GET with root path calls do_status + """ + handler = self._create_handler('GET', '/') + + with patch.object(handler, 'do_status') as mock_do_status: + handler.do_GET() + mock_do_status.assert_called_once() + + def test_do_GET_non_root_path_does_nothing(self): + """ + Test do_GET with non-root path does not call do_status + """ + handler = self._create_handler('GET', '/other') + + with patch.object(handler, 'do_status') as mock_do_status: + handler.do_GET() + mock_do_status.assert_not_called() + + def test_do_status_sends_200_response(self): + """ + Test do_status sends 200 status code + """ + server.server_status = "TEST_STATUS" + handler = self._create_handler() + + with patch.object(handler, 'send_response') as mock_send_response: + with patch.object(handler, 'end_headers'): + handler.do_status() + mock_send_response.assert_called_once_with(200) + + def test_do_status_writes_server_status(self): + """ + Test do_status writes server_status to response + """ + server.server_status = "RUNNING" + handler = self._create_handler() + + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.do_status() + + # Check that the status was written to wfile + response_content = handler.wfile.getvalue().decode('utf-8') + self.assertEqual(response_content, "RUNNING") + + def test_do_status_increments_requests_served(self): + """ + Test do_status increments requests_served counter + """ + # Note: Creating a handler increments the counter by 1 + # Then do_status increments it again + SimpleHTTPRequestHandler.requests_served = 0 + handler = self._create_handler() + initial_count = SimpleHTTPRequestHandler.requests_served + + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.do_status() + + self.assertEqual( + SimpleHTTPRequestHandler.requests_served, + initial_count + 1 + ) + + def test_do_status_multiple_requests_increment_counter(self): + """ + Test multiple do_status calls increment counter correctly + """ + SimpleHTTPRequestHandler.requests_served = 0 + + for i in range(5): + handler = self._create_handler() + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.do_status() + + # Each iteration: handler creation increments by 1, do_status increments by 1 + # Total: 5 * 2 = 10 + self.assertEqual(SimpleHTTPRequestHandler.requests_served, 10) + + def test_do_POST_STOP_path_calls_set_stop(self): + """ + Test do_POST with /STOP path calls set_stop + """ + handler = self._create_handler('POST', '/STOP') + + with patch.object(handler, 'set_stop') as mock_set_stop: + handler.do_POST() + mock_set_stop.assert_called_once() + + def test_do_POST_RUN_path_calls_set_run(self): + """ + Test do_POST with /RUN path calls set_run + """ + handler = self._create_handler('POST', '/RUN') + + with patch.object(handler, 'set_run') as mock_set_run: + handler.do_POST() + mock_set_run.assert_called_once() + + def test_do_POST_PAUSE_path_calls_set_pause(self): + """ + Test do_POST with /PAUSE path calls set_pause + """ + handler = self._create_handler('POST', '/PAUSE') + + with patch.object(handler, 'set_pause') as mock_set_pause: + handler.do_POST() + mock_set_pause.assert_called_once() + + def test_do_POST_unknown_path_does_nothing(self): + """ + Test do_POST with unknown path does not call any setter + """ + handler = self._create_handler('POST', '/UNKNOWN') + + with patch.object(handler, 'set_stop') as mock_set_stop: + with patch.object(handler, 'set_run') as mock_set_run: + with patch.object(handler, 'set_pause') as mock_set_pause: + handler.do_POST() + mock_set_stop.assert_not_called() + mock_set_run.assert_not_called() + mock_set_pause.assert_not_called() + + def test_set_run_sets_status_to_RUN(self): + """ + Test set_run sets global server_status to 'RUN' + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.set_run() + + self.assertEqual(server.server_status, 'RUN') + + def test_set_run_sends_200_response(self): + """ + Test set_run sends 200 status code + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response') as mock_send_response: + with patch.object(handler, 'end_headers'): + handler.set_run() + mock_send_response.assert_called_once_with(200) + + def test_set_stop_sets_status_to_STOP(self): + """ + Test set_stop sets global server_status to 'STOP' + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.set_stop() + + self.assertEqual(server.server_status, 'STOP') + + def test_set_stop_sends_200_response(self): + """ + Test set_stop sends 200 status code + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response') as mock_send_response: + with patch.object(handler, 'end_headers'): + handler.set_stop() + mock_send_response.assert_called_once_with(200) + + def test_set_pause_sets_status_to_PAUSE(self): + """ + Test set_pause sets global server_status to 'PAUSE' + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response'): + with patch.object(handler, 'end_headers'): + handler.set_pause() + + self.assertEqual(server.server_status, 'PAUSE') + + def test_set_pause_sends_200_response(self): + """ + Test set_pause sends 200 status code + """ + handler = self._create_handler() + + with patch.object(handler, 'send_response') as mock_send_response: + with patch.object(handler, 'end_headers'): + handler.set_pause() + mock_send_response.assert_called_once_with(200) + + def test_requests_served_is_class_variable(self): + """ + Test requests_served is shared across all instances + """ + SimpleHTTPRequestHandler.requests_served = 0 + + handler1 = self._create_handler() # Increments to 1 + handler2 = self._create_handler() # Increments to 2 + + with patch.object(handler1, 'send_response'): + with patch.object(handler1, 'end_headers'): + handler1.do_status() # Increments to 3 + + with patch.object(handler2, 'send_response'): + with patch.object(handler2, 'end_headers'): + handler2.do_status() # Increments to 4 + + # Both handlers should see the same counter + # 2 handler creations + 2 do_status calls = 4 + self.assertEqual(handler1.requests_served, 4) + self.assertEqual(handler2.requests_served, 4) + self.assertEqual(SimpleHTTPRequestHandler.requests_served, 4) + + +class TestServerModuleFunctions(unittest.TestCase): + + def setUp(self): + """ + Set up test fixtures for server module functions + """ + server.server_status = "" + + def test_publish_kraken_status_sets_server_status(self): + """ + Test publish_kraken_status sets global server_status + """ + server.publish_kraken_status("NEW_STATUS") + self.assertEqual(server.server_status, "NEW_STATUS") + + def test_publish_kraken_status_overwrites_existing_status(self): + """ + Test publish_kraken_status overwrites existing status + """ + server.server_status = "OLD_STATUS" + server.publish_kraken_status("NEW_STATUS") + self.assertEqual(server.server_status, "NEW_STATUS") + + @patch('server.HTTPServer') + @patch('server._thread') + def test_start_server_creates_http_server(self, mock_thread, mock_http_server): + """ + Test start_server creates HTTPServer with correct address + """ + address = ("localhost", 8080) + mock_server_instance = MagicMock() + mock_http_server.return_value = mock_server_instance + + server.start_server(address, "RUNNING") + + mock_http_server.assert_called_once_with( + address, + SimpleHTTPRequestHandler + ) + + @patch('server.HTTPServer') + @patch('server._thread') + def test_start_server_starts_thread(self, mock_thread, mock_http_server): + """ + Test start_server starts a new thread for serve_forever + """ + address = ("localhost", 8080) + mock_server_instance = MagicMock() + mock_http_server.return_value = mock_server_instance + + server.start_server(address, "RUNNING") + + mock_thread.start_new_thread.assert_called_once() + # Check that serve_forever was passed to the thread + args = mock_thread.start_new_thread.call_args[0] + self.assertEqual(args[0], mock_server_instance.serve_forever) + + @patch('server.HTTPServer') + @patch('server._thread') + def test_start_server_publishes_status(self, mock_thread, mock_http_server): + """ + Test start_server publishes the provided status + """ + address = ("localhost", 8080) + mock_server_instance = MagicMock() + mock_http_server.return_value = mock_server_instance + + server.start_server(address, "INITIAL_STATUS") + + self.assertEqual(server.server_status, "INITIAL_STATUS") + + @patch('server.HTTPConnection') + def test_get_status_makes_http_request(self, mock_http_connection): + """ + Test get_status makes HTTP GET request to root path + """ + address = ("localhost", 8080) + mock_connection = MagicMock() + mock_response = MagicMock() + mock_response.read.return_value = b"TEST_STATUS" + mock_connection.getresponse.return_value = mock_response + mock_http_connection.return_value = mock_connection + + result = server.get_status(address) + + mock_http_connection.assert_called_once_with("localhost", 8080) + mock_connection.request.assert_called_once_with("GET", "/") + self.assertEqual(result, "TEST_STATUS") + + @patch('server.HTTPConnection') + def test_get_status_returns_decoded_response(self, mock_http_connection): + """ + Test get_status returns decoded response string + """ + address = ("localhost", 8080) + mock_connection = MagicMock() + mock_response = MagicMock() + mock_response.read.return_value = b"RUNNING" + mock_connection.getresponse.return_value = mock_response + mock_http_connection.return_value = mock_connection + + result = server.get_status(address) + + self.assertEqual(result, "RUNNING") + self.assertIsInstance(result, str) + + +if __name__ == "__main__": + unittest.main() From eb7a1e243cc434e0dc838d0c840bb583820c6c6f Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:38:56 -0500 Subject: [PATCH 02/11] adding aws tests for node scenarios (#996) Assisted By: Claude Code Signed-off-by: Paige Patton --- tests/test_aws_node_scenarios.py | 984 +++++++++++++++++++++++++++++++ 1 file changed, 984 insertions(+) create mode 100644 tests/test_aws_node_scenarios.py diff --git a/tests/test_aws_node_scenarios.py b/tests/test_aws_node_scenarios.py new file mode 100644 index 00000000..be71dda0 --- /dev/null +++ b/tests/test_aws_node_scenarios.py @@ -0,0 +1,984 @@ +#!/usr/bin/env python3 + +""" +Test suite for AWS node scenarios + +This test suite covers both the AWS class and aws_node_scenarios class +using mocks to avoid actual AWS API calls. + +Usage: + python -m coverage run -a -m unittest tests/test_aws_node_scenarios.py -v + +Assisted By: Claude Code +""" + +import unittest +import sys +from unittest.mock import MagicMock, patch + +# Mock external dependencies before any imports that use them +sys.modules['boto3'] = MagicMock() +sys.modules['paramiko'] = MagicMock() + +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus +from krkn.scenario_plugins.node_actions.aws_node_scenarios import AWS, aws_node_scenarios + + +class TestAWS(unittest.TestCase): + """Test cases for AWS class""" + + def setUp(self): + """Set up test fixtures""" + # Mock boto3 to avoid actual AWS calls + self.boto_client_patcher = patch('boto3.client') + self.boto_resource_patcher = patch('boto3.resource') + + self.mock_client = self.boto_client_patcher.start() + self.mock_resource = self.boto_resource_patcher.start() + + # Create AWS instance with mocked boto3 + self.aws = AWS() + + def tearDown(self): + """Clean up after tests""" + self.boto_client_patcher.stop() + self.boto_resource_patcher.stop() + + def test_aws_init(self): + """Test AWS class initialization""" + self.assertIsNotNone(self.aws.boto_client) + self.assertIsNotNone(self.aws.boto_resource) + self.assertIsNotNone(self.aws.boto_instance) + + def test_get_instance_id_by_dns_name(self): + """Test getting instance ID by DNS name""" + mock_response = { + 'Reservations': [{ + 'Instances': [{ + 'InstanceId': 'i-1234567890abcdef0' + }] + }] + } + self.aws.boto_client.describe_instances = MagicMock(return_value=mock_response) + + instance_id = self.aws.get_instance_id('ip-10-0-1-100.ec2.internal') + + self.assertEqual(instance_id, 'i-1234567890abcdef0') + self.aws.boto_client.describe_instances.assert_called_once() + + def test_get_instance_id_by_ip_address(self): + """Test getting instance ID by IP address when DNS name fails""" + # First call returns empty, second call returns the instance + mock_response_empty = {'Reservations': []} + mock_response_with_instance = { + 'Reservations': [{ + 'Instances': [{ + 'InstanceId': 'i-1234567890abcdef0' + }] + }] + } + self.aws.boto_client.describe_instances = MagicMock( + side_effect=[mock_response_empty, mock_response_with_instance] + ) + + instance_id = self.aws.get_instance_id('ip-10-0-1-100') + + self.assertEqual(instance_id, 'i-1234567890abcdef0') + self.assertEqual(self.aws.boto_client.describe_instances.call_count, 2) + + def test_start_instances_success(self): + """Test starting instances successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.start_instances = MagicMock() + + self.aws.start_instances(instance_id) + + self.aws.boto_client.start_instances.assert_called_once_with( + InstanceIds=[instance_id] + ) + + def test_start_instances_failure(self): + """Test starting instances with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.start_instances = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.start_instances(instance_id) + + def test_stop_instances_success(self): + """Test stopping instances successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.stop_instances = MagicMock() + + self.aws.stop_instances(instance_id) + + self.aws.boto_client.stop_instances.assert_called_once_with( + InstanceIds=[instance_id] + ) + + def test_stop_instances_failure(self): + """Test stopping instances with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.stop_instances = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.stop_instances(instance_id) + + def test_terminate_instances_success(self): + """Test terminating instances successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.terminate_instances = MagicMock() + + self.aws.terminate_instances(instance_id) + + self.aws.boto_client.terminate_instances.assert_called_once_with( + InstanceIds=[instance_id] + ) + + def test_terminate_instances_failure(self): + """Test terminating instances with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.terminate_instances = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.terminate_instances(instance_id) + + def test_reboot_instances_success(self): + """Test rebooting instances successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.reboot_instances = MagicMock() + + self.aws.reboot_instances(instance_id) + + self.aws.boto_client.reboot_instances.assert_called_once_with( + InstanceIds=[instance_id] + ) + + def test_reboot_instances_failure(self): + """Test rebooting instances with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_client.reboot_instances = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.reboot_instances(instance_id) + + def test_wait_until_running_success(self): + """Test waiting until instance is running successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_running = MagicMock() + + result = self.aws.wait_until_running(instance_id, timeout=600, poll_interval=15) + + self.assertTrue(result) + self.aws.boto_instance.wait_until_running.assert_called_once() + + def test_wait_until_running_with_affected_node(self): + """Test waiting until running with affected node tracking""" + instance_id = 'i-1234567890abcdef0' + affected_node = MagicMock(spec=AffectedNode) + self.aws.boto_instance.wait_until_running = MagicMock() + + with patch('time.time', side_effect=[100, 110]): + result = self.aws.wait_until_running( + instance_id, + timeout=600, + affected_node=affected_node, + poll_interval=15 + ) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with("running", 10) + + def test_wait_until_running_failure(self): + """Test waiting until running with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_running = MagicMock( + side_effect=Exception("Timeout") + ) + + result = self.aws.wait_until_running(instance_id) + + self.assertFalse(result) + + def test_wait_until_stopped_success(self): + """Test waiting until instance is stopped successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_stopped = MagicMock() + + result = self.aws.wait_until_stopped(instance_id, timeout=600, poll_interval=15) + + self.assertTrue(result) + self.aws.boto_instance.wait_until_stopped.assert_called_once() + + def test_wait_until_stopped_with_affected_node(self): + """Test waiting until stopped with affected node tracking""" + instance_id = 'i-1234567890abcdef0' + affected_node = MagicMock(spec=AffectedNode) + self.aws.boto_instance.wait_until_stopped = MagicMock() + + with patch('time.time', side_effect=[100, 115]): + result = self.aws.wait_until_stopped( + instance_id, + timeout=600, + affected_node=affected_node, + poll_interval=15 + ) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with("stopped", 15) + + def test_wait_until_stopped_failure(self): + """Test waiting until stopped with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_stopped = MagicMock( + side_effect=Exception("Timeout") + ) + + result = self.aws.wait_until_stopped(instance_id) + + self.assertFalse(result) + + def test_wait_until_terminated_success(self): + """Test waiting until instance is terminated successfully""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_terminated = MagicMock() + + result = self.aws.wait_until_terminated(instance_id, timeout=600, poll_interval=15) + + self.assertTrue(result) + self.aws.boto_instance.wait_until_terminated.assert_called_once() + + def test_wait_until_terminated_with_affected_node(self): + """Test waiting until terminated with affected node tracking""" + instance_id = 'i-1234567890abcdef0' + affected_node = MagicMock(spec=AffectedNode) + self.aws.boto_instance.wait_until_terminated = MagicMock() + + with patch('time.time', side_effect=[100, 120]): + result = self.aws.wait_until_terminated( + instance_id, + timeout=600, + affected_node=affected_node, + poll_interval=15 + ) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with("terminated", 20) + + def test_wait_until_terminated_failure(self): + """Test waiting until terminated with failure""" + instance_id = 'i-1234567890abcdef0' + self.aws.boto_instance.wait_until_terminated = MagicMock( + side_effect=Exception("Timeout") + ) + + result = self.aws.wait_until_terminated(instance_id) + + self.assertFalse(result) + + def test_create_default_network_acl_success(self): + """Test creating default network ACL successfully""" + vpc_id = 'vpc-12345678' + acl_id = 'acl-12345678' + mock_response = { + 'NetworkAcl': { + 'NetworkAclId': acl_id + } + } + self.aws.boto_client.create_network_acl = MagicMock(return_value=mock_response) + + result = self.aws.create_default_network_acl(vpc_id) + + self.assertEqual(result, acl_id) + self.aws.boto_client.create_network_acl.assert_called_once_with(VpcId=vpc_id) + + def test_create_default_network_acl_failure(self): + """Test creating default network ACL with failure""" + vpc_id = 'vpc-12345678' + self.aws.boto_client.create_network_acl = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.create_default_network_acl(vpc_id) + + def test_replace_network_acl_association_success(self): + """Test replacing network ACL association successfully""" + association_id = 'aclassoc-12345678' + acl_id = 'acl-12345678' + new_association_id = 'aclassoc-87654321' + mock_response = { + 'NewAssociationId': new_association_id + } + self.aws.boto_client.replace_network_acl_association = MagicMock( + return_value=mock_response + ) + + result = self.aws.replace_network_acl_association(association_id, acl_id) + + self.assertEqual(result, new_association_id) + self.aws.boto_client.replace_network_acl_association.assert_called_once_with( + AssociationId=association_id, NetworkAclId=acl_id + ) + + def test_replace_network_acl_association_failure(self): + """Test replacing network ACL association with failure""" + association_id = 'aclassoc-12345678' + acl_id = 'acl-12345678' + self.aws.boto_client.replace_network_acl_association = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.replace_network_acl_association(association_id, acl_id) + + def test_describe_network_acls_success(self): + """Test describing network ACLs successfully""" + vpc_id = 'vpc-12345678' + subnet_id = 'subnet-12345678' + acl_id = 'acl-12345678' + associations = [{'NetworkAclId': acl_id, 'SubnetId': subnet_id}] + mock_response = { + 'NetworkAcls': [{ + 'Associations': associations + }] + } + self.aws.boto_client.describe_network_acls = MagicMock(return_value=mock_response) + + result_associations, result_acl_id = self.aws.describe_network_acls(vpc_id, subnet_id) + + self.assertEqual(result_associations, associations) + self.assertEqual(result_acl_id, acl_id) + + def test_describe_network_acls_failure(self): + """Test describing network ACLs with failure""" + vpc_id = 'vpc-12345678' + subnet_id = 'subnet-12345678' + self.aws.boto_client.describe_network_acls = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.describe_network_acls(vpc_id, subnet_id) + + def test_delete_network_acl_success(self): + """Test deleting network ACL successfully""" + acl_id = 'acl-12345678' + self.aws.boto_client.delete_network_acl = MagicMock() + + self.aws.delete_network_acl(acl_id) + + self.aws.boto_client.delete_network_acl.assert_called_once_with(NetworkAclId=acl_id) + + def test_delete_network_acl_failure(self): + """Test deleting network ACL with failure""" + acl_id = 'acl-12345678' + self.aws.boto_client.delete_network_acl = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.delete_network_acl(acl_id) + + def test_detach_volumes_success(self): + """Test detaching volumes successfully""" + volume_ids = ['vol-12345678', 'vol-87654321'] + self.aws.boto_client.detach_volume = MagicMock() + + self.aws.detach_volumes(volume_ids) + + self.assertEqual(self.aws.boto_client.detach_volume.call_count, 2) + self.aws.boto_client.detach_volume.assert_any_call(VolumeId='vol-12345678', Force=True) + self.aws.boto_client.detach_volume.assert_any_call(VolumeId='vol-87654321', Force=True) + + def test_detach_volumes_partial_failure(self): + """Test detaching volumes with partial failure""" + volume_ids = ['vol-12345678', 'vol-87654321'] + # First call succeeds, second fails - should not raise exception + self.aws.boto_client.detach_volume = MagicMock( + side_effect=[None, Exception("AWS error")] + ) + + # Should not raise exception, just log error + self.aws.detach_volumes(volume_ids) + + self.assertEqual(self.aws.boto_client.detach_volume.call_count, 2) + + def test_attach_volume_success(self): + """Test attaching volume successfully""" + attachment = { + 'VolumeId': 'vol-12345678', + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdf' + } + mock_volume = MagicMock() + mock_volume.state = 'available' + self.aws.boto_resource.Volume = MagicMock(return_value=mock_volume) + self.aws.boto_client.attach_volume = MagicMock() + + self.aws.attach_volume(attachment) + + self.aws.boto_client.attach_volume.assert_called_once_with( + InstanceId=attachment['InstanceId'], + Device=attachment['Device'], + VolumeId=attachment['VolumeId'] + ) + + def test_attach_volume_already_in_use(self): + """Test attaching volume that is already in use""" + attachment = { + 'VolumeId': 'vol-12345678', + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdf' + } + mock_volume = MagicMock() + mock_volume.state = 'in-use' + self.aws.boto_resource.Volume = MagicMock(return_value=mock_volume) + self.aws.boto_client.attach_volume = MagicMock() + + self.aws.attach_volume(attachment) + + # Should not attempt to attach + self.aws.boto_client.attach_volume.assert_not_called() + + def test_attach_volume_failure(self): + """Test attaching volume with failure""" + attachment = { + 'VolumeId': 'vol-12345678', + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdf' + } + mock_volume = MagicMock() + mock_volume.state = 'available' + self.aws.boto_resource.Volume = MagicMock(return_value=mock_volume) + self.aws.boto_client.attach_volume = MagicMock( + side_effect=Exception("AWS error") + ) + + with self.assertRaises(RuntimeError): + self.aws.attach_volume(attachment) + + def test_get_volumes_ids(self): + """Test getting volume IDs from instance""" + instance_id = ['i-1234567890abcdef0'] + mock_response = { + 'Reservations': [{ + 'Instances': [{ + 'BlockDeviceMappings': [ + {'DeviceName': '/dev/sda1', 'Ebs': {'VolumeId': 'vol-root'}}, + {'DeviceName': '/dev/sdf', 'Ebs': {'VolumeId': 'vol-12345678'}}, + {'DeviceName': '/dev/sdg', 'Ebs': {'VolumeId': 'vol-87654321'}} + ] + }] + }] + } + mock_instance = MagicMock() + mock_instance.root_device_name = '/dev/sda1' + self.aws.boto_resource.Instance = MagicMock(return_value=mock_instance) + self.aws.boto_client.describe_instances = MagicMock(return_value=mock_response) + + volume_ids = self.aws.get_volumes_ids(instance_id) + + self.assertEqual(len(volume_ids), 2) + self.assertIn('vol-12345678', volume_ids) + self.assertIn('vol-87654321', volume_ids) + self.assertNotIn('vol-root', volume_ids) + + def test_get_volume_attachment_details(self): + """Test getting volume attachment details""" + volume_ids = ['vol-12345678', 'vol-87654321'] + mock_response = { + 'Volumes': [ + {'VolumeId': 'vol-12345678', 'State': 'in-use'}, + {'VolumeId': 'vol-87654321', 'State': 'available'} + ] + } + self.aws.boto_client.describe_volumes = MagicMock(return_value=mock_response) + + details = self.aws.get_volume_attachment_details(volume_ids) + + self.assertEqual(len(details), 2) + self.assertEqual(details[0]['VolumeId'], 'vol-12345678') + self.assertEqual(details[1]['VolumeId'], 'vol-87654321') + + def test_get_root_volume_id(self): + """Test getting root volume ID""" + instance_id = ['i-1234567890abcdef0'] + mock_instance = MagicMock() + mock_instance.root_device_name = '/dev/sda1' + self.aws.boto_resource.Instance = MagicMock(return_value=mock_instance) + + root_volume = self.aws.get_root_volume_id(instance_id) + + self.assertEqual(root_volume, '/dev/sda1') + + def test_get_volume_state(self): + """Test getting volume state""" + volume_id = 'vol-12345678' + mock_volume = MagicMock() + mock_volume.state = 'available' + self.aws.boto_resource.Volume = MagicMock(return_value=mock_volume) + + state = self.aws.get_volume_state(volume_id) + + self.assertEqual(state, 'available') + + +class TestAWSNodeScenarios(unittest.TestCase): + """Test cases for aws_node_scenarios class""" + + def setUp(self): + """Set up test fixtures""" + self.kubecli = MagicMock(spec=KrknKubernetes) + self.affected_nodes_status = AffectedNodeStatus() + + # Mock the AWS class + with patch('krkn.scenario_plugins.node_actions.aws_node_scenarios.AWS') as mock_aws_class: + self.mock_aws = MagicMock() + mock_aws_class.return_value = self.mock_aws + self.scenario = aws_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=True, + affected_nodes_status=self.affected_nodes_status + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_start_scenario_success(self, mock_wait_ready): + """Test node start scenario successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.start_instances.return_value = None + self.mock_aws.wait_until_running.return_value = True + + self.scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.start_instances.assert_called_once_with(instance_id) + self.mock_aws.wait_until_running.assert_called_once() + mock_wait_ready.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + self.assertEqual(self.affected_nodes_status.affected_nodes[0].node_name, node) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_start_scenario_no_kube_check(self, mock_wait_ready): + """Test node start scenario without kube check""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.aws_node_scenarios.AWS') as mock_aws_class: + mock_aws = MagicMock() + mock_aws_class.return_value = mock_aws + scenario = aws_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_aws.get_instance_id.return_value = instance_id + mock_aws.start_instances.return_value = None + mock_aws.wait_until_running.return_value = True + + scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + # Should not call wait_for_ready_status + mock_wait_ready.assert_not_called() + + def test_node_start_scenario_failure(self): + """Test node start scenario with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + def test_node_stop_scenario_success(self, mock_wait_unknown): + """Test node stop scenario successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.stop_instances.return_value = None + self.mock_aws.wait_until_stopped.return_value = True + + self.scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.stop_instances.assert_called_once_with(instance_id) + self.mock_aws.wait_until_stopped.assert_called_once() + mock_wait_unknown.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + def test_node_stop_scenario_no_kube_check(self, mock_wait_unknown): + """Test node stop scenario without kube check""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.aws_node_scenarios.AWS') as mock_aws_class: + mock_aws = MagicMock() + mock_aws_class.return_value = mock_aws + scenario = aws_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_aws.get_instance_id.return_value = instance_id + mock_aws.stop_instances.return_value = None + mock_aws.wait_until_stopped.return_value = True + + scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + # Should not call wait_for_unknown_status + mock_wait_unknown.assert_not_called() + + def test_node_stop_scenario_failure(self): + """Test node stop scenario with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('time.sleep') + def test_node_termination_scenario_success(self, _mock_sleep): + """Test node termination scenario successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.terminate_instances.return_value = None + self.mock_aws.wait_until_terminated.return_value = True + self.kubecli.list_nodes.return_value = [] + + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.terminate_instances.assert_called_once_with(instance_id) + self.mock_aws.wait_until_terminated.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('time.sleep') + def test_node_termination_scenario_node_still_exists(self, _mock_sleep): + """Test node termination scenario when node still exists""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.terminate_instances.return_value = None + self.mock_aws.wait_until_terminated.return_value = True + # Node still in list after timeout + self.kubecli.list_nodes.return_value = [node] + + with self.assertRaises(RuntimeError): + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=2, + poll_interval=15 + ) + + def test_node_termination_scenario_failure(self): + """Test node termination scenario with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_reboot_scenario_success(self, mock_wait_ready, mock_wait_unknown): + """Test node reboot scenario successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.reboot_instances.return_value = None + + self.scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.reboot_instances.assert_called_once_with(instance_id) + mock_wait_unknown.assert_called_once() + mock_wait_ready.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_reboot_scenario_no_kube_check(self, mock_wait_ready, mock_wait_unknown): + """Test node reboot scenario without kube check""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.aws_node_scenarios.AWS') as mock_aws_class: + mock_aws = MagicMock() + mock_aws_class.return_value = mock_aws + scenario = aws_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_aws.get_instance_id.return_value = instance_id + mock_aws.reboot_instances.return_value = None + + scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + # Should not call wait functions + mock_wait_unknown.assert_not_called() + mock_wait_ready.assert_not_called() + + def test_node_reboot_scenario_failure(self): + """Test node reboot scenario with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + def test_node_reboot_scenario_multiple_kills(self): + """Test node reboot scenario with multiple kill counts""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + with patch('krkn.scenario_plugins.node_actions.aws_node_scenarios.AWS') as mock_aws_class: + mock_aws = MagicMock() + mock_aws_class.return_value = mock_aws + scenario = aws_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_aws.get_instance_id.return_value = instance_id + mock_aws.reboot_instances.return_value = None + + scenario.node_reboot_scenario( + instance_kill_count=3, + node=node, + timeout=600 + ) + + self.assertEqual(mock_aws.reboot_instances.call_count, 3) + self.assertEqual(len(scenario.affected_nodes_status.affected_nodes), 3) + + def test_get_disk_attachment_info_success(self): + """Test getting disk attachment info successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + volume_ids = ['vol-12345678'] + attachment_details = [ + { + 'VolumeId': 'vol-12345678', + 'Attachments': [{ + 'InstanceId': instance_id, + 'Device': '/dev/sdf' + }] + } + ] + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.get_volumes_ids.return_value = volume_ids + self.mock_aws.get_volume_attachment_details.return_value = attachment_details + + result = self.scenario.get_disk_attachment_info( + instance_kill_count=1, + node=node + ) + + self.assertEqual(result, attachment_details) + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.get_volumes_ids.assert_called_once() + self.mock_aws.get_volume_attachment_details.assert_called_once_with(volume_ids) + + def test_get_disk_attachment_info_no_volumes(self): + """Test getting disk attachment info when no volumes exist""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.get_volumes_ids.return_value = [] + + result = self.scenario.get_disk_attachment_info( + instance_kill_count=1, + node=node + ) + + self.assertIsNone(result) + self.mock_aws.get_volume_attachment_details.assert_not_called() + + def test_get_disk_attachment_info_failure(self): + """Test getting disk attachment info with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.get_disk_attachment_info( + instance_kill_count=1, + node=node + ) + + def test_disk_detach_scenario_success(self): + """Test disk detach scenario successfully""" + node = 'ip-10-0-1-100.ec2.internal' + instance_id = 'i-1234567890abcdef0' + volume_ids = ['vol-12345678', 'vol-87654321'] + + self.mock_aws.get_instance_id.return_value = instance_id + self.mock_aws.get_volumes_ids.return_value = volume_ids + self.mock_aws.detach_volumes.return_value = None + + self.scenario.disk_detach_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + self.mock_aws.get_instance_id.assert_called_once_with(node) + self.mock_aws.get_volumes_ids.assert_called_once() + self.mock_aws.detach_volumes.assert_called_once_with(volume_ids) + + def test_disk_detach_scenario_failure(self): + """Test disk detach scenario with failure""" + node = 'ip-10-0-1-100.ec2.internal' + + self.mock_aws.get_instance_id.side_effect = Exception("AWS error") + + with self.assertRaises(RuntimeError): + self.scenario.disk_detach_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + def test_disk_attach_scenario_success(self): + """Test disk attach scenario successfully""" + attachment_details = [ + { + 'VolumeId': 'vol-12345678', + 'Attachments': [{ + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdf', + 'VolumeId': 'vol-12345678' + }] + }, + { + 'VolumeId': 'vol-87654321', + 'Attachments': [{ + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdg', + 'VolumeId': 'vol-87654321' + }] + } + ] + + self.mock_aws.attach_volume.return_value = None + + self.scenario.disk_attach_scenario( + instance_kill_count=1, + attachment_details=attachment_details, + timeout=600 + ) + + self.assertEqual(self.mock_aws.attach_volume.call_count, 2) + + def test_disk_attach_scenario_multiple_kills(self): + """Test disk attach scenario with multiple kill counts""" + attachment_details = [ + { + 'VolumeId': 'vol-12345678', + 'Attachments': [{ + 'InstanceId': 'i-1234567890abcdef0', + 'Device': '/dev/sdf', + 'VolumeId': 'vol-12345678' + }] + } + ] + + self.mock_aws.attach_volume.return_value = None + + self.scenario.disk_attach_scenario( + instance_kill_count=3, + attachment_details=attachment_details, + timeout=600 + ) + + # Should call attach_volume 3 times (once per kill count) + self.assertEqual(self.mock_aws.attach_volume.call_count, 3) + + +if __name__ == "__main__": + unittest.main() From 4b3617bd8a031630d4183628254d81453a48e35e Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Mon, 15 Dec 2025 09:39:16 -0500 Subject: [PATCH 03/11] adding gcp tests for node actions (#997) Assisted By: Claude Code Signed-off-by: Paige Patton --- tests/test_gcp_node_scenarios.py | 781 +++++++++++++++++++++++++++++++ 1 file changed, 781 insertions(+) create mode 100644 tests/test_gcp_node_scenarios.py diff --git a/tests/test_gcp_node_scenarios.py b/tests/test_gcp_node_scenarios.py new file mode 100644 index 00000000..5d4a9c6e --- /dev/null +++ b/tests/test_gcp_node_scenarios.py @@ -0,0 +1,781 @@ +#!/usr/bin/env python3 + +""" +Test suite for GCP node scenarios + +This test suite covers both the GCP class and gcp_node_scenarios class +using mocks to avoid actual GCP API calls. + +Usage: + python -m coverage run -a -m unittest tests/test_gcp_node_scenarios.py -v + +Assisted By: Claude Code +""" + +import unittest +import sys +from unittest.mock import MagicMock, patch + +# Mock external dependencies before any imports that use them +# Create proper nested mock structure for google modules +mock_google = MagicMock() +mock_google_auth = MagicMock() +mock_google_auth_transport = MagicMock() +mock_google_cloud = MagicMock() +mock_google_cloud_compute = MagicMock() + +sys.modules['google'] = mock_google +sys.modules['google.auth'] = mock_google_auth +sys.modules['google.auth.transport'] = mock_google_auth_transport +sys.modules['google.auth.transport.requests'] = MagicMock() +sys.modules['google.cloud'] = mock_google_cloud +sys.modules['google.cloud.compute_v1'] = mock_google_cloud_compute +sys.modules['paramiko'] = MagicMock() + +from krkn_lib.k8s import KrknKubernetes +from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus +from krkn.scenario_plugins.node_actions.gcp_node_scenarios import GCP, gcp_node_scenarios + + +class TestGCP(unittest.TestCase): + """Test cases for GCP class""" + + def setUp(self): + """Set up test fixtures""" + # Mock google.auth before creating GCP instance + self.auth_patcher = patch('google.auth.default') + self.compute_patcher = patch('google.cloud.compute_v1.InstancesClient') + + self.mock_auth = self.auth_patcher.start() + self.mock_compute_client = self.compute_patcher.start() + + # Configure auth mock to return credentials and project_id + self.mock_auth.return_value = (MagicMock(), 'test-project-123') + + # Create GCP instance with mocked dependencies + self.gcp = GCP() + + def tearDown(self): + """Clean up after tests""" + self.auth_patcher.stop() + self.compute_patcher.stop() + + def test_gcp_init_success(self): + """Test GCP class initialization success""" + self.assertEqual(self.gcp.project_id, 'test-project-123') + self.assertIsNotNone(self.gcp.instance_client) + + def test_gcp_init_failure(self): + """Test GCP class initialization failure""" + with patch('google.auth.default', side_effect=Exception("Auth error")): + with self.assertRaises(Exception): + GCP() + + def test_get_node_instance_success(self): + """Test getting node instance successfully""" + # Create mock instance + mock_instance = MagicMock() + mock_instance.name = 'gke-cluster-node-1' + + # Create mock response + mock_response = MagicMock() + mock_response.instances = [mock_instance] + + # Mock aggregated_list to return our mock data + self.gcp.instance_client.aggregated_list = MagicMock( + return_value=[('zones/us-central1-a', mock_response)] + ) + + result = self.gcp.get_node_instance('gke-cluster-node-1') + + self.assertEqual(result, mock_instance) + self.assertEqual(result.name, 'gke-cluster-node-1') + + def test_get_node_instance_partial_match(self): + """Test getting node instance with partial name match""" + mock_instance = MagicMock() + mock_instance.name = 'node-1' + + mock_response = MagicMock() + mock_response.instances = [mock_instance] + + self.gcp.instance_client.aggregated_list = MagicMock( + return_value=[('zones/us-central1-a', mock_response)] + ) + + # instance.name ('node-1') in node ('gke-cluster-node-1-abc') == True + result = self.gcp.get_node_instance('gke-cluster-node-1-abc') + + self.assertIsNotNone(result) + self.assertEqual(result.name, 'node-1') + + def test_get_node_instance_not_found(self): + """Test getting node instance when not found""" + mock_response = MagicMock() + mock_response.instances = None + + self.gcp.instance_client.aggregated_list = MagicMock( + return_value=[('zones/us-central1-a', mock_response)] + ) + + result = self.gcp.get_node_instance('non-existent-node') + + self.assertIsNone(result) + + def test_get_node_instance_failure(self): + """Test getting node instance with failure""" + self.gcp.instance_client.aggregated_list = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(Exception): + self.gcp.get_node_instance('node-1') + + def test_get_instance_name(self): + """Test getting instance name""" + mock_instance = MagicMock() + mock_instance.name = 'gke-cluster-node-1' + + result = self.gcp.get_instance_name(mock_instance) + + self.assertEqual(result, 'gke-cluster-node-1') + + def test_get_instance_name_none(self): + """Test getting instance name when name is None""" + mock_instance = MagicMock() + mock_instance.name = None + + result = self.gcp.get_instance_name(mock_instance) + + self.assertIsNone(result) + + def test_get_instance_zone(self): + """Test getting instance zone""" + mock_instance = MagicMock() + mock_instance.zone = 'https://www.googleapis.com/compute/v1/projects/test-project/zones/us-central1-a' + + result = self.gcp.get_instance_zone(mock_instance) + + self.assertEqual(result, 'us-central1-a') + + def test_get_instance_zone_none(self): + """Test getting instance zone when zone is None""" + mock_instance = MagicMock() + mock_instance.zone = None + + result = self.gcp.get_instance_zone(mock_instance) + + self.assertIsNone(result) + + def test_get_node_instance_zone(self): + """Test getting node instance zone""" + mock_instance = MagicMock() + mock_instance.name = 'gke-cluster-node-1' + mock_instance.zone = 'https://www.googleapis.com/compute/v1/projects/test-project/zones/us-west1-b' + + # Patch get_node_instance to return our mock directly + with patch.object(self.gcp, 'get_node_instance', return_value=mock_instance): + result = self.gcp.get_node_instance_zone('node-1') + self.assertEqual(result, 'us-west1-b') + + def test_get_node_instance_name(self): + """Test getting node instance name""" + mock_instance = MagicMock() + mock_instance.name = 'gke-cluster-node-1' + + # Patch get_node_instance to return our mock directly + with patch.object(self.gcp, 'get_node_instance', return_value=mock_instance): + result = self.gcp.get_node_instance_name('node-1') + self.assertEqual(result, 'gke-cluster-node-1') + + def test_get_instance_id(self): + """Test getting instance ID (alias for get_node_instance_name)""" + # Patch get_node_instance_name since get_instance_id just calls it + with patch.object(self.gcp, 'get_node_instance_name', return_value='gke-cluster-node-1'): + result = self.gcp.get_instance_id('node-1') + self.assertEqual(result, 'gke-cluster-node-1') + + def test_start_instances_success(self): + """Test starting instances successfully""" + instance_id = 'gke-cluster-node-1' + + # Mock get_node_instance_zone + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.start = MagicMock() + + self.gcp.start_instances(instance_id) + + self.gcp.instance_client.start.assert_called_once() + + def test_start_instances_failure(self): + """Test starting instances with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.start = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.start_instances(instance_id) + + def test_stop_instances_success(self): + """Test stopping instances successfully""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.stop = MagicMock() + + self.gcp.stop_instances(instance_id) + + self.gcp.instance_client.stop.assert_called_once() + + def test_stop_instances_failure(self): + """Test stopping instances with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.stop = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.stop_instances(instance_id) + + def test_suspend_instances_success(self): + """Test suspending instances successfully""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.suspend = MagicMock() + + self.gcp.suspend_instances(instance_id) + + self.gcp.instance_client.suspend.assert_called_once() + + def test_suspend_instances_failure(self): + """Test suspending instances with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.suspend = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.suspend_instances(instance_id) + + def test_terminate_instances_success(self): + """Test terminating instances successfully""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.delete = MagicMock() + + self.gcp.terminate_instances(instance_id) + + self.gcp.instance_client.delete.assert_called_once() + + def test_terminate_instances_failure(self): + """Test terminating instances with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.delete = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.terminate_instances(instance_id) + + def test_reboot_instances_success(self): + """Test rebooting instances successfully""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.reset = MagicMock() + + self.gcp.reboot_instances(instance_id) + + self.gcp.instance_client.reset.assert_called_once() + + def test_reboot_instances_failure(self): + """Test rebooting instances with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.reset = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.reboot_instances(instance_id) + + @patch('time.sleep') + def test_get_instance_status_success(self, _mock_sleep): + """Test getting instance status successfully""" + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.status = 'RUNNING' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.get = MagicMock(return_value=mock_instance) + + result = self.gcp.get_instance_status(instance_id, 'RUNNING', 60) + + self.assertTrue(result) + + @patch('time.sleep') + def test_get_instance_status_timeout(self, _mock_sleep): + """Test getting instance status with timeout""" + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.status = 'PROVISIONING' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.get = MagicMock(return_value=mock_instance) + + result = self.gcp.get_instance_status(instance_id, 'RUNNING', 5) + + self.assertFalse(result) + + @patch('time.sleep') + def test_get_instance_status_failure(self, _mock_sleep): + """Test getting instance status with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_node_instance_zone', return_value='us-central1-a'): + self.gcp.instance_client.get = MagicMock( + side_effect=Exception("GCP error") + ) + + with self.assertRaises(RuntimeError): + self.gcp.get_instance_status(instance_id, 'RUNNING', 60) + + def test_wait_until_suspended_success(self): + """Test waiting until instance is suspended""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_instance_status', return_value=True) as mock_get_status: + result = self.gcp.wait_until_suspended(instance_id, 60) + + self.assertTrue(result) + mock_get_status.assert_called_once_with(instance_id, 'SUSPENDED', 60) + + def test_wait_until_suspended_failure(self): + """Test waiting until instance is suspended with failure""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_instance_status', return_value=False): + result = self.gcp.wait_until_suspended(instance_id, 60) + + self.assertFalse(result) + + def test_wait_until_running_success(self): + """Test waiting until instance is running successfully""" + instance_id = 'gke-cluster-node-1' + affected_node = MagicMock(spec=AffectedNode) + + with patch('time.time', side_effect=[100, 110]): + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_running(instance_id, 60, affected_node) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with('running', 10) + + def test_wait_until_running_without_affected_node(self): + """Test waiting until running without affected node tracking""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_running(instance_id, 60, None) + + self.assertTrue(result) + + def test_wait_until_stopped_success(self): + """Test waiting until instance is stopped successfully""" + instance_id = 'gke-cluster-node-1' + affected_node = MagicMock(spec=AffectedNode) + + with patch('time.time', side_effect=[100, 115]): + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_stopped(instance_id, 60, affected_node) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with('stopped', 15) + + def test_wait_until_stopped_without_affected_node(self): + """Test waiting until stopped without affected node tracking""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_stopped(instance_id, 60, None) + + self.assertTrue(result) + + def test_wait_until_terminated_success(self): + """Test waiting until instance is terminated successfully""" + instance_id = 'gke-cluster-node-1' + affected_node = MagicMock(spec=AffectedNode) + + with patch('time.time', side_effect=[100, 120]): + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_terminated(instance_id, 60, affected_node) + + self.assertTrue(result) + affected_node.set_affected_node_status.assert_called_once_with('terminated', 20) + + def test_wait_until_terminated_without_affected_node(self): + """Test waiting until terminated without affected node tracking""" + instance_id = 'gke-cluster-node-1' + + with patch.object(self.gcp, 'get_instance_status', return_value=True): + result = self.gcp.wait_until_terminated(instance_id, 60, None) + + self.assertTrue(result) + + +class TestGCPNodeScenarios(unittest.TestCase): + """Test cases for gcp_node_scenarios class""" + + def setUp(self): + """Set up test fixtures""" + self.kubecli = MagicMock(spec=KrknKubernetes) + self.affected_nodes_status = AffectedNodeStatus() + + # Mock the GCP class + with patch('krkn.scenario_plugins.node_actions.gcp_node_scenarios.GCP') as mock_gcp_class: + self.mock_gcp = MagicMock() + mock_gcp_class.return_value = self.mock_gcp + self.scenario = gcp_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=True, + affected_nodes_status=self.affected_nodes_status + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_start_scenario_success(self, mock_wait_ready): + """Test node start scenario successfully""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.start_instances.return_value = None + self.mock_gcp.wait_until_running.return_value = True + + self.scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_gcp.get_node_instance.assert_called_once_with(node) + self.mock_gcp.get_instance_name.assert_called_once_with(mock_instance) + self.mock_gcp.start_instances.assert_called_once_with(instance_id) + self.mock_gcp.wait_until_running.assert_called_once() + mock_wait_ready.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + self.assertEqual(self.affected_nodes_status.affected_nodes[0].node_name, node) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_start_scenario_no_kube_check(self, mock_wait_ready): + """Test node start scenario without kube check""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.gcp_node_scenarios.GCP') as mock_gcp_class: + mock_gcp = MagicMock() + mock_gcp_class.return_value = mock_gcp + scenario = gcp_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_instance = MagicMock() + mock_instance.name = instance_id + + mock_gcp.get_node_instance.return_value = mock_instance + mock_gcp.get_instance_name.return_value = instance_id + mock_gcp.start_instances.return_value = None + mock_gcp.wait_until_running.return_value = True + + scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + # Should not call wait_for_ready_status + mock_wait_ready.assert_not_called() + + def test_node_start_scenario_failure(self): + """Test node start scenario with failure""" + node = 'gke-cluster-node-1' + + self.mock_gcp.get_node_instance.side_effect = Exception("GCP error") + + with self.assertRaises(RuntimeError): + self.scenario.node_start_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + def test_node_stop_scenario_success(self, mock_wait_unknown): + """Test node stop scenario successfully""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.stop_instances.return_value = None + self.mock_gcp.wait_until_stopped.return_value = True + + self.scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_gcp.get_node_instance.assert_called_once_with(node) + self.mock_gcp.get_instance_name.assert_called_once_with(mock_instance) + self.mock_gcp.stop_instances.assert_called_once_with(instance_id) + self.mock_gcp.wait_until_stopped.assert_called_once() + mock_wait_unknown.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + def test_node_stop_scenario_no_kube_check(self, mock_wait_unknown): + """Test node stop scenario without kube check""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.gcp_node_scenarios.GCP') as mock_gcp_class: + mock_gcp = MagicMock() + mock_gcp_class.return_value = mock_gcp + scenario = gcp_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_instance = MagicMock() + mock_instance.name = instance_id + + mock_gcp.get_node_instance.return_value = mock_instance + mock_gcp.get_instance_name.return_value = instance_id + mock_gcp.stop_instances.return_value = None + mock_gcp.wait_until_stopped.return_value = True + + scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + # Should not call wait_for_unknown_status + mock_wait_unknown.assert_not_called() + + def test_node_stop_scenario_failure(self): + """Test node stop scenario with failure""" + node = 'gke-cluster-node-1' + + self.mock_gcp.get_node_instance.side_effect = Exception("GCP error") + + with self.assertRaises(RuntimeError): + self.scenario.node_stop_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('time.sleep') + def test_node_termination_scenario_success(self, _mock_sleep): + """Test node termination scenario successfully""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.terminate_instances.return_value = None + self.mock_gcp.wait_until_terminated.return_value = True + self.kubecli.list_nodes.return_value = [] + + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + self.mock_gcp.get_node_instance.assert_called_once_with(node) + self.mock_gcp.get_instance_name.assert_called_once_with(mock_instance) + self.mock_gcp.terminate_instances.assert_called_once_with(instance_id) + self.mock_gcp.wait_until_terminated.assert_called_once() + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('time.sleep') + def test_node_termination_scenario_node_still_exists(self, _mock_sleep): + """Test node termination scenario when node still exists""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.terminate_instances.return_value = None + self.mock_gcp.wait_until_terminated.return_value = True + # Node still in list after timeout + self.kubecli.list_nodes.return_value = [node] + + with self.assertRaises(RuntimeError): + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=2, + poll_interval=15 + ) + + def test_node_termination_scenario_failure(self): + """Test node termination scenario with failure""" + node = 'gke-cluster-node-1' + + self.mock_gcp.get_node_instance.side_effect = Exception("GCP error") + + with self.assertRaises(RuntimeError): + self.scenario.node_termination_scenario( + instance_kill_count=1, + node=node, + timeout=600, + poll_interval=15 + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_reboot_scenario_success(self, mock_wait_ready, mock_wait_unknown): + """Test node reboot scenario successfully""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.reboot_instances.return_value = None + self.mock_gcp.wait_until_running.return_value = True + + self.scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + self.mock_gcp.get_node_instance.assert_called_once_with(node) + self.mock_gcp.get_instance_name.assert_called_once_with(mock_instance) + self.mock_gcp.reboot_instances.assert_called_once_with(instance_id) + self.mock_gcp.wait_until_running.assert_called_once() + # Should be called twice in GCP implementation + self.assertEqual(mock_wait_unknown.call_count, 1) + self.assertEqual(mock_wait_ready.call_count, 1) + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 1) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_unknown_status') + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_reboot_scenario_no_kube_check(self, mock_wait_ready, mock_wait_unknown): + """Test node reboot scenario without kube check""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + # Create scenario with node_action_kube_check=False + with patch('krkn.scenario_plugins.node_actions.gcp_node_scenarios.GCP') as mock_gcp_class: + mock_gcp = MagicMock() + mock_gcp_class.return_value = mock_gcp + scenario = gcp_node_scenarios( + kubecli=self.kubecli, + node_action_kube_check=False, + affected_nodes_status=AffectedNodeStatus() + ) + + mock_instance = MagicMock() + mock_instance.name = instance_id + + mock_gcp.get_node_instance.return_value = mock_instance + mock_gcp.get_instance_name.return_value = instance_id + mock_gcp.reboot_instances.return_value = None + mock_gcp.wait_until_running.return_value = True + + scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + # Should not call wait functions + mock_wait_unknown.assert_not_called() + mock_wait_ready.assert_not_called() + + def test_node_reboot_scenario_failure(self): + """Test node reboot scenario with failure""" + node = 'gke-cluster-node-1' + + self.mock_gcp.get_node_instance.side_effect = Exception("GCP error") + + with self.assertRaises(RuntimeError): + self.scenario.node_reboot_scenario( + instance_kill_count=1, + node=node, + timeout=600 + ) + + @patch('krkn.scenario_plugins.node_actions.common_node_functions.wait_for_ready_status') + def test_node_start_scenario_multiple_kills(self, mock_wait_ready): + """Test node start scenario with multiple kill counts""" + node = 'gke-cluster-node-1' + instance_id = 'gke-cluster-node-1' + + mock_instance = MagicMock() + mock_instance.name = instance_id + + self.mock_gcp.get_node_instance.return_value = mock_instance + self.mock_gcp.get_instance_name.return_value = instance_id + self.mock_gcp.start_instances.return_value = None + self.mock_gcp.wait_until_running.return_value = True + + self.scenario.node_start_scenario( + instance_kill_count=3, + node=node, + timeout=600, + poll_interval=15 + ) + + self.assertEqual(self.mock_gcp.start_instances.call_count, 3) + self.assertEqual(len(self.affected_nodes_status.affected_nodes), 3) + + +if __name__ == "__main__": + unittest.main() From 42d18a8e047add12a1ba256dd749884ed6439afd Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Mon, 15 Dec 2025 10:04:35 -0500 Subject: [PATCH 04/11] adding fail scenario if unrecovered kubevirt vm killing (#994) Signed-off-by: Paige Patton --- .../kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py | 3 ++- tests/test_kubevirt_vm_outage.py | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py b/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py index 58a31364..f9a289f8 100644 --- a/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py +++ b/krkn/scenario_plugins/kubevirt_vm_outage/kubevirt_vm_outage_scenario_plugin.py @@ -55,7 +55,8 @@ class KubevirtVmOutageScenarioPlugin(AbstractScenarioPlugin): pods_status.merge(single_pods_status) scenario_telemetry.affected_pods = pods_status - + if len(scenario_telemetry.affected_pods.unrecovered) > 0: + return 1 return 0 except Exception as e: logging.error(f"KubeVirt VM Outage scenario failed: {e}") diff --git a/tests/test_kubevirt_vm_outage.py b/tests/test_kubevirt_vm_outage.py index cded8b51..4d884a14 100644 --- a/tests/test_kubevirt_vm_outage.py +++ b/tests/test_kubevirt_vm_outage.py @@ -147,7 +147,7 @@ class TestKubevirtVmOutageScenarioPlugin(unittest.TestCase): with patch("builtins.open", unittest.mock.mock_open(read_data=yaml.dump(self.config))): result = self.plugin.run("test-uuid", self.scenario_file, {}, self.telemetry, self.scenario_telemetry) - self.assertEqual(result, 0) + self.assertEqual(result, 1) mock_delete.assert_called_once_with("test-vm", "default", False) mock_wait.assert_not_called() From ba3fdea40387ddcf2bbacc4793877c364c9a0fc4 Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:46:48 -0500 Subject: [PATCH 05/11] adding pvc ttests (#1000) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 5 +++- CI/legacy/scenarios/volume_scenario.yaml | 38 +++++++++++++++++++++--- CI/tests/test_pvc.sh | 18 +++++++++++ scenarios/kind/pvc_scenario.yaml | 7 +++++ 4 files changed, 63 insertions(+), 5 deletions(-) create mode 100755 CI/tests/test_pvc.sh create mode 100644 scenarios/kind/pvc_scenario.yaml diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index d774b666..c943deb0 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -67,6 +67,8 @@ jobs: kubectl wait --for=condition=ready pod -l scenario=time-skew --timeout=300s kubectl apply -f CI/templates/service_hijacking.yaml kubectl wait --for=condition=ready pod -l "app.kubernetes.io/name=proxy" --timeout=300s + kubectl apply -f CI/legacy/scenarios/volume_scenario.yaml + kubectl wait --for=condition=ready pod kraken-test-pod -n kraken --timeout=300s - name: Get Kind nodes run: | kubectl get nodes --show-labels=true @@ -99,6 +101,7 @@ jobs: echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests + echo "test_pvc" >> ./CI/tests/functional_tests # Push on main only steps + all other functional to collect coverage # for the badge @@ -135,7 +138,7 @@ jobs: echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests - + echo "test_pvc" >> ./CI/tests/functional_tests # Final common steps - name: Run Functional tests env: diff --git a/CI/legacy/scenarios/volume_scenario.yaml b/CI/legacy/scenarios/volume_scenario.yaml index 2ea0b476..3b261bcb 100644 --- a/CI/legacy/scenarios/volume_scenario.yaml +++ b/CI/legacy/scenarios/volume_scenario.yaml @@ -45,6 +45,31 @@ metadata: name: kraken-test-pod namespace: kraken spec: + securityContext: + fsGroup: 1001 + # initContainer to fix permissions on the mounted volume + initContainers: + - name: fix-permissions + image: 'quay.io/centos7/httpd-24-centos7:centos7' + command: + - sh + - -c + - | + echo "Setting up permissions for /home/kraken..." + # Create the directory if it doesn't exist + mkdir -p /home/kraken + # Set ownership to user 1001 and group 1001 + chown -R 1001:1001 /home/kraken + # Set permissions to allow read/write + chmod -R 755 /home/kraken + rm -rf /home/kraken/* + echo "Permissions fixed. Current state:" + ls -la /home/kraken + volumeMounts: + - mountPath: "/home/kraken" + name: kraken-test-pv + securityContext: + runAsUser: 0 # Run as root to fix permissions volumes: - name: kraken-test-pv persistentVolumeClaim: @@ -52,8 +77,13 @@ spec: containers: - name: kraken-test-container image: 'quay.io/centos7/httpd-24-centos7:centos7' - volumeMounts: - - mountPath: "/home/krake-dir/" - name: kraken-test-pv securityContext: - privileged: true + runAsUser: 1001 + runAsNonRoot: true + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + volumeMounts: + - mountPath: "/home/kraken" + name: kraken-test-pv diff --git a/CI/tests/test_pvc.sh b/CI/tests/test_pvc.sh new file mode 100755 index 00000000..c115716d --- /dev/null +++ b/CI/tests/test_pvc.sh @@ -0,0 +1,18 @@ +set -xeEo pipefail + +source CI/tests/common.sh + +trap error ERR +trap finish EXIT + +function functional_test_pvc_fill { + export scenario_type="pvc_scenarios" + export scenario_file="scenarios/kind/pvc_scenario.yaml" + export post_config="" + envsubst < CI/config/common_test_config.yaml > CI/config/pvc_config.yaml + cat CI/config/pvc_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/pvc_config.yaml --debug True + echo "PVC Fill scenario test: Success" +} + +functional_test_pvc_fill diff --git a/scenarios/kind/pvc_scenario.yaml b/scenarios/kind/pvc_scenario.yaml new file mode 100644 index 00000000..9385ced9 --- /dev/null +++ b/scenarios/kind/pvc_scenario.yaml @@ -0,0 +1,7 @@ +pvc_scenario: + pvc_name: kraken-test-pvc # Name of the target PVC + pod_name: kraken-test-pod # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined + namespace: kraken # Namespace where the PVC is + fill_percentage: 38 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 + duration: 10 # Duration in seconds for the fault + block_size: 102400 # used only by dd if fallocate not present in the container From f2ba8b85af6d4504ded60bbbf9b75d1b23f46817 Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Mon, 15 Dec 2025 11:52:30 -0500 Subject: [PATCH 06/11] adding podman support in docker configuration (#999) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 2 + CI/tests/test_node.sh | 18 +++ .../node_actions/docker_node_scenarios.py | 129 +++++++++++++++++- requirements.txt | 5 +- scenarios/kind/node_scenarios_example.yml | 10 +- 5 files changed, 157 insertions(+), 7 deletions(-) create mode 100755 CI/tests/test_node.sh diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index c943deb0..a8498bb1 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -101,6 +101,7 @@ jobs: echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests + echo "test_node" >> ./CI/tests/functional_tests echo "test_pvc" >> ./CI/tests/functional_tests # Push on main only steps + all other functional to collect coverage @@ -138,6 +139,7 @@ jobs: echo "test_io_hog" >> ./CI/tests/functional_tests echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests + echo "test_node" >> ./CI/tests/functional_tests echo "test_pvc" >> ./CI/tests/functional_tests # Final common steps - name: Run Functional tests diff --git a/CI/tests/test_node.sh b/CI/tests/test_node.sh new file mode 100755 index 00000000..b0057ed0 --- /dev/null +++ b/CI/tests/test_node.sh @@ -0,0 +1,18 @@ +uset -xeEo pipefail + +source CI/tests/common.sh + +trap error ERR +trap finish EXIT + +function functional_test_node_stop_start { + export scenario_type="node_scenarios" + export scenario_file="scenarios/kind/node_scenarios_example.yml" + export post_config="" + envsubst < CI/config/common_test_config.yaml > CI/config/node_config.yaml + cat CI/config/node_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/node_config.yaml + echo "Node Stop/Start scenario test: Success" +} + +functional_test_node_stop_start diff --git a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py index 04ad20fc..37dd0a0e 100644 --- a/krkn/scenario_plugins/node_actions/docker_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/docker_node_scenarios.py @@ -2,46 +2,173 @@ import krkn.scenario_plugins.node_actions.common_node_functions as nodeaction from krkn.scenario_plugins.node_actions.abstract_node_scenarios import ( abstract_node_scenarios, ) +import os +import platform import logging import docker from krkn_lib.k8s import KrknKubernetes from krkn_lib.models.k8s import AffectedNode, AffectedNodeStatus class Docker: + """ + Container runtime client wrapper supporting both Docker and Podman. + + This class automatically detects and connects to either Docker or Podman + container runtimes using the Docker-compatible API. It tries multiple + connection methods in order of preference: + + 1. Docker Unix socket (unix:///var/run/docker.sock) + 2. Platform-specific Podman sockets: + - macOS: ~/.local/share/containers/podman/machine/podman.sock + - Linux rootful: unix:///run/podman/podman.sock + - Linux rootless: unix:///run/user//podman/podman.sock + 3. Environment variables (DOCKER_HOST or CONTAINER_HOST) + + The runtime type (docker/podman) is auto-detected and logged for debugging. + Supports Kind clusters running on Podman. + + Assisted By: Claude Code + """ def __init__(self): - self.client = docker.from_env() + self.client = None + self.runtime = 'unknown' + + + # Try multiple connection methods in order of preference + # Supports both Docker and Podman + connection_methods = [ + ('unix:///var/run/docker.sock', 'Docker Unix socket'), + ] + + # Add platform-specific Podman sockets + if platform.system() == 'Darwin': # macOS + # On macOS, Podman uses podman-machine with socket typically at: + # ~/.local/share/containers/podman/machine/podman.sock + # This is often symlinked to /var/run/docker.sock + podman_machine_sock = os.path.expanduser('~/.local/share/containers/podman/machine/podman.sock') + if os.path.exists(podman_machine_sock): + connection_methods.append((f'unix://{podman_machine_sock}', 'Podman machine socket (macOS)')) + else: # Linux + connection_methods.extend([ + ('unix:///run/podman/podman.sock', 'Podman Unix socket (rootful)'), + ('unix:///run/user/{uid}/podman/podman.sock', 'Podman Unix socket (rootless)'), + ]) + + # Always try from_env as last resort + connection_methods.append(('from_env', 'Environment variables (DOCKER_HOST/CONTAINER_HOST)')) + + for method, description in connection_methods: + try: + # Handle rootless Podman socket path with {uid} placeholder + if '{uid}' in method: + uid = os.getuid() + method = method.format(uid=uid) + logging.info(f'Attempting to connect using {description}: {method}') + + if method == 'from_env': + logging.info(f'Attempting to connect using {description}') + self.client = docker.from_env() + else: + logging.info(f'Attempting to connect using {description}: {method}') + self.client = docker.DockerClient(base_url=method) + + # Test the connection + self.client.ping() + + # Detect runtime type + try: + version_info = self.client.version() + version_str = version_info.get('Version', '') + if 'podman' in version_str.lower(): + self.runtime = 'podman' + else: + self.runtime = 'docker' + logging.debug(f'Runtime version info: {version_str}') + except Exception as version_err: + logging.warning(f'Could not detect runtime version: {version_err}') + self.runtime = 'unknown' + + logging.info(f'Successfully connected to {self.runtime} using {description}') + + # Log available containers for debugging + try: + containers = self.client.containers.list(all=True) + logging.info(f'Found {len(containers)} total containers') + for container in containers[:5]: # Log first 5 + logging.debug(f' Container: {container.name} ({container.status})') + except Exception as list_err: + logging.warning(f'Could not list containers: {list_err}') + + break + + except Exception as e: + logging.warning(f'Failed to connect using {description}: {e}') + continue + + if self.client is None: + error_msg = 'Failed to initialize container runtime client (Docker/Podman) with any connection method' + logging.error(error_msg) + logging.error('Attempted connection methods:') + for method, desc in connection_methods: + logging.error(f' - {desc}: {method}') + raise RuntimeError(error_msg) + + logging.info(f'Container runtime client initialized successfully: {self.runtime}') def get_container_id(self, node_name): + """Get the container ID for a given node name.""" container = self.client.containers.get(node_name) + logging.info(f'Found {self.runtime} container for node {node_name}: {container.id}') return container.id # Start the node instance def start_instances(self, node_name): + """Start a container instance (works with both Docker and Podman).""" + logging.info(f'Starting {self.runtime} container for node: {node_name}') container = self.client.containers.get(node_name) container.start() + logging.info(f'Container {container.id} started successfully') # Stop the node instance def stop_instances(self, node_name): + """Stop a container instance (works with both Docker and Podman).""" + logging.info(f'Stopping {self.runtime} container for node: {node_name}') container = self.client.containers.get(node_name) container.stop() + logging.info(f'Container {container.id} stopped successfully') # Reboot the node instance def reboot_instances(self, node_name): + """Restart a container instance (works with both Docker and Podman).""" + logging.info(f'Restarting {self.runtime} container for node: {node_name}') container = self.client.containers.get(node_name) container.restart() + logging.info(f'Container {container.id} restarted successfully') # Terminate the node instance def terminate_instances(self, node_name): + """Stop and remove a container instance (works with both Docker and Podman).""" + logging.info(f'Terminating {self.runtime} container for node: {node_name}') container = self.client.containers.get(node_name) container.stop() container.remove() + logging.info(f'Container {container.id} terminated and removed successfully') class docker_node_scenarios(abstract_node_scenarios): + """ + Node chaos scenarios for containerized Kubernetes nodes. + + Supports both Docker and Podman container runtimes. This class provides + methods to inject chaos into Kubernetes nodes running as containers + (e.g., Kind clusters, Podman-based clusters). + """ def __init__(self, kubecli: KrknKubernetes, node_action_kube_check: bool, affected_nodes_status: AffectedNodeStatus): + logging.info('Initializing docker_node_scenarios (supports Docker and Podman)') super().__init__(kubecli, node_action_kube_check, affected_nodes_status) self.docker = Docker() self.node_action_kube_check = node_action_kube_check + logging.info(f'Node scenarios initialized successfully using {self.docker.runtime} runtime') # Node scenario to start the node def node_start_scenario(self, instance_kill_count, node, timeout, poll_interval): diff --git a/requirements.txt b/requirements.txt index 60653ff5..2b0f577f 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,7 @@ azure-mgmt-network==27.0.0 itsdangerous==2.0.1 coverage==7.6.12 datetime==5.4 -docker==7.0.0 +docker>=6.0,<7.0 # docker 7.0+ has breaking changes with Unix sockets gitpython==3.1.41 google-auth==2.37.0 google-cloud-compute==1.22.0 @@ -28,7 +28,8 @@ pyfiglet==1.0.2 pytest==8.0.0 python-ipmi==0.5.4 python-openstackclient==6.5.0 -requests==2.32.4 +requests<2.32 # requests 2.32+ breaks Unix socket support (http+docker scheme) +requests-unixsocket>=0.4.0 # Required for Docker Unix socket support service_identity==24.1.0 PyYAML==6.0.1 setuptools==78.1.1 diff --git a/scenarios/kind/node_scenarios_example.yml b/scenarios/kind/node_scenarios_example.yml index e955795d..07f2b255 100644 --- a/scenarios/kind/node_scenarios_example.yml +++ b/scenarios/kind/node_scenarios_example.yml @@ -1,16 +1,18 @@ node_scenarios: - actions: # node chaos scenarios to be injected - node_stop_start_scenario - node_name: kind-worker # node on which scenario has to be injected; can set multiple names separated by comma - # label_selector: node-role.kubernetes.io/worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection + # node_name: kind-control-plane # node on which scenario has to be injected; can set multiple names separated by comma + label_selector: kubernetes.io/hostname=kind-worker # when node_name is not specified, a node with matching label_selector is selected for node chaos scenario injection instance_count: 1 # Number of nodes to perform action/select that match the label selector runs: 1 # number of times to inject each scenario under actions (will perform on same node each time) timeout: 120 # duration to wait for completion of node scenario injection cloud_type: docker # cloud type on which Kubernetes/OpenShift runs + duration: 10 - actions: - node_reboot_scenario - node_name: kind-worker - # label_selector: node-role.kubernetes.io/infra + node_name: kind-control-plane + # label_selector: kubernetes.io/hostname=kind-worker instance_count: 1 timeout: 120 cloud_type: docker + kube_check: false From c3f6b1a7ff21a7835fcd381c4a7dd37f672d483c Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Tue, 16 Dec 2025 10:27:24 -0500 Subject: [PATCH 07/11] updating return code (#1001) Signed-off-by: Paige Patton --- run_kraken.py | 29 ++++++++++++++++------------- 1 file changed, 16 insertions(+), 13 deletions(-) diff --git a/run_kraken.py b/run_kraken.py index 5a86de33..b2a485e3 100644 --- a/run_kraken.py +++ b/run_kraken.py @@ -141,7 +141,7 @@ def main(options, command: Optional[str]) -> int: logging.error( "Cannot read the kubeconfig file at %s, please check" % kubeconfig_path ) - return 1 + return -1 logging.info("Initializing client to talk to the Kubernetes cluster") # Generate uuid for the run @@ -184,10 +184,10 @@ def main(options, command: Optional[str]) -> int: # Set up kraken url to track signal if not 0 <= int(port) <= 65535: logging.error("%s isn't a valid port number, please check" % (port)) - return 1 + return -1 if not signal_address: logging.error("Please set the signal address in the config") - return 1 + return -1 address = (signal_address, port) # If publish_running_status is False this should keep us going @@ -220,7 +220,7 @@ def main(options, command: Optional[str]) -> int: "invalid distribution selected, running openshift scenarios against kubernetes cluster." "Please set 'kubernetes' in config.yaml krkn.platform and try again" ) - return 1 + return -1 if cv != "": logging.info(cv) else: @@ -361,7 +361,7 @@ def main(options, command: Optional[str]) -> int: logging.error( f"impossible to find scenario {scenario_type}, plugin not found. Exiting" ) - sys.exit(1) + sys.exit(-1) failed_post_scenarios, scenario_telemetries = ( scenario_plugin.run_scenarios( @@ -522,7 +522,7 @@ def main(options, command: Optional[str]) -> int: else: logging.error("Alert profile is not defined") - return 1 + return -1 # sys.exit(1) if enable_metrics: logging.info(f'Capturing metrics using file {metrics_profile}') @@ -537,17 +537,20 @@ def main(options, command: Optional[str]) -> int: telemetry_json ) + # want to exit with 1 first to show failure of scenario + # even if alerts failing + if failed_post_scenarios: + logging.error( + "Post scenarios are still failing at the end of all iterations" + ) + # sys.exit(1) + return 1 + if post_critical_alerts > 0: logging.error("Critical alerts are firing, please check; exiting") # sys.exit(2) return 2 - if failed_post_scenarios: - logging.error( - "Post scenarios are still failing at the end of all iterations" - ) - # sys.exit(2) - return 2 if health_checker.ret_value != 0: logging.error("Health check failed for the applications, Please check; exiting") return health_checker.ret_value @@ -563,7 +566,7 @@ def main(options, command: Optional[str]) -> int: else: logging.error("Cannot find a config at %s, please check" % (cfg)) # sys.exit(1) - return 2 + return -1 return 0 From e7fa6bdebcb19054e6d80208a40e72b2bc915c9b Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:09:15 -0500 Subject: [PATCH 08/11] checking chunk error in ci tests (#937) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 4 +- CI/tests/test_container.sh | 4 +- CI/tests/test_customapp_pod.sh | 2 +- CI/tests/test_pod.sh | 4 +- .../container/container_scenario_plugin.py | 4 +- .../pod_disruption_scenario_plugin.py | 87 +++++++++---------- requirements.txt | 2 +- scenarios/kind/pvc_scenario.yaml | 2 +- 8 files changed, 57 insertions(+), 52 deletions(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index a8498bb1..7053c282 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -102,7 +102,7 @@ jobs: echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests echo "test_node" >> ./CI/tests/functional_tests - echo "test_pvc" >> ./CI/tests/functional_tests + # echo "test_pvc" >> ./CI/tests/functional_tests # Push on main only steps + all other functional to collect coverage # for the badge @@ -140,7 +140,7 @@ jobs: echo "test_pod_network_filter" >> ./CI/tests/functional_tests echo "test_pod_server" >> ./CI/tests/functional_tests echo "test_node" >> ./CI/tests/functional_tests - echo "test_pvc" >> ./CI/tests/functional_tests + # echo "test_pvc" >> ./CI/tests/functional_tests # Final common steps - name: Run Functional tests env: diff --git a/CI/tests/test_container.sh b/CI/tests/test_container.sh index 9042b021..271b43fa 100755 --- a/CI/tests/test_container.sh +++ b/CI/tests/test_container.sh @@ -16,8 +16,10 @@ function functional_test_container_crash { export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/container_config.yaml - python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/container_config.yaml -d True echo "Container scenario test: Success" + + kubectl get pods -n kube-system -l component=etcd } functional_test_container_crash diff --git a/CI/tests/test_customapp_pod.sh b/CI/tests/test_customapp_pod.sh index c07869c8..6ae39230 100755 --- a/CI/tests/test_customapp_pod.sh +++ b/CI/tests/test_customapp_pod.sh @@ -11,7 +11,7 @@ function functional_test_customapp_pod_node_selector { export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/customapp_pod_config.yaml - python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/customapp_pod_config.yaml -d True echo "Pod disruption with node_label_selector test: Success" } diff --git a/CI/tests/test_pod.sh b/CI/tests/test_pod.sh index 97df491d..e09356ef 100755 --- a/CI/tests/test_pod.sh +++ b/CI/tests/test_pod.sh @@ -10,9 +10,11 @@ function functional_test_pod_crash { export scenario_file="scenarios/kind/pod_etcd.yml" export post_config="" envsubst < CI/config/common_test_config.yaml > CI/config/pod_config.yaml - cat CI/config/pod_config.yaml + python3 -m coverage run -a run_kraken.py -c CI/config/pod_config.yaml echo "Pod disruption scenario test: Success" + date + kubectl get pods -n kube-system -l component=etcd -o yaml } functional_test_pod_crash diff --git a/krkn/scenario_plugins/container/container_scenario_plugin.py b/krkn/scenario_plugins/container/container_scenario_plugin.py index 1c8e6ec2..21d67dcb 100644 --- a/krkn/scenario_plugins/container/container_scenario_plugin.py +++ b/krkn/scenario_plugins/container/container_scenario_plugin.py @@ -1,6 +1,7 @@ import logging import random import time +import traceback from asyncio import Future import yaml from krkn_lib.k8s import KrknKubernetes @@ -41,6 +42,7 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): logging.info("ContainerScenarioPlugin failed with unrecovered containers") return 1 except (RuntimeError, Exception) as e: + logging.error("Stack trace:\n%s", traceback.format_exc()) logging.error("ContainerScenarioPlugin exiting due to Exception %s" % e) return 1 else: @@ -50,7 +52,6 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): return ["container_scenarios"] def start_monitoring(self, kill_scenario: dict, lib_telemetry: KrknTelemetryOpenshift) -> Future: - namespace_pattern = f"^{kill_scenario['namespace']}$" label_selector = kill_scenario["label_selector"] recovery_time = kill_scenario["expected_recovery_time"] @@ -232,4 +233,5 @@ class ContainerScenarioPlugin(AbstractScenarioPlugin): timer += 5 logging.info("Waiting 5 seconds for containers to become ready") time.sleep(5) + return killed_container_list diff --git a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py index 736f6d5f..df309cc9 100644 --- a/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py +++ b/krkn/scenario_plugins/pod_disruption/pod_disruption_scenario_plugin.py @@ -2,7 +2,7 @@ import logging import random import time from asyncio import Future - +import traceback import yaml from krkn_lib.k8s import KrknKubernetes from krkn_lib.k8s.pod_monitor import select_and_monitor_by_namespace_pattern_and_label, \ @@ -74,6 +74,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): return 1 except (RuntimeError, Exception) as e: + logging.error("Stack trace:\n%s", traceback.format_exc()) logging.error("PodDisruptionScenariosPlugin exiting due to Exception %s" % e) return 1 else: @@ -150,7 +151,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): field_selector=combined_field_selector ) - def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None, quiet: bool = False): + def get_pods(self, name_pattern, label_selector, namespace, kubecli: KrknKubernetes, field_selector: str = None, node_label_selector: str = None, node_names: list = None): if label_selector and name_pattern: logging.error('Only, one of name pattern or label pattern can be specified') return [] @@ -161,8 +162,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): # If specific node names are provided, make multiple calls with field selector if node_names: - if not quiet: - logging.info(f"Targeting pods on {len(node_names)} specific nodes") + logging.debug(f"Targeting pods on {len(node_names)} specific nodes") all_pods = [] for node_name in node_names: pods = self._select_pods_with_field_selector( @@ -172,8 +172,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if pods: all_pods.extend(pods) - if not quiet: - logging.info(f"Found {len(all_pods)} target pods across {len(node_names)} nodes") + logging.debug(f"Found {len(all_pods)} target pods across {len(node_names)} nodes") return all_pods # Node label selector approach - use field selectors @@ -181,11 +180,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): # Get nodes matching the label selector first nodes_with_label = kubecli.list_nodes(label_selector=node_label_selector) if not nodes_with_label: - logging.info(f"No nodes found with label selector: {node_label_selector}") + logging.debug(f"No nodes found with label selector: {node_label_selector}") return [] - if not quiet: - logging.info(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}") + logging.debug(f"Targeting pods on {len(nodes_with_label)} nodes with label: {node_label_selector}") # Use field selector for each node all_pods = [] for node_name in nodes_with_label: @@ -196,8 +194,7 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if pods: all_pods.extend(pods) - if not quiet: - logging.info(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes") + logging.debug(f"Found {len(all_pods)} target pods across {len(nodes_with_label)} nodes") return all_pods # Standard pod selection (no node targeting) @@ -207,37 +204,40 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): def killing_pods(self, config: InputParams, kubecli: KrknKubernetes): # region Select target pods + try: + namespace = config.namespace_pattern + if not namespace: + logging.error('Namespace pattern must be specified') + + pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) + exclude_pods = set() + if config.exclude_label: + _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) + for pod in _exclude_pods: + exclude_pods.add(pod[0]) + + + pods_count = len(pods) + if len(pods) < config.kill: + logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format( + config.kill, len(pods))) + return 1 - namespace = config.namespace_pattern - if not namespace: - logging.error('Namespace pattern must be specified') - return 2 + random.shuffle(pods) + for i in range(config.kill): + pod = pods[i] + logging.info(pod) + if pod[0] in exclude_pods: + logging.info(f"Excluding {pod[0]} from chaos") + else: + logging.info(f'Deleting pod {pod[0]}') + kubecli.delete_pod(pod[0], pod[1]) + + return_val = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names) + except Exception as e: + raise(e) - pods = self.get_pods(config.name_pattern,config.label_selector,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) - exclude_pods = set() - if config.exclude_label: - _exclude_pods = self.get_pods("",config.exclude_label,config.namespace_pattern, kubecli, field_selector="status.phase=Running", node_label_selector=config.node_label_selector, node_names=config.node_names) - for pod in _exclude_pods: - exclude_pods.add(pod[0]) - - pods_count = len(pods) - if len(pods) < config.kill: - logging.error("Not enough pods match the criteria, expected {} but found only {} pods".format( - config.kill, len(pods))) - return 2 - - random.shuffle(pods) - for i in range(config.kill): - pod = pods[i] - logging.info(pod) - if pod[0] in exclude_pods: - logging.info(f"Excluding {pod[0]} from chaos") - else: - logging.info(f'Deleting pod {pod[0]}') - kubecli.delete_pod(pod[0], pod[1]) - - ret = self.wait_for_pods(config.label_selector,config.name_pattern,config.namespace_pattern, pods_count, config.duration, config.timeout, kubecli, config.node_label_selector, config.node_names) - return ret + return return_val def wait_for_pods( self, label_selector, pod_name, namespace, pod_count, duration, wait_timeout, kubecli: KrknKubernetes, node_label_selector, node_names @@ -246,10 +246,10 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): start_time = datetime.now() while not timeout: - pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names, quiet=True) + pods = self.get_pods(name_pattern=pod_name, label_selector=label_selector,namespace=namespace, field_selector="status.phase=Running", kubecli=kubecli, node_label_selector=node_label_selector, node_names=node_names) if pod_count == len(pods): return 0 - + time.sleep(duration) now_time = datetime.now() @@ -258,6 +258,5 @@ class PodDisruptionScenarioPlugin(AbstractScenarioPlugin): if time_diff.seconds > wait_timeout: logging.error("timeout while waiting for pods to come up") return 1 - - # should never get to this return + return 0 diff --git a/requirements.txt b/requirements.txt index 2b0f577f..494eb303 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,7 +16,7 @@ google-cloud-compute==1.22.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.6 -krkn-lib==5.1.12 +krkn-lib==5.1.13 lxml==5.1.0 kubernetes==34.1.0 numpy==1.26.4 diff --git a/scenarios/kind/pvc_scenario.yaml b/scenarios/kind/pvc_scenario.yaml index 9385ced9..28c8d272 100644 --- a/scenarios/kind/pvc_scenario.yaml +++ b/scenarios/kind/pvc_scenario.yaml @@ -2,6 +2,6 @@ pvc_scenario: pvc_name: kraken-test-pvc # Name of the target PVC pod_name: kraken-test-pod # Name of the pod where the PVC is mounted, it will be ignored if the pvc_name is defined namespace: kraken # Namespace where the PVC is - fill_percentage: 38 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 + fill_percentage: 98 # Target percentage to fill up the cluster, value must be higher than current percentage, valid values are between 0 and 99 duration: 10 # Duration in seconds for the fault block_size: 102400 # used only by dd if fallocate not present in the container From 18385cba2b2b864ff562b6ef67e5ea3bf44d7feb Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Wed, 17 Dec 2025 15:09:47 -0500 Subject: [PATCH 09/11] adding run unit tests on main (#1004) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 1 - 1 file changed, 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 7053c282..154e8e06 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -74,7 +74,6 @@ jobs: kubectl get nodes --show-labels=true # Pull request only steps - name: Run unit tests - if: github.event_name == 'pull_request' run: python -m coverage run -a -m unittest discover -s tests -v - name: Setup Pull Request Functional Tests From 2458022248a5e76a8ca3b266c87a10d1f52ff5b3 Mon Sep 17 00:00:00 2001 From: Paige Patton <64206430+paigerube14@users.noreply.github.com> Date: Thu, 18 Dec 2025 14:59:37 -0500 Subject: [PATCH 10/11] moving telemetry (#1008) Signed-off-by: Paige Patton --- .github/workflows/tests.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 154e8e06..718ed7de 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -123,11 +123,11 @@ jobs: yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml yq -i '.telemetry.username="${{secrets.TELEMETRY_USERNAME}}"' CI/config/common_test_config.yaml yq -i '.telemetry.password="${{secrets.TELEMETRY_PASSWORD}}"' CI/config/common_test_config.yaml - echo "test_telemetry" > ./CI/tests/functional_tests echo "test_service_hijacking" >> ./CI/tests/functional_tests echo "test_app_outages" >> ./CI/tests/functional_tests echo "test_container" >> ./CI/tests/functional_tests echo "test_pod" >> ./CI/tests/functional_tests + echo "test_telemetry" > ./CI/tests/functional_tests echo "test_pod_error" >> ./CI/tests/functional_tests echo "test_customapp_pod" >> ./CI/tests/functional_tests echo "test_namespace" >> ./CI/tests/functional_tests From b60b66de43330230855f4dc6c3a2f37c562aa896 Mon Sep 17 00:00:00 2001 From: Parag Kamble Date: Fri, 19 Dec 2025 20:36:17 +0530 Subject: [PATCH 11/11] Fixed IBM node_reboot_scenario failure (#1007) Signed-off-by: Parag Kamble Co-authored-by: Paige Patton <64206430+paigerube14@users.noreply.github.com> --- .../node_actions/ibmcloud_node_scenarios.py | 58 ++++++++++++------- 1 file changed, 36 insertions(+), 22 deletions(-) diff --git a/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py b/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py index 30bca718..039d766a 100644 --- a/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py +++ b/krkn/scenario_plugins/node_actions/ibmcloud_node_scenarios.py @@ -327,14 +327,20 @@ class ibm_node_scenarios(abstract_node_scenarios): vm_stopped = self.ibmcloud.stop_instances(instance_id) if vm_stopped: self.ibmcloud.wait_until_stopped(instance_id, timeout, affected_node) - logging.info( - "Node with instance ID: %s is in stopped state" % node - ) - logging.info( - "node_stop_scenario has been successfully injected!" - ) + logging.info( + "Node with instance ID: %s is in stopped state" % node + ) + logging.info( + "node_stop_scenario has been successfully injected!" + ) + else: + logging.error( + "Failed to stop node instance %s. Stop command failed." % instance_id + ) + raise Exception("Stop command failed for instance %s" % instance_id) + self.affected_nodes_status.affected_nodes.append(affected_node) except Exception as e: - logging.error("Failed to stop node instance. Test Failed") + logging.error("Failed to stop node instance. Test Failed: %s" % str(e)) logging.error("node_stop_scenario injection failed!") @@ -345,24 +351,31 @@ class ibm_node_scenarios(abstract_node_scenarios): affected_node = AffectedNode(node, node_id=instance_id) logging.info("Starting node_reboot_scenario injection") logging.info("Rebooting the node %s " % (node)) - self.ibmcloud.reboot_instances(instance_id) - self.ibmcloud.wait_until_rebooted(instance_id, timeout, affected_node) - if self.node_action_kube_check: - nodeaction.wait_for_unknown_status( - node, timeout, affected_node + vm_rebooted = self.ibmcloud.reboot_instances(instance_id) + if vm_rebooted: + self.ibmcloud.wait_until_rebooted(instance_id, timeout, affected_node) + if self.node_action_kube_check: + nodeaction.wait_for_unknown_status( + node, timeout, self.kubecli, affected_node + ) + nodeaction.wait_for_ready_status( + node, timeout, self.kubecli, affected_node + ) + logging.info( + "Node with instance ID: %s has rebooted successfully" % node ) - nodeaction.wait_for_ready_status( - node, timeout, affected_node + logging.info( + "node_reboot_scenario has been successfully injected!" ) - logging.info( - "Node with instance ID: %s has rebooted successfully" % node - ) - logging.info( - "node_reboot_scenario has been successfully injected!" - ) + else: + logging.error( + "Failed to reboot node instance %s. Reboot command failed." % instance_id + ) + raise Exception("Reboot command failed for instance %s" % instance_id) + self.affected_nodes_status.affected_nodes.append(affected_node) except Exception as e: - logging.error("Failed to reboot node instance. Test Failed") + logging.error("Failed to reboot node instance. Test Failed: %s" % str(e)) logging.error("node_reboot_scenario injection failed!") @@ -383,7 +396,8 @@ class ibm_node_scenarios(abstract_node_scenarios): logging.info( "node_terminate_scenario has been successfully injected!" ) + self.affected_nodes_status.affected_nodes.append(affected_node) except Exception as e: - logging.error("Failed to terminate node instance. Test Failed") + logging.error("Failed to terminate node instance. Test Failed: %s" % str(e)) logging.error("node_terminate_scenario injection failed!")