diff --git a/config/config.yaml b/config/config.yaml index adf22c35..cc69b6ab 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -125,4 +125,5 @@ kubevirt_checks: # Utilizing virt che namespace: # Namespace where to find VMI's name: # Regex Name style of VMI's to watch, optional, will watch all VMI names in the namespace if left blank only_failures: False # Boolean of whether to show all VMI's failures and successful ssh connection (False), or only failure status' (True) - disconnected: False # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False \ No newline at end of file + disconnected: False # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False + ssh_node: "" # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos \ No newline at end of file diff --git a/containers/Dockerfile.template b/containers/Dockerfile.template index 15ee1c1b..e699e838 100644 --- a/containers/Dockerfile.template +++ b/containers/Dockerfile.template @@ -43,6 +43,9 @@ COPY --from=oc-build /tmp/oc/oc /usr/bin/oc RUN git clone https://github.com/krkn-chaos/krkn.git /home/krkn/kraken && \ mkdir -p /home/krkn/.kube +RUN mkdir -p /home/krkn/.ssh && \ + chmod 700 /home/krkn/.ssh + WORKDIR /home/krkn/kraken # default behaviour will be to build main @@ -60,8 +63,14 @@ LABEL krknctl.title.global="Krkn Base Image" LABEL krknctl.description.global="This is the krkn base image." LABEL krknctl.input_fields.global='$KRKNCTL_INPUT' +# SSH setup script +RUN chmod +x /home/krkn/kraken/containers/setup-ssh.sh + +# Main entrypoint script +RUN chmod +x /home/krkn/kraken/containers/entrypoint.sh RUN chown -R krkn:krkn /home/krkn && chmod 755 /home/krkn USER krkn -ENTRYPOINT ["python3.9", "run_kraken.py"] + +ENTRYPOINT ["/bin/bash", "/home/krkn/kraken/containers/entrypoint.sh"] CMD ["--config=config/config.yaml"] diff --git a/containers/entrypoint.sh b/containers/entrypoint.sh new file mode 100644 index 00000000..eaadf7fc --- /dev/null +++ b/containers/entrypoint.sh @@ -0,0 +1,7 @@ +#!/bin/bash +# Run SSH setup +./containers/setup-ssh.sh +# Change to kraken directory + +# Execute the main command +exec python3.9 run_kraken.py "$@" diff --git a/containers/krknctl-input.json b/containers/krknctl-input.json index 18d074b9..8e6f1211 100644 --- a/containers/krknctl-input.json +++ b/containers/krknctl-input.json @@ -31,6 +31,24 @@ "separator": ",", "required": "false" }, + { + "name": "ssh-public-key", + "short_description": "Krkn ssh public key path", + "description": "Sets the path where krkn will search for ssh public key (in container)", + "variable": "KRKN_SSH_PUBLIC", + "type": "string", + "default": "", + "required": "false" + }, + { + "name": "ssh-private-key", + "short_description": "Krkn ssh private key path", + "description": "Sets the path where krkn will search for ssh private key (in container)", + "variable": "KRKN_SSH_PRIVATE", + "type": "string", + "default": "", + "required": "false" + }, { "name": "krkn-kubeconfig", "short_description": "Krkn kubeconfig path", @@ -474,6 +492,15 @@ "default": "False", "required": "false" }, + { + "name": "kubevirt-ssh-node", + "short_description": "KubeVirt node to ssh from", + "description": "KubeVirt node to ssh from, should be available whole chaos run", + "variable": "KUBE_VIRT_SSH_NODE", + "type": "string", + "default": "", + "required": "false" + }, { "name": "krkn-debug", "short_description": "Krkn debug mode", diff --git a/containers/setup-ssh.sh b/containers/setup-ssh.sh new file mode 100644 index 00000000..e6dda629 --- /dev/null +++ b/containers/setup-ssh.sh @@ -0,0 +1,73 @@ +#!/bin/bash +# Setup SSH key if mounted +# Support multiple mount locations +MOUNTED_PRIVATE_KEY_ALT="/secrets/id_rsa" +MOUNTED_PRIVATE_KEY="/home/krkn/.ssh/id_rsa" +MOUNTED_PUBLIC_KEY="/home/krkn/.ssh/id_rsa.pub" +WORKING_KEY="/home/krkn/.ssh/id_rsa.key" + +# Determine which source to use +SOURCE_KEY="" +if [ -f "$MOUNTED_PRIVATE_KEY_ALT" ]; then + SOURCE_KEY="$MOUNTED_PRIVATE_KEY_ALT" + echo "Found SSH key at alternative location: $SOURCE_KEY" +elif [ -f "$MOUNTED_PRIVATE_KEY" ]; then + SOURCE_KEY="$MOUNTED_PRIVATE_KEY" + echo "Found SSH key at default location: $SOURCE_KEY" +fi + +# Setup SSH private key and create config for outbound connections +if [ -n "$SOURCE_KEY" ]; then + echo "Setting up SSH private key from: $SOURCE_KEY" + + # Check current permissions and ownership + ls -la "$SOURCE_KEY" + + # Since the mounted key might be owned by root and we run as krkn user, + # we cannot modify it directly. Copy to a new location we can control. + echo "Copying SSH key to working location: $WORKING_KEY" + + # Try to copy - if readable by anyone, this will work + if cp "$SOURCE_KEY" "$WORKING_KEY" 2>/dev/null || cat "$SOURCE_KEY" > "$WORKING_KEY" 2>/dev/null; then + chmod 600 "$WORKING_KEY" + echo "SSH key copied successfully" + ls -la "$WORKING_KEY" + + # Verify the key is readable + if ssh-keygen -y -f "$WORKING_KEY" > /dev/null 2>&1; then + echo "SSH private key verified successfully" + else + echo "Warning: SSH key verification failed, but continuing anyway" + fi + + # Create SSH config to use the working key + cat > /home/krkn/.ssh/config </dev/null || stat -f '%Su:%Sg' "$SOURCE_KEY" 2>/dev/null)" + echo "" + echo "Solutions:" + echo "1. Mount with world-readable permissions (less secure): chmod 644 /path/to/key" + echo "2. Mount to /secrets/id_rsa instead of /home/krkn/.ssh/id_rsa" + echo "3. Change ownership on host: chown \$(id -u):\$(id -g) /path/to/key" + exit 1 + fi +fi + +# Setup SSH public key if mounted (for inbound server access) +if [ -f "$MOUNTED_PUBLIC_KEY" ]; then + echo "SSH public key already present at $MOUNTED_PUBLIC_KEY" + # Try to fix permissions (will fail silently if file is mounted read-only or owned by another user) + chmod 600 "$MOUNTED_PUBLIC_KEY" 2>/dev/null + if [ ! -f "/home/krkn/.ssh/authorized_keys" ]; then + cp "$MOUNTED_PUBLIC_KEY" /home/krkn/.ssh/authorized_keys + chmod 600 /home/krkn/.ssh/authorized_keys + fi +fi diff --git a/krkn/utils/VirtChecker.py b/krkn/utils/VirtChecker.py index 09358d47..9594d13a 100644 --- a/krkn/utils/VirtChecker.py +++ b/krkn/utils/VirtChecker.py @@ -14,12 +14,12 @@ from krkn_lib.utils.functions import get_yaml_item_value class VirtChecker: current_iterations: int = 0 ret_value = 0 - def __init__(self, kubevirt_check_config, iterations, krkn_lib: KrknKubernetes, threads_limt=20): + def __init__(self, kubevirt_check_config, iterations, krkn_lib: KrknKubernetes, threads_limit=20): self.iterations = iterations self.namespace = get_yaml_item_value(kubevirt_check_config, "namespace", "") self.vm_list = [] self.threads = [] - self.threads_limit = threads_limt + self.threads_limit = threads_limit if self.namespace == "": logging.info("kube virt checks config is not defined, skipping them") return @@ -28,6 +28,7 @@ class VirtChecker: self.disconnected = get_yaml_item_value(kubevirt_check_config, "disconnected", False) self.only_failures = get_yaml_item_value(kubevirt_check_config, "only_failures", False) self.interval = get_yaml_item_value(kubevirt_check_config, "interval", 2) + self.ssh_node = get_yaml_item_value(kubevirt_check_config, "ssh_node", "") try: self.kube_vm_plugin = KubevirtVmOutageScenarioPlugin() self.kube_vm_plugin.init_clients(k8s_client=krkn_lib) @@ -40,15 +41,51 @@ class VirtChecker: node_name = vmi.get("status",{}).get("nodeName") vmi_name = vmi.get("metadata",{}).get("name") ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress") - self.vm_list.append(VirtCheck({'vm_name':vmi_name, 'ip_address': ip_address, 'namespace':self.namespace, 'node_name':node_name})) + self.vm_list.append(VirtCheck({'vm_name':vmi_name, 'ip_address': ip_address, 'namespace':self.namespace, 'node_name':node_name, "new_ip_address":""})) - def check_disconnected_access(self, ip_address: str, worker_name:str = ''): - - virtctl_vm_cmd = f"ssh core@{worker_name} 'ssh -o BatchMode=yes -o ConnectTimeout=2 -o StrictHostKeyChecking=no root@{ip_address} 2>&1 | grep Permission' && echo 'True' || echo 'False'" - if 'True' in invoke_no_exit(virtctl_vm_cmd): - return True + def check_disconnected_access(self, ip_address: str, worker_name:str = '', vmi_name: str = ''): + + virtctl_vm_cmd = f"ssh core@{worker_name} 'ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@{ip_address}'" + + all_out = invoke_no_exit(virtctl_vm_cmd) + logging.debug(f"Checking disconnected access for {ip_address} on {worker_name} output: {all_out}") + virtctl_vm_cmd = f"ssh core@{worker_name} 'ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@{ip_address} 2>&1 | grep Permission' && echo 'True' || echo 'False'" + logging.debug(f"Checking disconnected access for {ip_address} on {worker_name} with command: {virtctl_vm_cmd}") + output = invoke_no_exit(virtctl_vm_cmd) + if 'True' in output: + logging.debug(f"Disconnected access for {ip_address} on {worker_name} is successful: {output}") + return True, None, None else: - return False + logging.debug(f"Disconnected access for {ip_address} on {worker_name} is failed: {output}") + vmi = self.kube_vm_plugin.get_vmi(vmi_name,self.namespace) + new_ip_address = vmi.get("status",{}).get("interfaces",[])[0].get("ipAddress") + new_node_name = vmi.get("status",{}).get("nodeName") + # if vm gets deleted, it'll start up with a new ip address + if new_ip_address != ip_address: + virtctl_vm_cmd = f"ssh core@{worker_name} 'ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@{new_ip_address} 2>&1 | grep Permission' && echo 'True' || echo 'False'" + logging.debug(f"Checking disconnected access for {new_ip_address} on {worker_name} with command: {virtctl_vm_cmd}") + new_output = invoke_no_exit(virtctl_vm_cmd) + logging.debug(f"Disconnected access for {ip_address} on {worker_name}: {new_output}") + if 'True' in new_output: + return True, new_ip_address, None + # if node gets stopped, vmis will start up with a new node (and with new ip) + if new_node_name != worker_name: + virtctl_vm_cmd = f"ssh core@{new_node_name} 'ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@{new_ip_address} 2>&1 | grep Permission' && echo 'True' || echo 'False'" + logging.debug(f"Checking disconnected access for {new_ip_address} on {new_node_name} with command: {virtctl_vm_cmd}") + new_output = invoke_no_exit(virtctl_vm_cmd) + logging.debug(f"Disconnected access for {ip_address} on {new_node_name}: {new_output}") + if 'True' in new_output: + return True, new_ip_address, new_node_name + # try to connect with a common "up" node as last resort + if self.ssh_node: + # using new_ip_address here since if it hasn't changed it'll match ip_address + virtctl_vm_cmd = f"ssh core@{self.ssh_node} 'ssh -o BatchMode=yes -o ConnectTimeout=5 -o StrictHostKeyChecking=no root@{new_ip_address} 2>&1 | grep Permission' && echo 'True' || echo 'False'" + logging.debug(f"Checking disconnected access for {new_ip_address} on {self.ssh_node} with command: {virtctl_vm_cmd}") + new_output = invoke_no_exit(virtctl_vm_cmd) + logging.debug(f"Disconnected access for {new_ip_address} on {self.ssh_node}: {new_output}") + if 'True' in new_output: + return True, new_ip_address, None + return False, None, None def get_vm_access(self, vm_name: str = '', namespace: str = ''): """ @@ -57,8 +94,8 @@ class VirtChecker: :param namespace: :return: virtctl_status 'True' if successful, or an error message if it fails. """ - virtctl_vm_cmd = f"virtctl ssh --local-ssh-opts='-o BatchMode=yes' --local-ssh-opts='-o PasswordAuthentication=no' --local-ssh-opts='-o ConnectTimeout=2' root@vmi/{vm_name} -n {namespace} 2>&1 |egrep 'denied|verification failed' && echo 'True' || echo 'False'" - check_virtctl_vm_cmd = f"virtctl ssh --local-ssh-opts='-o BatchMode=yes' --local-ssh-opts='-o PasswordAuthentication=no' --local-ssh-opts='-o ConnectTimeout=2' root@{vm_name} -n {namespace} 2>&1 |egrep 'denied|verification failed' && echo 'True' || echo 'False'" + virtctl_vm_cmd = f"virtctl ssh --local-ssh-opts='-o BatchMode=yes' --local-ssh-opts='-o PasswordAuthentication=no' --local-ssh-opts='-o ConnectTimeout=5' root@vmi/{vm_name} -n {namespace} 2>&1 |egrep 'denied|verification failed' && echo 'True' || echo 'False'" + check_virtctl_vm_cmd = f"virtctl ssh --local-ssh-opts='-o BatchMode=yes' --local-ssh-opts='-o PasswordAuthentication=no' --local-ssh-opts='-o ConnectTimeout=5' root@{vm_name} -n {namespace} 2>&1 |egrep 'denied|verification failed' && echo 'True' || echo 'False'" if 'True' in invoke_no_exit(check_virtctl_vm_cmd): return True else: @@ -91,7 +128,16 @@ class VirtChecker: if not self.disconnected: vm_status = self.get_vm_access(vm.vm_name, vm.namespace) else: - vm_status = self.check_disconnected_access(vm.ip_address, vm.node_name) + # if new ip address exists use it + if vm.new_ip_address: + vm_status, new_ip_address, new_node_name = self.check_disconnected_access(vm.new_ip_address, vm.node_name, vm.vm_name) + # since we already set the new ip address, we don't want to reset to none each time + else: + vm_status, new_ip_address, new_node_name = self.check_disconnected_access(vm.ip_address, vm.node_name, vm.vm_name) + if new_ip_address and vm.ip_address != new_ip_address: + vm.new_ip_address = new_ip_address + if new_node_name and vm.node_name != new_node_name: + vm.node_name = new_node_name except Exception: vm_status = False @@ -103,7 +149,8 @@ class VirtChecker: "namespace": vm.namespace, "node_name": vm.node_name, "status": vm_status, - "start_timestamp": start_timestamp + "start_timestamp": start_timestamp, + "new_ip_address": vm.new_ip_address } else: if vm_status != virt_check_tracker[vm.vm_name]["status"]: @@ -113,6 +160,8 @@ class VirtChecker: virt_check_tracker[vm.vm_name]["end_timestamp"] = end_timestamp.isoformat() virt_check_tracker[vm.vm_name]["duration"] = duration virt_check_tracker[vm.vm_name]["start_timestamp"] = start_timestamp.isoformat() + if vm.new_ip_address: + virt_check_tracker[vm.vm_name]["new_ip_address"] = vm.new_ip_address if self.only_failures: if not virt_check_tracker[vm.vm_name]["status"]: virt_check_telemetry.append(VirtCheck(virt_check_tracker[vm.vm_name])) diff --git a/requirements.txt b/requirements.txt index 5d074aa9..a986f165 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,9 +16,9 @@ google-cloud-compute==1.22.0 ibm_cloud_sdk_core==3.18.0 ibm_vpc==0.20.0 jinja2==3.1.6 -krkn-lib==5.1.9 +krkn-lib==5.1.11 lxml==5.1.0 -kubernetes==28.1.0 +kubernetes==34.1.0 numpy==1.26.4 pandas==2.2.0 openshift-client==1.0.21