Fix: Reslove ingress network chaos plugin issue

Added network_chaos to plugin step and job wait time to be based on the test duration and set the default wait_time to 30s Signed-off-by: yogananth subramanian <ysubrama@redhat.com>
Copy oc and kubectl clients to additional paths
2026-02-16 19:09:53 +00:00 · 2024-03-22 14:48:17 -04:00 · 2024-03-21 11:29:50 -04:00 · 2024-03-19 15:33:25 -04:00 · 2024-03-19 14:44:47 -04:00 · 2024-03-17 23:07:46 -04:00
23 changed files with 339 additions and 131 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__/*
 *.out
 kube-burner*
 kube_burner*
+recommender_*.json

 # Project files
 .ropeproject
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -29,7 +29,7 @@ tunings:
    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever.
 telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
-    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
+    api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
    username: $TELEMETRY_USERNAME                                      # telemetry service username
    password: $TELEMETRY_PASSWORD                                      # telemetry service password
    prometheus_namespace: 'prometheus-k8s'                                # prometheus namespace
@@ -49,3 +49,4 @@ telemetry:
        - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+    telemetry_group: "funtests"
--- a/CI/tests/common.sh
+++ b/CI/tests/common.sh
@@ -8,8 +8,14 @@ function finish {
 }

 function error {
-    echo "Error caught."
-    ERRORED=true
+    exit_code=$?
+    if [ $exit_code == 1 ]
+    then
+      echo "Error caught."
+      ERRORED=true
+    else
+      echo "Exit code greater than zero detected: $exit_code"
+    fi
 }

 function get_node {
--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -14,16 +14,20 @@ function functional_test_telemetry {
  export RUN_TAG="funtest-telemetry"
  yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
  yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
  yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
+
  export scenario_type="arcaflow_scenarios"
  export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
  python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
-  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
+  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
+  cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
  echo "all files uploaded!"
--- a/README.md
+++ b/README.md
@@ -1,5 +1,4 @@
 # Krkn aka Kraken
-[![Docker Repository on Quay](https://quay.io/repository/krkn-chaos/krkn/status "Docker Repository on Quay")](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
 ![Workflow-Status](https://github.com/krkn-chaos/krkn/actions/workflows/docker-image.yml/badge.svg)

 ![Krkn logo](media/logo.png)
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,5 +1,5 @@
 kraken:
-    distribution: openshift                                # Distribution can be kubernetes or openshift
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift
    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
@@ -15,7 +15,7 @@ kraken:
        - application_outages:
            - scenarios/openshift/app_outage.yaml
        - container_scenarios:                             # List of chaos pod scenarios to load
-            - -    scenarios/openshift/container_etcd.yml
+            - - scenarios/openshift/container_etcd.yml
        - plugin_scenarios:
            - scenarios/openshift/etcd.yml
            - scenarios/openshift/regex_openshift_pod_kill.yml
@@ -23,7 +23,7 @@ kraken:
            - scenarios/openshift/network_chaos_ingress.yml
            - scenarios/openshift/prom_kill.yml
        - node_scenarios:                                  # List of chaos node scenarios to load
-            -   scenarios/openshift/node_scenarios_example.yml
+            - scenarios/openshift/node_scenarios_example.yml
        - plugin_scenarios:
            - scenarios/openshift/openshift-apiserver.yml
            - scenarios/openshift/openshift-kube-apiserver.yml
@@ -51,7 +51,7 @@ cerberus:
 performance_monitoring:
    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_url:                                      # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
    uuid:                                                 # uuid for the run is generated by default if not set
    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
@@ -65,14 +65,19 @@ telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
    username: username                                      # telemetry service username
-    password: password                                      # telemetry service password
+    password: password                                    # telemetry service password
    prometheus_backup: True                                 # enables/disables prometheus data collection
+    prometheus_namespace: ""                                # namespace where prometheus is deployed (if distribution is kubernetes)
+    prometheus_container_name: ""                           # name of the prometheus container name (if distribution is kubernetes)
+    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
-    archive_size: 500000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+    archive_size: 500000
+    telemetry_group: ''                                     # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
+    # the size of the prometheus data archive size in KB. The lower the size of archive is
                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
                                                            # simultaneously).
                                                            # For unstable/slow connection is better to keep this value low
@@ -85,6 +90,9 @@ telemetry:
     - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+elastic: 
+    elastic_url: ""                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
+    elastic_index: ""                                       # Elastic search index pattern to post results to



--- a/config/config_performance.yaml
+++ b/config/config_performance.yaml
@@ -77,3 +77,8 @@ telemetry:
     - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+"          # kinit 2023/09/15 11:20:36 log
     - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
+elastic: 
+    elastic_url: ""                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
+    elastic_index: ""                                       # Elastic search index pattern to post results to
+
+
--- a/config/recommender_config.yaml
+++ b/config/recommender_config.yaml
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
 scrape_duration: 10m
 chaos_library: "kraken"
 log_level: INFO
+json_output_file: False
+json_output_folder_path:

 # for output purpose only do not change if not needed
 chaos_tests:
@@ -26,4 +28,8 @@ chaos_tests:
    - pod_network_chaos
  MEM:
    - node_memory_hog
-    - pvc_disk_fill
+    - pvc_disk_fill
+
+threshold: .7
+cpu_threshold: .5
+mem_threshold: .5
--- a/containers/Dockerfile
+++ b/containers/Dockerfile
@@ -12,7 +12,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.10 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
@@ -20,7 +20,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \

 # Get Kubernetes and OpenShift clients from stable releases
 WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
+RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl

 WORKDIR /root/kraken

--- a/containers/Dockerfile-ppc64le
+++ b/containers/Dockerfile-ppc64le
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.10 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
@@ -22,7 +22,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \

 # Get Kubernetes and OpenShift clients from stable releases
 WORKDIR /tmp
-RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
+RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl

 WORKDIR /root/kraken

--- a/docs/installation.md
+++ b/docs/installation.md
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
 Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
 ```
 $ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
-$ cd kraken
+$ cd krkn
 ```

 #### Install the dependencies
--- a/kraken/chaos_recommender/analysis.py
+++ b/kraken/chaos_recommender/analysis.py
@@ -4,13 +4,10 @@ import pandas as pd
 import kraken.chaos_recommender.kraken_tests as kraken_tests
 import time

-threshold = .7  # Adjust the threshold as needed
-heatmap_cpu_threshold = .5
-heatmap_mem_threshold = .5
-
 KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"

-#Placeholder, this should be done with topology
+
+# Placeholder, this should be done with topology
 def return_critical_services():
    return ["web", "cart"]

@@ -19,6 +16,7 @@ def load_telemetry_data(file_path):
    data = pd.read_csv(file_path, delimiter=r"\s+")
    return data

+
 def calculate_zscores(data):
    zscores = pd.DataFrame()
    zscores["Service"] = data["service"]
@@ -27,7 +25,8 @@ def calculate_zscores(data):
    zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
    return zscores

-def identify_outliers(data):
+
+def identify_outliers(data, threshold):
    outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
    outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
    outliers_network = data[data["Network"] > threshold]["Service"].tolist()
@@ -47,44 +46,64 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
    return cpu_services, mem_services


-def analysis(file_path, chaos_tests_config):
+def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
    # Load the telemetry data from file
+    logging.info("Fetching the Telemetry data")
    data = load_telemetry_data(file_path)

    # Calculate Z-scores for CPU, Memory, and Network columns
    zscores = calculate_zscores(data)

    # Identify outliers
-    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
+    logging.info("Identifying outliers")
+    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
    cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)

-    # Display the identified outliers
-    logging.info("======================== Profiling ==================================")
-    logging.info(f"CPU outliers: {outliers_cpu}")
-    logging.info(f"Memory outliers: {outliers_memory}")
-    logging.info(f"Network outliers: {outliers_network}")
-    logging.info("===================== HeatMap Analysis ==============================")
+    analysis_data = analysis_json(outliers_cpu, outliers_memory,
+                                  outliers_network, cpu_services,
+                                  mem_services, chaos_tests_config)
+
+    if not cpu_services:
+        logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
+    if not mem_services:
+        logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
+    time.sleep(2)
+
+    logging.info("Please check data in utilisation.txt for further analysis")
+
+    return analysis_data
+
+
+def analysis_json(outliers_cpu, outliers_memory, outliers_network,
+                  cpu_services, mem_services, chaos_tests_config):
+
+    profiling = {
+        "cpu_outliers": outliers_cpu,
+        "memory_outliers": outliers_memory,
+        "network_outliers": outliers_network
+    }
+
+    heatmap = {
+        "services_with_cpu_heatmap_above_threshold": cpu_services,
+        "services_with_mem_heatmap_above_threshold": mem_services
+    }
+
+    recommendations = {}

    if cpu_services:
-        logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
-    else:
-        logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
+        cpu_recommend = {"services": cpu_services,
+                         "tests": chaos_tests_config['CPU']}
+        recommendations["cpu_services_recommendations"] = cpu_recommend
+
    if mem_services:
-        logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
-    else:
-        logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
-    time.sleep(2)
-    logging.info("======================= Recommendations =============================")
-    if cpu_services:
-        logging.info(f"Recommended tests for {str(cpu_services)}  :\n {chaos_tests_config['CPU']}")
-        logging.info("\n")
-    if mem_services:
-        logging.info(f"Recommended tests for {str(mem_services)}  :\n {chaos_tests_config['MEM']}")
-        logging.info("\n")
+        mem_recommend = {"services": mem_services,
+                         "tests": chaos_tests_config['MEM']}
+        recommendations["mem_services_recommendations"] = mem_recommend

    if outliers_network:
-        logging.info(f"Recommended tests for  str(outliers_network)  :\n {chaos_tests_config['NETWORK']}")
-        logging.info("\n")
+        outliers_network_recommend = {"outliers_networks": outliers_network,
+                                      "tests": chaos_tests_config['NETWORK']}
+        recommendations["outliers_network_recommendations"] = (
+            outliers_network_recommend)

-    logging.info("\n")
-    logging.info("Please check data in utilisation.txt for further analysis")
+    return [profiling, heatmap, recommendations]
--- a/kraken/chaos_recommender/prometheus.py
+++ b/kraken/chaos_recommender/prometheus.py
@@ -1,6 +1,5 @@
 import logging

-import pandas
 from prometheus_api_client import PrometheusConnect
 import pandas as pd
 import urllib3
@@ -8,6 +7,7 @@ import urllib3

 saved_metrics_path = "./utilisation.txt"

+
 def convert_data_to_dataframe(data, label):
    df = pd.DataFrame()
    df['service'] = [item['metric']['pod'] for item in data]
@@ -25,6 +25,7 @@ def convert_data(data, service):
        result[pod_name] = value
    return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value

+
 def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
    df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
    merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
@@ -39,8 +40,6 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
                    "NETWORK" : convert_data(network_data, s)}, index=[0])
        merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)

-
-
    # Convert columns to string
    merged_df['CPU'] = merged_df['CPU'].astype(str)
    merged_df['MEM'] = merged_df['MEM'].astype(str)
@@ -57,40 +56,39 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r

    merged_df.to_csv(filename, sep='\t', index=False)

+
 def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
    urllib3.disable_warnings()
    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)

    # Fetch CPU utilization
+    logging.info("Fetching utilization")
    cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
-    logging.info(cpu_query)
    cpu_result = prometheus.custom_query(cpu_query)
-    cpu_data = cpu_result
-

    cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
-    logging.info(cpu_limits_query)
    cpu_limits_result = prometheus.custom_query(cpu_limits_query)

-
    mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
-    logging.info(mem_query)
    mem_result = prometheus.custom_query(mem_query)
-    mem_data = mem_result

    mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
-    logging.info(mem_limits_query)
    mem_limits_result = prometheus.custom_query(mem_limits_query)

-
    network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
    (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
    network_result = prometheus.custom_query(network_query)
-    logging.info(network_query)
-    network_data = network_result
-
-
-    save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
-    return saved_metrics_path
+
+    save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
+    queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
+    return saved_metrics_path, queries


+def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
+    queries = {
+        "cpu_query": cpu_query,
+        "cpu_limit_query": cpu_limits_query,
+        "memory_query": mem_query,
+        "memory_limit_query": mem_limits_query
+    }
+    return queries
--- a/kraken/plugins/init.py
+++ b/kraken/plugins/init.py
@@ -213,6 +213,12 @@ PLUGINS = Plugins(
                "error"
            ]
        ),
+        PluginStep(
+            network_chaos,
+            [
+                "error"
+            ]
+        ),        
        PluginStep(
            pod_outage,
            [
--- a/kraken/plugins/network/ingress_shaping.py
+++ b/kraken/plugins/network/ingress_shaping.py
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
        typing.Optional[int],
        validation.min(1)
    ] = field(
-        default=300,
+        default=30,
        metadata={
            "name": "Wait Duration",
            "description":
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                )
            logging.info("Waiting for parallel job to finish")
            start_time = int(time.time())
-            wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+            wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
            end_time = int(time.time())
            if publish:
                cerberus.publish_kraken_status(
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
                    )
                logging.info("Waiting for serial job to finish")
                start_time = int(time.time())
-                wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
+                wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
                logging.info("Deleting jobs")
                delete_jobs(cli, batch_cli, job_list[:])
                job_list = []
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,10 +1,13 @@
 import datetime
 import os.path
+from typing import Optional
+
 import urllib3
 import logging
 import sys

 import yaml
+from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):

            prom_cli.process_alert(alert,
                                   datetime.datetime.fromtimestamp(start_time),
-                                   datetime.datetime.fromtimestamp(end_time))
+                                   datetime.datetime.fromtimestamp(end_time))
+
+
+def critical_alerts(prom_cli: KrknPrometheus,
+                    summary: ChaosRunAlertSummary,
+                    run_id,
+                    scenario,
+                    start_time,
+                    end_time):
+    summary.scenario = scenario
+    summary.run_id = run_id
+    query = r"""ALERTS{severity="critical"}"""
+    logging.info("Checking for critical alerts firing post chaos")
+
+    during_critical_alerts = prom_cli.process_prom_query_in_range(
+        query,
+        start_time=datetime.datetime.fromtimestamp(start_time),
+        end_time=end_time
+
+    )
+
+    for alert in during_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.chaos_alerts.append(alert)
+
+
+    post_critical_alerts = prom_cli.process_query(
+        query
+    )
+
+    for alert in post_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.post_chaos_alerts.append(alert)
+
+    during_critical_alerts_count = len(during_critical_alerts)
+    post_critical_alerts_count = len(post_critical_alerts)
+    firing_alerts = False
+
+    if during_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if post_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if not firing_alerts:
+        logging.info("No critical alerts are firing!!")
--- a/requirements.txt
+++ b/requirements.txt
@@ -10,13 +10,12 @@ itsdangerous==2.0.1
 coverage==7.4.1
 datetime==5.4
 docker==7.0.0
-docker-compose==1.29.2
 gitpython==3.1.41
 google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.3
-krkn-lib==1.4.9
+krkn-lib==2.1.0
 lxml==5.1.0
 kubernetes==26.1.0
 oauth2client==4.1.3
@@ -31,11 +30,12 @@ python-ipmi==0.5.4
 python-openstackclient==6.5.0
 requests==2.31.0
 service_identity==24.1.0
-PyYAML==5.4.1
+PyYAML==6.0
 setuptools==65.5.1
 werkzeug==3.0.1
 wheel==0.42.0
 zope.interface==5.4.0

 git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
-git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
+git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
+cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -9,6 +9,8 @@ import optparse
 import pyfiglet
 import uuid
 import time
+
+from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
@@ -27,6 +29,7 @@ import server as server
 from kraken import plugins
 from krkn_lib.k8s import KrknKubernetes
 from krkn_lib.ocp import KrknOpenshift
+from krkn_lib.telemetry.elastic import KrknElastic
 from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
 from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
 from krkn_lib.models.telemetry import ChaosRunTelemetry
@@ -94,6 +97,9 @@ def main(cfg):
            config["performance_monitoring"], "check_critical_alerts", False
        )
        telemetry_api_url = config["telemetry"].get("api_url")
+        elastic_config = get_yaml_item_value(config,"elastic",{})
+        elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
+        elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")
        
        # Initialize clients
        if (not os.path.isfile(kubeconfig_path) and
@@ -129,8 +135,6 @@ def main(cfg):
        except:
            kubecli.initialize_clients(None)

-
-
        # find node kraken might be running on
        kubecli.find_kraken_node()

@@ -161,8 +165,13 @@ def main(cfg):
            if prometheus_url is None:
                try:
                    connection_data = ocpcli.get_prometheus_api_connection_data()
-                    prometheus_url = connection_data.endpoint
-                    prometheus_bearer_token = connection_data.token
+                    if connection_data:
+                        prometheus_url = connection_data.endpoint
+                        prometheus_bearer_token = connection_data.token
+                    else: 
+                        # If can't make a connection, set alerts to false
+                        enable_alerts = False
+                        critical_alerts = False
                except Exception:
                    logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
                                  "Please set 'kubernetes' in config.yaml krkn.platform and try again")
@@ -175,9 +184,9 @@ def main(cfg):
        # KrknTelemetry init
        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
-
-
-        if enable_alerts:
+        telemetry_elastic = KrknElastic(safe_logger,elastic_url)
+        summary = ChaosRunAlertSummary()
+        if enable_alerts or check_critical_alerts:
            prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)

        logging.info("Server URL: %s" % kubecli.get_host())
@@ -208,7 +217,8 @@ def main(cfg):

        # Capture the start time
        start_time = int(time.time())
-
+        post_critical_alerts = 0
+        chaos_output = ChaosRunOutput()
        chaos_telemetry = ChaosRunTelemetry()
        chaos_telemetry.run_uuid = run_uuid
        # Loop to run the chaos starts here
@@ -339,25 +349,21 @@ def main(cfg):
                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)

                        # Check for critical alerts when enabled
-                        if enable_alerts and check_critical_alerts :
-                            logging.info("Checking for critical alerts firing post choas")
+                        post_critical_alerts = 0
+                        if check_critical_alerts:
+                            prometheus_plugin.critical_alerts(prometheus,
+                                                              summary,
+                                                              run_uuid,
+                                                              scenario_type,
+                                                              start_time,
+                                                              datetime.datetime.now())

-                            ##PROM
-                            query = r"""ALERTS{severity="critical"}"""
-                            end_time = datetime.datetime.now()
-                            critical_alerts = prometheus.process_prom_query_in_range(
-                                query,
-                                start_time = datetime.datetime.fromtimestamp(start_time),
-                                end_time = end_time
+                            chaos_output.critical_alerts = summary
+                            post_critical_alerts = len(summary.post_chaos_alerts)
+                            if post_critical_alerts > 0:
+                                logging.error("Post chaos critical alerts firing please check, exiting")
+                                break

-                            )
-                            critical_alerts_count = len(critical_alerts)
-                            if critical_alerts_count > 0:
-                                logging.error("Critical alerts are firing: %s", critical_alerts)
-                                logging.error("Please check, exiting")
-                                sys.exit(1)
-                            else:
-                                logging.info("No critical alerts are firing!!")

            iteration += 1
            logging.info("")
@@ -377,14 +383,18 @@ def main(cfg):
            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)

        decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
-        logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
-
+        chaos_output.telemetry = decoded_chaos_run_telemetry
+        logging.info(f"Chaos data:\n{chaos_output.to_json()}")
+        telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
        if config["telemetry"]["enabled"]:
-            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
+            logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
+                         f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
+                         f'{telemetry_request_id}')
            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
            try:
                telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
                telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
+                telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
                # prometheus data collection is available only on Openshift
                if config["telemetry"]["prometheus_backup"]:
                    prometheus_archive_files = ''
@@ -434,11 +444,15 @@ def main(cfg):
                logging.error("Alert profile is not defined")
                sys.exit(1)

+        if post_critical_alerts > 0:
+            logging.error("Critical alerts are firing, please check; exiting")
+            sys.exit(2)
+
        if failed_post_scenarios:
            logging.error(
                "Post scenarios are still failing at the end of all iterations"
            )
-            sys.exit(1)
+            sys.exit(2)

        logging.info(
            "Successfully finished running Kraken. UUID for the run: "
--- a/scenarios/arcaflow/cpu-hog/config.yaml
+++ b/scenarios/arcaflow/cpu-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/io-hog/config.yaml
+++ b/scenarios/arcaflow/io-hog/config.yaml
@@ -3,7 +3,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/scenarios/arcaflow/memory-hog/config.yaml
+++ b/scenarios/arcaflow/memory-hog/config.yaml
@@ -4,7 +4,7 @@ deployers:
    connection: {}
    deployer_name: kubernetes
 log:
-  level: debug
+  level: error
 logged_outputs:
  error:
    level: error
--- a/utils/chaos_recommender/README.md
+++ b/utils/chaos_recommender/README.md
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
    $ git clone https://github.com/krkn-chaos/krkn.git 
    $ cd krkn
    $ pip3 install -r requirements.txt
+    Edit configuration file:
+    $ vi config/recommender_config.yaml 
    $ python3.9 utils/chaos_recommender/chaos_recommender.py
    ```

@@ -37,11 +39,16 @@ You can customize the default values by editing the `krkn/config/recommender_con
  - `auth_token`: Auth token to connect to prometheus endpoint (must).
  - `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
  - `chaos_library`: "kraken" (currently it only supports kraken).
+  - `json_output_file`: True or False (by default False).
+  - `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
  - `chaos_tests`: (for output purpose only do not change if not needed)
    - `GENERAL`: list of general purpose tests available in Krkn
    - `MEM`: list of memory related tests available in Krkn
    - `NETWORK`: list of network related tests available in Krkn
    - `CPU`: list of memory related tests available in Krkn
+  - `threshold`: Specify the threshold to use for comparison and identifying outliers
+  - `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
+  - `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers

 *TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
        ```
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
                        Chaos library
  -L LOG_LEVEL, --log-level LOG_LEVEL
                        log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
+  -J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
+                        Create output file, the path to the folder can be specified, if not specified the default folder is used.
  -M MEM [MEM ...], --MEM MEM [MEM ...]
                        Memory related chaos tests (space separated list)
  -C CPU [CPU ...], --CPU CPU [CPU ...]
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
                        Network related chaos tests (space separated list)
  -G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
                        Memory related chaos tests (space separated list)
-
+  --threshold THRESHOLD
+                        Threshold
+  --cpu_threshold CPU_THRESHOLD
+                        CPU threshold to compare with the cpu limits
+  --mem_threshold MEM_THRESHOLD
+                        Memory threshold to compare with the memory limits
 ```

 If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t

 ## Customizing Thresholds and Options

-You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
+You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.

 ## Additional Files

--- a/utils/chaos_recommender/chaos_recommender.py
+++ b/utils/chaos_recommender/chaos_recommender.py
@@ -1,7 +1,9 @@
 import argparse
+import json
 import logging
 import os.path
 import sys
+import time
 import yaml
 # kraken module import for running the recommender
 # both from the root directory and the recommender
@@ -9,12 +11,13 @@ import yaml
 sys.path.insert(0, './')
 sys.path.insert(0, '../../')

+from krkn_lib.utils import get_yaml_item_value
+
 import kraken.chaos_recommender.analysis as analysis
 import kraken.chaos_recommender.prometheus as prometheus
 from kubernetes import config as kube_config


-
 def parse_arguments(parser):

    # command line options
@@ -27,6 +30,9 @@ def parse_arguments(parser):
    parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
    parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")

+    parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
+                        help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
+
    parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
    parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
@@ -35,10 +41,13 @@ def parse_arguments(parser):
                        help="Network related chaos tests (space separated list)")
    parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
-
+    parser.add_argument("--threshold", action="store", default="", help="Threshold")
+    parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
+    parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")

    return parser.parse_args()

+
 def read_configuration(config_file_path):
    if not os.path.exists(config_file_path):
        logging.error(f"Config file not found: {config_file_path}")
@@ -48,15 +57,25 @@ def read_configuration(config_file_path):
        config = yaml.safe_load(config_file)

    log_level = config.get("log level", "INFO")
-    namespace = config.get("namespace", "")
-    kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+    namespace = config.get("namespace")
+    kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

-    prometheus_endpoint = config.get("prometheus_endpoint", "")
-    auth_token = config.get("auth_token", "")
-    scrape_duration = config.get("scrape_duration", "10m")
-    chaos_tests = config.get("chaos_tests" , {})
+    prometheus_endpoint = config.get("prometheus_endpoint")
+    auth_token = config.get("auth_token")
+    scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
+    threshold = get_yaml_item_value(config, "threshold", ".7")
+    heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
+    heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
+    output_file = config.get("json_output_file", False)
+    if output_file is True:
+        output_path = config.get("json_output_folder_path")
+    else:
+        output_path = False
+    chaos_tests = config.get("chaos_tests", {})
    return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
-            chaos_tests, log_level)
+            chaos_tests, log_level, threshold, heatmap_cpu_threshold,
+            heatmap_mem_threshold, output_path)
+

 def prompt_input(prompt, default_value):
    user_input = input(f"{prompt} [{default_value}]: ")
@@ -64,6 +83,44 @@ def prompt_input(prompt, default_value):
        return user_input
    return default_value

+
+def make_json_output(inputs, queries, analysis_data, output_path):
+    time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+
+    data = {
+        "inputs": inputs,
+        "queries": queries,
+        "profiling": analysis_data[0],
+        "heatmap_analysis": analysis_data[1],
+        "recommendations": analysis_data[2]
+    }
+
+    logging.info(f"Summary\n{json.dumps(data, indent=4)}")
+
+    if output_path is not False:
+        file = f"recommender_{inputs['namespace']}_{time_str}.json"
+        path = f"{os.path.expanduser(output_path)}/{file}"
+
+        with open(path, "w") as json_output:
+            logging.info(f"Saving output file in {output_path} folder...")
+            json_output.write(json.dumps(data, indent=4))
+            logging.info(f"Recommendation output saved in {file}.")
+
+
+def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
+    inputs = {
+        "namespace": namespace,
+        "kubeconfig": kubeconfig,
+        "prometheus_endpoint": prometheus_endpoint,
+        "scrape_duration": scrape_duration,
+        "chaos_tests": chaos_tests,
+        "threshold": threshold,
+        "heatmap_cpu_threshold": heatmap_cpu_threshold,
+        "heatmap_mem_threshold": heatmap_mem_threshold
+    }
+    return inputs
+
+
 def main():
    parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
    args = parse_arguments(parser)
@@ -81,7 +138,11 @@ def main():
         auth_token,
         scrape_duration,
         chaos_tests,
-         log_level
+         log_level,
+         threshold,
+         heatmap_cpu_threshold,
+         heatmap_mem_threshold,
+         output_path
         ) = read_configuration(args.config_file)

    if args.options:
@@ -91,27 +152,35 @@ def main():
        scrape_duration = args.scrape_duration
        log_level = args.log_level
        prometheus_endpoint = args.prometheus_endpoint
+        output_path = args.json_output_file
        chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
+        threshold = args.threshold
+        heatmap_mem_threshold = args.mem_threshold
+        heatmap_cpu_threshold = args.cpu_threshold

-    if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
+    if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
        logging.error(f"{log_level} not a valid log level")
        sys.exit(1)

    logging.basicConfig(level=log_level)

-    logging.info("============================INPUTS===================================")
-    logging.info(f"Namespace: {namespace}")
-    logging.info(f"Kubeconfig: {kubeconfig}")
-    logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
-    logging.info(f"Scrape duration: {scrape_duration}")
-    for test in chaos_tests.keys():
-        logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
-    logging.info("=====================================================================")
+    if output_path is not False:
+        if output_path is None:
+            output_path = "./recommender_output"
+            logging.info(f"Path for output file not specified. "
+                         f"Using default folder {output_path}")
+        if not os.path.exists(os.path.expanduser(output_path)):
+            logging.error(f"Folder {output_path} for output not found.")
+            sys.exit(1)
+    logging.info("Loading inputs...")
+    inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
    logging.info("Starting Analysis ...")
-    logging.info("Fetching the Telemetry data")

-    file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
-    analysis(file_path, chaos_tests)
+    file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
+    analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
+
+    make_json_output(inputs, queries, analysis_data, output_path)
+

 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
yogananth	a1b81bd382	Fix: Reslove ingress network chaos plugin issue Added network_chaos to plugin step and job wait time to be based on the test duration and set the default wait_time to 30s Signed-off-by: yogananth subramanian <ysubrama@redhat.com>	2024-03-22 14:48:17 -04:00
Naga Ravi Chaitanya Elluri	782440c8c4	Copy oc and kubectl clients to additional paths This will make sure oc and kubectl clients are accessible for users with both /usr/bin and /usr/local/bin paths set on the host. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-21 11:29:50 -04:00
Naga Ravi Chaitanya Elluri	7e2755cbb7	Remove container status badge Quay is no longer exposing it correctly: https://quay.io/repository/krkn-chaos/krkn/status Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-19 15:33:25 -04:00
Naga Ravi Chaitanya Elluri	2babb53d6e	Bump cryptography version This is need to fix the security vulnerability: https://nvd.nist.gov/vuln/detail/CVE-2024-26130. Note: Reported by FOSSA. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-19 14:44:47 -04:00
Tullio Sebastiani	85f76e9193	do not consider exit code 2 as an error in funtests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-17 23:07:46 -04:00
Liangquan Li	8bf21392f1	fix doc's nit Signed-off-by: Liangquan Li <liangli@redhat.com>	2024-03-13 15:21:57 -04:00
Tullio Sebastiani	606fb60811	changed exit codes on post chaos alerts and post_scenario failure (#592 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-07 16:31:55 +01:00
Tullio Sebastiani	fac7c3c6fb	lowered arcaflow log level to error (#591 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-07 15:32:53 +01:00
Paige Rubendall	8dd9b30030	updating tag (#589 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-03-06 13:11:44 -05:00
Naga Ravi Chaitanya Elluri	2d99f17aaf	fix: requirements.txt to reduce vulnerabilities (#587 ) The following vulnerabilities are fixed by pinning transitive dependencies: - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3172287 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3314966 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315324 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315328 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315331 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315452 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315972 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3315975 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3316038 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-3316211 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5663682 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5777683 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813745 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813746 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5813750 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-5914629 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6036192 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6050294 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6092044 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6126975 - https://snyk.io/vuln/SNYK-PYTHON-CRYPTOGRAPHY-6210214 - https://snyk.io/vuln/SNYK-PYTHON-SETUPTOOLS-3180412 - https://snyk.io/vuln/SNYK-PYTHON-WHEEL-3180413 Co-authored-by: snyk-bot <snyk-bot@snyk.io>	2024-03-06 12:54:30 -05:00
Tullio Sebastiani	50742a793c	updated krkn-lib to 2.1.0 (#588 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-06 11:30:01 -05:00
Naga Ravi Chaitanya Elluri	ba6a844544	Add /usr/local/bin to the path for krkn images This is needed to ensure oc and kubectl binaries under /usr/local/bin are accessible. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-03-04 16:03:40 -05:00
Tullio Sebastiani	7e7a917dba	dockerfiles update (#585 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-04 15:59:53 +01:00
Tullio Sebastiani	b9c0bb39c7	checking post run alerts properties presence (#584 ) added metric check Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 18:30:54 +01:00
Tullio Sebastiani	706a886151	checking alert properties presence (#583 ) typo fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:58:21 +01:00
Tullio Sebastiani	a1cf9e2c00	fixed typo on funtests (#582 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:09:19 +01:00
Tullio Sebastiani	0f5dfcb823	fixed the telemetry funtest according to the new telemetry API Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 09:48:56 -05:00
Tullio Sebastiani	1e1015e6e7	added new WS configuration to funtests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-29 11:35:00 -05:00
Tullio Sebastiani	c71ce31779	integrated new telemetry library for WS 2.0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> updated krkn-lib version Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 22:58:54 -05:00
Tullio Sebastiani	1298f220a6	Critical alerts collection and upload (#577 ) * added prometheus client method for critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * adapted run_kraken to the new plugin method for critical_alerts collection + telemetry upload Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * requirements.txt pointing temporarly to git Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * fixed severity level Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * added functional tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * exit on post chaos critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> log moved Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed noisy log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fixed log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * updated requirements.txt to krkn-lib 1.4.13 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * krkn lib * added check on variable that makes kraken return 1 whether post critical alerts are > 0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 09:48:29 -05:00
jtydlcak	24059fb731	Add json output file option for recommender (#511 ) Output in terminal changed to use json structure. The json output file names are in format recommender_namespace_YYYY-MM-DD_HH-MM-SS. The path to the json file can be specified. Default path is in kraken/utils/chaos_recommender/recommender_output. Signed-off-by: jtydlcak <139967002+jtydlack@users.noreply.github.com>	2024-02-27 11:09:00 -05:00
Naga Ravi Chaitanya Elluri	ab951adb78	Expose thresholds config options (#574 ) This commit allows users to edit the thresholds in the chaos-recommender config to be able to identify outliers based on their use case. Fixes https://github.com/krkn-chaos/krkn/issues/509 Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-02-26 09:43:34 -05:00
Paige Rubendall	a9a7fb7e51	updating release version in dockerfiles (#578 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-21 10:17:02 -05:00
Naga Ravi Chaitanya Elluri	5a8d5b0fe1	Allow critical alerts check when enable_alerts is disabled This covers use case where user wants to just check for critical alerts post chaos without having to enable the alerts evaluation feature which evaluates prom queries specified in an alerts file. Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-02-19 23:15:47 -05:00
Paige Rubendall	c440dc4b51	Taking out start and end time for critical alerts (#572 ) * taking out start and end time" Signed-off-by: Paige Rubendall <prubenda@redhat.com> * adding only break when alert fires Signed-off-by: Paige Rubendall <prubenda@redhat.com> * fail at end if alert had fired Signed-off-by: Paige Rubendall <prubenda@redhat.com> * adding new krkn-lib function with no range Signed-off-by: Paige Rubendall <prubenda@redhat.com> * updating requirements to new krkn-lib Signed-off-by: Paige Rubendall <prubenda@redhat.com> --------- Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-19 09:28:13 -05:00
Paige Rubendall	b174c51ee0	adding check if connection was properly set Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-15 17:28:20 -05:00
Paige Rubendall	fec0434ce1	adding upload to elastic search Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-13 12:01:40 -05:00
Tullio Sebastiani	1067d5ec8d	changed telemetry endpoint for funtests (#571 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-13 17:06:20 +01:00