dockerfiles update (#585 )

Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>
checking post run alerts properties presence (#584 )
2026-02-19 20:40:33 +00:00 · 2024-03-04 15:59:53 +01:00 · 2024-03-01 18:30:54 +01:00 · 2024-03-01 17:58:21 +01:00 · 2024-03-01 17:09:19 +01:00 · 2024-03-01 09:48:56 -05:00
14 changed files with 283 additions and 103 deletions
--- a/.gitignore
+++ b/.gitignore
@@ -16,6 +16,7 @@ __pycache__/*
 *.out
 kube-burner*
 kube_burner*
+recommender_*.json

 # Project files
 .ropeproject
--- a/CI/config/common_test_config.yaml
+++ b/CI/config/common_test_config.yaml
@@ -49,3 +49,4 @@ telemetry:
        - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection
+    telemetry_group: "funtests"
--- a/CI/tests/test_telemetry.sh
+++ b/CI/tests/test_telemetry.sh
@@ -14,16 +14,20 @@ function functional_test_telemetry {
  export RUN_TAG="funtest-telemetry"
  yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
  yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
+  yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
  yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
+
  export scenario_type="arcaflow_scenarios"
  export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
  export post_config=""
  envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
  python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
-  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
+  RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
  $AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
  echo "checking if telemetry files are uploaded on s3"
  cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
+  cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
  cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
  cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
  echo "all files uploaded!"
--- a/config/config.yaml
+++ b/config/config.yaml
@@ -1,5 +1,5 @@
 kraken:
-    distribution: openshift                                # Distribution can be kubernetes or openshift
+    distribution: kubernetes                                # Distribution can be kubernetes or openshift
    kubeconfig_path: ~/.kube/config                        # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
@@ -51,7 +51,7 @@ cerberus:
 performance_monitoring:
    deploy_dashboards: False                              # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
    repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
-    prometheus_url:                                       # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
+    prometheus_url:                                      # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
    uuid:                                                 # uuid for the run is generated by default if not set
    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
@@ -65,14 +65,19 @@ telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
    username: username                                      # telemetry service username
-    password: password                                      # telemetry service password
+    password: password                                    # telemetry service password
    prometheus_backup: True                                 # enables/disables prometheus data collection
+    prometheus_namespace: ""                                # namespace where prometheus is deployed (if distribution is kubernetes)
+    prometheus_container_name: ""                           # name of the prometheus container name (if distribution is kubernetes)
+    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
    archive_path: /tmp                                      # local path where the archive files will be temporarly stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
-    archive_size: 500000                                     # the size of the prometheus data archive size in KB. The lower the size of archive is
+    archive_size: 500000
+    telemetry_group: ''                                     # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
+    # the size of the prometheus data archive size in KB. The lower the size of archive is
                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
                                                            # simultaneously).
                                                            # For unstable/slow connection is better to keep this value low
--- a/config/recommender_config.yaml
+++ b/config/recommender_config.yaml
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
 scrape_duration: 10m
 chaos_library: "kraken"
 log_level: INFO
+json_output_file: False
+json_output_folder_path:

 # for output purpose only do not change if not needed
 chaos_tests:
@@ -26,4 +28,8 @@ chaos_tests:
    - pod_network_chaos
  MEM:
    - node_memory_hog
-    - pvc_disk_fill
+    - pvc_disk_fill
+
+threshold: .7
+cpu_threshold: .5
+mem_threshold: .5
--- a/containers/Dockerfile
+++ b/containers/Dockerfile
@@ -12,7 +12,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.9 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
--- a/containers/Dockerfile-ppc64le
+++ b/containers/Dockerfile-ppc64le
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
 # Install dependencies
 RUN yum install -y git python39 python3-pip jq gettext wget && \
    python3.9 -m pip install -U pip && \
-    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
+    git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.9 /root/kraken && \
    mkdir -p /root/.kube && cd /root/kraken && \
    pip3.9 install -r requirements.txt && \
    pip3.9 install virtualenv && \
--- a/kraken/chaos_recommender/analysis.py
+++ b/kraken/chaos_recommender/analysis.py
@@ -4,13 +4,10 @@ import pandas as pd
 import kraken.chaos_recommender.kraken_tests as kraken_tests
 import time

-threshold = .7  # Adjust the threshold as needed
-heatmap_cpu_threshold = .5
-heatmap_mem_threshold = .5
-
 KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"

-#Placeholder, this should be done with topology
+
+# Placeholder, this should be done with topology
 def return_critical_services():
    return ["web", "cart"]

@@ -19,6 +16,7 @@ def load_telemetry_data(file_path):
    data = pd.read_csv(file_path, delimiter=r"\s+")
    return data

+
 def calculate_zscores(data):
    zscores = pd.DataFrame()
    zscores["Service"] = data["service"]
@@ -27,7 +25,8 @@ def calculate_zscores(data):
    zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
    return zscores

-def identify_outliers(data):
+
+def identify_outliers(data, threshold):
    outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
    outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
    outliers_network = data[data["Network"] > threshold]["Service"].tolist()
@@ -47,44 +46,64 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
    return cpu_services, mem_services


-def analysis(file_path, chaos_tests_config):
+def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
    # Load the telemetry data from file
+    logging.info("Fetching the Telemetry data")
    data = load_telemetry_data(file_path)

    # Calculate Z-scores for CPU, Memory, and Network columns
    zscores = calculate_zscores(data)

    # Identify outliers
-    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
+    logging.info("Identifying outliers")
+    outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
    cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)

-    # Display the identified outliers
-    logging.info("======================== Profiling ==================================")
-    logging.info(f"CPU outliers: {outliers_cpu}")
-    logging.info(f"Memory outliers: {outliers_memory}")
-    logging.info(f"Network outliers: {outliers_network}")
-    logging.info("===================== HeatMap Analysis ==============================")
+    analysis_data = analysis_json(outliers_cpu, outliers_memory,
+                                  outliers_network, cpu_services,
+                                  mem_services, chaos_tests_config)
+
+    if not cpu_services:
+        logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
+    if not mem_services:
+        logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
+    time.sleep(2)
+
+    logging.info("Please check data in utilisation.txt for further analysis")
+
+    return analysis_data
+
+
+def analysis_json(outliers_cpu, outliers_memory, outliers_network,
+                  cpu_services, mem_services, chaos_tests_config):
+
+    profiling = {
+        "cpu_outliers": outliers_cpu,
+        "memory_outliers": outliers_memory,
+        "network_outliers": outliers_network
+    }
+
+    heatmap = {
+        "services_with_cpu_heatmap_above_threshold": cpu_services,
+        "services_with_mem_heatmap_above_threshold": mem_services
+    }
+
+    recommendations = {}

    if cpu_services:
-        logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
-    else:
-        logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
+        cpu_recommend = {"services": cpu_services,
+                         "tests": chaos_tests_config['CPU']}
+        recommendations["cpu_services_recommendations"] = cpu_recommend
+
    if mem_services:
-        logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
-    else:
-        logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
-    time.sleep(2)
-    logging.info("======================= Recommendations =============================")
-    if cpu_services:
-        logging.info(f"Recommended tests for {str(cpu_services)}  :\n {chaos_tests_config['CPU']}")
-        logging.info("\n")
-    if mem_services:
-        logging.info(f"Recommended tests for {str(mem_services)}  :\n {chaos_tests_config['MEM']}")
-        logging.info("\n")
+        mem_recommend = {"services": mem_services,
+                         "tests": chaos_tests_config['MEM']}
+        recommendations["mem_services_recommendations"] = mem_recommend

    if outliers_network:
-        logging.info(f"Recommended tests for  str(outliers_network)  :\n {chaos_tests_config['NETWORK']}")
-        logging.info("\n")
+        outliers_network_recommend = {"outliers_networks": outliers_network,
+                                      "tests": chaos_tests_config['NETWORK']}
+        recommendations["outliers_network_recommendations"] = (
+            outliers_network_recommend)

-    logging.info("\n")
-    logging.info("Please check data in utilisation.txt for further analysis")
+    return [profiling, heatmap, recommendations]
--- a/kraken/chaos_recommender/prometheus.py
+++ b/kraken/chaos_recommender/prometheus.py
@@ -1,6 +1,5 @@
 import logging

-import pandas
 from prometheus_api_client import PrometheusConnect
 import pandas as pd
 import urllib3
@@ -8,6 +7,7 @@ import urllib3

 saved_metrics_path = "./utilisation.txt"

+
 def convert_data_to_dataframe(data, label):
    df = pd.DataFrame()
    df['service'] = [item['metric']['pod'] for item in data]
@@ -25,6 +25,7 @@ def convert_data(data, service):
        result[pod_name] = value
    return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value

+
 def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
    df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
    merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
@@ -39,8 +40,6 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
                    "NETWORK" : convert_data(network_data, s)}, index=[0])
        merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)

-
-
    # Convert columns to string
    merged_df['CPU'] = merged_df['CPU'].astype(str)
    merged_df['MEM'] = merged_df['MEM'].astype(str)
@@ -57,40 +56,39 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r

    merged_df.to_csv(filename, sep='\t', index=False)

+
 def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
    urllib3.disable_warnings()
    prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)

    # Fetch CPU utilization
+    logging.info("Fetching utilization")
    cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
-    logging.info(cpu_query)
    cpu_result = prometheus.custom_query(cpu_query)
-    cpu_data = cpu_result
-

    cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
-    logging.info(cpu_limits_query)
    cpu_limits_result = prometheus.custom_query(cpu_limits_query)

-
    mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
-    logging.info(mem_query)
    mem_result = prometheus.custom_query(mem_query)
-    mem_data = mem_result

    mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"})  ' %(namespace)
-    logging.info(mem_limits_query)
    mem_limits_result = prometheus.custom_query(mem_limits_query)

-
    network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
    (avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
    network_result = prometheus.custom_query(network_query)
-    logging.info(network_query)
-    network_data = network_result
-
-
-    save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
-    return saved_metrics_path
+
+    save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
+    queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
+    return saved_metrics_path, queries


+def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
+    queries = {
+        "cpu_query": cpu_query,
+        "cpu_limit_query": cpu_limits_query,
+        "memory_query": mem_query,
+        "memory_limit_query": mem_limits_query
+    }
+    return queries
--- a/kraken/prometheus/client.py
+++ b/kraken/prometheus/client.py
@@ -1,10 +1,13 @@
 import datetime
 import os.path
+from typing import Optional
+
 import urllib3
 import logging
 import sys

 import yaml
+from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
 def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):

            prom_cli.process_alert(alert,
                                   datetime.datetime.fromtimestamp(start_time),
-                                   datetime.datetime.fromtimestamp(end_time))
+                                   datetime.datetime.fromtimestamp(end_time))
+
+
+def critical_alerts(prom_cli: KrknPrometheus,
+                    summary: ChaosRunAlertSummary,
+                    run_id,
+                    scenario,
+                    start_time,
+                    end_time):
+    summary.scenario = scenario
+    summary.run_id = run_id
+    query = r"""ALERTS{severity="critical"}"""
+    logging.info("Checking for critical alerts firing post chaos")
+
+    during_critical_alerts = prom_cli.process_prom_query_in_range(
+        query,
+        start_time=datetime.datetime.fromtimestamp(start_time),
+        end_time=end_time
+
+    )
+
+    for alert in during_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.chaos_alerts.append(alert)
+
+
+    post_critical_alerts = prom_cli.process_query(
+        query
+    )
+
+    for alert in post_critical_alerts:
+        if "metric" in alert:
+            alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
+            alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
+            namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
+            severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
+            alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
+            summary.post_chaos_alerts.append(alert)
+
+    during_critical_alerts_count = len(during_critical_alerts)
+    post_critical_alerts_count = len(post_critical_alerts)
+    firing_alerts = False
+
+    if during_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if post_critical_alerts_count > 0:
+        firing_alerts = True
+
+    if not firing_alerts:
+        logging.info("No critical alerts are firing!!")
--- a/requirements.txt
+++ b/requirements.txt
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
 ibm_cloud_sdk_core==3.18.0
 ibm_vpc==0.20.0
 jinja2==3.1.3
-krkn-lib==1.4.12
+krkn-lib==2.0.0
 lxml==5.1.0
 kubernetes==26.1.0
 oauth2client==4.1.3
--- a/run_kraken.py
+++ b/run_kraken.py
@@ -9,6 +9,8 @@ import optparse
 import pyfiglet
 import uuid
 import time
+
+from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
 from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
 import kraken.time_actions.common_time_functions as time_actions
 import kraken.performance_dashboards.setup as performance_dashboards
@@ -183,7 +185,7 @@ def main(cfg):
        telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
        telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
        telemetry_elastic = KrknElastic(safe_logger,elastic_url)
-
+        summary = ChaosRunAlertSummary()
        if enable_alerts or check_critical_alerts:
            prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)

@@ -215,8 +217,8 @@ def main(cfg):

        # Capture the start time
        start_time = int(time.time())
-        critical_alerts_count = 0
-
+        post_critical_alerts = 0
+        chaos_output = ChaosRunOutput()
        chaos_telemetry = ChaosRunTelemetry()
        chaos_telemetry.run_uuid = run_uuid
        # Loop to run the chaos starts here
@@ -347,22 +349,21 @@ def main(cfg):
                            failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)

                        # Check for critical alerts when enabled
+                        post_critical_alerts = 0
                        if check_critical_alerts:
-                            logging.info("Checking for critical alerts firing post choas")
+                            prometheus_plugin.critical_alerts(prometheus,
+                                                              summary,
+                                                              run_uuid,
+                                                              scenario_type,
+                                                              start_time,
+                                                              datetime.datetime.now())

-                            ##PROM
-                            query = r"""ALERTS{severity="critical"}"""
-                            end_time = datetime.datetime.now()
-                            critical_alerts = prometheus.process_query(
-                                query
-                            )
-                            critical_alerts_count = len(critical_alerts)
-                            if critical_alerts_count > 0:
-                                logging.error("Critical alerts are firing: %s", critical_alerts)
-                                logging.error("Please check, exiting")
+                            chaos_output.critical_alerts = summary
+                            post_critical_alerts = len(summary.post_chaos_alerts)
+                            if post_critical_alerts > 0:
+                                logging.error("Post chaos critical alerts firing please check, exiting")
                                break
-                            else:
-                                logging.info("No critical alerts are firing!!")
+

            iteration += 1
            logging.info("")
@@ -382,14 +383,18 @@ def main(cfg):
            telemetry_k8s.collect_cluster_metadata(chaos_telemetry)

        decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
-        logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
+        chaos_output.telemetry = decoded_chaos_run_telemetry
+        logging.info(f"Chaos data:\n{chaos_output.to_json()}")
        telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
        if config["telemetry"]["enabled"]:
-            logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
+            logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
+                         f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
+                         f'{telemetry_request_id}')
            logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
            try:
                telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
                telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
+                telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
                # prometheus data collection is available only on Openshift
                if config["telemetry"]["prometheus_backup"]:
                    prometheus_archive_files = ''
@@ -439,7 +444,7 @@ def main(cfg):
                logging.error("Alert profile is not defined")
                sys.exit(1)

-        if critical_alerts_count > 0:
+        if post_critical_alerts > 0:
            logging.error("Critical alerts are firing, please check; exiting")
            sys.exit(1)

--- a/utils/chaos_recommender/README.md
+++ b/utils/chaos_recommender/README.md
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
    $ git clone https://github.com/krkn-chaos/krkn.git 
    $ cd krkn
    $ pip3 install -r requirements.txt
+    Edit configuration file:
+    $ vi config/recommender_config.yaml 
    $ python3.9 utils/chaos_recommender/chaos_recommender.py
    ```

@@ -37,11 +39,16 @@ You can customize the default values by editing the `krkn/config/recommender_con
  - `auth_token`: Auth token to connect to prometheus endpoint (must).
  - `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
  - `chaos_library`: "kraken" (currently it only supports kraken).
+  - `json_output_file`: True or False (by default False).
+  - `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
  - `chaos_tests`: (for output purpose only do not change if not needed)
    - `GENERAL`: list of general purpose tests available in Krkn
    - `MEM`: list of memory related tests available in Krkn
    - `NETWORK`: list of network related tests available in Krkn
    - `CPU`: list of memory related tests available in Krkn
+  - `threshold`: Specify the threshold to use for comparison and identifying outliers
+  - `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
+  - `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers

 *TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
        ```
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
                        Chaos library
  -L LOG_LEVEL, --log-level LOG_LEVEL
                        log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
+  -J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
+                        Create output file, the path to the folder can be specified, if not specified the default folder is used.
  -M MEM [MEM ...], --MEM MEM [MEM ...]
                        Memory related chaos tests (space separated list)
  -C CPU [CPU ...], --CPU CPU [CPU ...]
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
                        Network related chaos tests (space separated list)
  -G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
                        Memory related chaos tests (space separated list)
-
+  --threshold THRESHOLD
+                        Threshold
+  --cpu_threshold CPU_THRESHOLD
+                        CPU threshold to compare with the cpu limits
+  --mem_threshold MEM_THRESHOLD
+                        Memory threshold to compare with the memory limits
 ```

 If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t

 ## Customizing Thresholds and Options

-You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
+You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.

 ## Additional Files

--- a/utils/chaos_recommender/chaos_recommender.py
+++ b/utils/chaos_recommender/chaos_recommender.py
@@ -1,7 +1,9 @@
 import argparse
+import json
 import logging
 import os.path
 import sys
+import time
 import yaml
 # kraken module import for running the recommender
 # both from the root directory and the recommender
@@ -9,12 +11,13 @@ import yaml
 sys.path.insert(0, './')
 sys.path.insert(0, '../../')

+from krkn_lib.utils import get_yaml_item_value
+
 import kraken.chaos_recommender.analysis as analysis
 import kraken.chaos_recommender.prometheus as prometheus
 from kubernetes import config as kube_config


-
 def parse_arguments(parser):

    # command line options
@@ -27,6 +30,9 @@ def parse_arguments(parser):
    parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
    parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")

+    parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
+                        help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
+
    parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
    parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
@@ -35,10 +41,13 @@ def parse_arguments(parser):
                        help="Network related chaos tests (space separated list)")
    parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
                        help="Memory related chaos tests (space separated list)")
-
+    parser.add_argument("--threshold", action="store", default="", help="Threshold")
+    parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
+    parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")

    return parser.parse_args()

+
 def read_configuration(config_file_path):
    if not os.path.exists(config_file_path):
        logging.error(f"Config file not found: {config_file_path}")
@@ -48,15 +57,25 @@ def read_configuration(config_file_path):
        config = yaml.safe_load(config_file)

    log_level = config.get("log level", "INFO")
-    namespace = config.get("namespace", "")
-    kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
+    namespace = config.get("namespace")
+    kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)

-    prometheus_endpoint = config.get("prometheus_endpoint", "")
-    auth_token = config.get("auth_token", "")
-    scrape_duration = config.get("scrape_duration", "10m")
-    chaos_tests = config.get("chaos_tests" , {})
+    prometheus_endpoint = config.get("prometheus_endpoint")
+    auth_token = config.get("auth_token")
+    scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
+    threshold = get_yaml_item_value(config, "threshold", ".7")
+    heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
+    heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
+    output_file = config.get("json_output_file", False)
+    if output_file is True:
+        output_path = config.get("json_output_folder_path")
+    else:
+        output_path = False
+    chaos_tests = config.get("chaos_tests", {})
    return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
-            chaos_tests, log_level)
+            chaos_tests, log_level, threshold, heatmap_cpu_threshold,
+            heatmap_mem_threshold, output_path)
+

 def prompt_input(prompt, default_value):
    user_input = input(f"{prompt} [{default_value}]: ")
@@ -64,6 +83,44 @@ def prompt_input(prompt, default_value):
        return user_input
    return default_value

+
+def make_json_output(inputs, queries, analysis_data, output_path):
+    time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
+
+    data = {
+        "inputs": inputs,
+        "queries": queries,
+        "profiling": analysis_data[0],
+        "heatmap_analysis": analysis_data[1],
+        "recommendations": analysis_data[2]
+    }
+
+    logging.info(f"Summary\n{json.dumps(data, indent=4)}")
+
+    if output_path is not False:
+        file = f"recommender_{inputs['namespace']}_{time_str}.json"
+        path = f"{os.path.expanduser(output_path)}/{file}"
+
+        with open(path, "w") as json_output:
+            logging.info(f"Saving output file in {output_path} folder...")
+            json_output.write(json.dumps(data, indent=4))
+            logging.info(f"Recommendation output saved in {file}.")
+
+
+def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
+    inputs = {
+        "namespace": namespace,
+        "kubeconfig": kubeconfig,
+        "prometheus_endpoint": prometheus_endpoint,
+        "scrape_duration": scrape_duration,
+        "chaos_tests": chaos_tests,
+        "threshold": threshold,
+        "heatmap_cpu_threshold": heatmap_cpu_threshold,
+        "heatmap_mem_threshold": heatmap_mem_threshold
+    }
+    return inputs
+
+
 def main():
    parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
    args = parse_arguments(parser)
@@ -81,7 +138,11 @@ def main():
         auth_token,
         scrape_duration,
         chaos_tests,
-         log_level
+         log_level,
+         threshold,
+         heatmap_cpu_threshold,
+         heatmap_mem_threshold,
+         output_path
         ) = read_configuration(args.config_file)

    if args.options:
@@ -91,27 +152,35 @@ def main():
        scrape_duration = args.scrape_duration
        log_level = args.log_level
        prometheus_endpoint = args.prometheus_endpoint
+        output_path = args.json_output_file
        chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
+        threshold = args.threshold
+        heatmap_mem_threshold = args.mem_threshold
+        heatmap_cpu_threshold = args.cpu_threshold

-    if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
+    if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
        logging.error(f"{log_level} not a valid log level")
        sys.exit(1)

    logging.basicConfig(level=log_level)

-    logging.info("============================INPUTS===================================")
-    logging.info(f"Namespace: {namespace}")
-    logging.info(f"Kubeconfig: {kubeconfig}")
-    logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
-    logging.info(f"Scrape duration: {scrape_duration}")
-    for test in chaos_tests.keys():
-        logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
-    logging.info("=====================================================================")
+    if output_path is not False:
+        if output_path is None:
+            output_path = "./recommender_output"
+            logging.info(f"Path for output file not specified. "
+                         f"Using default folder {output_path}")
+        if not os.path.exists(os.path.expanduser(output_path)):
+            logging.error(f"Folder {output_path} for output not found.")
+            sys.exit(1)
+    logging.info("Loading inputs...")
+    inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
    logging.info("Starting Analysis ...")
-    logging.info("Fetching the Telemetry data")

-    file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
-    analysis(file_path, chaos_tests)
+    file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
+    analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
+
+    make_json_output(inputs, queries, analysis_data, output_path)
+

 if __name__ == "__main__":
    main()
Author	SHA1	Message	Date
Tullio Sebastiani	7e7a917dba	dockerfiles update (#585 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-04 15:59:53 +01:00
Tullio Sebastiani	b9c0bb39c7	checking post run alerts properties presence (#584 ) added metric check Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 18:30:54 +01:00
Tullio Sebastiani	706a886151	checking alert properties presence (#583 ) typo fix Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:58:21 +01:00
Tullio Sebastiani	a1cf9e2c00	fixed typo on funtests (#582 ) Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 17:09:19 +01:00
Tullio Sebastiani	0f5dfcb823	fixed the telemetry funtest according to the new telemetry API Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-03-01 09:48:56 -05:00
Tullio Sebastiani	1e1015e6e7	added new WS configuration to funtests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-29 11:35:00 -05:00
Tullio Sebastiani	c71ce31779	integrated new telemetry library for WS 2.0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> updated krkn-lib version Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 22:58:54 -05:00
Tullio Sebastiani	1298f220a6	Critical alerts collection and upload (#577 ) * added prometheus client method for critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * adapted run_kraken to the new plugin method for critical_alerts collection + telemetry upload Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * requirements.txt pointing temporarly to git Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * fixed severity level Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * added functional tests Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * exit on post chaos critical alerts Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> log moved Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * removed noisy log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> fixed log Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * updated requirements.txt to krkn-lib 1.4.13 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> * krkn lib * added check on variable that makes kraken return 1 whether post critical alerts are > 0 Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com> --------- Signed-off-by: Tullio Sebastiani <tsebasti@redhat.com>	2024-02-28 09:48:29 -05:00
jtydlcak	24059fb731	Add json output file option for recommender (#511 ) Output in terminal changed to use json structure. The json output file names are in format recommender_namespace_YYYY-MM-DD_HH-MM-SS. The path to the json file can be specified. Default path is in kraken/utils/chaos_recommender/recommender_output. Signed-off-by: jtydlcak <139967002+jtydlack@users.noreply.github.com>	2024-02-27 11:09:00 -05:00
Naga Ravi Chaitanya Elluri	ab951adb78	Expose thresholds config options (#574 ) This commit allows users to edit the thresholds in the chaos-recommender config to be able to identify outliers based on their use case. Fixes https://github.com/krkn-chaos/krkn/issues/509 Signed-off-by: Naga Ravi Chaitanya Elluri <nelluri@redhat.com>	2024-02-26 09:43:34 -05:00
Paige Rubendall	a9a7fb7e51	updating release version in dockerfiles (#578 ) Signed-off-by: Paige Rubendall <prubenda@redhat.com>	2024-02-21 10:17:02 -05:00