mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-04-15 06:57:28 +00:00
Compare commits
13 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
50742a793c | ||
|
|
ba6a844544 | ||
|
|
7e7a917dba | ||
|
|
b9c0bb39c7 | ||
|
|
706a886151 | ||
|
|
a1cf9e2c00 | ||
|
|
0f5dfcb823 | ||
|
|
1e1015e6e7 | ||
|
|
c71ce31779 | ||
|
|
1298f220a6 | ||
|
|
24059fb731 | ||
|
|
ab951adb78 | ||
|
|
a9a7fb7e51 |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ __pycache__/*
|
|||||||
*.out
|
*.out
|
||||||
kube-burner*
|
kube-burner*
|
||||||
kube_burner*
|
kube_burner*
|
||||||
|
recommender_*.json
|
||||||
|
|
||||||
# Project files
|
# Project files
|
||||||
.ropeproject
|
.ropeproject
|
||||||
|
|||||||
@@ -49,3 +49,4 @@ telemetry:
|
|||||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||||
events_backup: True # enables/disables cluster events collection
|
events_backup: True # enables/disables cluster events collection
|
||||||
|
telemetry_group: "funtests"
|
||||||
|
|||||||
@@ -14,16 +14,20 @@ function functional_test_telemetry {
|
|||||||
export RUN_TAG="funtest-telemetry"
|
export RUN_TAG="funtest-telemetry"
|
||||||
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
|
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
|
||||||
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
|
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
|
||||||
|
yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
|
||||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||||
|
|
||||||
export scenario_type="arcaflow_scenarios"
|
export scenario_type="arcaflow_scenarios"
|
||||||
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
||||||
export post_config=""
|
export post_config=""
|
||||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||||
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
|
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
|
||||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
|
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||||
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||||
echo "checking if telemetry files are uploaded on s3"
|
echo "checking if telemetry files are uploaded on s3"
|
||||||
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
||||||
|
cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
|
||||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||||
echo "all files uploaded!"
|
echo "all files uploaded!"
|
||||||
|
|||||||
@@ -1,5 +1,5 @@
|
|||||||
kraken:
|
kraken:
|
||||||
distribution: openshift # Distribution can be kubernetes or openshift
|
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||||
exit_on_failure: False # Exit when a post action scenario fails
|
exit_on_failure: False # Exit when a post action scenario fails
|
||||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||||
@@ -51,7 +51,7 @@ cerberus:
|
|||||||
performance_monitoring:
|
performance_monitoring:
|
||||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||||
uuid: # uuid for the run is generated by default if not set
|
uuid: # uuid for the run is generated by default if not set
|
||||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||||
@@ -65,14 +65,19 @@ telemetry:
|
|||||||
enabled: False # enable/disables the telemetry collection feature
|
enabled: False # enable/disables the telemetry collection feature
|
||||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||||
username: username # telemetry service username
|
username: username # telemetry service username
|
||||||
password: password # telemetry service password
|
password: password # telemetry service password
|
||||||
prometheus_backup: True # enables/disables prometheus data collection
|
prometheus_backup: True # enables/disables prometheus data collection
|
||||||
|
prometheus_namespace: "" # namespace where prometheus is deployed (if distribution is kubernetes)
|
||||||
|
prometheus_container_name: "" # name of the prometheus container name (if distribution is kubernetes)
|
||||||
|
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||||
backup_threads: 5 # number of telemetry download/upload threads
|
backup_threads: 5 # number of telemetry download/upload threads
|
||||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
archive_size: 500000
|
||||||
|
telemetry_group: '' # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
|
||||||
|
# the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||||
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
||||||
# simultaneously).
|
# simultaneously).
|
||||||
# For unstable/slow connection is better to keep this value low
|
# For unstable/slow connection is better to keep this value low
|
||||||
|
|||||||
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
|
|||||||
scrape_duration: 10m
|
scrape_duration: 10m
|
||||||
chaos_library: "kraken"
|
chaos_library: "kraken"
|
||||||
log_level: INFO
|
log_level: INFO
|
||||||
|
json_output_file: False
|
||||||
|
json_output_folder_path:
|
||||||
|
|
||||||
# for output purpose only do not change if not needed
|
# for output purpose only do not change if not needed
|
||||||
chaos_tests:
|
chaos_tests:
|
||||||
@@ -26,4 +28,8 @@ chaos_tests:
|
|||||||
- pod_network_chaos
|
- pod_network_chaos
|
||||||
MEM:
|
MEM:
|
||||||
- node_memory_hog
|
- node_memory_hog
|
||||||
- pvc_disk_fill
|
- pvc_disk_fill
|
||||||
|
|
||||||
|
threshold: .7
|
||||||
|
cpu_threshold: .5
|
||||||
|
mem_threshold: .5
|
||||||
|
|||||||
@@ -5,6 +5,7 @@ FROM mcr.microsoft.com/azure-cli:latest as azure-cli
|
|||||||
FROM registry.access.redhat.com/ubi8/ubi:latest
|
FROM registry.access.redhat.com/ubi8/ubi:latest
|
||||||
|
|
||||||
ENV KUBECONFIG /root/.kube/config
|
ENV KUBECONFIG /root/.kube/config
|
||||||
|
ENV PATH=$PATH:/usr/local/bin
|
||||||
|
|
||||||
# Copy azure client binary from azure-cli image
|
# Copy azure client binary from azure-cli image
|
||||||
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||||
@@ -12,7 +13,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||||
python3.9 -m pip install -U pip && \
|
python3.9 -m pip install -U pip && \
|
||||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.9 /root/kraken && \
|
||||||
mkdir -p /root/.kube && cd /root/kraken && \
|
mkdir -p /root/.kube && cd /root/kraken && \
|
||||||
pip3.9 install -r requirements.txt && \
|
pip3.9 install -r requirements.txt && \
|
||||||
pip3.9 install virtualenv && \
|
pip3.9 install virtualenv && \
|
||||||
|
|||||||
@@ -7,6 +7,7 @@ FROM mcr.microsoft.com/azure-cli:latest as azure-cli
|
|||||||
LABEL org.opencontainers.image.authors="Red Hat OpenShift Chaos Engineering"
|
LABEL org.opencontainers.image.authors="Red Hat OpenShift Chaos Engineering"
|
||||||
|
|
||||||
ENV KUBECONFIG /root/.kube/config
|
ENV KUBECONFIG /root/.kube/config
|
||||||
|
ENV PATH=$PATH:/usr/local/bin
|
||||||
|
|
||||||
# Copy azure client binary from azure-cli image
|
# Copy azure client binary from azure-cli image
|
||||||
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||||
@@ -14,7 +15,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
|||||||
# Install dependencies
|
# Install dependencies
|
||||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||||
python3.9 -m pip install -U pip && \
|
python3.9 -m pip install -U pip && \
|
||||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.9 /root/kraken && \
|
||||||
mkdir -p /root/.kube && cd /root/kraken && \
|
mkdir -p /root/.kube && cd /root/kraken && \
|
||||||
pip3.9 install -r requirements.txt && \
|
pip3.9 install -r requirements.txt && \
|
||||||
pip3.9 install virtualenv && \
|
pip3.9 install virtualenv && \
|
||||||
|
|||||||
@@ -4,13 +4,10 @@ import pandas as pd
|
|||||||
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
||||||
import time
|
import time
|
||||||
|
|
||||||
threshold = .7 # Adjust the threshold as needed
|
|
||||||
heatmap_cpu_threshold = .5
|
|
||||||
heatmap_mem_threshold = .5
|
|
||||||
|
|
||||||
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
||||||
|
|
||||||
#Placeholder, this should be done with topology
|
|
||||||
|
# Placeholder, this should be done with topology
|
||||||
def return_critical_services():
|
def return_critical_services():
|
||||||
return ["web", "cart"]
|
return ["web", "cart"]
|
||||||
|
|
||||||
@@ -19,6 +16,7 @@ def load_telemetry_data(file_path):
|
|||||||
data = pd.read_csv(file_path, delimiter=r"\s+")
|
data = pd.read_csv(file_path, delimiter=r"\s+")
|
||||||
return data
|
return data
|
||||||
|
|
||||||
|
|
||||||
def calculate_zscores(data):
|
def calculate_zscores(data):
|
||||||
zscores = pd.DataFrame()
|
zscores = pd.DataFrame()
|
||||||
zscores["Service"] = data["service"]
|
zscores["Service"] = data["service"]
|
||||||
@@ -27,7 +25,8 @@ def calculate_zscores(data):
|
|||||||
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
||||||
return zscores
|
return zscores
|
||||||
|
|
||||||
def identify_outliers(data):
|
|
||||||
|
def identify_outliers(data, threshold):
|
||||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||||
@@ -47,44 +46,64 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
|
|||||||
return cpu_services, mem_services
|
return cpu_services, mem_services
|
||||||
|
|
||||||
|
|
||||||
def analysis(file_path, chaos_tests_config):
|
def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||||
# Load the telemetry data from file
|
# Load the telemetry data from file
|
||||||
|
logging.info("Fetching the Telemetry data")
|
||||||
data = load_telemetry_data(file_path)
|
data = load_telemetry_data(file_path)
|
||||||
|
|
||||||
# Calculate Z-scores for CPU, Memory, and Network columns
|
# Calculate Z-scores for CPU, Memory, and Network columns
|
||||||
zscores = calculate_zscores(data)
|
zscores = calculate_zscores(data)
|
||||||
|
|
||||||
# Identify outliers
|
# Identify outliers
|
||||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
|
logging.info("Identifying outliers")
|
||||||
|
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
|
||||||
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
|
|
||||||
# Display the identified outliers
|
analysis_data = analysis_json(outliers_cpu, outliers_memory,
|
||||||
logging.info("======================== Profiling ==================================")
|
outliers_network, cpu_services,
|
||||||
logging.info(f"CPU outliers: {outliers_cpu}")
|
mem_services, chaos_tests_config)
|
||||||
logging.info(f"Memory outliers: {outliers_memory}")
|
|
||||||
logging.info(f"Network outliers: {outliers_network}")
|
if not cpu_services:
|
||||||
logging.info("===================== HeatMap Analysis ==============================")
|
logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
|
||||||
|
if not mem_services:
|
||||||
|
logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
||||||
|
time.sleep(2)
|
||||||
|
|
||||||
|
logging.info("Please check data in utilisation.txt for further analysis")
|
||||||
|
|
||||||
|
return analysis_data
|
||||||
|
|
||||||
|
|
||||||
|
def analysis_json(outliers_cpu, outliers_memory, outliers_network,
|
||||||
|
cpu_services, mem_services, chaos_tests_config):
|
||||||
|
|
||||||
|
profiling = {
|
||||||
|
"cpu_outliers": outliers_cpu,
|
||||||
|
"memory_outliers": outliers_memory,
|
||||||
|
"network_outliers": outliers_network
|
||||||
|
}
|
||||||
|
|
||||||
|
heatmap = {
|
||||||
|
"services_with_cpu_heatmap_above_threshold": cpu_services,
|
||||||
|
"services_with_mem_heatmap_above_threshold": mem_services
|
||||||
|
}
|
||||||
|
|
||||||
|
recommendations = {}
|
||||||
|
|
||||||
if cpu_services:
|
if cpu_services:
|
||||||
logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
|
cpu_recommend = {"services": cpu_services,
|
||||||
else:
|
"tests": chaos_tests_config['CPU']}
|
||||||
logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
|
recommendations["cpu_services_recommendations"] = cpu_recommend
|
||||||
|
|
||||||
if mem_services:
|
if mem_services:
|
||||||
logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
|
mem_recommend = {"services": mem_services,
|
||||||
else:
|
"tests": chaos_tests_config['MEM']}
|
||||||
logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
recommendations["mem_services_recommendations"] = mem_recommend
|
||||||
time.sleep(2)
|
|
||||||
logging.info("======================= Recommendations =============================")
|
|
||||||
if cpu_services:
|
|
||||||
logging.info(f"Recommended tests for {str(cpu_services)} :\n {chaos_tests_config['CPU']}")
|
|
||||||
logging.info("\n")
|
|
||||||
if mem_services:
|
|
||||||
logging.info(f"Recommended tests for {str(mem_services)} :\n {chaos_tests_config['MEM']}")
|
|
||||||
logging.info("\n")
|
|
||||||
|
|
||||||
if outliers_network:
|
if outliers_network:
|
||||||
logging.info(f"Recommended tests for str(outliers_network) :\n {chaos_tests_config['NETWORK']}")
|
outliers_network_recommend = {"outliers_networks": outliers_network,
|
||||||
logging.info("\n")
|
"tests": chaos_tests_config['NETWORK']}
|
||||||
|
recommendations["outliers_network_recommendations"] = (
|
||||||
|
outliers_network_recommend)
|
||||||
|
|
||||||
logging.info("\n")
|
return [profiling, heatmap, recommendations]
|
||||||
logging.info("Please check data in utilisation.txt for further analysis")
|
|
||||||
|
|||||||
@@ -1,6 +1,5 @@
|
|||||||
import logging
|
import logging
|
||||||
|
|
||||||
import pandas
|
|
||||||
from prometheus_api_client import PrometheusConnect
|
from prometheus_api_client import PrometheusConnect
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
import urllib3
|
import urllib3
|
||||||
@@ -8,6 +7,7 @@ import urllib3
|
|||||||
|
|
||||||
saved_metrics_path = "./utilisation.txt"
|
saved_metrics_path = "./utilisation.txt"
|
||||||
|
|
||||||
|
|
||||||
def convert_data_to_dataframe(data, label):
|
def convert_data_to_dataframe(data, label):
|
||||||
df = pd.DataFrame()
|
df = pd.DataFrame()
|
||||||
df['service'] = [item['metric']['pod'] for item in data]
|
df['service'] = [item['metric']['pod'] for item in data]
|
||||||
@@ -25,6 +25,7 @@ def convert_data(data, service):
|
|||||||
result[pod_name] = value
|
result[pod_name] = value
|
||||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||||
|
|
||||||
|
|
||||||
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
||||||
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
||||||
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
||||||
@@ -39,8 +40,6 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
|||||||
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
||||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
# Convert columns to string
|
# Convert columns to string
|
||||||
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
||||||
merged_df['MEM'] = merged_df['MEM'].astype(str)
|
merged_df['MEM'] = merged_df['MEM'].astype(str)
|
||||||
@@ -57,40 +56,39 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
|||||||
|
|
||||||
merged_df.to_csv(filename, sep='\t', index=False)
|
merged_df.to_csv(filename, sep='\t', index=False)
|
||||||
|
|
||||||
|
|
||||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
||||||
urllib3.disable_warnings()
|
urllib3.disable_warnings()
|
||||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||||
|
|
||||||
# Fetch CPU utilization
|
# Fetch CPU utilization
|
||||||
|
logging.info("Fetching utilization")
|
||||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||||
logging.info(cpu_query)
|
|
||||||
cpu_result = prometheus.custom_query(cpu_query)
|
cpu_result = prometheus.custom_query(cpu_query)
|
||||||
cpu_data = cpu_result
|
|
||||||
|
|
||||||
|
|
||||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||||
logging.info(cpu_limits_query)
|
|
||||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||||
|
|
||||||
|
|
||||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||||
logging.info(mem_query)
|
|
||||||
mem_result = prometheus.custom_query(mem_query)
|
mem_result = prometheus.custom_query(mem_query)
|
||||||
mem_data = mem_result
|
|
||||||
|
|
||||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||||
logging.info(mem_limits_query)
|
|
||||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||||
|
|
||||||
|
|
||||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||||
network_result = prometheus.custom_query(network_query)
|
network_result = prometheus.custom_query(network_query)
|
||||||
logging.info(network_query)
|
|
||||||
network_data = network_result
|
save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
|
||||||
|
queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
|
||||||
|
return saved_metrics_path, queries
|
||||||
save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
|
|
||||||
return saved_metrics_path
|
|
||||||
|
|
||||||
|
|
||||||
|
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
|
||||||
|
queries = {
|
||||||
|
"cpu_query": cpu_query,
|
||||||
|
"cpu_limit_query": cpu_limits_query,
|
||||||
|
"memory_query": mem_query,
|
||||||
|
"memory_limit_query": mem_limits_query
|
||||||
|
}
|
||||||
|
return queries
|
||||||
|
|||||||
@@ -1,10 +1,13 @@
|
|||||||
import datetime
|
import datetime
|
||||||
import os.path
|
import os.path
|
||||||
|
from typing import Optional
|
||||||
|
|
||||||
import urllib3
|
import urllib3
|
||||||
import logging
|
import logging
|
||||||
import sys
|
import sys
|
||||||
|
|
||||||
import yaml
|
import yaml
|
||||||
|
from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
|
||||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||||
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
|||||||
|
|
||||||
prom_cli.process_alert(alert,
|
prom_cli.process_alert(alert,
|
||||||
datetime.datetime.fromtimestamp(start_time),
|
datetime.datetime.fromtimestamp(start_time),
|
||||||
datetime.datetime.fromtimestamp(end_time))
|
datetime.datetime.fromtimestamp(end_time))
|
||||||
|
|
||||||
|
|
||||||
|
def critical_alerts(prom_cli: KrknPrometheus,
|
||||||
|
summary: ChaosRunAlertSummary,
|
||||||
|
run_id,
|
||||||
|
scenario,
|
||||||
|
start_time,
|
||||||
|
end_time):
|
||||||
|
summary.scenario = scenario
|
||||||
|
summary.run_id = run_id
|
||||||
|
query = r"""ALERTS{severity="critical"}"""
|
||||||
|
logging.info("Checking for critical alerts firing post chaos")
|
||||||
|
|
||||||
|
during_critical_alerts = prom_cli.process_prom_query_in_range(
|
||||||
|
query,
|
||||||
|
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||||
|
end_time=end_time
|
||||||
|
|
||||||
|
)
|
||||||
|
|
||||||
|
for alert in during_critical_alerts:
|
||||||
|
if "metric" in alert:
|
||||||
|
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||||
|
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||||
|
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||||
|
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||||
|
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||||
|
summary.chaos_alerts.append(alert)
|
||||||
|
|
||||||
|
|
||||||
|
post_critical_alerts = prom_cli.process_query(
|
||||||
|
query
|
||||||
|
)
|
||||||
|
|
||||||
|
for alert in post_critical_alerts:
|
||||||
|
if "metric" in alert:
|
||||||
|
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||||
|
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||||
|
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||||
|
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||||
|
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||||
|
summary.post_chaos_alerts.append(alert)
|
||||||
|
|
||||||
|
during_critical_alerts_count = len(during_critical_alerts)
|
||||||
|
post_critical_alerts_count = len(post_critical_alerts)
|
||||||
|
firing_alerts = False
|
||||||
|
|
||||||
|
if during_critical_alerts_count > 0:
|
||||||
|
firing_alerts = True
|
||||||
|
|
||||||
|
if post_critical_alerts_count > 0:
|
||||||
|
firing_alerts = True
|
||||||
|
|
||||||
|
if not firing_alerts:
|
||||||
|
logging.info("No critical alerts are firing!!")
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ google-api-python-client==2.116.0
|
|||||||
ibm_cloud_sdk_core==3.18.0
|
ibm_cloud_sdk_core==3.18.0
|
||||||
ibm_vpc==0.20.0
|
ibm_vpc==0.20.0
|
||||||
jinja2==3.1.3
|
jinja2==3.1.3
|
||||||
krkn-lib==1.4.12
|
krkn-lib==2.1.0
|
||||||
lxml==5.1.0
|
lxml==5.1.0
|
||||||
kubernetes==26.1.0
|
kubernetes==26.1.0
|
||||||
oauth2client==4.1.3
|
oauth2client==4.1.3
|
||||||
|
|||||||
@@ -9,6 +9,8 @@ import optparse
|
|||||||
import pyfiglet
|
import pyfiglet
|
||||||
import uuid
|
import uuid
|
||||||
import time
|
import time
|
||||||
|
|
||||||
|
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
|
||||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||||
import kraken.time_actions.common_time_functions as time_actions
|
import kraken.time_actions.common_time_functions as time_actions
|
||||||
import kraken.performance_dashboards.setup as performance_dashboards
|
import kraken.performance_dashboards.setup as performance_dashboards
|
||||||
@@ -183,7 +185,7 @@ def main(cfg):
|
|||||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||||
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
|
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
|
||||||
|
summary = ChaosRunAlertSummary()
|
||||||
if enable_alerts or check_critical_alerts:
|
if enable_alerts or check_critical_alerts:
|
||||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||||
|
|
||||||
@@ -215,8 +217,8 @@ def main(cfg):
|
|||||||
|
|
||||||
# Capture the start time
|
# Capture the start time
|
||||||
start_time = int(time.time())
|
start_time = int(time.time())
|
||||||
critical_alerts_count = 0
|
post_critical_alerts = 0
|
||||||
|
chaos_output = ChaosRunOutput()
|
||||||
chaos_telemetry = ChaosRunTelemetry()
|
chaos_telemetry = ChaosRunTelemetry()
|
||||||
chaos_telemetry.run_uuid = run_uuid
|
chaos_telemetry.run_uuid = run_uuid
|
||||||
# Loop to run the chaos starts here
|
# Loop to run the chaos starts here
|
||||||
@@ -347,22 +349,21 @@ def main(cfg):
|
|||||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||||
|
|
||||||
# Check for critical alerts when enabled
|
# Check for critical alerts when enabled
|
||||||
|
post_critical_alerts = 0
|
||||||
if check_critical_alerts:
|
if check_critical_alerts:
|
||||||
logging.info("Checking for critical alerts firing post choas")
|
prometheus_plugin.critical_alerts(prometheus,
|
||||||
|
summary,
|
||||||
|
run_uuid,
|
||||||
|
scenario_type,
|
||||||
|
start_time,
|
||||||
|
datetime.datetime.now())
|
||||||
|
|
||||||
##PROM
|
chaos_output.critical_alerts = summary
|
||||||
query = r"""ALERTS{severity="critical"}"""
|
post_critical_alerts = len(summary.post_chaos_alerts)
|
||||||
end_time = datetime.datetime.now()
|
if post_critical_alerts > 0:
|
||||||
critical_alerts = prometheus.process_query(
|
logging.error("Post chaos critical alerts firing please check, exiting")
|
||||||
query
|
|
||||||
)
|
|
||||||
critical_alerts_count = len(critical_alerts)
|
|
||||||
if critical_alerts_count > 0:
|
|
||||||
logging.error("Critical alerts are firing: %s", critical_alerts)
|
|
||||||
logging.error("Please check, exiting")
|
|
||||||
break
|
break
|
||||||
else:
|
|
||||||
logging.info("No critical alerts are firing!!")
|
|
||||||
|
|
||||||
iteration += 1
|
iteration += 1
|
||||||
logging.info("")
|
logging.info("")
|
||||||
@@ -382,14 +383,18 @@ def main(cfg):
|
|||||||
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
||||||
|
|
||||||
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
||||||
logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
|
chaos_output.telemetry = decoded_chaos_run_telemetry
|
||||||
|
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
|
||||||
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
|
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
|
||||||
if config["telemetry"]["enabled"]:
|
if config["telemetry"]["enabled"]:
|
||||||
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
|
logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
|
||||||
|
f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
|
||||||
|
f'{telemetry_request_id}')
|
||||||
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
||||||
try:
|
try:
|
||||||
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
||||||
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
||||||
|
telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
|
||||||
# prometheus data collection is available only on Openshift
|
# prometheus data collection is available only on Openshift
|
||||||
if config["telemetry"]["prometheus_backup"]:
|
if config["telemetry"]["prometheus_backup"]:
|
||||||
prometheus_archive_files = ''
|
prometheus_archive_files = ''
|
||||||
@@ -439,7 +444,7 @@ def main(cfg):
|
|||||||
logging.error("Alert profile is not defined")
|
logging.error("Alert profile is not defined")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
if critical_alerts_count > 0:
|
if post_critical_alerts > 0:
|
||||||
logging.error("Critical alerts are firing, please check; exiting")
|
logging.error("Critical alerts are firing, please check; exiting")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
|
|||||||
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
|||||||
$ git clone https://github.com/krkn-chaos/krkn.git
|
$ git clone https://github.com/krkn-chaos/krkn.git
|
||||||
$ cd krkn
|
$ cd krkn
|
||||||
$ pip3 install -r requirements.txt
|
$ pip3 install -r requirements.txt
|
||||||
|
Edit configuration file:
|
||||||
|
$ vi config/recommender_config.yaml
|
||||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
||||||
```
|
```
|
||||||
|
|
||||||
@@ -37,11 +39,16 @@ You can customize the default values by editing the `krkn/config/recommender_con
|
|||||||
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
||||||
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
||||||
- `chaos_library`: "kraken" (currently it only supports kraken).
|
- `chaos_library`: "kraken" (currently it only supports kraken).
|
||||||
|
- `json_output_file`: True or False (by default False).
|
||||||
|
- `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
|
||||||
- `chaos_tests`: (for output purpose only do not change if not needed)
|
- `chaos_tests`: (for output purpose only do not change if not needed)
|
||||||
- `GENERAL`: list of general purpose tests available in Krkn
|
- `GENERAL`: list of general purpose tests available in Krkn
|
||||||
- `MEM`: list of memory related tests available in Krkn
|
- `MEM`: list of memory related tests available in Krkn
|
||||||
- `NETWORK`: list of network related tests available in Krkn
|
- `NETWORK`: list of network related tests available in Krkn
|
||||||
- `CPU`: list of memory related tests available in Krkn
|
- `CPU`: list of memory related tests available in Krkn
|
||||||
|
- `threshold`: Specify the threshold to use for comparison and identifying outliers
|
||||||
|
- `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
|
||||||
|
- `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers
|
||||||
|
|
||||||
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
||||||
```
|
```
|
||||||
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
|
|||||||
Chaos library
|
Chaos library
|
||||||
-L LOG_LEVEL, --log-level LOG_LEVEL
|
-L LOG_LEVEL, --log-level LOG_LEVEL
|
||||||
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||||
|
-J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
|
||||||
|
Create output file, the path to the folder can be specified, if not specified the default folder is used.
|
||||||
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
||||||
Memory related chaos tests (space separated list)
|
Memory related chaos tests (space separated list)
|
||||||
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
||||||
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
|
|||||||
Network related chaos tests (space separated list)
|
Network related chaos tests (space separated list)
|
||||||
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
||||||
Memory related chaos tests (space separated list)
|
Memory related chaos tests (space separated list)
|
||||||
|
--threshold THRESHOLD
|
||||||
|
Threshold
|
||||||
|
--cpu_threshold CPU_THRESHOLD
|
||||||
|
CPU threshold to compare with the cpu limits
|
||||||
|
--mem_threshold MEM_THRESHOLD
|
||||||
|
Memory threshold to compare with the memory limits
|
||||||
```
|
```
|
||||||
|
|
||||||
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
||||||
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t
|
|||||||
|
|
||||||
## Customizing Thresholds and Options
|
## Customizing Thresholds and Options
|
||||||
|
|
||||||
You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
|
You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.
|
||||||
|
|
||||||
## Additional Files
|
## Additional Files
|
||||||
|
|
||||||
|
|||||||
@@ -1,7 +1,9 @@
|
|||||||
import argparse
|
import argparse
|
||||||
|
import json
|
||||||
import logging
|
import logging
|
||||||
import os.path
|
import os.path
|
||||||
import sys
|
import sys
|
||||||
|
import time
|
||||||
import yaml
|
import yaml
|
||||||
# kraken module import for running the recommender
|
# kraken module import for running the recommender
|
||||||
# both from the root directory and the recommender
|
# both from the root directory and the recommender
|
||||||
@@ -9,12 +11,13 @@ import yaml
|
|||||||
sys.path.insert(0, './')
|
sys.path.insert(0, './')
|
||||||
sys.path.insert(0, '../../')
|
sys.path.insert(0, '../../')
|
||||||
|
|
||||||
|
from krkn_lib.utils import get_yaml_item_value
|
||||||
|
|
||||||
import kraken.chaos_recommender.analysis as analysis
|
import kraken.chaos_recommender.analysis as analysis
|
||||||
import kraken.chaos_recommender.prometheus as prometheus
|
import kraken.chaos_recommender.prometheus as prometheus
|
||||||
from kubernetes import config as kube_config
|
from kubernetes import config as kube_config
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parse_arguments(parser):
|
def parse_arguments(parser):
|
||||||
|
|
||||||
# command line options
|
# command line options
|
||||||
@@ -27,6 +30,9 @@ def parse_arguments(parser):
|
|||||||
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
||||||
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
||||||
|
|
||||||
|
parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
|
||||||
|
help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
|
||||||
|
|
||||||
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
||||||
help="Memory related chaos tests (space separated list)")
|
help="Memory related chaos tests (space separated list)")
|
||||||
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
||||||
@@ -35,10 +41,13 @@ def parse_arguments(parser):
|
|||||||
help="Network related chaos tests (space separated list)")
|
help="Network related chaos tests (space separated list)")
|
||||||
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
||||||
help="Memory related chaos tests (space separated list)")
|
help="Memory related chaos tests (space separated list)")
|
||||||
|
parser.add_argument("--threshold", action="store", default="", help="Threshold")
|
||||||
|
parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
|
||||||
|
parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")
|
||||||
|
|
||||||
return parser.parse_args()
|
return parser.parse_args()
|
||||||
|
|
||||||
|
|
||||||
def read_configuration(config_file_path):
|
def read_configuration(config_file_path):
|
||||||
if not os.path.exists(config_file_path):
|
if not os.path.exists(config_file_path):
|
||||||
logging.error(f"Config file not found: {config_file_path}")
|
logging.error(f"Config file not found: {config_file_path}")
|
||||||
@@ -48,15 +57,25 @@ def read_configuration(config_file_path):
|
|||||||
config = yaml.safe_load(config_file)
|
config = yaml.safe_load(config_file)
|
||||||
|
|
||||||
log_level = config.get("log level", "INFO")
|
log_level = config.get("log level", "INFO")
|
||||||
namespace = config.get("namespace", "")
|
namespace = config.get("namespace")
|
||||||
kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||||
|
|
||||||
prometheus_endpoint = config.get("prometheus_endpoint", "")
|
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||||
auth_token = config.get("auth_token", "")
|
auth_token = config.get("auth_token")
|
||||||
scrape_duration = config.get("scrape_duration", "10m")
|
scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
|
||||||
chaos_tests = config.get("chaos_tests" , {})
|
threshold = get_yaml_item_value(config, "threshold", ".7")
|
||||||
|
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
|
||||||
|
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
|
||||||
|
output_file = config.get("json_output_file", False)
|
||||||
|
if output_file is True:
|
||||||
|
output_path = config.get("json_output_folder_path")
|
||||||
|
else:
|
||||||
|
output_path = False
|
||||||
|
chaos_tests = config.get("chaos_tests", {})
|
||||||
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
||||||
chaos_tests, log_level)
|
chaos_tests, log_level, threshold, heatmap_cpu_threshold,
|
||||||
|
heatmap_mem_threshold, output_path)
|
||||||
|
|
||||||
|
|
||||||
def prompt_input(prompt, default_value):
|
def prompt_input(prompt, default_value):
|
||||||
user_input = input(f"{prompt} [{default_value}]: ")
|
user_input = input(f"{prompt} [{default_value}]: ")
|
||||||
@@ -64,6 +83,44 @@ def prompt_input(prompt, default_value):
|
|||||||
return user_input
|
return user_input
|
||||||
return default_value
|
return default_value
|
||||||
|
|
||||||
|
|
||||||
|
def make_json_output(inputs, queries, analysis_data, output_path):
|
||||||
|
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
|
||||||
|
|
||||||
|
data = {
|
||||||
|
"inputs": inputs,
|
||||||
|
"queries": queries,
|
||||||
|
"profiling": analysis_data[0],
|
||||||
|
"heatmap_analysis": analysis_data[1],
|
||||||
|
"recommendations": analysis_data[2]
|
||||||
|
}
|
||||||
|
|
||||||
|
logging.info(f"Summary\n{json.dumps(data, indent=4)}")
|
||||||
|
|
||||||
|
if output_path is not False:
|
||||||
|
file = f"recommender_{inputs['namespace']}_{time_str}.json"
|
||||||
|
path = f"{os.path.expanduser(output_path)}/{file}"
|
||||||
|
|
||||||
|
with open(path, "w") as json_output:
|
||||||
|
logging.info(f"Saving output file in {output_path} folder...")
|
||||||
|
json_output.write(json.dumps(data, indent=4))
|
||||||
|
logging.info(f"Recommendation output saved in {file}.")
|
||||||
|
|
||||||
|
|
||||||
|
def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||||
|
inputs = {
|
||||||
|
"namespace": namespace,
|
||||||
|
"kubeconfig": kubeconfig,
|
||||||
|
"prometheus_endpoint": prometheus_endpoint,
|
||||||
|
"scrape_duration": scrape_duration,
|
||||||
|
"chaos_tests": chaos_tests,
|
||||||
|
"threshold": threshold,
|
||||||
|
"heatmap_cpu_threshold": heatmap_cpu_threshold,
|
||||||
|
"heatmap_mem_threshold": heatmap_mem_threshold
|
||||||
|
}
|
||||||
|
return inputs
|
||||||
|
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
||||||
args = parse_arguments(parser)
|
args = parse_arguments(parser)
|
||||||
@@ -81,7 +138,11 @@ def main():
|
|||||||
auth_token,
|
auth_token,
|
||||||
scrape_duration,
|
scrape_duration,
|
||||||
chaos_tests,
|
chaos_tests,
|
||||||
log_level
|
log_level,
|
||||||
|
threshold,
|
||||||
|
heatmap_cpu_threshold,
|
||||||
|
heatmap_mem_threshold,
|
||||||
|
output_path
|
||||||
) = read_configuration(args.config_file)
|
) = read_configuration(args.config_file)
|
||||||
|
|
||||||
if args.options:
|
if args.options:
|
||||||
@@ -91,27 +152,35 @@ def main():
|
|||||||
scrape_duration = args.scrape_duration
|
scrape_duration = args.scrape_duration
|
||||||
log_level = args.log_level
|
log_level = args.log_level
|
||||||
prometheus_endpoint = args.prometheus_endpoint
|
prometheus_endpoint = args.prometheus_endpoint
|
||||||
|
output_path = args.json_output_file
|
||||||
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
||||||
|
threshold = args.threshold
|
||||||
|
heatmap_mem_threshold = args.mem_threshold
|
||||||
|
heatmap_cpu_threshold = args.cpu_threshold
|
||||||
|
|
||||||
if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
|
if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
||||||
logging.error(f"{log_level} not a valid log level")
|
logging.error(f"{log_level} not a valid log level")
|
||||||
sys.exit(1)
|
sys.exit(1)
|
||||||
|
|
||||||
logging.basicConfig(level=log_level)
|
logging.basicConfig(level=log_level)
|
||||||
|
|
||||||
logging.info("============================INPUTS===================================")
|
if output_path is not False:
|
||||||
logging.info(f"Namespace: {namespace}")
|
if output_path is None:
|
||||||
logging.info(f"Kubeconfig: {kubeconfig}")
|
output_path = "./recommender_output"
|
||||||
logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
|
logging.info(f"Path for output file not specified. "
|
||||||
logging.info(f"Scrape duration: {scrape_duration}")
|
f"Using default folder {output_path}")
|
||||||
for test in chaos_tests.keys():
|
if not os.path.exists(os.path.expanduser(output_path)):
|
||||||
logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
|
logging.error(f"Folder {output_path} for output not found.")
|
||||||
logging.info("=====================================================================")
|
sys.exit(1)
|
||||||
|
logging.info("Loading inputs...")
|
||||||
|
inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
logging.info("Starting Analysis ...")
|
logging.info("Starting Analysis ...")
|
||||||
logging.info("Fetching the Telemetry data")
|
|
||||||
|
|
||||||
file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
||||||
analysis(file_path, chaos_tests)
|
analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||||
|
|
||||||
|
make_json_output(inputs, queries, analysis_data, output_path)
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
main()
|
main()
|
||||||
|
|||||||
Reference in New Issue
Block a user