mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-16 19:09:53 +00:00
Compare commits
28 Commits
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
a1b81bd382 | ||
|
|
782440c8c4 | ||
|
|
7e2755cbb7 | ||
|
|
2babb53d6e | ||
|
|
85f76e9193 | ||
|
|
8bf21392f1 | ||
|
|
606fb60811 | ||
|
|
fac7c3c6fb | ||
|
|
8dd9b30030 | ||
|
|
2d99f17aaf | ||
|
|
50742a793c | ||
|
|
ba6a844544 | ||
|
|
7e7a917dba | ||
|
|
b9c0bb39c7 | ||
|
|
706a886151 | ||
|
|
a1cf9e2c00 | ||
|
|
0f5dfcb823 | ||
|
|
1e1015e6e7 | ||
|
|
c71ce31779 | ||
|
|
1298f220a6 | ||
|
|
24059fb731 | ||
|
|
ab951adb78 | ||
|
|
a9a7fb7e51 | ||
|
|
5a8d5b0fe1 | ||
|
|
c440dc4b51 | ||
|
|
b174c51ee0 | ||
|
|
fec0434ce1 | ||
|
|
1067d5ec8d |
1
.gitignore
vendored
1
.gitignore
vendored
@@ -16,6 +16,7 @@ __pycache__/*
|
||||
*.out
|
||||
kube-burner*
|
||||
kube_burner*
|
||||
recommender_*.json
|
||||
|
||||
# Project files
|
||||
.ropeproject
|
||||
|
||||
@@ -29,7 +29,7 @@ tunings:
|
||||
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever.
|
||||
telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
api_url: https://yvnn4rfoi7.execute-api.us-west-2.amazonaws.com/test #telemetry service endpoint
|
||||
username: $TELEMETRY_USERNAME # telemetry service username
|
||||
password: $TELEMETRY_PASSWORD # telemetry service password
|
||||
prometheus_namespace: 'prometheus-k8s' # prometheus namespace
|
||||
@@ -49,3 +49,4 @@ telemetry:
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
telemetry_group: "funtests"
|
||||
|
||||
@@ -8,8 +8,14 @@ function finish {
|
||||
}
|
||||
|
||||
function error {
|
||||
echo "Error caught."
|
||||
ERRORED=true
|
||||
exit_code=$?
|
||||
if [ $exit_code == 1 ]
|
||||
then
|
||||
echo "Error caught."
|
||||
ERRORED=true
|
||||
else
|
||||
echo "Exit code greater than zero detected: $exit_code"
|
||||
fi
|
||||
}
|
||||
|
||||
function get_node {
|
||||
|
||||
@@ -14,16 +14,20 @@ function functional_test_telemetry {
|
||||
export RUN_TAG="funtest-telemetry"
|
||||
yq -i '.telemetry.enabled=True' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.full_prometheus_backup=True' CI/config/common_test_config.yaml
|
||||
yq -i '.performance_monitoring.check_critical_alerts=True' CI/config/common_test_config.yaml
|
||||
yq -i '.performance_monitoring.prometheus_url="http://localhost:9090"' CI/config/common_test_config.yaml
|
||||
yq -i '.telemetry.run_tag=env(RUN_TAG)' CI/config/common_test_config.yaml
|
||||
|
||||
export scenario_type="arcaflow_scenarios"
|
||||
export scenario_file="scenarios/arcaflow/cpu-hog/input.yaml"
|
||||
export post_config=""
|
||||
envsubst < CI/config/common_test_config.yaml > CI/config/telemetry.yaml
|
||||
python3 -m coverage run -a run_kraken.py -c CI/config/telemetry.yaml
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/download/(.*)#\1#p"`
|
||||
RUN_FOLDER=`cat CI/out/test_telemetry.out | grep amazonaws.com | sed -rn "s#.*https:\/\/.*\/files/(.*)#\1#p"`
|
||||
$AWS_CLI s3 ls "s3://$AWS_BUCKET/$RUN_FOLDER/" | awk '{ print $4 }' > s3_remote_files
|
||||
echo "checking if telemetry files are uploaded on s3"
|
||||
cat s3_remote_files | grep events-00.json || ( echo "FAILED: events-00.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep critical-alerts-00.json || ( echo "FAILED: critical-alerts-00.json not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep prometheus-00.tar || ( echo "FAILED: prometheus backup not uploaded" && exit 1 )
|
||||
cat s3_remote_files | grep telemetry.json || ( echo "FAILED: telemetry.json not uploaded" && exit 1 )
|
||||
echo "all files uploaded!"
|
||||
|
||||
@@ -1,5 +1,4 @@
|
||||
# Krkn aka Kraken
|
||||
[](https://quay.io/repository/krkn-chaos/krkn?tab=tags&tag=latest)
|
||||

|
||||
|
||||

|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
kraken:
|
||||
distribution: openshift # Distribution can be kubernetes or openshift
|
||||
distribution: kubernetes # Distribution can be kubernetes or openshift
|
||||
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
||||
exit_on_failure: False # Exit when a post action scenario fails
|
||||
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
||||
@@ -15,7 +15,7 @@ kraken:
|
||||
- application_outages:
|
||||
- scenarios/openshift/app_outage.yaml
|
||||
- container_scenarios: # List of chaos pod scenarios to load
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- - scenarios/openshift/container_etcd.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/etcd.yml
|
||||
- scenarios/openshift/regex_openshift_pod_kill.yml
|
||||
@@ -23,7 +23,7 @@ kraken:
|
||||
- scenarios/openshift/network_chaos_ingress.yml
|
||||
- scenarios/openshift/prom_kill.yml
|
||||
- node_scenarios: # List of chaos node scenarios to load
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- scenarios/openshift/node_scenarios_example.yml
|
||||
- plugin_scenarios:
|
||||
- scenarios/openshift/openshift-apiserver.yml
|
||||
- scenarios/openshift/openshift-kube-apiserver.yml
|
||||
@@ -51,7 +51,7 @@ cerberus:
|
||||
performance_monitoring:
|
||||
deploy_dashboards: False # Install a mutable grafana and load the performance dashboards. Enable this only when running on OpenShift
|
||||
repo: "https://github.com/cloud-bulldozer/performance-dashboards.git"
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_url: # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
||||
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
||||
uuid: # uuid for the run is generated by default if not set
|
||||
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
||||
@@ -65,14 +65,19 @@ telemetry:
|
||||
enabled: False # enable/disables the telemetry collection feature
|
||||
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
||||
username: username # telemetry service username
|
||||
password: password # telemetry service password
|
||||
password: password # telemetry service password
|
||||
prometheus_backup: True # enables/disables prometheus data collection
|
||||
prometheus_namespace: "" # namespace where prometheus is deployed (if distribution is kubernetes)
|
||||
prometheus_container_name: "" # name of the prometheus container name (if distribution is kubernetes)
|
||||
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
||||
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
||||
backup_threads: 5 # number of telemetry download/upload threads
|
||||
archive_path: /tmp # local path where the archive files will be temporarly stored
|
||||
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
||||
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
||||
archive_size: 500000 # the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
archive_size: 500000
|
||||
telemetry_group: '' # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
|
||||
# the size of the prometheus data archive size in KB. The lower the size of archive is
|
||||
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
||||
# simultaneously).
|
||||
# For unstable/slow connection is better to keep this value low
|
||||
@@ -85,6 +90,9 @@ telemetry:
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
events_backup: True # enables/disables cluster events collection
|
||||
elastic:
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_index: "" # Elastic search index pattern to post results to
|
||||
|
||||
|
||||
|
||||
|
||||
@@ -77,3 +77,8 @@ telemetry:
|
||||
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
||||
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
||||
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
||||
elastic:
|
||||
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
||||
elastic_index: "" # Elastic search index pattern to post results to
|
||||
|
||||
|
||||
|
||||
@@ -7,6 +7,8 @@ auth_token: <Auth_Token>
|
||||
scrape_duration: 10m
|
||||
chaos_library: "kraken"
|
||||
log_level: INFO
|
||||
json_output_file: False
|
||||
json_output_folder_path:
|
||||
|
||||
# for output purpose only do not change if not needed
|
||||
chaos_tests:
|
||||
@@ -26,4 +28,8 @@ chaos_tests:
|
||||
- pod_network_chaos
|
||||
MEM:
|
||||
- node_memory_hog
|
||||
- pvc_disk_fill
|
||||
- pvc_disk_fill
|
||||
|
||||
threshold: .7
|
||||
cpu_threshold: .5
|
||||
mem_threshold: .5
|
||||
|
||||
@@ -12,7 +12,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
||||
git clone https://github.com/krkn-chaos/krkn.git --branch v1.5.10 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
@@ -20,7 +20,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
WORKDIR /tmp
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||
|
||||
WORKDIR /root/kraken
|
||||
|
||||
|
||||
@@ -14,7 +14,7 @@ COPY --from=azure-cli /usr/local/bin/az /usr/bin/az
|
||||
# Install dependencies
|
||||
RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
python3.9 -m pip install -U pip && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.7 /root/kraken && \
|
||||
git clone https://github.com/redhat-chaos/krkn.git --branch v1.5.10 /root/kraken && \
|
||||
mkdir -p /root/.kube && cd /root/kraken && \
|
||||
pip3.9 install -r requirements.txt && \
|
||||
pip3.9 install virtualenv && \
|
||||
@@ -22,7 +22,7 @@ RUN yum install -y git python39 python3-pip jq gettext wget && \
|
||||
|
||||
# Get Kubernetes and OpenShift clients from stable releases
|
||||
WORKDIR /tmp
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp kubectl /usr/local/bin/kubectl
|
||||
RUN wget https://mirror.openshift.com/pub/openshift-v4/clients/ocp/stable/openshift-client-linux.tar.gz && tar -xvf openshift-client-linux.tar.gz && cp oc /usr/local/bin/oc && cp oc /usr/bin/oc && cp kubectl /usr/local/bin/kubectl && cp kubectl /usr/bin/kubectl
|
||||
|
||||
WORKDIR /root/kraken
|
||||
|
||||
|
||||
@@ -22,7 +22,7 @@ the capabilities of the current supported scenarios.
|
||||
Pick the latest stable release to install [here](https://github.com/krkn-chaos/krkn/releases).
|
||||
```
|
||||
$ git clone https://github.com/krkn-chaos/krkn.git --branch <release version>
|
||||
$ cd kraken
|
||||
$ cd krkn
|
||||
```
|
||||
|
||||
#### Install the dependencies
|
||||
|
||||
@@ -4,13 +4,10 @@ import pandas as pd
|
||||
import kraken.chaos_recommender.kraken_tests as kraken_tests
|
||||
import time
|
||||
|
||||
threshold = .7 # Adjust the threshold as needed
|
||||
heatmap_cpu_threshold = .5
|
||||
heatmap_mem_threshold = .5
|
||||
|
||||
KRAKEN_TESTS_PATH = "./kraken_chaos_tests.txt"
|
||||
|
||||
#Placeholder, this should be done with topology
|
||||
|
||||
# Placeholder, this should be done with topology
|
||||
def return_critical_services():
|
||||
return ["web", "cart"]
|
||||
|
||||
@@ -19,6 +16,7 @@ def load_telemetry_data(file_path):
|
||||
data = pd.read_csv(file_path, delimiter=r"\s+")
|
||||
return data
|
||||
|
||||
|
||||
def calculate_zscores(data):
|
||||
zscores = pd.DataFrame()
|
||||
zscores["Service"] = data["service"]
|
||||
@@ -27,7 +25,8 @@ def calculate_zscores(data):
|
||||
zscores["Network"] = (data["NETWORK"] - data["NETWORK"].mean()) / data["NETWORK"].std()
|
||||
return zscores
|
||||
|
||||
def identify_outliers(data):
|
||||
|
||||
def identify_outliers(data, threshold):
|
||||
outliers_cpu = data[data["CPU"] > threshold]["Service"].tolist()
|
||||
outliers_memory = data[data["Memory"] > threshold]["Service"].tolist()
|
||||
outliers_network = data[data["Network"] > threshold]["Service"].tolist()
|
||||
@@ -47,44 +46,64 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
|
||||
return cpu_services, mem_services
|
||||
|
||||
|
||||
def analysis(file_path, chaos_tests_config):
|
||||
def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
# Load the telemetry data from file
|
||||
logging.info("Fetching the Telemetry data")
|
||||
data = load_telemetry_data(file_path)
|
||||
|
||||
# Calculate Z-scores for CPU, Memory, and Network columns
|
||||
zscores = calculate_zscores(data)
|
||||
|
||||
# Identify outliers
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores)
|
||||
logging.info("Identifying outliers")
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
|
||||
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
# Display the identified outliers
|
||||
logging.info("======================== Profiling ==================================")
|
||||
logging.info(f"CPU outliers: {outliers_cpu}")
|
||||
logging.info(f"Memory outliers: {outliers_memory}")
|
||||
logging.info(f"Network outliers: {outliers_network}")
|
||||
logging.info("===================== HeatMap Analysis ==============================")
|
||||
analysis_data = analysis_json(outliers_cpu, outliers_memory,
|
||||
outliers_network, cpu_services,
|
||||
mem_services, chaos_tests_config)
|
||||
|
||||
if not cpu_services:
|
||||
logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
|
||||
if not mem_services:
|
||||
logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
|
||||
logging.info("Please check data in utilisation.txt for further analysis")
|
||||
|
||||
return analysis_data
|
||||
|
||||
|
||||
def analysis_json(outliers_cpu, outliers_memory, outliers_network,
|
||||
cpu_services, mem_services, chaos_tests_config):
|
||||
|
||||
profiling = {
|
||||
"cpu_outliers": outliers_cpu,
|
||||
"memory_outliers": outliers_memory,
|
||||
"network_outliers": outliers_network
|
||||
}
|
||||
|
||||
heatmap = {
|
||||
"services_with_cpu_heatmap_above_threshold": cpu_services,
|
||||
"services_with_mem_heatmap_above_threshold": mem_services
|
||||
}
|
||||
|
||||
recommendations = {}
|
||||
|
||||
if cpu_services:
|
||||
logging.info("Services with CPU_HEATMAP above threshold:", cpu_services)
|
||||
else:
|
||||
logging.info("There are no services that are using siginificant CPU compared to their assigned limits (infinite in case no limits are set).")
|
||||
cpu_recommend = {"services": cpu_services,
|
||||
"tests": chaos_tests_config['CPU']}
|
||||
recommendations["cpu_services_recommendations"] = cpu_recommend
|
||||
|
||||
if mem_services:
|
||||
logging.info("Services with MEM_HEATMAP above threshold:", mem_services)
|
||||
else:
|
||||
logging.info("There are no services that are using siginificant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
logging.info("======================= Recommendations =============================")
|
||||
if cpu_services:
|
||||
logging.info(f"Recommended tests for {str(cpu_services)} :\n {chaos_tests_config['CPU']}")
|
||||
logging.info("\n")
|
||||
if mem_services:
|
||||
logging.info(f"Recommended tests for {str(mem_services)} :\n {chaos_tests_config['MEM']}")
|
||||
logging.info("\n")
|
||||
mem_recommend = {"services": mem_services,
|
||||
"tests": chaos_tests_config['MEM']}
|
||||
recommendations["mem_services_recommendations"] = mem_recommend
|
||||
|
||||
if outliers_network:
|
||||
logging.info(f"Recommended tests for str(outliers_network) :\n {chaos_tests_config['NETWORK']}")
|
||||
logging.info("\n")
|
||||
outliers_network_recommend = {"outliers_networks": outliers_network,
|
||||
"tests": chaos_tests_config['NETWORK']}
|
||||
recommendations["outliers_network_recommendations"] = (
|
||||
outliers_network_recommend)
|
||||
|
||||
logging.info("\n")
|
||||
logging.info("Please check data in utilisation.txt for further analysis")
|
||||
return [profiling, heatmap, recommendations]
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import logging
|
||||
|
||||
import pandas
|
||||
from prometheus_api_client import PrometheusConnect
|
||||
import pandas as pd
|
||||
import urllib3
|
||||
@@ -8,6 +7,7 @@ import urllib3
|
||||
|
||||
saved_metrics_path = "./utilisation.txt"
|
||||
|
||||
|
||||
def convert_data_to_dataframe(data, label):
|
||||
df = pd.DataFrame()
|
||||
df['service'] = [item['metric']['pod'] for item in data]
|
||||
@@ -25,6 +25,7 @@ def convert_data(data, service):
|
||||
result[pod_name] = value
|
||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
|
||||
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
||||
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
||||
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
||||
@@ -39,8 +40,6 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
||||
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
|
||||
|
||||
|
||||
# Convert columns to string
|
||||
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
||||
merged_df['MEM'] = merged_df['MEM'].astype(str)
|
||||
@@ -57,40 +56,39 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
||||
|
||||
merged_df.to_csv(filename, sep='\t', index=False)
|
||||
|
||||
|
||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
||||
urllib3.disable_warnings()
|
||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||
|
||||
# Fetch CPU utilization
|
||||
logging.info("Fetching utilization")
|
||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||
logging.info(cpu_query)
|
||||
cpu_result = prometheus.custom_query(cpu_query)
|
||||
cpu_data = cpu_result
|
||||
|
||||
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
logging.info(cpu_limits_query)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
|
||||
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
logging.info(mem_query)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
mem_data = mem_result
|
||||
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
logging.info(mem_limits_query)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
logging.info(network_query)
|
||||
network_data = network_result
|
||||
|
||||
|
||||
save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, saved_metrics_path)
|
||||
return saved_metrics_path
|
||||
|
||||
save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
|
||||
queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
|
||||
return saved_metrics_path, queries
|
||||
|
||||
|
||||
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
|
||||
queries = {
|
||||
"cpu_query": cpu_query,
|
||||
"cpu_limit_query": cpu_limits_query,
|
||||
"memory_query": mem_query,
|
||||
"memory_limit_query": mem_limits_query
|
||||
}
|
||||
return queries
|
||||
|
||||
@@ -213,6 +213,12 @@ PLUGINS = Plugins(
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
network_chaos,
|
||||
[
|
||||
"error"
|
||||
]
|
||||
),
|
||||
PluginStep(
|
||||
pod_outage,
|
||||
[
|
||||
|
||||
@@ -62,7 +62,7 @@ class NetworkScenarioConfig:
|
||||
typing.Optional[int],
|
||||
validation.min(1)
|
||||
] = field(
|
||||
default=300,
|
||||
default=30,
|
||||
metadata={
|
||||
"name": "Wait Duration",
|
||||
"description":
|
||||
@@ -864,7 +864,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
||||
)
|
||||
logging.info("Waiting for parallel job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||
end_time = int(time.time())
|
||||
if publish:
|
||||
cerberus.publish_kraken_status(
|
||||
@@ -893,7 +893,7 @@ def network_chaos(cfg: NetworkScenarioConfig) -> typing.Tuple[
|
||||
)
|
||||
logging.info("Waiting for serial job to finish")
|
||||
start_time = int(time.time())
|
||||
wait_for_job(batch_cli, job_list[:], cfg.wait_duration)
|
||||
wait_for_job(batch_cli, job_list[:], cfg.test_duration+100)
|
||||
logging.info("Deleting jobs")
|
||||
delete_jobs(cli, batch_cli, job_list[:])
|
||||
job_list = []
|
||||
|
||||
@@ -1,10 +1,13 @@
|
||||
import datetime
|
||||
import os.path
|
||||
from typing import Optional
|
||||
|
||||
import urllib3
|
||||
import logging
|
||||
import sys
|
||||
|
||||
import yaml
|
||||
from krkn_lib.models.krkn import ChaosRunAlertSummary, ChaosRunAlert
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
|
||||
def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||
@@ -27,4 +30,59 @@ def alerts(prom_cli: KrknPrometheus, start_time, end_time, alert_profile):
|
||||
|
||||
prom_cli.process_alert(alert,
|
||||
datetime.datetime.fromtimestamp(start_time),
|
||||
datetime.datetime.fromtimestamp(end_time))
|
||||
datetime.datetime.fromtimestamp(end_time))
|
||||
|
||||
|
||||
def critical_alerts(prom_cli: KrknPrometheus,
|
||||
summary: ChaosRunAlertSummary,
|
||||
run_id,
|
||||
scenario,
|
||||
start_time,
|
||||
end_time):
|
||||
summary.scenario = scenario
|
||||
summary.run_id = run_id
|
||||
query = r"""ALERTS{severity="critical"}"""
|
||||
logging.info("Checking for critical alerts firing post chaos")
|
||||
|
||||
during_critical_alerts = prom_cli.process_prom_query_in_range(
|
||||
query,
|
||||
start_time=datetime.datetime.fromtimestamp(start_time),
|
||||
end_time=end_time
|
||||
|
||||
)
|
||||
|
||||
for alert in during_critical_alerts:
|
||||
if "metric" in alert:
|
||||
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||
summary.chaos_alerts.append(alert)
|
||||
|
||||
|
||||
post_critical_alerts = prom_cli.process_query(
|
||||
query
|
||||
)
|
||||
|
||||
for alert in post_critical_alerts:
|
||||
if "metric" in alert:
|
||||
alertname = alert["metric"]["alertname"] if "alertname" in alert["metric"] else "none"
|
||||
alertstate = alert["metric"]["alertstate"] if "alertstate" in alert["metric"] else "none"
|
||||
namespace = alert["metric"]["namespace"] if "namespace" in alert["metric"] else "none"
|
||||
severity = alert["metric"]["severity"] if "severity" in alert["metric"] else "none"
|
||||
alert = ChaosRunAlert(alertname, alertstate, namespace, severity)
|
||||
summary.post_chaos_alerts.append(alert)
|
||||
|
||||
during_critical_alerts_count = len(during_critical_alerts)
|
||||
post_critical_alerts_count = len(post_critical_alerts)
|
||||
firing_alerts = False
|
||||
|
||||
if during_critical_alerts_count > 0:
|
||||
firing_alerts = True
|
||||
|
||||
if post_critical_alerts_count > 0:
|
||||
firing_alerts = True
|
||||
|
||||
if not firing_alerts:
|
||||
logging.info("No critical alerts are firing!!")
|
||||
|
||||
@@ -10,13 +10,12 @@ itsdangerous==2.0.1
|
||||
coverage==7.4.1
|
||||
datetime==5.4
|
||||
docker==7.0.0
|
||||
docker-compose==1.29.2
|
||||
gitpython==3.1.41
|
||||
google-api-python-client==2.116.0
|
||||
ibm_cloud_sdk_core==3.18.0
|
||||
ibm_vpc==0.20.0
|
||||
jinja2==3.1.3
|
||||
krkn-lib==1.4.9
|
||||
krkn-lib==2.1.0
|
||||
lxml==5.1.0
|
||||
kubernetes==26.1.0
|
||||
oauth2client==4.1.3
|
||||
@@ -31,11 +30,12 @@ python-ipmi==0.5.4
|
||||
python-openstackclient==6.5.0
|
||||
requests==2.31.0
|
||||
service_identity==24.1.0
|
||||
PyYAML==5.4.1
|
||||
PyYAML==6.0
|
||||
setuptools==65.5.1
|
||||
werkzeug==3.0.1
|
||||
wheel==0.42.0
|
||||
zope.interface==5.4.0
|
||||
|
||||
git+https://github.com/krkn-chaos/arcaflow-plugin-kill-pod.git
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
git+https://github.com/vmware/vsphere-automation-sdk-python.git@v8.0.0.0
|
||||
cryptography>=42.0.4 # not directly required, pinned by Snyk to avoid a vulnerability
|
||||
|
||||
@@ -9,6 +9,8 @@ import optparse
|
||||
import pyfiglet
|
||||
import uuid
|
||||
import time
|
||||
|
||||
from krkn_lib.models.krkn import ChaosRunOutput, ChaosRunAlertSummary
|
||||
from krkn_lib.prometheus.krkn_prometheus import KrknPrometheus
|
||||
import kraken.time_actions.common_time_functions as time_actions
|
||||
import kraken.performance_dashboards.setup as performance_dashboards
|
||||
@@ -27,6 +29,7 @@ import server as server
|
||||
from kraken import plugins
|
||||
from krkn_lib.k8s import KrknKubernetes
|
||||
from krkn_lib.ocp import KrknOpenshift
|
||||
from krkn_lib.telemetry.elastic import KrknElastic
|
||||
from krkn_lib.telemetry.k8s import KrknTelemetryKubernetes
|
||||
from krkn_lib.telemetry.ocp import KrknTelemetryOpenshift
|
||||
from krkn_lib.models.telemetry import ChaosRunTelemetry
|
||||
@@ -94,6 +97,9 @@ def main(cfg):
|
||||
config["performance_monitoring"], "check_critical_alerts", False
|
||||
)
|
||||
telemetry_api_url = config["telemetry"].get("api_url")
|
||||
elastic_config = get_yaml_item_value(config,"elastic",{})
|
||||
elastic_url = get_yaml_item_value(elastic_config,"elastic_url","")
|
||||
elastic_index = get_yaml_item_value(elastic_config,"elastic_index","")
|
||||
|
||||
# Initialize clients
|
||||
if (not os.path.isfile(kubeconfig_path) and
|
||||
@@ -129,8 +135,6 @@ def main(cfg):
|
||||
except:
|
||||
kubecli.initialize_clients(None)
|
||||
|
||||
|
||||
|
||||
# find node kraken might be running on
|
||||
kubecli.find_kraken_node()
|
||||
|
||||
@@ -161,8 +165,13 @@ def main(cfg):
|
||||
if prometheus_url is None:
|
||||
try:
|
||||
connection_data = ocpcli.get_prometheus_api_connection_data()
|
||||
prometheus_url = connection_data.endpoint
|
||||
prometheus_bearer_token = connection_data.token
|
||||
if connection_data:
|
||||
prometheus_url = connection_data.endpoint
|
||||
prometheus_bearer_token = connection_data.token
|
||||
else:
|
||||
# If can't make a connection, set alerts to false
|
||||
enable_alerts = False
|
||||
critical_alerts = False
|
||||
except Exception:
|
||||
logging.error("invalid distribution selected, running openshift scenarios against kubernetes cluster."
|
||||
"Please set 'kubernetes' in config.yaml krkn.platform and try again")
|
||||
@@ -175,9 +184,9 @@ def main(cfg):
|
||||
# KrknTelemetry init
|
||||
telemetry_k8s = KrknTelemetryKubernetes(safe_logger, kubecli)
|
||||
telemetry_ocp = KrknTelemetryOpenshift(safe_logger, ocpcli)
|
||||
|
||||
|
||||
if enable_alerts:
|
||||
telemetry_elastic = KrknElastic(safe_logger,elastic_url)
|
||||
summary = ChaosRunAlertSummary()
|
||||
if enable_alerts or check_critical_alerts:
|
||||
prometheus = KrknPrometheus(prometheus_url, prometheus_bearer_token)
|
||||
|
||||
logging.info("Server URL: %s" % kubecli.get_host())
|
||||
@@ -208,7 +217,8 @@ def main(cfg):
|
||||
|
||||
# Capture the start time
|
||||
start_time = int(time.time())
|
||||
|
||||
post_critical_alerts = 0
|
||||
chaos_output = ChaosRunOutput()
|
||||
chaos_telemetry = ChaosRunTelemetry()
|
||||
chaos_telemetry.run_uuid = run_uuid
|
||||
# Loop to run the chaos starts here
|
||||
@@ -339,25 +349,21 @@ def main(cfg):
|
||||
failed_post_scenarios, scenario_telemetries = network_chaos.run(scenarios_list, config, wait_duration, kubecli, telemetry_k8s)
|
||||
|
||||
# Check for critical alerts when enabled
|
||||
if enable_alerts and check_critical_alerts :
|
||||
logging.info("Checking for critical alerts firing post choas")
|
||||
post_critical_alerts = 0
|
||||
if check_critical_alerts:
|
||||
prometheus_plugin.critical_alerts(prometheus,
|
||||
summary,
|
||||
run_uuid,
|
||||
scenario_type,
|
||||
start_time,
|
||||
datetime.datetime.now())
|
||||
|
||||
##PROM
|
||||
query = r"""ALERTS{severity="critical"}"""
|
||||
end_time = datetime.datetime.now()
|
||||
critical_alerts = prometheus.process_prom_query_in_range(
|
||||
query,
|
||||
start_time = datetime.datetime.fromtimestamp(start_time),
|
||||
end_time = end_time
|
||||
chaos_output.critical_alerts = summary
|
||||
post_critical_alerts = len(summary.post_chaos_alerts)
|
||||
if post_critical_alerts > 0:
|
||||
logging.error("Post chaos critical alerts firing please check, exiting")
|
||||
break
|
||||
|
||||
)
|
||||
critical_alerts_count = len(critical_alerts)
|
||||
if critical_alerts_count > 0:
|
||||
logging.error("Critical alerts are firing: %s", critical_alerts)
|
||||
logging.error("Please check, exiting")
|
||||
sys.exit(1)
|
||||
else:
|
||||
logging.info("No critical alerts are firing!!")
|
||||
|
||||
iteration += 1
|
||||
logging.info("")
|
||||
@@ -377,14 +383,18 @@ def main(cfg):
|
||||
telemetry_k8s.collect_cluster_metadata(chaos_telemetry)
|
||||
|
||||
decoded_chaos_run_telemetry = ChaosRunTelemetry(json.loads(chaos_telemetry.to_json()))
|
||||
logging.info(f"Telemetry data:\n{decoded_chaos_run_telemetry.to_json()}")
|
||||
|
||||
chaos_output.telemetry = decoded_chaos_run_telemetry
|
||||
logging.info(f"Chaos data:\n{chaos_output.to_json()}")
|
||||
telemetry_elastic.upload_data_to_elasticsearch(decoded_chaos_run_telemetry.to_json(), elastic_index)
|
||||
if config["telemetry"]["enabled"]:
|
||||
logging.info(f"telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/download/{telemetry_request_id}")
|
||||
logging.info(f'telemetry data will be stored on s3 bucket folder: {telemetry_api_url}/files/'
|
||||
f'{(config["telemetry"]["telemetry_group"] if config["telemetry"]["telemetry_group"] else "default")}/'
|
||||
f'{telemetry_request_id}')
|
||||
logging.info(f"telemetry upload log: {safe_logger.log_file_name}")
|
||||
try:
|
||||
telemetry_k8s.send_telemetry(config["telemetry"], telemetry_request_id, chaos_telemetry)
|
||||
telemetry_k8s.put_cluster_events(telemetry_request_id, config["telemetry"], start_time, end_time)
|
||||
telemetry_k8s.put_critical_alerts(telemetry_request_id, config["telemetry"], summary)
|
||||
# prometheus data collection is available only on Openshift
|
||||
if config["telemetry"]["prometheus_backup"]:
|
||||
prometheus_archive_files = ''
|
||||
@@ -434,11 +444,15 @@ def main(cfg):
|
||||
logging.error("Alert profile is not defined")
|
||||
sys.exit(1)
|
||||
|
||||
if post_critical_alerts > 0:
|
||||
logging.error("Critical alerts are firing, please check; exiting")
|
||||
sys.exit(2)
|
||||
|
||||
if failed_post_scenarios:
|
||||
logging.error(
|
||||
"Post scenarios are still failing at the end of all iterations"
|
||||
)
|
||||
sys.exit(1)
|
||||
sys.exit(2)
|
||||
|
||||
logging.info(
|
||||
"Successfully finished running Kraken. UUID for the run: "
|
||||
|
||||
@@ -4,7 +4,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -3,7 +3,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -4,7 +4,7 @@ deployers:
|
||||
connection: {}
|
||||
deployer_name: kubernetes
|
||||
log:
|
||||
level: debug
|
||||
level: error
|
||||
logged_outputs:
|
||||
error:
|
||||
level: error
|
||||
|
||||
@@ -20,6 +20,8 @@ This tool profiles an application and gathers telemetry data such as CPU, Memory
|
||||
$ git clone https://github.com/krkn-chaos/krkn.git
|
||||
$ cd krkn
|
||||
$ pip3 install -r requirements.txt
|
||||
Edit configuration file:
|
||||
$ vi config/recommender_config.yaml
|
||||
$ python3.9 utils/chaos_recommender/chaos_recommender.py
|
||||
```
|
||||
|
||||
@@ -37,11 +39,16 @@ You can customize the default values by editing the `krkn/config/recommender_con
|
||||
- `auth_token`: Auth token to connect to prometheus endpoint (must).
|
||||
- `scrape_duration`: For how long data should be fetched, e.g., '1m' (must).
|
||||
- `chaos_library`: "kraken" (currently it only supports kraken).
|
||||
- `json_output_file`: True or False (by default False).
|
||||
- `json_output_folder_path`: Specify folder path where output should be saved. If empty the default path is used.
|
||||
- `chaos_tests`: (for output purpose only do not change if not needed)
|
||||
- `GENERAL`: list of general purpose tests available in Krkn
|
||||
- `MEM`: list of memory related tests available in Krkn
|
||||
- `NETWORK`: list of network related tests available in Krkn
|
||||
- `CPU`: list of memory related tests available in Krkn
|
||||
- `threshold`: Specify the threshold to use for comparison and identifying outliers
|
||||
- `cpu_threshold`: Specify the cpu threshold to compare with the cpu limits set on the pods and identify outliers
|
||||
- `mem_threshold`: Specify the memory threshold to compare with the memory limits set on the pods and identify outliers
|
||||
|
||||
*TIP:* to collect prometheus endpoint and token from your OpenShift cluster you can run the following commands:
|
||||
```
|
||||
@@ -74,6 +81,8 @@ You can also provide the input values through command-line arguments launching t
|
||||
Chaos library
|
||||
-L LOG_LEVEL, --log-level LOG_LEVEL
|
||||
log level (DEBUG, INFO, WARNING, ERROR, CRITICAL
|
||||
-J [FOLDER_PATH], --json-output-file [FOLDER_PATH]
|
||||
Create output file, the path to the folder can be specified, if not specified the default folder is used.
|
||||
-M MEM [MEM ...], --MEM MEM [MEM ...]
|
||||
Memory related chaos tests (space separated list)
|
||||
-C CPU [CPU ...], --CPU CPU [CPU ...]
|
||||
@@ -82,7 +91,12 @@ You can also provide the input values through command-line arguments launching t
|
||||
Network related chaos tests (space separated list)
|
||||
-G GENERIC [GENERIC ...], --GENERIC GENERIC [GENERIC ...]
|
||||
Memory related chaos tests (space separated list)
|
||||
|
||||
--threshold THRESHOLD
|
||||
Threshold
|
||||
--cpu_threshold CPU_THRESHOLD
|
||||
CPU threshold to compare with the cpu limits
|
||||
--mem_threshold MEM_THRESHOLD
|
||||
Memory threshold to compare with the memory limits
|
||||
```
|
||||
|
||||
If you provide the input values through command-line arguments, the corresponding config file inputs would be ignored.
|
||||
@@ -97,7 +111,7 @@ After obtaining telemetry data, sourced either locally or from Prometheus, the t
|
||||
|
||||
## Customizing Thresholds and Options
|
||||
|
||||
You can customize the thresholds and options used for data analysis by modifying the `krkn/kraken/chaos_recommender/analysis.py` file. For example, you can adjust the threshold for identifying outliers by changing the value of the `threshold` variable in the `identify_outliers` function.
|
||||
You can customize the thresholds and options used for data analysis and identifying the outliers by setting the threshold, cpu_threshold and mem_threshold parameters in the config.
|
||||
|
||||
## Additional Files
|
||||
|
||||
|
||||
@@ -1,7 +1,9 @@
|
||||
import argparse
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import sys
|
||||
import time
|
||||
import yaml
|
||||
# kraken module import for running the recommender
|
||||
# both from the root directory and the recommender
|
||||
@@ -9,12 +11,13 @@ import yaml
|
||||
sys.path.insert(0, './')
|
||||
sys.path.insert(0, '../../')
|
||||
|
||||
from krkn_lib.utils import get_yaml_item_value
|
||||
|
||||
import kraken.chaos_recommender.analysis as analysis
|
||||
import kraken.chaos_recommender.prometheus as prometheus
|
||||
from kubernetes import config as kube_config
|
||||
|
||||
|
||||
|
||||
def parse_arguments(parser):
|
||||
|
||||
# command line options
|
||||
@@ -27,6 +30,9 @@ def parse_arguments(parser):
|
||||
parser.add_argument("-s", "--scrape-duration", action="store", default="10m", help="Prometheus scrape duration")
|
||||
parser.add_argument("-L", "--log-level", action="store", default="INFO", help="log level (DEBUG, INFO, WARNING, ERROR, CRITICAL")
|
||||
|
||||
parser.add_argument("-J", "--json-output-file", default=False, nargs="?", action="store",
|
||||
help="Create output file, the path to the folder can be specified, if not specified the default folder is used")
|
||||
|
||||
parser.add_argument("-M", "--MEM", nargs='+', action="store", default=[],
|
||||
help="Memory related chaos tests (space separated list)")
|
||||
parser.add_argument("-C", "--CPU", nargs='+', action="store", default=[],
|
||||
@@ -35,10 +41,13 @@ def parse_arguments(parser):
|
||||
help="Network related chaos tests (space separated list)")
|
||||
parser.add_argument("-G", "--GENERIC", nargs='+', action="store", default=[],
|
||||
help="Memory related chaos tests (space separated list)")
|
||||
|
||||
parser.add_argument("--threshold", action="store", default="", help="Threshold")
|
||||
parser.add_argument("--cpu-threshold", action="store", default="", help="CPU threshold")
|
||||
parser.add_argument("--mem-threshold", action="store", default="", help="Memory threshold")
|
||||
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def read_configuration(config_file_path):
|
||||
if not os.path.exists(config_file_path):
|
||||
logging.error(f"Config file not found: {config_file_path}")
|
||||
@@ -48,15 +57,25 @@ def read_configuration(config_file_path):
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
log_level = config.get("log level", "INFO")
|
||||
namespace = config.get("namespace", "")
|
||||
kubeconfig = config.get("kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||
namespace = config.get("namespace")
|
||||
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||
|
||||
prometheus_endpoint = config.get("prometheus_endpoint", "")
|
||||
auth_token = config.get("auth_token", "")
|
||||
scrape_duration = config.get("scrape_duration", "10m")
|
||||
chaos_tests = config.get("chaos_tests" , {})
|
||||
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||
auth_token = config.get("auth_token")
|
||||
scrape_duration = get_yaml_item_value(config, "scrape_duration", "10m")
|
||||
threshold = get_yaml_item_value(config, "threshold", ".7")
|
||||
heatmap_cpu_threshold = get_yaml_item_value(config, "cpu_threshold", ".5")
|
||||
heatmap_mem_threshold = get_yaml_item_value(config, "mem_threshold", ".3")
|
||||
output_file = config.get("json_output_file", False)
|
||||
if output_file is True:
|
||||
output_path = config.get("json_output_folder_path")
|
||||
else:
|
||||
output_path = False
|
||||
chaos_tests = config.get("chaos_tests", {})
|
||||
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
||||
chaos_tests, log_level)
|
||||
chaos_tests, log_level, threshold, heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold, output_path)
|
||||
|
||||
|
||||
def prompt_input(prompt, default_value):
|
||||
user_input = input(f"{prompt} [{default_value}]: ")
|
||||
@@ -64,6 +83,44 @@ def prompt_input(prompt, default_value):
|
||||
return user_input
|
||||
return default_value
|
||||
|
||||
|
||||
def make_json_output(inputs, queries, analysis_data, output_path):
|
||||
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
|
||||
|
||||
data = {
|
||||
"inputs": inputs,
|
||||
"queries": queries,
|
||||
"profiling": analysis_data[0],
|
||||
"heatmap_analysis": analysis_data[1],
|
||||
"recommendations": analysis_data[2]
|
||||
}
|
||||
|
||||
logging.info(f"Summary\n{json.dumps(data, indent=4)}")
|
||||
|
||||
if output_path is not False:
|
||||
file = f"recommender_{inputs['namespace']}_{time_str}.json"
|
||||
path = f"{os.path.expanduser(output_path)}/{file}"
|
||||
|
||||
with open(path, "w") as json_output:
|
||||
logging.info(f"Saving output file in {output_path} folder...")
|
||||
json_output.write(json.dumps(data, indent=4))
|
||||
logging.info(f"Recommendation output saved in {file}.")
|
||||
|
||||
|
||||
def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
inputs = {
|
||||
"namespace": namespace,
|
||||
"kubeconfig": kubeconfig,
|
||||
"prometheus_endpoint": prometheus_endpoint,
|
||||
"scrape_duration": scrape_duration,
|
||||
"chaos_tests": chaos_tests,
|
||||
"threshold": threshold,
|
||||
"heatmap_cpu_threshold": heatmap_cpu_threshold,
|
||||
"heatmap_mem_threshold": heatmap_mem_threshold
|
||||
}
|
||||
return inputs
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
||||
args = parse_arguments(parser)
|
||||
@@ -81,7 +138,11 @@ def main():
|
||||
auth_token,
|
||||
scrape_duration,
|
||||
chaos_tests,
|
||||
log_level
|
||||
log_level,
|
||||
threshold,
|
||||
heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold,
|
||||
output_path
|
||||
) = read_configuration(args.config_file)
|
||||
|
||||
if args.options:
|
||||
@@ -91,27 +152,35 @@ def main():
|
||||
scrape_duration = args.scrape_duration
|
||||
log_level = args.log_level
|
||||
prometheus_endpoint = args.prometheus_endpoint
|
||||
output_path = args.json_output_file
|
||||
chaos_tests = {"MEM": args.MEM, "GENERIC": args.GENERIC, "CPU": args.CPU, "NETWORK": args.NETWORK}
|
||||
threshold = args.threshold
|
||||
heatmap_mem_threshold = args.mem_threshold
|
||||
heatmap_cpu_threshold = args.cpu_threshold
|
||||
|
||||
if log_level not in ["DEBUG","INFO", "WARNING", "ERROR","CRITICAL"]:
|
||||
if log_level not in ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"]:
|
||||
logging.error(f"{log_level} not a valid log level")
|
||||
sys.exit(1)
|
||||
|
||||
logging.basicConfig(level=log_level)
|
||||
|
||||
logging.info("============================INPUTS===================================")
|
||||
logging.info(f"Namespace: {namespace}")
|
||||
logging.info(f"Kubeconfig: {kubeconfig}")
|
||||
logging.info(f"Prometheus endpoint: {prometheus_endpoint}")
|
||||
logging.info(f"Scrape duration: {scrape_duration}")
|
||||
for test in chaos_tests.keys():
|
||||
logging.info(f"Chaos tests {test}: {chaos_tests[test]}")
|
||||
logging.info("=====================================================================")
|
||||
if output_path is not False:
|
||||
if output_path is None:
|
||||
output_path = "./recommender_output"
|
||||
logging.info(f"Path for output file not specified. "
|
||||
f"Using default folder {output_path}")
|
||||
if not os.path.exists(os.path.expanduser(output_path)):
|
||||
logging.error(f"Folder {output_path} for output not found.")
|
||||
sys.exit(1)
|
||||
logging.info("Loading inputs...")
|
||||
inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
logging.info("Starting Analysis ...")
|
||||
logging.info("Fetching the Telemetry data")
|
||||
|
||||
file_path = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
||||
analysis(file_path, chaos_tests)
|
||||
file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
||||
analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
make_json_output(inputs, queries, analysis_data, output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user