mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 09:59:59 +00:00
Accept list of namespaces in chaos recommender
Signed-off-by: jtydlack <139967002+jtydlack@users.noreply.github.com>
This commit is contained in:
committed by
Naga Ravi Chaitanya Elluri
parent
54af2fc6ff
commit
804d7cbf58
@@ -1,5 +1,5 @@
|
||||
application: openshift-etcd
|
||||
namespace: openshift-etcd
|
||||
namespaces: openshift-etcd
|
||||
labels: app=openshift-etcd
|
||||
kubeconfig: ~/.kube/config.yaml
|
||||
prometheus_endpoint: <Prometheus_Endpoint>
|
||||
|
||||
@@ -19,6 +19,7 @@ def load_telemetry_data(file_path):
|
||||
|
||||
def calculate_zscores(data):
|
||||
zscores = pd.DataFrame()
|
||||
zscores["Namespace"] = data["namespace"]
|
||||
zscores["Service"] = data["service"]
|
||||
zscores["CPU"] = (data["CPU"] - data["CPU"].mean()) / data["CPU"].std()
|
||||
zscores["Memory"] = (data["MEM"] - data["MEM"].mean()) / data["MEM"].std()
|
||||
@@ -46,28 +47,49 @@ def get_services_above_heatmap_threshold(dataframe, cpu_threshold, mem_threshold
|
||||
return cpu_services, mem_services
|
||||
|
||||
|
||||
def analysis(file_path, chaos_tests_config, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
def analysis(file_path, namespaces, chaos_tests_config, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
# Load the telemetry data from file
|
||||
logging.info("Fetching the Telemetry data")
|
||||
logging.info("Fetching the Telemetry data...")
|
||||
data = load_telemetry_data(file_path)
|
||||
|
||||
# Calculate Z-scores for CPU, Memory, and Network columns
|
||||
zscores = calculate_zscores(data)
|
||||
# Dict for saving analysis data -- key is the namespace
|
||||
analysis_data = {}
|
||||
|
||||
# Identify outliers
|
||||
logging.info("Identifying outliers")
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(zscores, threshold)
|
||||
cpu_services, mem_services = get_services_above_heatmap_threshold(data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
# Identify outliers for each namespace
|
||||
for namespace in namespaces:
|
||||
|
||||
analysis_data = analysis_json(outliers_cpu, outliers_memory,
|
||||
outliers_network, cpu_services,
|
||||
mem_services, chaos_tests_config)
|
||||
logging.info(f"Identifying outliers for namespace {namespace}...")
|
||||
|
||||
if not cpu_services:
|
||||
logging.info("There are no services that are using significant CPU compared to their assigned limits (infinite in case no limits are set).")
|
||||
if not mem_services:
|
||||
logging.info("There are no services that are using significant MEMORY compared to their assigned limits (infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
namespace_zscores = zscores.loc[zscores["Namespace"] == namespace]
|
||||
namespace_data = data.loc[data["namespace"] == namespace]
|
||||
outliers_cpu, outliers_memory, outliers_network = identify_outliers(
|
||||
namespace_zscores, threshold)
|
||||
cpu_services, mem_services = get_services_above_heatmap_threshold(
|
||||
namespace_data, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
analysis_data[namespace] = analysis_json(outliers_cpu, outliers_memory,
|
||||
outliers_network,
|
||||
cpu_services, mem_services,
|
||||
chaos_tests_config)
|
||||
|
||||
if cpu_services:
|
||||
logging.info(f"These services use significant CPU compared to "
|
||||
f"their assigned limits: {cpu_services}")
|
||||
else:
|
||||
logging.info("There are no services that are using significant "
|
||||
"CPU compared to their assigned limits "
|
||||
"(infinite in case no limits are set).")
|
||||
if mem_services:
|
||||
logging.info(f"These services use significant MEMORY compared to "
|
||||
f"their assigned limits: {mem_services}")
|
||||
else:
|
||||
logging.info("There are no services that are using significant "
|
||||
"MEMORY compared to their assigned limits "
|
||||
"(infinite in case no limits are set).")
|
||||
time.sleep(2)
|
||||
|
||||
logging.info("Please check data in utilisation.txt for further analysis")
|
||||
|
||||
|
||||
@@ -26,19 +26,26 @@ def convert_data(data, service):
|
||||
return result.get(service, '100000000000') # for those pods whose limits are not defined they can take as much resources, there assigning a very high value
|
||||
|
||||
|
||||
def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_result, network_data, filename):
|
||||
df_cpu = convert_data_to_dataframe(cpu_data, "CPU")
|
||||
merged_df = pd.DataFrame(columns=['service','CPU','CPU_LIMITS','MEM','MEM_LIMITS','NETWORK'])
|
||||
services = df_cpu.service.unique()
|
||||
logging.info(services)
|
||||
def save_utilization_to_file(utilization, filename):
|
||||
merged_df = pd.DataFrame(columns=['namespace', 'service', 'CPU', 'CPU_LIMITS', 'MEM', 'MEM_LIMITS', 'NETWORK'])
|
||||
for namespace in utilization:
|
||||
# Loading utilization_data[] for namespace
|
||||
# indexes -- 0 CPU, 1 CPU limits, 2 mem, 3 mem limits, 4 network
|
||||
utilization_data = utilization[namespace]
|
||||
df_cpu = convert_data_to_dataframe(utilization_data[0], "CPU")
|
||||
services = df_cpu.service.unique()
|
||||
logging.info(f"Services for namespace {namespace}: {services}")
|
||||
|
||||
for s in services:
|
||||
for s in services:
|
||||
|
||||
new_row_df = pd.DataFrame( {"service": s, "CPU" : convert_data(cpu_data, s),
|
||||
"CPU_LIMITS" : convert_data(cpu_limits_result, s),
|
||||
"MEM" : convert_data(mem_data, s), "MEM_LIMITS" : convert_data(mem_limits_result, s),
|
||||
"NETWORK" : convert_data(network_data, s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
new_row_df = pd.DataFrame({
|
||||
"namespace": namespace, "service": s,
|
||||
"CPU": convert_data(utilization_data[0], s),
|
||||
"CPU_LIMITS": convert_data(utilization_data[1], s),
|
||||
"MEM": convert_data(utilization_data[2], s),
|
||||
"MEM_LIMITS": convert_data(utilization_data[3], s),
|
||||
"NETWORK": convert_data(utilization_data[4], s)}, index=[0])
|
||||
merged_df = pd.concat([merged_df, new_row_df], ignore_index=True)
|
||||
|
||||
# Convert columns to string
|
||||
merged_df['CPU'] = merged_df['CPU'].astype(str)
|
||||
@@ -57,38 +64,49 @@ def save_utilization_to_file(cpu_data, cpu_limits_result, mem_data, mem_limits_r
|
||||
merged_df.to_csv(filename, sep='\t', index=False)
|
||||
|
||||
|
||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration):
|
||||
def fetch_utilization_from_prometheus(prometheus_endpoint, auth_token,
|
||||
namespaces, scrape_duration):
|
||||
urllib3.disable_warnings()
|
||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||
prometheus = PrometheusConnect(url=prometheus_endpoint, headers={
|
||||
'Authorization':'Bearer {}'.format(auth_token)}, disable_ssl=True)
|
||||
|
||||
# Fetch CPU utilization
|
||||
logging.info("Fetching utilization")
|
||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||
cpu_result = prometheus.custom_query(cpu_query)
|
||||
# Dicts for saving utilisation and queries -- key is namespace
|
||||
utilization = {}
|
||||
queries = {}
|
||||
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
logging.info("Fetching utilization...")
|
||||
for namespace in namespaces:
|
||||
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
# Fetch CPU utilization
|
||||
cpu_query = 'sum (rate (container_cpu_usage_seconds_total{image!="", namespace="%s"}[%s])) by (pod) *1000' % (namespace,scrape_duration)
|
||||
cpu_result = prometheus.custom_query(cpu_query)
|
||||
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
cpu_limits_query = '(sum by (pod) (kube_pod_container_resource_limits{resource="cpu", namespace="%s"}))*1000' %(namespace)
|
||||
cpu_limits_result = prometheus.custom_query(cpu_limits_query)
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
mem_query = 'sum by (pod) (avg_over_time(container_memory_usage_bytes{image!="", namespace="%s"}[%s]))' % (namespace, scrape_duration)
|
||||
mem_result = prometheus.custom_query(mem_query)
|
||||
|
||||
save_utilization_to_file(cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result, saved_metrics_path)
|
||||
queries = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query)
|
||||
mem_limits_query = 'sum by (pod) (kube_pod_container_resource_limits{resource="memory", namespace="%s"}) ' %(namespace)
|
||||
mem_limits_result = prometheus.custom_query(mem_limits_query)
|
||||
|
||||
network_query = 'sum by (pod) ((avg_over_time(container_network_transmit_bytes_total{namespace="%s"}[%s])) + \
|
||||
(avg_over_time(container_network_receive_bytes_total{namespace="%s"}[%s])))' % (namespace, scrape_duration, namespace, scrape_duration)
|
||||
network_result = prometheus.custom_query(network_query)
|
||||
|
||||
utilization[namespace] = [cpu_result, cpu_limits_result, mem_result, mem_limits_result, network_result]
|
||||
queries[namespace] = json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query)
|
||||
|
||||
save_utilization_to_file(utilization, saved_metrics_path)
|
||||
return saved_metrics_path, queries
|
||||
|
||||
|
||||
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query):
|
||||
def json_queries(cpu_query, cpu_limits_query, mem_query, mem_limits_query, network_query):
|
||||
queries = {
|
||||
"cpu_query": cpu_query,
|
||||
"cpu_limit_query": cpu_limits_query,
|
||||
"memory_query": mem_query,
|
||||
"memory_limit_query": mem_limits_query
|
||||
"memory_limit_query": mem_limits_query,
|
||||
"network_query": network_query
|
||||
}
|
||||
return queries
|
||||
|
||||
@@ -32,7 +32,7 @@ To run the recommender with a config file specify the config file path with the
|
||||
You can customize the default values by editing the `krkn/config/recommender_config.yaml` file. The configuration file contains the following options:
|
||||
|
||||
- `application`: Specify the application name.
|
||||
- `namespace`: Specify the namespace name. If you want to profile
|
||||
- `namespaces`: Specify the namespaces names (separated by coma or space). If you want to profile
|
||||
- `labels`: Specify the labels (not used).
|
||||
- `kubeconfig`: Specify the location of the kubeconfig file (not used).
|
||||
- `prometheus_endpoint`: Specify the prometheus endpoint (must).
|
||||
@@ -65,8 +65,8 @@ You can also provide the input values through command-line arguments launching t
|
||||
-o, --options Evaluate command line options
|
||||
-a APPLICATION, --application APPLICATION
|
||||
Kubernetes application name
|
||||
-n NAMESPACE, --namespace NAMESPACE
|
||||
Kubernetes application namespace
|
||||
-n NAMESPACES, --namespaces NAMESPACE
|
||||
Kubernetes application namespaces separated by space
|
||||
-l LABELS, --labels LABELS
|
||||
Kubernetes application labels
|
||||
-p PROMETHEUS_ENDPOINT, --prometheus-endpoint PROMETHEUS_ENDPOINT
|
||||
|
||||
@@ -2,6 +2,7 @@ import argparse
|
||||
import json
|
||||
import logging
|
||||
import os.path
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
import yaml
|
||||
@@ -23,7 +24,7 @@ def parse_arguments(parser):
|
||||
# command line options
|
||||
parser.add_argument("-c", "--config-file", action="store", help="Config file path")
|
||||
parser.add_argument("-o", "--options", action="store_true", help="Evaluate command line options")
|
||||
parser.add_argument("-n", "--namespace", action="store", default="", help="Kubernetes application namespace")
|
||||
parser.add_argument("-n", "--namespaces", action="store", default="", nargs="+", help="Kubernetes application namespaces separated by space")
|
||||
parser.add_argument("-p", "--prometheus-endpoint", action="store", default="", help="Prometheus endpoint URI")
|
||||
parser.add_argument("-k", "--kubeconfig", action="store", default=kube_config.KUBE_CONFIG_DEFAULT_LOCATION, help="Kubeconfig path")
|
||||
parser.add_argument("-t", "--token", action="store", default="", help="Kubernetes authentication token")
|
||||
@@ -57,7 +58,8 @@ def read_configuration(config_file_path):
|
||||
config = yaml.safe_load(config_file)
|
||||
|
||||
log_level = config.get("log level", "INFO")
|
||||
namespace = config.get("namespace")
|
||||
namespaces = config.get("namespaces")
|
||||
namespaces = re.split(r",+\s+|,+|\s+", namespaces)
|
||||
kubeconfig = get_yaml_item_value(config, "kubeconfig", kube_config.KUBE_CONFIG_DEFAULT_LOCATION)
|
||||
|
||||
prometheus_endpoint = config.get("prometheus_endpoint")
|
||||
@@ -72,9 +74,9 @@ def read_configuration(config_file_path):
|
||||
else:
|
||||
output_path = False
|
||||
chaos_tests = config.get("chaos_tests", {})
|
||||
return (namespace, kubeconfig, prometheus_endpoint, auth_token, scrape_duration,
|
||||
chaos_tests, log_level, threshold, heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold, output_path)
|
||||
return (namespaces, kubeconfig, prometheus_endpoint, auth_token,
|
||||
scrape_duration, chaos_tests, log_level, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold, output_path)
|
||||
|
||||
|
||||
def prompt_input(prompt, default_value):
|
||||
@@ -84,21 +86,18 @@ def prompt_input(prompt, default_value):
|
||||
return default_value
|
||||
|
||||
|
||||
def make_json_output(inputs, queries, analysis_data, output_path):
|
||||
def make_json_output(inputs, namespace_data, output_path):
|
||||
time_str = time.strftime("%Y-%m-%d_%H-%M-%S", time.localtime())
|
||||
|
||||
data = {
|
||||
"inputs": inputs,
|
||||
"queries": queries,
|
||||
"profiling": analysis_data[0],
|
||||
"heatmap_analysis": analysis_data[1],
|
||||
"recommendations": analysis_data[2]
|
||||
"analysis_outputs": namespace_data
|
||||
}
|
||||
|
||||
logging.info(f"Summary\n{json.dumps(data, indent=4)}")
|
||||
|
||||
if output_path is not False:
|
||||
file = f"recommender_{inputs['namespace']}_{time_str}.json"
|
||||
file = f"recommender_{time_str}.json"
|
||||
path = f"{os.path.expanduser(output_path)}/{file}"
|
||||
|
||||
with open(path, "w") as json_output:
|
||||
@@ -107,9 +106,11 @@ def make_json_output(inputs, queries, analysis_data, output_path):
|
||||
logging.info(f"Recommendation output saved in {file}.")
|
||||
|
||||
|
||||
def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold):
|
||||
def json_inputs(namespaces, kubeconfig, prometheus_endpoint, scrape_duration,
|
||||
chaos_tests, threshold, heatmap_cpu_threshold,
|
||||
heatmap_mem_threshold):
|
||||
inputs = {
|
||||
"namespace": namespace,
|
||||
"namespaces": namespaces,
|
||||
"kubeconfig": kubeconfig,
|
||||
"prometheus_endpoint": prometheus_endpoint,
|
||||
"scrape_duration": scrape_duration,
|
||||
@@ -121,6 +122,17 @@ def json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, cha
|
||||
return inputs
|
||||
|
||||
|
||||
def json_namespace(namespace, queries, analysis_data):
|
||||
data = {
|
||||
"namespace": namespace,
|
||||
"queries": queries,
|
||||
"profiling": analysis_data[0],
|
||||
"heatmap_analysis": analysis_data[1],
|
||||
"recommendations": analysis_data[2]
|
||||
}
|
||||
return data
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(description="Krkn Chaos Recommender Command-Line tool")
|
||||
args = parse_arguments(parser)
|
||||
@@ -132,7 +144,7 @@ def main():
|
||||
|
||||
if args.config_file is not None:
|
||||
(
|
||||
namespace,
|
||||
namespaces,
|
||||
kubeconfig,
|
||||
prometheus_endpoint,
|
||||
auth_token,
|
||||
@@ -146,7 +158,7 @@ def main():
|
||||
) = read_configuration(args.config_file)
|
||||
|
||||
if args.options:
|
||||
namespace = args.namespace
|
||||
namespaces = args.namespaces
|
||||
kubeconfig = args.kubeconfig
|
||||
auth_token = args.token
|
||||
scrape_duration = args.scrape_duration
|
||||
@@ -172,14 +184,26 @@ def main():
|
||||
if not os.path.exists(os.path.expanduser(output_path)):
|
||||
logging.error(f"Folder {output_path} for output not found.")
|
||||
sys.exit(1)
|
||||
|
||||
logging.info("Loading inputs...")
|
||||
inputs = json_inputs(namespace, kubeconfig, prometheus_endpoint, scrape_duration, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
logging.info("Starting Analysis ...")
|
||||
inputs = json_inputs(namespaces, kubeconfig, prometheus_endpoint,
|
||||
scrape_duration, chaos_tests, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
namespaces_data = []
|
||||
|
||||
file_path, queries = prometheus.fetch_utilization_from_prometheus(prometheus_endpoint, auth_token, namespace, scrape_duration)
|
||||
analysis_data = analysis(file_path, chaos_tests, threshold, heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
logging.info("Starting Analysis...")
|
||||
|
||||
make_json_output(inputs, queries, analysis_data, output_path)
|
||||
file_path, queries = prometheus.fetch_utilization_from_prometheus(
|
||||
prometheus_endpoint, auth_token, namespaces, scrape_duration)
|
||||
|
||||
analysis_data = analysis(file_path, namespaces, chaos_tests, threshold,
|
||||
heatmap_cpu_threshold, heatmap_mem_threshold)
|
||||
|
||||
for namespace in namespaces:
|
||||
namespace_data = json_namespace(namespace, queries[namespace],
|
||||
analysis_data[namespace])
|
||||
namespaces_data.append(namespace_data)
|
||||
make_json_output(inputs, namespaces_data, output_path)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
Reference in New Issue
Block a user