mirror of
https://github.com/krkn-chaos/krkn.git
synced 2026-02-14 18:10:00 +00:00
131 lines
10 KiB
YAML
131 lines
10 KiB
YAML
kraken:
|
|
kubeconfig_path: ~/.kube/config # Path to kubeconfig
|
|
exit_on_failure: False # Exit when a post action scenario fails
|
|
auto_rollback: True # Enable auto rollback for scenarios.
|
|
rollback_versions_directory: /tmp/kraken-rollback # Directory to store rollback version files.
|
|
publish_kraken_status: True # Can be accessed at http://0.0.0.0:8081
|
|
signal_state: RUN # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
|
|
signal_address: 0.0.0.0 # Signal listening address
|
|
port: 8081 # Signal port
|
|
chaos_scenarios:
|
|
# List of policies/chaos scenarios to load
|
|
- hog_scenarios:
|
|
- scenarios/kube/cpu-hog.yml
|
|
- scenarios/kube/memory-hog.yml
|
|
- scenarios/kube/io-hog.yml
|
|
- application_outages_scenarios:
|
|
- scenarios/openshift/app_outage.yaml
|
|
- container_scenarios: # List of chaos pod scenarios to load
|
|
- scenarios/openshift/container_etcd.yml
|
|
- pod_network_scenarios:
|
|
- scenarios/openshift/network_chaos_ingress.yml
|
|
- scenarios/openshift/pod_network_outage.yml
|
|
- pod_disruption_scenarios:
|
|
- scenarios/openshift/etcd.yml
|
|
- scenarios/openshift/regex_openshift_pod_kill.yml
|
|
- scenarios/openshift/prom_kill.yml
|
|
- scenarios/openshift/openshift-apiserver.yml
|
|
- scenarios/openshift/openshift-kube-apiserver.yml
|
|
- node_scenarios: # List of chaos node scenarios to load
|
|
- scenarios/openshift/aws_node_scenarios.yml
|
|
- scenarios/openshift/vmware_node_scenarios.yml
|
|
- scenarios/openshift/ibmcloud_node_scenarios.yml
|
|
- time_scenarios: # List of chaos time scenarios to load
|
|
- scenarios/openshift/time_scenarios_example.yml
|
|
- cluster_shut_down_scenarios:
|
|
- scenarios/openshift/cluster_shut_down_scenario.yml
|
|
- service_disruption_scenarios:
|
|
- scenarios/openshift/regex_namespace.yaml
|
|
- scenarios/openshift/ingress_namespace.yaml
|
|
- zone_outages_scenarios:
|
|
- scenarios/openshift/zone_outage.yaml
|
|
- pvc_scenarios:
|
|
- scenarios/openshift/pvc_scenario.yaml
|
|
- network_chaos_scenarios:
|
|
- scenarios/openshift/network_chaos.yaml
|
|
- service_hijacking_scenarios:
|
|
- scenarios/kube/service_hijacking.yaml
|
|
- syn_flood_scenarios:
|
|
- scenarios/kube/syn_flood.yaml
|
|
- network_chaos_ng_scenarios:
|
|
- scenarios/kube/pod-network-filter.yml
|
|
- scenarios/kube/node-network-filter.yml
|
|
- kubevirt_vm_outage:
|
|
- scenarios/kubevirt/kubevirt-vm-outage.yaml
|
|
|
|
cerberus:
|
|
cerberus_enabled: False # Enable it when cerberus is previously installed
|
|
cerberus_url: # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
|
|
check_application_routes: False # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run
|
|
|
|
performance_monitoring:
|
|
prometheus_url: '' # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
|
|
prometheus_bearer_token: # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
|
|
uuid: # uuid for the run is generated by default if not set
|
|
enable_alerts: False # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
|
|
enable_metrics: False
|
|
alert_profile: config/alerts.yaml # Path or URL to alert profile with the prometheus queries
|
|
metrics_profile: config/metrics-report.yaml
|
|
check_critical_alerts: False # When enabled will check prometheus for critical alerts firing post chaos
|
|
elastic:
|
|
enable_elastic: False
|
|
verify_certs: False
|
|
elastic_url: "" # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
|
|
elastic_port: 32766
|
|
username: "elastic"
|
|
password: "test"
|
|
metrics_index: "krkn-metrics"
|
|
alerts_index: "krkn-alerts"
|
|
telemetry_index: "krkn-telemetry"
|
|
|
|
tunings:
|
|
wait_duration: 1 # Duration to wait between each chaos scenario
|
|
iterations: 1 # Number of times to execute the scenarios
|
|
daemon_mode: False # Iterations are set to infinity which means that the kraken will cause chaos forever
|
|
telemetry:
|
|
enabled: False # enable/disables the telemetry collection feature
|
|
api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
|
|
username: username # telemetry service username
|
|
password: password # telemetry service password
|
|
prometheus_backup: True # enables/disables prometheus data collection
|
|
prometheus_namespace: "" # namespace where prometheus is deployed (if distribution is kubernetes)
|
|
prometheus_container_name: "" # name of the prometheus container name (if distribution is kubernetes)
|
|
prometheus_pod_name: "" # name of the prometheus pod (if distribution is kubernetes)
|
|
full_prometheus_backup: False # if is set to False only the /prometheus/wal folder will be downloaded.
|
|
backup_threads: 5 # number of telemetry download/upload threads
|
|
archive_path: /tmp # local path where the archive files will be temporarily stored
|
|
max_retries: 0 # maximum number of upload retries (if 0 will retry forever)
|
|
run_tag: '' # if set, this will be appended to the run folder in the bucket (useful to group the runs)
|
|
archive_size: 500000
|
|
telemetry_group: '' # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
|
|
# the size of the prometheus data archive size in KB. The lower the size of archive is
|
|
# the higher the number of archive files will be produced and uploaded (and processed by backup_threads
|
|
# simultaneously).
|
|
# For unstable/slow connection is better to keep this value low
|
|
# increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
|
|
# failed chunk without affecting the whole upload.
|
|
logs_backup: True
|
|
logs_filter_patterns:
|
|
- "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+" # Sep 9 11:20:36.123425532
|
|
- "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+" # kinit 2023/09/15 11:20:36 log
|
|
- "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+" # 2023-09-15T11:20:36.123425532Z log
|
|
oc_cli_path: /usr/bin/oc # optional, if not specified will be search in $PATH
|
|
events_backup: True # enables/disables cluster events collection
|
|
|
|
health_checks: # Utilizing health check endpoints to observe application behavior during chaos injection.
|
|
interval: # Interval in seconds to perform health checks, default value is 2 seconds
|
|
config: # Provide list of health check configurations for applications
|
|
- url: # Provide application endpoint
|
|
bearer_token: # Bearer token for authentication if any
|
|
auth: # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
|
|
exit_on_failure: # If value is True exits when health check failed for application, values can be True/False
|
|
|
|
kubevirt_checks: # Utilizing virt check endpoints to observe ssh ability to VMI's during chaos injection.
|
|
interval: 2 # Interval in seconds to perform virt checks, default value is 2 seconds
|
|
namespace: # Namespace where to find VMI's
|
|
name: # Regex Name style of VMI's to watch, optional, will watch all VMI names in the namespace if left blank
|
|
only_failures: False # Boolean of whether to show all VMI's failures and successful ssh connection (False), or only failure status' (True)
|
|
disconnected: False # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False
|
|
ssh_node: "" # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos
|
|
node_names: ""
|
|
exit_on_failure: # If value is True and VMI's are failing post chaos returns failure, values can be True/False |