krkn/config/config.yaml

kraken:
    kubeconfig_path: ~/.kube/config                     # Path to kubeconfig
    exit_on_failure: False                                 # Exit when a post action scenario fails
    auto_rollback: True                                    # Enable auto rollback for scenarios.
    rollback_versions_directory: /tmp/kraken-rollback      # Directory to store rollback version files.
    publish_kraken_status: True                            # Can be accessed at http://0.0.0.0:8081
    signal_state: RUN                                      # Will wait for the RUN signal when set to PAUSE before running the scenarios, refer docs/signal.md for more details
    signal_address: 0.0.0.0                                # Signal listening address
    port: 8081                                             # Signal port
    chaos_scenarios:
       # List of policies/chaos scenarios to load
       - hog_scenarios:
           - scenarios/kube/cpu-hog.yml
           - scenarios/kube/memory-hog.yml
           - scenarios/kube/io-hog.yml
       - application_outages_scenarios:
           - scenarios/openshift/app_outage.yaml
       - container_scenarios:                             # List of chaos pod scenarios to load
           - scenarios/openshift/container_etcd.yml
       - pod_network_scenarios:
             - scenarios/openshift/network_chaos_ingress.yml
             - scenarios/openshift/pod_network_outage.yml
       - pod_disruption_scenarios:
           - scenarios/openshift/etcd.yml
           - scenarios/openshift/regex_openshift_pod_kill.yml
           - scenarios/openshift/prom_kill.yml
           - scenarios/openshift/openshift-apiserver.yml
           - scenarios/openshift/openshift-kube-apiserver.yml
       - node_scenarios:                                  # List of chaos node scenarios to load
           - scenarios/openshift/aws_node_scenarios.yml
           - scenarios/openshift/vmware_node_scenarios.yml
           - scenarios/openshift/ibmcloud_node_scenarios.yml
       - time_scenarios:                                  # List of chaos time scenarios to load
           - scenarios/openshift/time_scenarios_example.yml
       - cluster_shut_down_scenarios:
           - scenarios/openshift/cluster_shut_down_scenario.yml
       - service_disruption_scenarios:
            - scenarios/openshift/regex_namespace.yaml
            - scenarios/openshift/ingress_namespace.yaml
       - zone_outages_scenarios:
           - scenarios/openshift/zone_outage.yaml
       - pvc_scenarios:
           - scenarios/openshift/pvc_scenario.yaml
       - network_chaos_scenarios:
           - scenarios/openshift/network_chaos.yaml
       - service_hijacking_scenarios:
             - scenarios/kube/service_hijacking.yaml
       - syn_flood_scenarios:
             - scenarios/kube/syn_flood.yaml
       - network_chaos_ng_scenarios:
               - scenarios/kube/pod-network-filter.yml
               - scenarios/kube/node-network-filter.yml
       -  kubevirt_vm_outage:
              - scenarios/kubevirt/kubevirt-vm-outage.yaml

cerberus:
    cerberus_enabled: False                                # Enable it when cerberus is previously installed
    cerberus_url:                                          # When cerberus_enabled is set to True, provide the url where cerberus publishes go/no-go signal
    check_application_routes: False                         # When enabled will look for application unavailability using the routes specified in the cerberus config and fails the run

performance_monitoring:
    prometheus_url: ''                                    # The prometheus url/route is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes.
    prometheus_bearer_token:                              # The bearer token is automatically obtained in case of OpenShift, please set it when the distribution is Kubernetes. This is needed to authenticate with prometheus.
    uuid:                                                 # uuid for the run is generated by default if not set
    enable_alerts: False                                  # Runs the queries specified in the alert profile and displays the info or exits 1 when severity=error
    enable_metrics: False
    alert_profile: config/alerts.yaml                          # Path or URL to alert profile with the prometheus queries
    metrics_profile: config/metrics-report.yaml
    check_critical_alerts: False                          # When enabled will check prometheus for critical alerts firing post chaos
elastic:
    enable_elastic: False
    verify_certs: False
    elastic_url: ""                                         # To track results in elasticsearch, give url to server here; will post telemetry details when url and index not blank
    elastic_port: 32766
    username: "elastic"
    password: "test"
    metrics_index: "krkn-metrics"
    alerts_index: "krkn-alerts"
    telemetry_index: "krkn-telemetry"

tunings:
    wait_duration: 1                                      # Duration to wait between each chaos scenario
    iterations: 1                                          # Number of times to execute the scenarios
    daemon_mode: False                                     # Iterations are set to infinity which means that the kraken will cause chaos forever
telemetry:
    enabled: False                                           # enable/disables the telemetry collection feature
    api_url: https://ulnmf9xv7j.execute-api.us-west-2.amazonaws.com/production #telemetry service endpoint
    username: username                                      # telemetry service username
    password: password                                    # telemetry service password
    prometheus_backup: True                                 # enables/disables prometheus data collection
    prometheus_namespace: ""                                # namespace where prometheus is deployed (if distribution is kubernetes)
    prometheus_container_name: ""                           # name of the prometheus container name (if distribution is kubernetes)
    prometheus_pod_name: ""                                 # name of the prometheus pod (if distribution is kubernetes)
    full_prometheus_backup: False                           # if is set to False only the /prometheus/wal folder will be downloaded.
    backup_threads: 5                                       # number of telemetry download/upload threads
    archive_path: /tmp                                      # local path where the archive files will be temporarily stored
    max_retries: 0                                          # maximum number of upload retries (if 0 will retry forever)
    run_tag: ''                                             # if set, this will be appended to the run folder in the bucket (useful to group the runs)
    archive_size: 500000
    telemetry_group: ''                                     # if set will archive the telemetry in the S3 bucket on a folder named after the value, otherwise will use "default"
    # the size of the prometheus data archive size in KB. The lower the size of archive is
                                                            # the higher the number of archive files will be produced and uploaded (and processed by backup_threads
                                                            # simultaneously).
                                                            # For unstable/slow connection is better to keep this value low
                                                            # increasing the number of backup_threads, in this way, on upload failure, the retry will happen only on the
                                                            # failed chunk without affecting the whole upload.
    logs_backup: True
    logs_filter_patterns:
     - "(\\w{3}\\s\\d{1,2}\\s\\d{2}:\\d{2}:\\d{2}\\.\\d+).+"         # Sep 9 11:20:36.123425532
     - "kinit (\\d+/\\d+/\\d+\\s\\d{2}:\\d{2}:\\d{2})\\s+"          # kinit 2023/09/15 11:20:36 log
     - "(\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d+Z).+"      # 2023-09-15T11:20:36.123425532Z log
    oc_cli_path: /usr/bin/oc                                # optional, if not specified will be search in $PATH
    events_backup: True                                     # enables/disables cluster events collection

health_checks:                                              # Utilizing health check endpoints to observe application behavior during chaos injection.
    interval:                                               # Interval in seconds to perform health checks, default value is 2 seconds
    config:                                                 # Provide list of health check configurations for applications
        - url:                                              # Provide application endpoint
          bearer_token:                                     # Bearer token for authentication if any
          auth:                                             # Provide authentication credentials (username , password) in tuple format if any, ex:("admin","secretpassword")
          exit_on_failure:                                  # If value is True exits when health check failed for application, values can be True/False

kubevirt_checks:                                            # Utilizing virt check endpoints to observe ssh ability to VMI's during chaos injection.
    interval: 2                                             # Interval in seconds to perform virt checks, default value is 2 seconds
    namespace:                                              # Namespace where to find VMI's
    name:                                                   # Regex Name style of VMI's to watch, optional, will watch all VMI names in the namespace if left blank
    only_failures: False                                    # Boolean of whether to show all VMI's failures and successful ssh connection (False), or only failure status' (True)
    disconnected: False                                     # Boolean of how to try to connect to the VMIs; if True will use the ip_address to try ssh from within a node, if false will use the name and uses virtctl to try to connect; Default is False
    ssh_node: ""                                            # If set, will be a backup way to ssh to a node. Will want to set to a node that isn't targeted in chaos
    node_names: ""
    exit_on_failure:                                        # If value is True and VMI's are failing post chaos returns failure, values can be True/False