You signed in with another tab or window. Reload to refresh your session.You signed out in another tab or window. Reload to refresh your session.You switched accounts on another tab or window. Reload to refresh your session.Dismiss alert
maintenance: # Optional, for data partitioning and archiving
10
+
template_folder: '/path/to/template/folder/'# Optional: by default the data folder, can be omitted
11
+
partition_table: 'request_sets'# default value
12
+
partition_by: week # partition by week or month, default value is week
13
+
partition_field: created_at # which field to use for the partitioning, this is the default value, can be omitted
14
+
strict: False # if False, then for the week partition the start and end date will be changed to the start and end of the respective weeks. If true, then the dates will remain unchanged. Be careful to be consistent with this.
15
+
data_partition: # Optional: Define the period to create partitions for
16
+
since: 2018-01-01# when to start partitioning
17
+
until: "2018-12-31 23:59:59"# when to stop partitioning
18
+
index_by: # which fields to index in the partitions that will be created (only one index is supported currently), default value, can be omitted
19
+
- target
20
+
- ip
21
+
template: 'data_partitioning.jinja2'# Optional: the template name, default value, can be omitted
22
+
data_archive: # Optional: define the period to archive
23
+
since: 2017-02-01# which dates to archive - in a non-strict mode, the start date will be modified to the start date of the week
24
+
until: 2017-12-31# this is also true for the end date. If a strict mode is requested then the end date will be modified to the end of the week the until date belongs to.
25
+
template: 'data_archiving.jinja2'# Optional: the template name, default value, can be omitted
26
+
27
+
# Optional: used only by the Elastic pipeline
28
+
elastic:
29
+
user: 'elastic'
30
+
password: 'changeme'
31
+
host: 'url to ES instance'
32
+
base_index: 'some.log'
33
+
index_type: 'some_type'
34
+
35
+
engine:
36
+
time_bucket: 120# seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
37
+
# load_test: 10 # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
38
+
es_log:
39
+
host: somehost # Optional
40
+
start: 2018-01-01 00:00:00 # Optional
41
+
stop: 2018-01-02 00:00:00 # Optional
42
+
batch_length: 30# minutes - split start and stop in batch_length periods to avoid overloading the es cluster
app_name: 'Baskerville'# the application name - can be changed for two different runs - used by the spark UI
78
+
master: 'local'# the ip:port of the master node, e.g. spark://someip:7077 to submit to a cluster
79
+
parallelism: -1# controls the number of tasks, -1 means use all cores - used for local master
80
+
log_level: 'INFO'# spark logs level
81
+
storage_level: 'OFF_HEAP'# which strategy to use for storing dfs - valid values are the ones found here: https://spark.apache.org/docs/2.4.0/api/python/_modules/pyspark/storagelevel.html default: OFF_HEAP
82
+
jars: '/path/to/jars/postgresql-42.2.4.jar,/path/to/spark-iforest-2.4.0.jar,/path/to/elasticsearch-spark-20_2.11-5.6.5.jar'# or /path/to/jars/mysql-connector-java-8.0.11.jar
83
+
session_timezone: 'UTC'
84
+
shuffle_partitions: 14# depends on your dataset and your hardware, usually ~ 2 * number of cores is a good choice
85
+
executor_instances: 4# omitted when running locally
86
+
executor_cores: 4# omitted when running locally
87
+
spark_driver_memory: '6G'# depends on your dataset and the available ram you have. If running locally 6 - 8 GB should be a good choice, depending on the amount of data you need to process
88
+
db_driver: 'org.postgresql.Driver'# or for mysql: 'com.mysql.cj.jdbc.Driver'
89
+
metrics_conf: /path/to/data/spark.metrics # Optional: required only to export spark metrics
90
+
jar_packages: 'com.banzaicloud:spark-metrics_2.11:2.3-2.0.4,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2'# required to export spark metrics
91
+
jar_repositories: 'https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases'# Optional: Required only to export spark metrics
kryoserializer_buffer_max: '2024m'# 2024m and 1024k are the max values the KryoSerializer can handle
95
+
kryoserializer_buffer: '1024k'# It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors.
96
+
driver_java_options: '-verbose:gc'# Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
97
+
executor_extra_java_options: '-verbose:gc'# Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
98
+
# to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail):
0 commit comments