Skip to content

Commit 77235c4

Browse files
committed
Full project commit (#1)
* Baskerville - complete project Set theme jekyll-theme-minimal Update Dockerfile
1 parent f770a4e commit 77235c4

330 files changed

Lines changed: 79118 additions & 0 deletions

File tree

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

DEPLOYMENT.md

Lines changed: 726 additions & 0 deletions
Large diffs are not rendered by default.

README.md

Lines changed: 557 additions & 0 deletions
Large diffs are not rendered by default.

_config.yml

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
theme: jekyll-theme-minimal

alembic/README

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
Generic single-database configuration.

alembic/env.py

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
from __future__ import with_statement
2+
from alembic import context
3+
from sqlalchemy import engine_from_config, pool
4+
from logging.config import fileConfig
5+
6+
import os
7+
8+
# this is the Alembic Config object, which provides
9+
# access to the values within the .ini file in use.
10+
config = context.config
11+
12+
# read and set current database configuration
13+
db_type = os.environ.get('BASKERVILLE_DB_TYPE', 'postgres')
14+
db_user = os.environ.get('DB_USER')
15+
db_pass = os.environ.get('DB_PASS')
16+
db_host = os.environ.get('DB_HOST')
17+
db_name = os.environ.get('BASKERVILLE_DB')
18+
# or parse baskerville config and use get_db_connection_str
19+
# conf = parse_config(path=conf_options['conf_file'])
20+
if all([db_user, db_host, db_pass, db_name]):
21+
config.set_main_option(
22+
'sqlalchemy.url',
23+
f'{db_type}://{db_user}:{db_pass}@{db_host}/{db_name}'
24+
)
25+
26+
# Interpret the config file for Python logging.
27+
# This line sets up loggers basically.
28+
fileConfig(config.config_file_name)
29+
30+
# add your model's MetaData object here
31+
# for 'autogenerate' support
32+
# from myapp import mymodel
33+
# target_metadata = mymodel.Base.metadata
34+
target_metadata = None
35+
36+
# other values from the config, defined by the needs of env.py,
37+
# can be acquired:
38+
# my_important_option = config.get_main_option("my_important_option")
39+
# ... etc.
40+
41+
42+
def run_migrations_offline():
43+
"""Run migrations in 'offline' mode.
44+
45+
This configures the context with just a URL
46+
and not an Engine, though an Engine is acceptable
47+
here as well. By skipping the Engine creation
48+
we don't even need a DBAPI to be available.
49+
50+
Calls to context.execute() here emit the given string to the
51+
script output.
52+
53+
"""
54+
url = config.get_main_option("sqlalchemy.url")
55+
context.configure(
56+
url=url, target_metadata=target_metadata, literal_binds=True)
57+
58+
with context.begin_transaction():
59+
context.run_migrations()
60+
61+
62+
def run_migrations_online():
63+
"""Run migrations in 'online' mode.
64+
65+
In this scenario we need to create an Engine
66+
and associate a connection with the context.
67+
68+
"""
69+
connectable = engine_from_config(
70+
config.get_section(config.config_ini_section),
71+
prefix='sqlalchemy.',
72+
poolclass=pool.NullPool)
73+
74+
with connectable.connect() as connection:
75+
context.configure(
76+
connection=connection,
77+
target_metadata=target_metadata
78+
)
79+
80+
with context.begin_transaction():
81+
context.run_migrations()
82+
83+
84+
if context.is_offline_mode():
85+
run_migrations_offline()
86+
else:
87+
run_migrations_online()

alembic/script.py.mako

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""${message}
2+
3+
Revision ID: ${up_revision}
4+
Revises: ${down_revision | comma,n}
5+
Create Date: ${create_date}
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
${imports if imports else ""}
11+
12+
# revision identifiers, used by Alembic.
13+
revision = ${repr(up_revision)}
14+
down_revision = ${repr(down_revision)}
15+
branch_labels = ${repr(branch_labels)}
16+
depends_on = ${repr(depends_on)}
17+
18+
19+
def upgrade():
20+
${upgrades if upgrades else "pass"}
21+
22+
23+
def downgrade():
24+
${downgrades if downgrades else "pass"}
Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,24 @@
1+
"""initial revision
2+
3+
Revision ID: 0c5cf09f1fc4
4+
Revises:
5+
Create Date: 2018-05-28 15:01:48.308789
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = '0c5cf09f1fc4'
14+
down_revision = None
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
pass
21+
22+
23+
def downgrade():
24+
pass
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
"""remove subsets
2+
3+
Revision ID: 41ff5ba653c6
4+
Revises: 4c5d9065aee2
5+
Create Date: 2019-03-04 14:11:26.543111
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
from baskerville.db.models import utcnow
14+
15+
revision = '41ff5ba653c6'
16+
down_revision = '4c5d9065aee2'
17+
branch_labels = None
18+
depends_on = None
19+
20+
21+
def upgrade():
22+
op.drop_table('subsets')
23+
24+
25+
def downgrade():
26+
op.create_table(
27+
'subsets',
28+
sa.Column('id', sa.Integer, primary_key=True),
29+
sa.Column('target', sa.String(45), nullable=False),
30+
sa.Column('ip', sa.TEXT(), nullable=False),
31+
sa.Column('start', sa.DateTime(timezone=True)),
32+
sa.Column('stop', sa.DateTime(timezone=True)),
33+
sa.Column('num_requests', sa.Integer(), nullable=False),
34+
sa.Column('features', sa.JSON()),
35+
sa.Column('prediction', sa.Integer()),
36+
sa.Column('row_num', sa.Integer()),
37+
sa.Column('r', sa.Float()),
38+
sa.Column('time_bucket', sa.Integer()),
39+
sa.Column(
40+
'created_at', sa.DateTime(timezone=True), server_default=utcnow()
41+
),
42+
)
Lines changed: 32 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,32 @@
1+
"""add banjax bans table
2+
3+
Revision ID: 4c5d9065aee2
4+
Revises:
5+
Create Date: 2019-02-19 13:12:54.127134
6+
7+
"""
8+
from alembic import op
9+
import sqlalchemy as sa
10+
11+
12+
# revision identifiers, used by Alembic.
13+
revision = '4c5d9065aee2'
14+
down_revision = '0c5cf09f1fc4'
15+
branch_labels = None
16+
depends_on = None
17+
18+
19+
def upgrade():
20+
op.create_table(
21+
'banjax_bans',
22+
sa.Column('id', sa.Integer, primary_key=True),
23+
sa.Column('sync_start', sa.DateTime(timezone=True)),
24+
sa.Column('sync_stop', sa.DateTime(timezone=True)),
25+
sa.Column('ip', sa.TEXT(), nullable=False)
26+
)
27+
op.add_column('request_sets', sa.Column('id_banjax', sa.Integer))
28+
29+
30+
def downgrade():
31+
op.drop_table('banjax_bans')
32+
op.drop_column('request_sets', 'id_banjax')
Lines changed: 105 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,105 @@
1+
---
2+
database: # Mandatory configuration
3+
name: baskerville # the database name
4+
user: user
5+
password: 'pass'
6+
type: 'postgres'
7+
host: 127.0.0.1
8+
port: 5432
9+
maintenance: # Optional, for data partitioning and archiving
10+
template_folder: '/path/to/template/folder/' # Optional: by default the data folder, can be omitted
11+
partition_table: 'request_sets' # default value
12+
partition_by: week # partition by week or month, default value is week
13+
partition_field: created_at # which field to use for the partitioning, this is the default value, can be omitted
14+
strict: False # if False, then for the week partition the start and end date will be changed to the start and end of the respective weeks. If true, then the dates will remain unchanged. Be careful to be consistent with this.
15+
data_partition: # Optional: Define the period to create partitions for
16+
since: 2018-01-01 # when to start partitioning
17+
until: "2018-12-31 23:59:59" # when to stop partitioning
18+
index_by: # which fields to index in the partitions that will be created (only one index is supported currently), default value, can be omitted
19+
- target
20+
- ip
21+
template: 'data_partitioning.jinja2' # Optional: the template name, default value, can be omitted
22+
data_archive: # Optional: define the period to archive
23+
since: 2017-02-01 # which dates to archive - in a non-strict mode, the start date will be modified to the start date of the week
24+
until: 2017-12-31 # this is also true for the end date. If a strict mode is requested then the end date will be modified to the end of the week the until date belongs to.
25+
template: 'data_archiving.jinja2' # Optional: the template name, default value, can be omitted
26+
27+
# Optional: used only by the Elastic pipeline
28+
elastic:
29+
user: 'elastic'
30+
password: 'changeme'
31+
host: 'url to ES instance'
32+
base_index: 'some.log'
33+
index_type: 'some_type'
34+
35+
engine:
36+
time_bucket: 120 # seconds: NOTE: this is the default value, model training is dependent upon this, this should not be set under normal circumstances
37+
# load_test: 10 # multiply the dataset x times and add random ips - only used for load testing, default false, can be omitted.
38+
es_log:
39+
host: somehost # Optional
40+
start: 2018-01-01 00:00:00 # Optional
41+
stop: 2018-01-02 00:00:00 # Optional
42+
batch_length: 30 # minutes - split start and stop in batch_length periods to avoid overloading the es cluster
43+
save_logs_dir: path/to/directory/to/save/logs # optional
44+
datetime_format: '%Y-%m-%d %H:%M:%S'
45+
cache_expire_time: 604800 # sec (604800 = 1 week)
46+
cross_reference: False # search MISP for IPs
47+
model_version_id: n # optional
48+
extra_features: # useful when we need to calculate more features than the model requests or when there is no model
49+
- 'example_feature_average'
50+
metrics:
51+
port: 8998
52+
performance:
53+
pipeline: # list the name of the methods you want to time for performance
54+
- 'preprocessing'
55+
- 'group_by'
56+
- 'feature_calculation'
57+
- 'label_or_predict'
58+
- 'save'
59+
request_set_cache: # list the name of the methods you want to time for performance
60+
- 'instantiate_cache'
61+
- '__getitem__'
62+
- '__contains__'
63+
- 'clean'
64+
features: True # add a metric to time the features
65+
progress: True # add a metric to watch the pipeline progress
66+
data_config:
67+
parser: JSONLogSparkParser
68+
schema: '/path/to/data/samples/sample_log_schema.json'
69+
group_by_cols:
70+
- 'client_request_host'
71+
- 'client_ip'
72+
timestamp_column: '@timestamp'
73+
logpath: /where/to/save/logs.log
74+
log_level: 'ERROR'
75+
76+
spark:
77+
app_name: 'Baskerville' # the application name - can be changed for two different runs - used by the spark UI
78+
master: 'local' # the ip:port of the master node, e.g. spark://someip:7077 to submit to a cluster
79+
parallelism: -1 # controls the number of tasks, -1 means use all cores - used for local master
80+
log_level: 'INFO' # spark logs level
81+
storage_level: 'OFF_HEAP' # which strategy to use for storing dfs - valid values are the ones found here: https://spark.apache.org/docs/2.4.0/api/python/_modules/pyspark/storagelevel.html default: OFF_HEAP
82+
jars: '/path/to/jars/postgresql-42.2.4.jar,/path/to/spark-iforest-2.4.0.jar,/path/to/elasticsearch-spark-20_2.11-5.6.5.jar' # or /path/to/jars/mysql-connector-java-8.0.11.jar
83+
session_timezone: 'UTC'
84+
shuffle_partitions: 14 # depends on your dataset and your hardware, usually ~ 2 * number of cores is a good choice
85+
executor_instances: 4 # omitted when running locally
86+
executor_cores: 4 # omitted when running locally
87+
spark_driver_memory: '6G' # depends on your dataset and the available ram you have. If running locally 6 - 8 GB should be a good choice, depending on the amount of data you need to process
88+
db_driver: 'org.postgresql.Driver' # or for mysql: 'com.mysql.cj.jdbc.Driver'
89+
metrics_conf: /path/to/data/spark.metrics # Optional: required only to export spark metrics
90+
jar_packages: 'com.banzaicloud:spark-metrics_2.11:2.3-2.0.4,io.prometheus:simpleclient:0.3.0,io.prometheus:simpleclient_dropwizard:0.3.0,io.prometheus:simpleclient_pushgateway:0.3.0,io.dropwizard.metrics:metrics-core:3.1.2' # required to export spark metrics
91+
jar_repositories: 'https://raw.github.com/banzaicloud/spark-metrics/master/maven-repo/releases' # Optional: Required only to export spark metrics
92+
event_log: True
93+
serializer: 'org.apache.spark.serializer.KryoSerializer'
94+
kryoserializer_buffer_max: '2024m' # 2024m and 1024k are the max values the KryoSerializer can handle
95+
kryoserializer_buffer: '1024k' # It is suggested that you omit setting kryoserializer_buffer_max and kryoserializer_buffer and only set them if you get serialization errors.
96+
driver_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
97+
executor_extra_java_options: '-verbose:gc' # Optional. When on a local machine with less than 36GB of ram -XX:+UseCompressedOops
98+
# to connect to the jvm for memory profiling and deugging (remove the -Dcom.sun.management.jmxremote.port=1098 if more than one executors because it will cause the other executors to fail):
99+
# -XX:+PrintFlagsFinal -XX:+PrintReferenceGC -verbose:gc -XX:+PrintGCDetails -XX:+PrintGCTimeStamps -XX:+UnlockDiagnosticVMOptions -Dcom.sun.management.jmxremote -Dcom.sun.management.jmxremote.ssl=false -Dcom.sun.management.jmxremote.authenticate=false -Dcom.sun.management.jmxremote.port=1098
100+
# depending on your configuration and resources:
101+
# -Dio.netty.noPreferDirect=true -Dio.netty.allocator.type=unpooled -XX:+UseCompressedOops -XX:G1HeapRegionSize=10 -XX:+UseG1GC -XX:ParallelGCThreads=8 -XX:ConcGCThreads=2 -XX:InitiatingHeapOccupancyPercent=25
102+
# UseG1GC is usually the best option
103+
# number of ParallelGCThreads cannot go above the number of cores
104+
# ConcGCThreads=2 : two per core is a reasonable option that works well on most cases
105+
# InitiatingHeapOccupancyPercent=25: allocate 25% for heap - this has to be tested on your machine to see which percentage works well

0 commit comments

Comments
 (0)