diff --git a/.github/workflows/unit_tests.yml b/.github/workflows/unit_tests.yml index 6304459d..956f0d0e 100644 --- a/.github/workflows/unit_tests.yml +++ b/.github/workflows/unit_tests.yml @@ -12,7 +12,7 @@ on: jobs: build: - runs-on: ubuntu-latest + runs-on: ubuntu-18.04 steps: - uses: actions/checkout@v2 @@ -28,17 +28,12 @@ jobs: git clone -b categorical_features https://github.com/equalitie/spark-iforest.git cd spark-iforest/python python setup.py sdist - pip install dist/pyspark-iforest-2.4.0.tar.gz + pip install dist/pyspark-iforest-2.4.0.99.tar.gz cd ../../ - git clone https://github.com/equalitie/esretriever.git - cd esretriever - sudo pip install -e . - cd .. mkdir ./src/baskerville/logs/ - name: Lint with flake8 run: | - cd ./src - flake8 . --count --ignore=C901,W503,W504,E226 --max-line-length=127 --statistics + ./linting.sh - name: Test with pytest run: | @@ -46,4 +41,4 @@ jobs: pytest ./tests/unit - name: License check run: | - python ./src/baskerville/util/licensing.py \ No newline at end of file + python ./src/baskerville/util/licensing.py diff --git a/Dockerfile b/Dockerfile index 18dd9661..adc4d31f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ RUN rm -r $JAVA_HOME/* RUN mv jdk262/openlogic-openjdk-8u262-b10-linux-64/* $JAVA_HOME/ COPY ./src /usr/local/baskerville/src -COPY ./data/jars /usr/local/baskerville/data/jars +COPY ./data /usr/local/baskerville/data COPY ./requirements.txt /usr/local/baskerville WORKDIR /usr/local/baskerville diff --git a/data/geoip2/GeoLite2-Country.mmdb b/data/geoip2/GeoLite2-Country.mmdb new file mode 100644 index 00000000..035cc01b Binary files /dev/null and b/data/geoip2/GeoLite2-Country.mmdb differ diff --git a/data/samples/ats_log_schema.json b/data/samples/ats_log_schema.json index 72f64f67..dffd544f 100644 --- a/data/samples/ats_log_schema.json +++ b/data/samples/ats_log_schema.json @@ -5,7 +5,7 @@ "type": "string", "format": "date", "pattern": "(\\d\\d\\d\\d-([0-2])?\\d-([0-3])?\\dT?([0-2])?\\d:([0-5])?\\d:([0-5])?\\d\\.\\d?\\d?\\d?Z?)", - "required": true + "required": false }, "ISP": { "type": "string" diff --git a/data/samples/sample_vectors/._SUCCESS.crc b/data/samples/sample_vectors/._SUCCESS.crc deleted file mode 100644 index 3b7b0449..00000000 Binary files a/data/samples/sample_vectors/._SUCCESS.crc and /dev/null differ diff --git a/data/samples/sample_vectors_one_client/._SUCCESS.crc b/data/samples/sample_vectors_one_client/._SUCCESS.crc deleted file mode 100644 index 3b7b0449..00000000 Binary files a/data/samples/sample_vectors_one_client/._SUCCESS.crc and /dev/null differ diff --git a/data/samples/weblog_schema.json b/data/samples/weblog_schema.json new file mode 100644 index 00000000..53df1a8c --- /dev/null +++ b/data/samples/weblog_schema.json @@ -0,0 +1,64 @@ +{ + "name": "Weblogs", + "properties": { + "datestamp": { + "type": "string", + "format": "date", + "pattern": "(\\d\\d\\d\\d-([0-2])?\\d-([0-3])?\\dT?([0-2])?\\d:([0-5])?\\d:([0-5])?\\d\\.\\d?\\d?\\d?Z?)", + "required": true + }, + "cache_result": { + "type": "string" + }, + "client_ip": { + "type": "string", + "pattern": "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)(.(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)){3}" + }, + "client_request_host": { + "type": "string" + }, + "client_request_method": { + "type": "string", + "default": "" + }, + "client_ua": { + "type": "string" + }, + "client_url": { + "type": "string" + }, + "content_type": { + "type": "string" + }, + "http_request_scheme": { + "type": "string" + }, + "http_response_code": { + "type": "string", + "pattern": "[1-5][0-9][0-9]" + }, + "querystring": { + "type": "string" + }, + "reply_length_bytes": { + "type": "string" + }, + "geoip": { + "location": { + "lon": { + "type": "string", + "format": "number" + }, + "lat": { + "type": "string", + "format": "number" + } + }, + "country_name":{ + "type": "string" + } + } + }, + "required": ["datestamp", "client_ip", "client_request_host", "client_ua", "client_url", "content_type", "http_response_code", "querystring", "reply_length_bytes", "geoip"], + "additionalProperties": false +} \ No newline at end of file diff --git a/deployment/README.md b/deployment/README.md index 38b3871c..8b6facfe 100644 --- a/deployment/README.md +++ b/deployment/README.md @@ -43,7 +43,10 @@ cd .. * create Kafka secret: ```commandline -kubectl create secret generic kafka-jks --from-file=./truststore/kafka.truststore.jks --from-file=./kafka-0.keystore.jks --from-file=./kafka-1.keystore.jks --from-file=./kafka-2.keystore.jks +kubectl create secret generic kafka-jks-0 --from-file=./kafka.truststore.jks --from-file=./kafka.keystore.jks +kubectl create secret generic kafka-jks-1 --from-file=./kafka.truststore.jks --from-file=./kafka.keystore.jks +kubectl create secret generic kafka-jks-2 --from-file=./kafka.truststore.jks --from-file=./kafka.keystore.jks + ``` # Kafka @@ -61,7 +64,34 @@ nodeSelector: * deploy kafka ```commandline helm repo add bitnami https://charts.bitnami.com/bitnami -helm install kafka -f deployment/kafka/values-kafka.yaml bitnami/kafka +helm install kafka -f deployment/kafka/values-kafka.yaml ../charts/bitnami/kafka +helm install kafka -f deployment/kafka/values-kafka_new.yaml ../charts/bitnami/kafka + +helm install kafka9 -f deployment/kafka/values-kafka9.yaml ../charts/bitnami/kafka + +helm install kafkab ../charts/bitnami/kafka -f deployment/kafka/values-kafkab.yaml + +helm upgrade kafkab ../charts/bitnami/kafka -f deployment/kafka/values_kafkab_new.yaml + +kubectl apply -f deployment/kafka/kafkab-loadbalancers.yaml +kubectl delete svc kafkab-0-external kafkab-1-external kafkab-2-external + +helm -n default upgrade kafkab ../charts/bitnami/kafka \ + --reuse-values \ + --set image.repository=docker.io/bitnamilegacy/kafka + + + +helm upgrade --install kafkab oci://registry-1.docker.io/bitnamicharts/kafka:32.3.10 \ + -n $NS -f /tmp/kafkab-jks.yaml --wait --timeout 30m + +helm upgrade --install kafkab oci://registry-1.docker.io/bitnamicharts/kafka \ + -f deployment/kafka/values-kafkab.yaml \ + --version 32.3.10 \ + --set existingKraftSecret=kafkab-kraft \ + --wait + + ``` * follow the displayed instruction to get kafka connection string: @@ -124,7 +154,7 @@ rm -r keystore/ ``` * Create a pod for ACL commands ```commandline -kubectl run kafka-client --restart='Never' --image docker.io/bitnami/kafka:2.8.0-debian-10-r43 --namespace default --command -- sleep infinity +kubectl run kafka-client --restart='Never' --image docker.io/bitnami/kafka:2.8.0-debian-10-r43 --env="ALLOW_PLAINTEXT_LISTENER=yes" --namespace default --command -- sleep infinity ``` * login to the `kafka-client` pod ```commandline @@ -270,6 +300,22 @@ Create four argo workflow templates from each file in `deployment/argo'. * copy paste the content of the file * save the template + +## TimescaleDB +```commandline +helm install timescaledb -f ./deployment/timescaledb/values.yaml timescale/timescaledb-multinode +``` + +```commandline +kubectl get secret --namespace default timescaledb-data -o jsonpath="{.data.password-superuser}" | base64 --decode +``` + +```commandline +kubectl port-forward service/timescaledb 5432:5432 +``` + +* modify password for `postgres` user manually in `psql` + ## Postgres * deploy postgres pod: ``` @@ -281,6 +327,46 @@ kubectl apply -f deployment/postgres/postgres.yaml kubectl apply -f deployment/postgres/postgres_lb.yaml ``` +* port forwarding +``` +kubectl port-forward service/postgres-db-lb 5432:5432 +``` + +## Install Ingress + +* Repo +```commandline +helm repo add ingress-nginx https://kubernetes.github.io/ingress-nginx +helm repo update +``` + +* Choose version based on k8s cluster version in [https://github.com/kubernetes/ingress-nginx/] +and update the version tag in `./ingress/congroller/nginx/values/yaml` + +* Deploy Ingress-Nginx +```commandline +helm -n ingress-nginx install ingress-nginx \ +ingress-nginx/ingress-nginx --create-namespace \ +--version 4.2.5 \ +-f ./ingress-nginx/values.yaml +``` +## Install Certificate Manager +```commandline +helm repo add jetstack https://charts.jetstack.io +helm repo update +kubectl apply -f https://github.com/cert-manager/cert-manager/releases/download/v1.13.2/cert-manager.crds.yaml + +helm install \ + cert-manager jetstack/cert-manager \ + --namespace cert-manager \ + --create-namespace \ + --version v1.13.2 + +k apply -f deployment/cert/issuer.yaml +``` + + + ## Grafana * set your postgres password in the datasource `deployment/grafana/datasources/postgres.yaml` ```yaml @@ -305,6 +391,11 @@ kubectl create configmap dashboard-trafficlight --from-file=deployment/grafana/d helm install grafana -f deployment/grafana/values-grafana.yaml bitnami/grafana ``` +* deploy grafana ingress: +```commandline +k apply -f deployment/grafana/ingress-grafana.yaml +``` + ## Baskerville images * Spark image (https://levelup.gitconnected.com/spark-on-kubernetes-3d822969f85b) ```commandline @@ -400,3 +491,335 @@ kafka: ssl_keyfile: '/usr/local/baskerville/kafka/admin.key.pem' api_version: '0.11.5' ``` + +## Install KSQL +KSQL is performing 5 minute window aggregation over the two topics: `deflect.log` and `banjax.log` + +* clone the repo `git@github.com:confluentinc/cp-helm-charts.git` +``` +cd baskerville +cd .. +git clone git@github.com:confluentinc/cp-helm-charts.git +``` +* install schema registry +```commandline +helm install ksql-schema-registry -f deployment/ksql/values-ksql-registry.yaml ../cp-helm-charts/charts/cp-schema-registry +``` + +* install `ksql` +```commandline +helm install ksql -f deployment/ksql/values-ksql.yaml ../cp-helm-charts/charts/cp-ksql-server +``` + +* connect to ksql cli to confirm the deployment +```commandline +kubectl run ksql-cli --rm -i --tty --image confluentinc/cp-ksql-cli:5.2.1 http://ksql-cp-ksql-server:8088 +``` +or +```commandline +kubectl attach ksql-cli -c ksql-cli -i -t +``` + +To make sure KSQL is up and running you can list kafka topics inside KSQL: +```commandline +show topics; +``` + +* create the cstat KSQL queries. Copy the content of `./deployment/ksql/create_queries.sql` +and paste it inside KSQL cli pod. +Make sure you don't have any errors. + +* to check KSQL logs: +List the pods: +```commandline +kubectl get pods +``` +Locate one of the ksql pods, for example, ksql-cp-ksql-server-5b7466c57f-89vx5 +Get the logs: +```commandline +kubectl logs ksql-cp-ksql-server-5b7466c57f-89vx5 cp-ksql-server --since=5m +``` + +* To confirm the output of KSQL (in kafka cli pod): +```commandline +kafka-console-consumer.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --topic STATS_WEBLOGS_DICTIONARY_5M +kafka-console-consumer.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --topic STATS_BANJAX_DICTIONARY_5M +``` + +* To change retention policy of cstats topics to 24 hours: +login go kafka-client pod: +```commandline +kubectl run kafka-client --restart='Never' --image docker.io/bitnami/kafka:2.8.0-debian-10-r43 --namespace default --command -- +or +kubectl exec --tty -i kafka-client --namespace default -- bash +``` +change the retention policy: +```commandline +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name STATS_WEBLOGS_5M --add-config retention.ms=86400000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name STATS_BANJAX_5M --add-config retention.ms=86400000 +``` + +* To delete KSQL query or table: +1) get query id from `show queries` +2) terminate query with `terminate query_id` +3) drop query or table with the corresponding topic: +`drop stream query_name delete topic` +or +`drop table table_name delete topic` + + +## Uninstalling KSQL + +* stop the helm charts: +```commandline +helm delete ksql +helm delete ksql-schema-registry +``` + +* delete kafka topics: +```commandline +kubectl run kafka-client --restart='Never' --image docker.io/bitnami/kafka:2.8.0-debian-10-r43 --namespace default --command -- + +kubectl run kafka-client \ + --restart='Never' \ + --image=docker.io/bitnami/kafka:2.8.0-debian-10-r43 \ + --namespace default \ + --env="ALLOW_PLAINTEXT_LISTENER=yes" \ + --command -- sleep infinity + +``` +or +``` +kubectl exec --tty -i kafka-client --namespace default -- bash +``` +then inside kafka-client pod: +``` +kafka-topics.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --delete --topic '_confluent-ksql-.*' +kafka-topics.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --delete --topic 'STATS_.*' +kafka-topics.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --delete --topic _schemas +``` + +* Set the maximum message size to 10M: +```commandline +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_WEBLOGS_5M --alter --add-config max.message.bytes=20000000 +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_LOGSTASH_WEBLOGS_DICTIONARY_5M --alter --add-config max.message.bytes=20000000 +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_BANJAX_5M --alter --add-config max.message.bytes=10000000 +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093,kafka-1.kafka-headless.default.svc.cluster.local:9093,kafka-2.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_LOGSTASH_BANJAX_DICTIONARY_5M --alter --add-config max.message.bytes=10000000 +``` + +## KStream +KStream transformer is correcting the format of KSQL output topics in order to be compatible +with the logstash which is is processing the output of KStream. +Logstash has a maximum number of fields within a single message. +The workaround is to convert the resulting output of KSQL 'HISTOGRAM' from a map of values to a list of values. +The reference is https://github.com/gwenshap/kafka-streams-stockstats +> + +* To build Java package +``` +cd deployment/kafka_stream +mvn compile jib:build +``` + +* To deploy KStream +``` +kubectl create -f ./deployment/kafka_stream/baskerville-cstats-deployment.yaml +``` + +* To delete KStream +``` + kubectl delete -f ./deployment/kafka_stream/baskerville-cstats-deployment.yaml +``` + +* To increase the maximum message size (in kafka cli): +``` +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_LOGSTASH_WEBLOGS_DICTIONARY_5M --alter --add-config max.message.bytes=30000000 +kafka-configs.sh --bootstrap-server 'kafka-0.kafka-headless.default.svc.cluster.local:9093' --entity-type topics --entity-name STATS_WEBLOGS_5M --alter --add-config max.message.bytes=30000000 +``` + +* To reduce the retention policy of four filebeat topics: +```commandline +filebeat_deflect_access +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name filebeat_deflect_access --add-config retention.ms=1200000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name filebeat_deflect_access_temp --add-config retention.ms=1200000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name filebeat_banjax --add-config retention.ms=1200000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name filebeat_banjax_access_temp --add-config retention.ms=1200000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name logstash_deflect.log --add-config retention.ms=1200000 +kafka-configs.sh --bootstrap-server kafka-0.kafka-headless.default.svc.cluster.local:9093 --alter --entity-type topics --entity-name logstash_banjax --add-config retention.ms=1200000 + +``` + +## Logstash + +### download a fresh asn database and put it to +```commandline +/deployment/logstash/GeoLite2-ASN.mmdb +``` +### create database image +```commandline +cd deployment/logstash +docker build -t equalitie/baskerville_geoip:latest . +docker push equalitie/baskerville_geoip:latest +cd ../.. +``` + +### create logstash certificates secret +kubectl create secret generic logstash-tls-secret \ + --from-file=caroot.pem \ + --from-file=certificate.pem \ + --from-file=key.pem + +### +```commandline + +helm install logstash -f deployment/logstash/values-logstash.yaml bitnami/logstash --version 5.1.15 +``` + +### Logstash loadbalancer +```commandline +kubectl apply -f ./deployment/logstash/logstash-lb.yaml +``` +## Elastic Search +* Install the basic version +``` +helm install elasticsearch --version 7.17.3 elastic/elasticsearch -f ./deployment/elasticsearch/values.yaml +``` +* Create certificates +``` +kubectl exec -it elasticsearch-master-0 -- /bin/bash +cd /usr/share/elasticsearch/bin/ +elasticsearch-certutil ca +CA password: ca_password +elasticsearch-certutil cert --ca elastic-stack-ca.p12 +certificate password: certificate_password +``` + +* Get the certificates and create secrets +``` +k cp elasticsearch-master-0:/usr/share/elasticsearch/elastic-certificates.p12 ./deployment/elasticsearch/cert/elastic-certificates.p12 +k cp elasticsearch-master-0:/usr/share/elasticsearch/elastic-stack-ca.p12 ./deployment/elasticsearch/cert/elastic-stack-ca.p12 + +kubectl create secret generic elastic-certificates --from-file=./deployment/elasticsearch/cert/elastic-certificates.p12 +kubectl create secret generic elastic-certificates-password --from-literal=password='certificate_password' +``` + +* paste certificate password into ./deployment/elasticsearch/values.yaml +``` +xpack.security.http.ssl.keystore.password: +xpack.security.http.ssl.truststore.password: +xpack.security.transport.ssl.keystore.password: +xpack.security.transport.ssl.truststore.password: +``` + +* restart Elasticsearch +``` +helm delete elasticsearch +helm install elasticsearch --version 7.17.3 elastic/elasticsearch -f ./deployment/elasticsearch/values.yaml +``` + +* create users +``` +kubectl exec -it elasticsearch-master-0 -- /bin/bash +/usr/share/elasticsearch/bin/elasticsearch-setup-passwords auto +``` +Copy the generated passwords. + +* create secret for `elastic` user and restart Elasticsearch +``` +kubectl create secret generic elastic-secret \ + --from-literal=username=elastic \ + --from-literal=password='xxx' + +uncomment extraEnvs section in ./deployment/elsticsearch/values.yaml +helm delete elasticsearch + +helm install elasticsearch elastic/elasticsearch -f ./deployment/elasticsearch/values.yaml +``` + +* Install Kibana +``` +cd ./deployment/kibana/ +mkdir cert +cd cert +openssl req -newkey rsa:2048 -nodes -keyout kibana.key -x509 -days 365 -out kibana.crt + +cd ./deployment/elasticsearch/cert +openssl pkcs12 -in elastic-certificates.p12 -cacerts -nokeys -out elastic-ca.pem + +cd ../../../ + +kubectl create secret generic kibana-certificates \ + --from-file=./deployment/elasticsearch/cert/elastic-ca.pem \ + --from-file=./deployment/kibana/cert/kibana.crt \ + --from-file=./deployment/kibana/cert/kibana.key + +helm install kibana --version 7.17.3 elastic/kibana -f ./deployment/kibana/values.yaml + +kubectl exec -it kibana-kibana-6bcb76b84-dg7qp -- /bin/bash +/usr/share/kibana/bin/kibana-keystore create +/usr/share/kibana/bin/kibana-keystore add elasticsearch.username +enter 'elastic' +/usr/share/kibana/bin/kibana-keystore add elasticsearch.password +enter the password for 'elastic' +exit + +cp kibana-kibana-6bcb76b84-dg7qp:/usr/share/kibana/config/kibana.keystore ./deployment/kibana/cert/kibana.keystore + +kubectl create secret generic kibana-keystore \ + --from-file=./deployment/kibana/cert/kibana.keystore +``` + +uncomment kibana-keystore in ./deployment/kibana/values.yaml + +* delete Kibana +``` +helm delete kibana +``` + +change in ./deployment/kibana/values.yaml +``` +service: + type: LoadBalancer +``` +* install Kibana again +``` +helm install kibana --version 7.17.3 elastic/kibana -f ./deployment/kibana/values.yaml +``` + +* port forwarding for Kibana +``` +kubectl port-forward deployment/kibana-kibana 5601 +``` + +* port forwarding for Elasticsearch +``` +kubectl port-forward service/elasticsearch-master 9200 +``` + +* Test Elasticsearch connection +``` +curl -u "elastic:$ES_PASS" -k "http://localhost:9200" +``` + +* Deleting all the indexes in Elasticsearch: +```commandline +curl -u "elastic:$ES_PASS" -X DELETE 'http://localhost:9200/_all' +``` + +* Install logstash for streaming topics to Elasticsearch +``` +helm install logstash-es elastic/logstash -f ./deployment/logstash_es/values.yaml +``` + +* Install logstash for forwarding to/from dev Kafka +``` +helm install logstash-dev-commands elastic/logstash -f ./deployment/logstash_dev/values_dev_commands.yaml +helm install logstash-dev-reports elastic/logstash -f ./deployment/logstash_dev/values_dev_reports.yaml + +``` + +* Install logstash for forwarding from clearinghouse topics +``` +helm install logstash-ch elastic/logstash -f ./deployment/logstash_ch/values_ch.yaml + +``` \ No newline at end of file diff --git a/deployment/argo/argo-preprocessing-filebeat.yaml b/deployment/argo/argo-preprocessing-filebeat.yaml new file mode 100644 index 00000000..aae5271d --- /dev/null +++ b/deployment/argo/argo-preprocessing-filebeat.yaml @@ -0,0 +1,133 @@ +metadata: + name: preprocessing-filebeat + namespace: default +spec: + templates: + - name: spark-submit + inputs: + parameters: + - name: baskerville_image + - name: config + - name: config-branch + outputs: {} + metadata: {} + container: + name: '' + image: '{{inputs.parameters.baskerville_image}}' + command: + - sh + args: + - /opt/spark/bin/spark-submit + - '--master' + - 'k8s://https://kubernetes.default.svc' + - '--deploy-mode' + - client + - '--conf' + - spark.kubernetes.namespace=default + - '--conf' + - spark.kubernetes.container.image=sparkimage + - '--conf' + - spark.executor.instances=1 + - '--name' + - preprocessing + - '--conf' + - spark.kubernetes.pyspark.pythonVersion=3 + - '--conf' + - spark.kubernetes.memoryOverheadFactor=0.5 + - '--conf' + - spark.memory.fraction=0.2 + - '--conf' + - spark.kubernetes.executor.request.cores=3 + - '--conf' + - spark.kubernetes.executor.limit.cores=3 + - '--conf' + - spark.executor.memory=4G + - '--conf' + - spark.driver.memory=3G + - '--conf' + - spark.kubernetes.node.selector.nodepool=workers + - '--jars' + - >- + local:///usr/local/baskerville/data/jars/hadoop-aws-2.7.1.jar,local:///usr/local/baskerville/data/jars/aws-java-sdk-1.7.4.jar,local:///usr/local/baskerville/data/jars/spark-iforest-2.4.0.99.jar,local:///usr/local/baskerville/data/jars/postgresql-42.2.4.jar,local:///usr/local/baskerville/data/jars/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar,local:///usr/local/baskerville/data/jars/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar + - '--conf' + - >- + spark.kubernetes.container.image={{inputs.parameters.baskerville_image}} + - /usr/local/baskerville/src/baskerville/main.py + - preprocessing + - '-c' + - '{{inputs.parameters.config}}' + - '-cb' + - '{{inputs.parameters.config-branch}}' + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: POSTGRES_PASS + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_password + - name: S3_ACCESS + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_access + - name: S3_SECRET + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_secret + - name: S3_ENDPOINT + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_endpoint + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: redis_password + - name: POSTGRES_HOST + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_host + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_user + - name: KAFKA_HOST + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: kafka_host + resources: {} + volumeMounts: + - name: ssh-secrets + mountPath: /usr/local/baskerville/ssh + imagePullPolicy: Always + retryStrategy: + limit: 3 + retryPolicy: Always + entrypoint: spark-submit + arguments: + parameters: + - name: baskerville_image + value: 'equalitie/baskerville:latest' + - name: config + value: 'git@github.com:equalitie/baskerville_config.git' + - name: config-branch + value: filebeat + volumes: + - name: ssh-secrets + secret: + secretName: ssh-secrets + defaultMode: 384 + nodeSelector: + nodepool: workers diff --git a/deployment/argo/argo_training_classifier.yaml b/deployment/argo/argo_training_classifier.yaml new file mode 100644 index 00000000..fcecf9d3 --- /dev/null +++ b/deployment/argo/argo_training_classifier.yaml @@ -0,0 +1,124 @@ +metadata: + name: training-classifier + namespace: default +spec: + templates: + - name: spark-submit + inputs: + parameters: + - name: baskerville_image + - name: config + - name: config-branch + outputs: {} + metadata: {} + container: + name: '' + image: '{{inputs.parameters.baskerville_image}}' + command: + - sh + args: + - /opt/spark/bin/spark-submit + - '--master' + - 'k8s://https://kubernetes.default.svc' + - '--deploy-mode' + - client + - '--conf' + - spark.kubernetes.namespace=default + - '--conf' + - spark.kubernetes.container.image=sparkimage + - '--conf' + - spark.executor.instances=3 + - '--name' + - training_classifier + - '--conf' + - spark.kubernetes.pyspark.pythonVersion=3 + - '--conf' + - spark.kubernetes.memoryOverheadFactor=0.5 + - '--conf' + - spark.memory.fraction=0.2 + - '--conf' + - spark.kubernetes.executor.request.cores=2 + - '--conf' + - spark.kubernetes.executor.limit.cores=2 + - '--conf' + - spark.executor.memory=6G + - '--jars' + - >- + local:///usr/local/baskerville/data/jars/hadoop-aws-2.7.1.jar,local:///usr/local/baskerville/data/jars/aws-java-sdk-1.7.4.jar,local:///usr/local/baskerville/data/jars/spark-iforest-2.4.0.99.jar,local:///usr/local/baskerville/data/jars/postgresql-42.2.4.jar,local:///usr/local/baskerville/data/jars/spark-streaming-kafka-0-8-assembly_2.11-2.4.0.jar,local:///usr/local/baskerville/data/jars/spark-redis_2.11-2.5.0-SNAPSHOT-jar-with-dependencies.jar + - '--conf' + - >- + spark.kubernetes.container.image={{inputs.parameters.baskerville_image}} + - /usr/local/baskerville/src/baskerville/main.py + - training_classifier + - '-c' + - '{{inputs.parameters.config}}' + - '-cb' + - '{{inputs.parameters.config-branch}}' + env: + - name: MY_POD_NAME + valueFrom: + fieldRef: + fieldPath: metadata.name + - name: MY_POD_IP + valueFrom: + fieldRef: + fieldPath: status.podIP + - name: POSTGRES_PASS + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_password + - name: S3_ACCESS + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_access + - name: S3_SECRET + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_secret + - name: S3_ENDPOINT + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: s3_endpoint + - name: REDIS_PASSWORD + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: redis_password + - name: POSTGRES_HOST + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_host + - name: POSTGRES_USER + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_user + - name: KAFKA_HOST + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: kafka_host + resources: {} + volumeMounts: + - name: ssh-secrets + mountPath: /usr/local/baskerville/ssh + imagePullPolicy: Always + entrypoint: spark-submit + arguments: + parameters: + - name: baskerville_image + value: 'equalitie/baskerville:latest' + - name: config + value: 'git@github.com:equalitie/baskerville_config.git' + - name: config-branch + value: main + volumes: + - name: ssh-secrets + secret: + secretName: ssh-secrets + defaultMode: 384 diff --git a/deployment/cert/issuer.yaml b/deployment/cert/issuer.yaml new file mode 100644 index 00000000..fc9b57d7 --- /dev/null +++ b/deployment/cert/issuer.yaml @@ -0,0 +1,18 @@ +apiVersion: cert-manager.io/v1 +kind: Issuer +metadata: + name: letsencrypt-prod +spec: + acme: + # The ACME server URL + server: https://acme-v02.api.letsencrypt.org/directory + # Email address used for ACME registration + email: mazhurin@gmail.com + # Name of a secret used to store the ACME account private key + privateKeySecretRef: + name: letsencrypt-prod + # Enable the HTTP-01 challenge provider + solvers: + - http01: + ingress: + ingressClassName: nginx \ No newline at end of file diff --git a/deployment/elasticsearch/values.yaml b/deployment/elasticsearch/values.yaml new file mode 100644 index 00000000..b9cdc8cf --- /dev/null +++ b/deployment/elasticsearch/values.yaml @@ -0,0 +1,41 @@ +replicas: 3 + +image: "docker.elastic.co/elasticsearch/elasticsearch" +imageTag: "7.17.3" + +extraEnvs: + - name: ELASTIC_USERNAME + valueFrom: + secretKeyRef: + name: elastic-secret + key: username + - name: ELASTIC_PASSWORD + valueFrom: + secretKeyRef: + name: elastic-secret + key: password + +secretMounts: + - name: elastic-certificates + secretName: elastic-certificates + path: /usr/share/elasticsearch/config/certs + +volumeClaimTemplate: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 500Gi + +esConfig: + elasticsearch.yml: | + cluster.name: "docker-cluster" + network.host: 0.0.0.0 + xpack.security.enabled: true + xpack.security.transport.ssl.enabled: true + xpack.security.transport.ssl.verification_mode: certificate + xpack.security.transport.ssl.client_authentication: required + xpack.security.transport.ssl.keystore.path: elastic-certificates.p12 + xpack.security.transport.ssl.truststore.path: elastic-certificates.p12 + xpack.security.authc.api_key.enabled: true + xpack.security.transport.ssl.keystore.password: xxx + xpack.security.transport.ssl.truststore.password: xxx diff --git a/deployment/filebeat/filebeat-config.yaml b/deployment/filebeat/filebeat-config.yaml new file mode 100644 index 00000000..e579e474 --- /dev/null +++ b/deployment/filebeat/filebeat-config.yaml @@ -0,0 +1,15 @@ +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: filebeat-test-config +data: + test.log: | + {"time_local": "18/May/2023:21:59:15 +0000","request_id": "867cbb2c7e3be106085c039024a9dca3","client_user": "", + "client_ip": "85.10.195.146","http_request_scheme": "http","client_request_method": "GET","client_request_host" + : "testnewsite2.com","http_response_code": 444,"reply_length_bytes": 0,"cache_result": "","http_request_version + ": "HTTP/1.1","referer": "","client_ua": "python-requests/2.20.0","client_url": "/","querystring": "","proxy_ho + st": "","proxy_port": "","content_type": "","request_time": 0.000,"forwardedfor": "","loc_in": "","loc_out": "" + ,"upstream_addr": "","upstream_status": "","upstream_response_time": "","upstream_header_time": "","upstream_co + nnect_time": "","upstream_bytes_sent": "","upstream_bytes_received": "","banjax_decision": "-","banjax_error": + "-","disable_logging": 0,"cookie_deflect_challenge2": "","cookie_deflect_password2": ""} diff --git a/deployment/filebeat/filebeat.yaml b/deployment/filebeat/filebeat.yaml new file mode 100644 index 00000000..d6c313c8 --- /dev/null +++ b/deployment/filebeat/filebeat.yaml @@ -0,0 +1,235 @@ +apiVersion: v1 +kind: ServiceAccount +metadata: + name: filebeat + namespace: kube-system + labels: + k8s-app: filebeat + +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: test-log + namespace: kube-system +data: + test.log: | + {"time_local": "18/May/2023:21:59:15 +0000","request_id": "867cbb2c7e3be106085c039024a9dca3","client_user": "","client_ip": "85.10.195.146","http_request_scheme": "http","client_request_method": "GET","client_request_host": "testnewsite2.com","http_response_code": 444,"reply_length_bytes": 0,"cache_result": "","http_request_version": "HTTP/1.1","referer": "","client_ua": "python-requests/2.20.0","client_url": "/","querystring": "","proxy_host": "","proxy_port": "","content_type": "","request_time": 0.000,"forwardedfor": "","loc_in": "","loc_out": "","upstream_addr": "","upstream_status": "","upstream_response_time": "","upstream_header_time": "","upstream_connect_time": "","upstream_bytes_sent": "","upstream_bytes_received": "","banjax_decision": "-","banjax_error":"-","disable_logging": 0,"cookie_deflect_challenge2": "","cookie_deflect_password2": ""} + +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRole +metadata: + name: filebeat + labels: + k8s-app: filebeat +rules: +- apiGroups: [""] # "" indicates the core API group + resources: + - namespaces + - pods + - nodes + verbs: + - get + - watch + - list +- apiGroups: ["apps"] + resources: + - replicasets + verbs: ["get", "list", "watch"] +- apiGroups: ["batch"] + resources: + - jobs + verbs: ["get", "list", "watch"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: filebeat + # should be the namespace where filebeat is running + namespace: kube-system + labels: + k8s-app: filebeat +rules: + - apiGroups: + - coordination.k8s.io + resources: + - leases + verbs: ["get", "create", "update"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: Role +metadata: + name: filebeat-kubeadm-config + namespace: kube-system + labels: + k8s-app: filebeat +rules: + - apiGroups: [""] + resources: + - configmaps + resourceNames: + - kubeadm-config + verbs: ["get"] +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: ClusterRoleBinding +metadata: + name: filebeat +subjects: +- kind: ServiceAccount + name: filebeat + namespace: kube-system +roleRef: + kind: ClusterRole + name: filebeat + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: filebeat + namespace: kube-system +subjects: + - kind: ServiceAccount + name: filebeat + namespace: kube-system +roleRef: + kind: Role + name: filebeat + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: rbac.authorization.k8s.io/v1 +kind: RoleBinding +metadata: + name: filebeat-kubeadm-config + namespace: kube-system +subjects: + - kind: ServiceAccount + name: filebeat + namespace: kube-system +roleRef: + kind: Role + name: filebeat-kubeadm-config + apiGroup: rbac.authorization.k8s.io +--- +apiVersion: v1 +kind: ConfigMap +metadata: + name: filebeat-config + namespace: kube-system + labels: + k8s-app: filebeat +data: + filebeat.yml: |- + filebeat.inputs: + - type: filestream + id: deflect_access + ignore_inactive: since_first_start + parsers: + - ndjson: + keys_under_root: false + paths: + - /usr/share/filebeat/test.log + fields: + dnet: "XXX" + type: deflect_access + log_topic: filebeat_deflect_access + log_topic: filebeat_banjax + + output.kafka: + hosts: ["kafka-0.kafka-headless.default.svc.cluster.local:9093","kafka-1.kafka-headless.default.svc.cluster.local:9093","kafka-2.kafka-headless.default.svc.cluster.local:9093"] + ssl.enabled: false + topic: 'anton7' + key: '%{[json.client_request_host]}' + partition.hash: + hash: [] + required_acks: 1 + reachable_only: true + compression: gzip + max_message_bytes: 1000000 + + + logging.level: "DEBUG" +--- + +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: filebeat + namespace: kube-system + labels: + k8s-app: filebeat +spec: + selector: + matchLabels: + k8s-app: filebeat + template: + metadata: + labels: + k8s-app: filebeat + spec: + serviceAccountName: filebeat + terminationGracePeriodSeconds: 30 + hostNetwork: true + dnsPolicy: ClusterFirstWithHostNet + containers: + - name: filebeat + image: docker.elastic.co/beats/filebeat:8.8.0 + args: [ + "-c", "/etc/filebeat.yml", + "-e", + ] + env: + - name: NODE_NAME + valueFrom: + fieldRef: + fieldPath: spec.nodeName + securityContext: + runAsUser: 0 + # If using Red Hat OpenShift uncomment this: + #privileged: true + resources: + limits: + memory: 200Mi + requests: + cpu: 100m + memory: 100Mi + volumeMounts: + - name: config + mountPath: /etc/filebeat.yml + readOnly: true + subPath: filebeat.yml + - name: test-log + mountPath: /usr/share/filebeat/test.log + readOnly: true + subPath: test.log + - name: data + mountPath: /usr/share/filebeat/data + - name: varlibdockercontainers + mountPath: /var/lib/docker/containers + readOnly: true + - name: varlog + mountPath: /var/log + readOnly: true + volumes: + - name: config + configMap: + defaultMode: 0640 + name: filebeat-config + - name: test-log + configMap: + name: test-log + - name: varlibdockercontainers + hostPath: + path: /var/lib/docker/containers + - name: varlog + hostPath: + path: /var/log + # data folder stores a registry of read status for all files, so we don't send everything again on a Filebeat pod restart + - name: data + hostPath: + # When filebeat runs as non-root user, this directory needs to be writable by group (g+w). + path: /var/lib/filebeat-data + type: DirectoryOrCreate +--- \ No newline at end of file diff --git a/deployment/grafana/ingress-grafana.yaml b/deployment/grafana/ingress-grafana.yaml new file mode 100644 index 00000000..56b796c7 --- /dev/null +++ b/deployment/grafana/ingress-grafana.yaml @@ -0,0 +1,24 @@ +apiVersion: networking.k8s.io/v1 +kind: Ingress +metadata: + name: grafana + annotations: + cert-manager.io/issuer: "letsencrypt-prod" + +spec: + ingressClassName: nginx + tls: + - hosts: + - baskerville-dashboard.deflect.ca + secretName: grafana-tls + rules: + - host: baskerville-dashboard.deflect.ca + http: + paths: + - path: / + pathType: Prefix + backend: + service: + name: grafana + port: + number: 3000 \ No newline at end of file diff --git a/deployment/grafana/values-grafana.yaml b/deployment/grafana/values-grafana.yaml index 5d1a9912..c5371f27 100644 --- a/deployment/grafana/values-grafana.yaml +++ b/deployment/grafana/values-grafana.yaml @@ -17,7 +17,7 @@ dashboardsProvider: enabled: true service: - type: LoadBalancer + type: ClusterIP datasources: secretName: datasource-secret diff --git a/deployment/ingress-baskerville/values.yaml b/deployment/ingress-baskerville/values.yaml new file mode 100644 index 00000000..19daa504 --- /dev/null +++ b/deployment/ingress-baskerville/values.yaml @@ -0,0 +1,4 @@ +controller: + service: + enableHttp: true + enableHttps: true \ No newline at end of file diff --git a/deployment/kafka/kafkab-loadbalancers.yaml b/deployment/kafka/kafkab-loadbalancers.yaml new file mode 100644 index 00000000..e56c3cba --- /dev/null +++ b/deployment/kafka/kafkab-loadbalancers.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Service +metadata: + name: kafkab-lb-0 +spec: + type: LoadBalancer + ports: + - name: kafka + port: 9094 + targetPort: kafka + selector: + statefulset.kubernetes.io/pod-name: kafkab-0 +--- +apiVersion: v1 +kind: Service +metadata: + name: kafkab-lb-1 +spec: + type: LoadBalancer + ports: + - name: kafka + port: 9094 + targetPort: kafka + selector: + statefulset.kubernetes.io/pod-name: kafkab-1 +--- +apiVersion: v1 +kind: Service +metadata: + name: kafkab-lb-2 +spec: + type: LoadBalancer + ports: + - name: kafka + port: 9094 + targetPort: kafka + selector: + statefulset.kubernetes.io/pod-name: kafkab-2 diff --git a/deployment/kafka/load_balancer_0.yaml b/deployment/kafka/load_balancer_0.yaml new file mode 100644 index 00000000..88277611 --- /dev/null +++ b/deployment/kafka/load_balancer_0.yaml @@ -0,0 +1,13 @@ +apiVersion: v1 +kind: Service +metadata: + name: kafkab_0 +spec: + type: LoadBalancer + selector: + app: kafka + ports: + - name: kafka + port: 9093 + targetPort: 9092 + protocol: diff --git a/deployment/kafka/values-kafka.yaml b/deployment/kafka/values-kafka.yaml index 2ae93410..00a251ab 100644 --- a/deployment/kafka/values-kafka.yaml +++ b/deployment/kafka/values-kafka.yaml @@ -1,38 +1,69 @@ replicaCount: 3 -logRetentionMinutes: 6 +numPartitions: 3 +logRetentionMinutes: 30 +logRetentionBytes: _268435456 logRetentionCheckIntervalMs: 60000 autoCreateTopicsEnable: True +deleteTopicEnable: True -nodeSelector: - nodepool: workers +#nodeSelector: +# nodepool: workers extraEnvVars: - - name: KAFKA_CFG_AUTHORIZER_CLASS_NAME - value: "kafka.security.authorizer.AclAuthorizer" - - name: KAFKA_CFG_SUPER_USERS - value: "User:CN=admin,OU=Unknown,O=Unknown,L=Unknown,ST=Unknown,C=Unknown;User:CN=Unknown,OU=Unknown,O=Unknown,L=Unknown,ST=Unknown,C=Unknown" - - name: KAFKA_CFG_ALLOW_EVERYONE_IF_NO_ACL_FOUND - value: "true" +# - name: KAFKA_CFG_AUTHORIZER_CLASS_NAME +# value: "kafka.security.authorizer.AclAuthorizer" +# - name: KAFKA_CFG_SUPER_USERS +# value: "User:CN=admin,OU=Unknown,O=Unknown,L=Unknown,ST=Unknown,C=Unknown;User:CN=Unknown,OU=Unknown,O=Unknown,L=Unknown,ST=Unknown,C=Unknown" +# - name: KAFKA_CFG_ALLOW_EVERYONE_IF_NO_ACL_FOUND +# value: "true" + - name: KAFKA_CFG_MAX_POLL_RECORDS + value: "100" + - name: KAFKA_CFG_MAX_REQUEST_SIZE + value: "10048576" + - name: KAFKA_CFG_LOG_CLEANUP_POLICY + value: "delete" +serviceAccount: + create: true +rbac: + create: true externalAccess: enabled: True + autoDiscovery: + enabled: True service: - type: NodePort - nodePorts: [30001, 30002, 30003] + type: LoadBalancer + useDNS: True + loadBalancerDomain: prod.deflect.network + loadBalancerBrokerName: kafka + ports: + external: 9094 + +#externalAccess: +# enabled: True +# service: +# type: NodePort +# nodePorts: [30001, 30002, 30003] persistence: enabled: False - -zookeeper: - auth: - enabled: False - persistence: - enabled: False + size: 700Gi + storageClass: csi-cinder-high-speed auth: - tls: - endpointIdentificationAlgorithm: "" clientProtocol: mtls interBrokerProtocol: plaintext - jksSecret: kafka-jks - jksPassword: "B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!" \ No newline at end of file + tls: + type: jks + endpointIdentificationAlgorithm: "" + existingSecrets: ["kafka-jks-0","kafka-jks-1","kafka-jks-2"] + password: "B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!" + zookeeper: + tls: + enabled: False + verifyHostname : False + +zookeeper: + persistence: + enabled: True + size: 20Gi \ No newline at end of file diff --git a/deployment/kafka/values-kafkab.yaml b/deployment/kafka/values-kafkab.yaml new file mode 100644 index 00000000..ed150b91 --- /dev/null +++ b/deployment/kafka/values-kafkab.yaml @@ -0,0 +1,101 @@ +fullnameOverride: kafkab + +kraft: + enabled: true + +global: + security: + allowInsecureImages: true + +image: + registry: docker.io + repository: bitnamilegacy/kafka + tag: 4.0.0-debian-12-r10 + pullPolicy: IfNotPresent + # Turn on Bitnami's verbose startup in BOTH containers (main + init) + debug: true + +controller: + replicaCount: 3 + persistence: + enabled: true + storageClass: csi-cinder-high-speed + size: 20Gi + # (optional but helpful) more time before probes during first boot/format + readinessProbe: + initialDelaySeconds: 60 + livenessProbe: + initialDelaySeconds: 120 + +broker: + replicaCount: 3 + persistence: + enabled: true + storageClass: csi-cinder-high-speed + size: 800Gi + extraEnvVars: + - name: KAFKA_CFG_MAX_POLL_RECORDS + value: "100" + - name: KAFKA_CFG_MAX_REQUEST_SIZE + value: "10048576" + - name: KAFKA_CFG_LOG_CLEANUP_POLICY + value: "delete" + +listeners: + client: + protocol: SSL + containerPort: 9093 + controller: + protocol: PLAINTEXT + containerPort: 9091 + interbroker: + protocol: PLAINTEXT + containerPort: 9092 + external: + protocol: SSL + containerPort: 9094 + sslClientAuth: required + # IMPORTANT: INTERNAL (not INTERBROKER) + advertisedListeners: >- + CLIENT://advertised-address-placeholder:9093, + INTERNAL://advertised-address-placeholder:9092 + +tls: + type: PEM + existingSecret: kafka-pem2 + pemKeyPassword: "" + keystorePassword: "B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!" + truststorePassword: "B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!" + endpointIdentificationAlgorithm: "" + +service: + type: ClusterIP + ports: + client: 9093 + controller: 9091 + interbroker: 9092 + external: 9094 + +externalAccess: + enabled: true + autoDiscovery: + enabled: false + + controller: + service: + type: ClusterIP + domain: default.svc.cluster.local + + broker: + service: + type: LoadBalancer + externalTrafficPolicy: Local + ports: { external: 9094 } + loadBalancerIPs: + - 57.128.91.45 + - 57.128.91.229 + - 57.128.91.4 + loadBalancerNames: + - kafkab0.prod.deflect.network + - kafkab1.prod.deflect.network + - kafkab2.prod.deflect.network diff --git a/deployment/kafka/values_kafkab_new.yaml b/deployment/kafka/values_kafkab_new.yaml new file mode 100644 index 00000000..62c2d67a --- /dev/null +++ b/deployment/kafka/values_kafkab_new.yaml @@ -0,0 +1,60 @@ +replicaCount: 3 + +persistence: + enabled: true + storageClass: csi-cinder-high-speed + size: 800Gi + +image: + repository: docker.io/bitnamilegacy/kafka + pullPolicy: IfNotPresent + +externalAccess: + enabled: true + autoDiscovery: + enabled: false + service: + type: ClusterIP + ports: + external: 9094 + external: + broker: + - name: kafkab0.prod.deflect.network + - name: kafkab1.prod.deflect.network + - name: kafkab2.prod.deflect.network + +listeners: + client: + protocol: SSL # Required for mTLS external clients + external: + protocol: SSL + port: 9094 + +auth: + clientProtocol: mtls # external clients must present a client cert + interBrokerProtocol: plaintext + tls: + type: jks + existingSecrets: + - kafka-jks-0 + - kafka-jks-1 + - kafka-jks-2 + password: "B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!" + endpointIdentificationAlgorithm: "" + + zookeeper: + tls: + enabled: false + verifyHostname: false + +# Custom Kafka config via env variables +extraEnvVars: + - name: KAFKA_CFG_MAX_POLL_RECORDS + value: "100" + - name: KAFKA_CFG_MAX_REQUEST_SIZE + value: "10048576" + - name: KAFKA_CFG_LOG_CLEANUP_POLICY + value: "delete" + +verticalPodAutoscaler: + enabled: false diff --git a/deployment/kafka9/README.md b/deployment/kafka9/README.md new file mode 100644 index 00000000..e25dc10f --- /dev/null +++ b/deployment/kafka9/README.md @@ -0,0 +1,35 @@ +``` + +NS=default + +# 0a) KRaft cluster id (pick one, keep forever for this cluster) +CLUSTER_ID=$(kubectl -n $NS run gen-kid --rm -i --restart=Never \ + --image=docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 --quiet -- \ + /opt/bitnami/kafka/bin/kafka-storage.sh random-uuid) +kubectl -n $NS create secret generic kafka9-kraft \ + --from-literal=cluster-id="$CLUSTER_ID" \ + --dry-run=client -o yaml | kubectl apply -f - + + +# 0b) Your PEMs +kubectl -n $NS create secret generic kafka-pem2 \ + --from-file=tls.crt=certificate.pem \ + --from-file=tls.key=key.pem \ + --from-file=ca.crt=caroot.pem + +PW='B1^ZRUUVoIuKND7t2HiJ8fwRg0kdMo4zdh8m8eRzgXw!' +kubectl -n $NS create secret generic kafka9-tls-passwords \ + --from-literal=keystore-password="$PW" \ + --from-literal=truststore-password="$PW" \ + --dry-run=client -o yaml | kubectl apply -f - + + +``` + + +```commandline +kubectl -n $NS apply -f kafka9.yaml +``` + + + diff --git a/deployment/kafka9/kafka9-lb.yaml b/deployment/kafka9/kafka9-lb.yaml new file mode 100644 index 00000000..e4edf053 --- /dev/null +++ b/deployment/kafka9/kafka9-lb.yaml @@ -0,0 +1,38 @@ +apiVersion: v1 +kind: Service +metadata: + name: kafka9-lb-0 +spec: + type: LoadBalancer + ports: + - name: external + port: 9094 + targetPort: external + selector: + statefulset.kubernetes.io/pod-name: kafka9-0 +--- +apiVersion: v1 +kind: Service +metadata: + name: kafka9-lb-1 +spec: + type: LoadBalancer + ports: + - name: external + port: 9094 + targetPort: external + selector: + statefulset.kubernetes.io/pod-name: kafka9-1 +--- +apiVersion: v1 +kind: Service +metadata: + name: kafka9-lb-2 +spec: + type: LoadBalancer + ports: + - name: external + port: 9094 + targetPort: external + selector: + statefulset.kubernetes.io/pod-name: kafka9-2 diff --git a/deployment/kafka9/kafka9.yaml b/deployment/kafka9/kafka9.yaml new file mode 100644 index 00000000..425fad37 --- /dev/null +++ b/deployment/kafka9/kafka9.yaml @@ -0,0 +1,259 @@ +apiVersion: v1 +kind: Service +metadata: + name: kafka9-headless + namespace: default + labels: { app: kafka9 } +spec: + clusterIP: None + publishNotReadyAddresses: true + selector: { app: kafka9 } + ports: + - { name: controller, port: 9091, targetPort: 9091 } + - { name: internal, port: 9092, targetPort: 9092 } + - { name: client, port: 9093, targetPort: 9093 } + - { name: external, port: 9094, targetPort: 9094 } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: kafka9 + namespace: default + labels: { app: kafka9 } +spec: + serviceName: kafka9-headless + replicas: 3 + podManagementPolicy: Parallel + selector: + matchLabels: { app: kafka9 } + template: + metadata: + labels: { app: kafka9 } + spec: + terminationGracePeriodSeconds: 30 + securityContext: + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + fsGroupChangePolicy: "OnRootMismatch" + initContainers: + - name: fix-perms + image: busybox:1.36 + securityContext: + runAsUser: 0 + command: ["sh","-c"] + args: + - | + mkdir -p /bitnami/kafka/data + chown -R 1001:1001 /bitnami/kafka + chmod -R g+rwX /bitnami/kafka + volumeMounts: + - { name: data, mountPath: /bitnami/kafka } + # 1) Convert your PEMs -> JKS into /certs + - name: pem-to-jks + image: docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 + command: ["/bin/bash","-ec"] + args: + - | + set -euo pipefail + test -f /mounted-certs/tls.crt + test -f /mounted-certs/tls.key + test -f /mounted-certs/ca.crt + openssl pkcs12 -export \ + -in /mounted-certs/tls.crt \ + -inkey /mounted-certs/tls.key \ + -name kafka \ + -out /certs/kafka.p12 \ + -passout pass:${KEYPASS} + keytool -importkeystore -noprompt \ + -srckeystore /certs/kafka.p12 -srcstoretype PKCS12 -srcstorepass "${KEYPASS}" \ + -destkeystore /certs/kafka.keystore.jks -deststorepass "${KEYPASS}" + keytool -import -noprompt -alias CARoot \ + -file /mounted-certs/ca.crt \ + -keystore /certs/kafka.truststore.jks -storepass "${KEYPASS}" + rm -f /certs/kafka.p12 + env: + - name: KEYPASS + valueFrom: { secretKeyRef: { name: kafka9-tls-passwords, key: keystore-password } } + volumeMounts: + - { name: pem, mountPath: /mounted-certs, readOnly: true } + - { name: certs, mountPath: /certs } + # 2) Write a complete server.properties the broker will use + - name: render-config + image: docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 + command: ["/bin/bash","-ec"] + args: + - | + set -euo pipefail + + # derive ordinal and per-pod hostnames + ORD="${MY_POD_NAME##*-}" + FQDN_BASE="kafka9-${ORD}.kafka9-headless.default.svc.cluster.local" + EXTERNAL_HOSTS="kafkadev0.prod.deflect.network kafkadev1.prod.deflect.network kafkadev2.prod.deflect.network" + set -- $EXTERNAL_HOSTS + case "$ORD" in + 0) EXT="$1" ;; + 1) EXT="$2" ;; + 2) EXT="$3" ;; + esac + + # --- MINIMAL file for kafka-storage.sh (controller-only) --- + cat > /config/storage.properties < /config/server.properties </tmp/format.properties < + + 4.0.0 + + baskerville-kafka-streams + baskerville-kafka-streams + 1.0 + + + 2.8.0 + 6.1.0 + 1.8.2 + UTF-8 + + + + + confluent + http://packages.confluent.io/maven/ + + + + + + org.apache.kafka + kafka-clients + ${kafka.version} + + + org.apache.kafka + kafka-streams + ${kafka.version} + + + com.google.code.gson + gson + 2.2.2 + + + + org.slf4j + slf4j-log4j12 + 1.7.21 + + + org.apache.logging.log4j + log4j-api + 2.17.1 + + + org.apache.logging.log4j + log4j-core + 2.17.1 + + + + + + org.apache.maven.plugins + maven-compiler-plugin + 3.5.1 + + 1.8 + 1.8 + + + + org.apache.maven.plugins + maven-shade-plugin + 3.2.4 + + + package + + shade + + + + + uber-${artifactId}-${version} + + + + com.google.cloud.tools + jib-maven-plugin + 0.9.10 + + + equalitie/baskerville_streams:latest + + + ie.equalit.baskerville.streams.stats.StatsFormatter + + + + + + diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/Constants.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/Constants.java new file mode 100644 index 00000000..91fd51f9 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/Constants.java @@ -0,0 +1,7 @@ +package ie.equalit.baskerville.streams.stats; + +public class Constants { + public static final String WEBLOG_TOPIC = "STATS_LOGSTASH_WEBLOGS_DICTIONARY_5M"; + public static final String BANJAXLOG_TOPIC = "STATS_LOGSTASH_BANJAX_DICTIONARY_5M"; + public static final String BROKER = "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093"; +} diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/LoadConfigs.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/LoadConfigs.java new file mode 100644 index 00000000..89f88f4b --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/LoadConfigs.java @@ -0,0 +1,38 @@ +package ie.equalit.baskerville.streams.stats; + +import java.io.File; +import java.io.FileInputStream; +import java.io.IOException; +import java.io.InputStream; +import java.nio.file.Files; +import java.nio.file.Paths; +import java.util.Properties; + +/** + * Load configuration from a file. This is mainly intended for connection info, so I can switch between clusters without recompile + * But you can put other client configation here, but we may override it... + */ +public class LoadConfigs { + + // default to cloud, duh + private static final String DEFAULT_CONFIG_File = + System.getProperty("user.home") + File.separator + ".ccloud" + File.separator + "config"; + + static Properties loadConfig() throws IOException { + return loadConfig(DEFAULT_CONFIG_File); + } + + static Properties loadConfig(String configFile) throws IOException { + if (!Files.exists(Paths.get(configFile))) { + throw new RuntimeException(configFile + " does not exist. You need a file with client configuration, " + + "either create one or run `ccloud init` if you are a Confluent Cloud user"); + } + System.out.println("Loading configs from:" + configFile); + final Properties cfg = new Properties(); + try (InputStream inputStream = new FileInputStream(configFile)) { + cfg.load(inputStream); + } + + return cfg; + } +} diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/StatsFormatter.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/StatsFormatter.java new file mode 100644 index 00000000..67970755 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/StatsFormatter.java @@ -0,0 +1,121 @@ +package ie.equalit.baskerville.streams.stats; + +import ie.equalit.baskerville.streams.stats.serde.JsonDeserializer; +import ie.equalit.baskerville.streams.stats.serde.JsonSerializer; +import ie.equalit.baskerville.streams.stats.serde.WrapperSerde; +import ie.equalit.baskerville.streams.stats.model.Weblog; +import ie.equalit.baskerville.streams.stats.model.WeblogCorrected; +import ie.equalit.baskerville.streams.stats.model.Banjaxlog; +import ie.equalit.baskerville.streams.stats.model.BanjaxlogCorrected; +import org.apache.kafka.clients.admin.AdminClient; +import org.apache.kafka.clients.admin.DescribeClusterResult; +import org.apache.kafka.clients.consumer.ConsumerConfig; +import org.apache.kafka.common.serialization.Serdes; +import org.apache.kafka.common.utils.Bytes; +import org.apache.kafka.streams.KafkaStreams; +import org.apache.kafka.streams.StreamsConfig; +import org.apache.kafka.clients.producer.ProducerConfig; +import org.apache.kafka.streams.Topology; +import org.apache.kafka.streams.kstream.KStream; +import org.apache.kafka.streams.StreamsBuilder; +import org.apache.kafka.streams.kstream.Produced; +import org.apache.kafka.streams.kstream.Consumed; + +import java.time.Duration; +import java.util.Properties; +import ie.equalit.baskerville.streams.stats.Constants; + +/** + * Input is a stream of trades + * Output is two streams: One with minimum and avg "ASK" price for every 10 seconds window + * Another with the top-3 stocks with lowest minimum ask every minute + */ +public class StatsFormatter{ + + public static void main(String[] args) throws Exception { + +// Properties props; +// if (args.length==1) +// props = LoadConfigs.loadConfig(args[0]); +// else +// props = LoadConfigs.loadConfig(); + Properties props = new Properties(); + + props.put(StreamsConfig.APPLICATION_ID_CONFIG, "baskerville-logstash-stats"); +// props.put(StreamsConfig.DEFAULT_KEY_SERDE_CLASS_CONFIG, Serdes.String().getClass().getName()); +// props.put(StreamsConfig.DEFAULT_VALUE_SERDE_CLASS_CONFIG, WeblogSerde.class.getName()); + + props.put(StreamsConfig.BOOTSTRAP_SERVERS_CONFIG, Constants.BROKER); + props.put(StreamsConfig.PRODUCER_PREFIX + ProducerConfig.MAX_REQUEST_SIZE_CONFIG, "10000000"); + + + // setting offset reset to earliest so that we can re-run the demo code with the same pre-loaded data + // Note: To re-run the demo, you need to use the offset reset tool: + // https://cwiki.apache.org/confluence/display/KAFKA/Kafka+Streams+Application+Reset+Tool +// props.put(ConsumerConfig.AUTO_OFFSET_RESET_CONFIG, "earliest"); + + // creating an AdminClient and checking the number of brokers in the cluster, so I'll know how many replicas we want... + + AdminClient ac = AdminClient.create(props); + DescribeClusterResult dcr = ac.describeCluster(); + int clusterSize = dcr.nodes().get().size(); + + if (clusterSize<3) + props.put("replication.factor",clusterSize); + else + props.put("replication.factor",3); + + StreamsBuilder builder = new StreamsBuilder(); + + KStream sourceWeblog = builder.stream(Constants.WEBLOG_TOPIC, + Consumed.with(Serdes.String(), new WeblogSerde())); + KStream statsWeblog = sourceWeblog + .mapValues((weblog) -> new WeblogCorrected(weblog)); + statsWeblog.to("STATS_WEBLOGS_5M", Produced.with(Serdes.String(), new WeblogCorrectedSerde())); + + + KStream sourceBanjaxlog = builder.stream(Constants.BANJAXLOG_TOPIC, + Consumed.with(Serdes.String(), new BanjaxlogSerde())); + KStream statsBanjax = sourceBanjaxlog + .mapValues((banjaxlog) -> new BanjaxlogCorrected(banjaxlog)); + statsBanjax.to("STATS_BANJAX_5M", Produced.with(Serdes.String(), new BanjaxlogCorrectedSerde())); + + Topology topology = builder.build(); + + KafkaStreams streams = new KafkaStreams(topology, props); + + System.out.println(topology.describe()); + + streams.cleanUp(); + + streams.start(); + + // Add shutdown hook to respond to SIGTERM and gracefully close Kafka Streams + Runtime.getRuntime().addShutdownHook(new Thread(streams::close)); + + } + + static public final class WeblogSerde extends WrapperSerde { + public WeblogSerde() { + super(new JsonSerializer(), new JsonDeserializer(Weblog.class)); + } + } + + static public final class WeblogCorrectedSerde extends WrapperSerde { + public WeblogCorrectedSerde() { + super(new JsonSerializer(), new JsonDeserializer(WeblogCorrected.class)); + } + } + + static public final class BanjaxlogSerde extends WrapperSerde { + public BanjaxlogSerde() { + super(new JsonSerializer(), new JsonDeserializer(Banjaxlog.class)); + } + } + + static public final class BanjaxlogCorrectedSerde extends WrapperSerde { + public BanjaxlogCorrectedSerde() { + super(new JsonSerializer(), new JsonDeserializer(BanjaxlogCorrected.class)); + } + } + } diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Banjaxlog.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Banjaxlog.java new file mode 100644 index 00000000..9bc1764c --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Banjaxlog.java @@ -0,0 +1,42 @@ +package ie.equalit.baskerville.streams.stats.model; + +import java.util.ArrayList; +import java.util.HashMap; + + +public class Banjaxlog { + + String host; + ArrayList client_ip; + HashMap country_codes; + HashMap target_url; + Long uniquebots; + String window_end; + + public Banjaxlog( + String host, + ArrayList client_ip, + HashMap country_codes, + HashMap target_url, + Long uniquebots, + String window_end + ) { + this.host = host; + this.client_ip = client_ip; + this.uniquebots = uniquebots; + this.country_codes = country_codes; + this.target_url = target_url; + this.window_end = window_end; + } + + @Override + public String toString() { + return "BanjaxlogStat {" + + "HOST='" + this.host + + ", CLIENT_IP='" + this.client_ip + + ", UNIQUEBOTS='" + this.uniquebots + + ", country_codes='" + this.country_codes + + ", TARGET_URL='" + this.target_url + + ", WINDOW_END='" + this.window_end; + } +} \ No newline at end of file diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/BanjaxlogCorrected.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/BanjaxlogCorrected.java new file mode 100644 index 00000000..130cce65 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/BanjaxlogCorrected.java @@ -0,0 +1,39 @@ +package ie.equalit.baskerville.streams.stats.model; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import ie.equalit.baskerville.streams.stats.model.Banjaxlog; + +public class BanjaxlogCorrected { + String hostname; + ArrayList client_ip; + Long uniquebots; + String window_end; + + ArrayList target_url; + ArrayList country_codes; + + private ArrayList correct_map(HashMap original){ + ArrayList result = new ArrayList(); + for (Map.Entry entry : original.entrySet()) { + HashMap item = new HashMap(); + item.put("key", entry.getKey()); + item.put("doc_count", Integer.parseInt(entry.getValue())); + result.add(item); + } + return result; + } + + + public BanjaxlogCorrected(Banjaxlog original) { + this.hostname = original.host; + this.client_ip = original.client_ip; + this.uniquebots = original.uniquebots; + this.window_end = original.window_end; + + this.target_url = this.correct_map(original.target_url); + this.country_codes = this.correct_map(original.country_codes); + } + +} \ No newline at end of file diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Weblog.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Weblog.java new file mode 100644 index 00000000..9ad5f632 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/Weblog.java @@ -0,0 +1,79 @@ +package ie.equalit.baskerville.streams.stats.model; + +import java.util.ArrayList; +import java.util.HashMap; + + +public class Weblog { + + String host; + Long allbytes; + Long cachedbytes; + Long allhits; + Long cachedhits; + ArrayList client_ip; + HashMap country_codes; + HashMap client_url; + HashMap viewed_pages; + Long viewed_page_count; + HashMap ua; + HashMap http_code; + String window_end; + HashMap content_type; + HashMap utm_source; + HashMap utm_medium; + HashMap utm_campaign; + + public Weblog( + String host, + Long allbytes, + Long cachedbytes, + Long allhits, + Long cachedhits, + ArrayList client_ip, + HashMap country_codes, + HashMap client_url, + HashMap viewed_pages, + Long viewed_page_count, + HashMap ua, + HashMap http_code, + HashMap content_type, + HashMap utm_source, + HashMap utm_medium, + HashMap utm_campaign, + String window_end + ) { + this.host = host; + this.allbytes = allbytes; + this.cachedbytes = cachedbytes; + this.allhits = allhits; + this.cachedhits= cachedhits; + this.client_ip = client_ip; + this.country_codes = country_codes; + this.client_url = client_url; + this.viewed_pages = viewed_pages; + this.viewed_page_count = viewed_page_count; + this.ua = ua; + this.http_code = http_code; + this.window_end = window_end; + this.content_type = content_type; + this.utm_source = utm_source; + this.utm_medium = utm_medium; + this.utm_campaign = utm_campaign; + } + + @Override + public String toString() { + return "WeblogStat {" + + "host='" + this.host + + ", allbytes='" + this.allbytes + + ", cachedbytes='" + this.cachedbytes + + ", client_ip='" + this.client_ip + + ", country_codes='" + this.country_codes + + ", client_url='" + this.client_url + + ", viewed_pages='" + this.viewed_pages + + ", viewed_page_count='" + this.viewed_page_count + + ", ua='" + this.ua + + ", http_code='" + this.http_code; + } +} \ No newline at end of file diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/WeblogCorrected.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/WeblogCorrected.java new file mode 100644 index 00000000..1f773bec --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/model/WeblogCorrected.java @@ -0,0 +1,64 @@ +package ie.equalit.baskerville.streams.stats.model; + +import java.util.ArrayList; +import java.util.HashMap; +import java.util.Map; +import ie.equalit.baskerville.streams.stats.model.Weblog; + +public class WeblogCorrected { + String hostname; + Long allbytes; + Long cachedbytes; + Long allhits; + Long cachedhits; + ArrayList client_ip; + Long viewed_page_count; + String window_end; + + ArrayList ua; + ArrayList country_codes; + ArrayList client_url; + ArrayList viewed_pages; + ArrayList http_code; + ArrayList content_type; + ArrayList utm_source; + ArrayList utm_medium; + ArrayList utm_campaign; + + private ArrayList correct_map(HashMap original){ + ArrayList result = new ArrayList(); + for (Map.Entry entry : original.entrySet()) { + if(entry.getKey().length() == 0) + continue; + HashMap item = new HashMap(); + item.put("key", entry.getKey()); + item.put("doc_count", Integer.parseInt(entry.getValue())); + result.add(item); + } + return result; + } + + + public WeblogCorrected(Weblog original) { + this.hostname = original.host; + this.allbytes = original.allbytes; + this.cachedbytes = original.cachedbytes; + this.allhits = original.allhits; + this.cachedhits= original.cachedhits; + this.client_ip = original.client_ip; + this.viewed_page_count = original.viewed_page_count; + this.window_end = original.window_end; + + this.ua = this.correct_map(original.ua); + this.country_codes = this.correct_map(original.country_codes); + this.client_url = this.correct_map(original.client_url); + this.viewed_pages = this.correct_map(original.viewed_pages); + this.http_code = this.correct_map(original.http_code); + + this.content_type = this.correct_map(original.content_type); + this.utm_source = this.correct_map(original.utm_source); + this.utm_medium = this.correct_map(original.utm_medium); + this.utm_campaign = this.correct_map(original.utm_campaign); + } + +} \ No newline at end of file diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonDeserializer.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonDeserializer.java new file mode 100644 index 00000000..6b100df9 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonDeserializer.java @@ -0,0 +1,43 @@ +package ie.equalit.baskerville.streams.stats.serde; + + +import com.google.gson.Gson; +import org.apache.kafka.common.serialization.Deserializer; + +import java.util.Map; + +public class JsonDeserializer implements Deserializer { + + private Gson gson = new Gson(); + private Class deserializedClass; + + public JsonDeserializer(Class deserializedClass) { + this.deserializedClass = deserializedClass; + } + + public JsonDeserializer() { + } + + @Override + @SuppressWarnings("unchecked") + public void configure(Map map, boolean b) { + if(deserializedClass == null) { + deserializedClass = (Class) map.get("serializedClass"); + } + } + + @Override + public T deserialize(String s, byte[] bytes) { + if(bytes == null){ + return null; + } + + return gson.fromJson(new String(bytes),deserializedClass); + + } + + @Override + public void close() { + + } +} \ No newline at end of file diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonSerializer.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonSerializer.java new file mode 100644 index 00000000..52786780 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/JsonSerializer.java @@ -0,0 +1,27 @@ +package ie.equalit.baskerville.streams.stats.serde; + +import com.google.gson.Gson; +import org.apache.kafka.common.serialization.Serializer; + +import java.nio.charset.Charset; +import java.util.Map; + +public class JsonSerializer implements Serializer { + + private Gson gson = new Gson(); + + @Override + public void configure(Map map, boolean b) { + + } + + @Override + public byte[] serialize(String topic, T t) { + return gson.toJson(t).getBytes(Charset.forName("UTF-8")); + } + + @Override + public void close() { + + } +} diff --git a/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/WrapperSerde.java b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/WrapperSerde.java new file mode 100644 index 00000000..7b7eeb27 --- /dev/null +++ b/deployment/kafka_stream/src/main/java/ie/equalit/baskerville/streams/stats/serde/WrapperSerde.java @@ -0,0 +1,41 @@ +package ie.equalit.baskerville.streams.stats.serde; + +import org.apache.kafka.common.serialization.Deserializer; +import org.apache.kafka.common.serialization.Serde; +import org.apache.kafka.common.serialization.Serializer; + +import java.util.Map; + + +public class WrapperSerde implements Serde { + + final private Serializer serializer; + final private Deserializer deserializer; + + public WrapperSerde(Serializer serializer, Deserializer deserializer) { + this.serializer = serializer; + this.deserializer = deserializer; + } + + @Override + public void configure(Map configs, boolean isKey) { + serializer.configure(configs, isKey); + deserializer.configure(configs, isKey); + } + + @Override + public void close() { + serializer.close(); + deserializer.close(); + } + + @Override + public Serializer serializer() { + return serializer; + } + + @Override + public Deserializer deserializer() { + return deserializer; + } +} diff --git a/deployment/kafkab/README.md b/deployment/kafkab/README.md new file mode 100644 index 00000000..7bc201eb --- /dev/null +++ b/deployment/kafkab/README.md @@ -0,0 +1,35 @@ +``` + +NS=default + +# KRaft cluster id (pick one, keep forever for this cluster) +CLUSTER_ID=$(kubectl -n $NS run gen-kid --rm -i --restart=Never \ + --image=docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 --quiet -- \ + /opt/bitnami/kafka/bin/kafka-storage.sh random-uuid) +kubectl -n $NS create secret generic kafkab-kraft \ + --from-literal=cluster-id="$CLUSTER_ID" \ + --dry-run=client -o yaml | kubectl apply -f - + + +# Your PEMs +kubectl -n $NS create secret generic kafka-pem \ + --from-file=tls.crt=certificate.pem \ + --from-file=tls.key=key.pem \ + --from-file=ca.crt=caroot.pem + +PW='xxx' +kubectl -n $NS create secret generic kafkab-tls-passwords \ + --from-literal=keystore-password="$PW" \ + --from-literal=truststore-password="$PW" \ + --dry-run=client -o yaml | kubectl apply -f - + + +``` + + +```commandline +kubectl -n $NS apply -f kafkab.yaml +``` + + + diff --git a/deployment/kafkab/kafkab.yaml b/deployment/kafkab/kafkab.yaml new file mode 100644 index 00000000..bda09f7d --- /dev/null +++ b/deployment/kafkab/kafkab.yaml @@ -0,0 +1,259 @@ +apiVersion: v1 +kind: Service +metadata: + name: kafkab-headless + namespace: default + labels: { app: kafkab } +spec: + clusterIP: None + publishNotReadyAddresses: true + selector: { app: kafkab } + ports: + - { name: controller, port: 9091, targetPort: 9091 } + - { name: internal, port: 9092, targetPort: 9092 } + - { name: client, port: 9093, targetPort: 9093 } + - { name: external, port: 9094, targetPort: 9094 } +--- +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: kafkab + namespace: default + labels: { app: kafkab } +spec: + serviceName: kafkab-headless + replicas: 3 + podManagementPolicy: Parallel + selector: + matchLabels: { app: kafkab } + template: + metadata: + labels: { app: kafkab } + spec: + terminationGracePeriodSeconds: 30 + securityContext: + runAsUser: 1001 + runAsGroup: 1001 + fsGroup: 1001 + fsGroupChangePolicy: "OnRootMismatch" + initContainers: + - name: fix-perms + image: busybox:1.36 + securityContext: + runAsUser: 0 + command: ["sh","-c"] + args: + - | + mkdir -p /bitnami/kafka/data + chown -R 1001:1001 /bitnami/kafka + chmod -R g+rwX /bitnami/kafka + volumeMounts: + - { name: data, mountPath: /bitnami/kafka } + # 1) Convert your PEMs -> JKS into /certs + - name: pem-to-jks + image: docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 + command: ["/bin/bash","-ec"] + args: + - | + set -euo pipefail + test -f /mounted-certs/tls.crt + test -f /mounted-certs/tls.key + test -f /mounted-certs/ca.crt + openssl pkcs12 -export \ + -in /mounted-certs/tls.crt \ + -inkey /mounted-certs/tls.key \ + -name kafka \ + -out /certs/kafka.p12 \ + -passout pass:${KEYPASS} + keytool -importkeystore -noprompt \ + -srckeystore /certs/kafka.p12 -srcstoretype PKCS12 -srcstorepass "${KEYPASS}" \ + -destkeystore /certs/kafka.keystore.jks -deststorepass "${KEYPASS}" + keytool -import -noprompt -alias CARoot \ + -file /mounted-certs/ca.crt \ + -keystore /certs/kafka.truststore.jks -storepass "${KEYPASS}" + rm -f /certs/kafka.p12 + env: + - name: KEYPASS + valueFrom: { secretKeyRef: { name: kafkab-tls-passwords, key: keystore-password } } + volumeMounts: + - { name: pem, mountPath: /mounted-certs, readOnly: true } + - { name: certs, mountPath: /certs } + # 2) Write a complete server.properties the broker will use + - name: render-config + image: docker.io/bitnamilegacy/kafka:4.0.0-debian-12-r10 + command: ["/bin/bash","-ec"] + args: + - | + set -euo pipefail + + # derive ordinal and per-pod hostnames + ORD="${MY_POD_NAME##*-}" + FQDN_BASE="kafkab-${ORD}.kafkab-headless.default.svc.cluster.local" + EXTERNAL_HOSTS="kafkab0.prod.deflect.network kafkab1.prod.deflect.network kafkab2.prod.deflect.network" + set -- $EXTERNAL_HOSTS + case "$ORD" in + 0) EXT="$1" ;; + 1) EXT="$2" ;; + 2) EXT="$3" ;; + esac + + # --- MINIMAL file for kafka-storage.sh (controller-only) --- + cat > /config/storage.properties < /config/server.properties </tmp/format.properties <, + http_response_code VARCHAR, + datestamp VARCHAR, + reply_length_bytes BIGINT, + geoip STRUCT, + cache_result VARCHAR, + content_type VARCHAR, + disable_logging BIGINT +) WITH ( + kafka_topic = 'logstash_deflect.log', + partitions = 3, + value_format = 'json', + timestamp = 'datestamp', + timestamp_format = 'yyyy-MM-dd''T''HH:mm:ss.SSS''Z''' +); + """, + """ +CREATE STREAM {}BANJAX_SCHEMA ( + client_request_host VARCHAR, + client_ip VARCHAR, + action VARCHAR, + client_url VARCHAR, + user_agent STRUCT, + geoip STRUCT, + datestamp VARCHAR, + disable_logging BIGINT +) WITH ( + kafka_topic = 'logstash_banjax', + partitions = 3, + value_format = 'json', + timestamp = 'datestamp', + timestamp_format = 'yyyy-MM-dd''T''HH:mm:ss.SSS''Z''' +); + """ +] + +streams = [ + """ +CREATE STREAM {}WEBLOGS_WWW AS + SELECT + REPLACE(client_request_host, 'www.', '') as host_no_www, + client_url, + REGEXP_REPLACE(content_type, ';.*', '') as content_type, + CASE WHEN (querystring like '%utm_source=%') then + REGEXP_REPLACE(REGEXP_REPLACE(querystring, '.*(?=utm_source=)utm_source=', ''), '&.*', '') + ELSE + '' + END AS utm_source, + CASE WHEN (querystring like '%utm_campaign=%') then + REGEXP_REPLACE(REGEXP_REPLACE(querystring, '.*(?=utm_campaign=)utm_campaign=', ''), '&.*', '') + ELSE + '' + END AS utm_campaign, + CASE WHEN (querystring like '%utm_medium=%') then + REGEXP_REPLACE(REGEXP_REPLACE(querystring, '.*(?=utm_medium=)utm_medium=', ''), '&.*', '') + ELSE + '' + END AS utm_medium, + CASE + WHEN (http_response_code = '200' or http_response_code = '304') + and ( + content_type = 'text/html' or + content_type = 'text/plain' or + content_type = 'application/pdf' or + content_type = 'application/msword' or + content_type = 'text/html; charset=utf-8' or + content_type = 'text/plain; charset=utf-8' or + content_type = 'application/pdf; charset=utf-8' or + content_type = 'application/msword; charset=utf-8' or + content_type = '-' or + content_type = 'text/html; charset=UTF-8' or + content_type = 'text/plain; charset=UTF-8' or + content_type = 'application/pdf; charset=UTF-8' or + content_type = 'application/msword; charset=UTF-8') + THEN +REGEXP_REPLACE(client_url,'/(robots.txt|xmlrpc.php|10k|.*(jpeg|js|jpg|ico|css|json|png|gif|class|bmp|rss|xml|swf))', '') + ELSE + '' + END as client_url_filtered, + + datestamp, + reply_length_bytes, + + CASE WHEN (geoip is not NULL) then + geoip->country_code2 + ELSE + '' + END AS country_code, + + client_ip, + + CASE WHEN (user_agent is not NULL) then + user_agent->name + ELSE + '' + END AS client_ua, + + http_response_code, + CASE + WHEN + cache_result = 'HIT' or + cache_result = 'STALE' or + cache_result = 'UPDATING' or + cache_result = 'REVALIDATED' + THEN 1 + ELSE 0 + END AS cached + FROM {}WEBLOGS_SCHEMA + WHERE disable_logging <> 1; + """, + """ +CREATE STREAM {}WEBLOGS + WITH (PARTITIONS=3) AS + SELECT * + FROM {}WEBLOGS_WWW + PARTITION BY host_no_www; + """, + """ +CREATE STREAM {}BANJAX_WWW AS + SELECT + REPLACE(client_request_host, 'www.', '') as host_no_www, + client_ip, + client_url, + user_agent->name as ua_name, + geoip->country_code2 as country_code + FROM {}BANJAX_SCHEMA + WHERE (action = 'NginxBlock' or action = 'IptablesBlock') + and (disable_logging is NULL or disable_logging <> 1); + """, + """ +CREATE STREAM {}BANJAX_PARTITIONED + WITH (PARTITIONS=3) AS + SELECT * + FROM {}BANJAX_WWW + PARTITION BY host_no_www; + """, + # @@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@ UNIQUE @@@@@@@@@@@@@@@@@@ + """ +CREATE TABLE {}BANJAX_UNIQUE_TABLE AS + SELECT + host_no_www, + country_code, + client_ip, + client_url, + EARLIEST_BY_OFFSET(host_no_www) AS host2, + EARLIEST_BY_OFFSET(client_ip) as client_ip2, + EARLIEST_BY_OFFSET(country_code) as country_code2, + EARLIEST_BY_OFFSET(client_url) as client_url2, + COUNT(client_ip) as ip_count, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as window_end + FROM {}BANJAX_PARTITIONED + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host_no_www, country_code, client_ip, client_url; + """, + """ + CREATE STREAM {}BANJAX_UNIQUE_SCHEMA + ( + host2 VARCHAR, + client_ip2 VARCHAR, + country_code2 VARCHAR, + client_url2 VARCHAR, + ip_count INTEGER +) WITH ( + kafka_topic = '{}BANJAX_UNIQUE_TABLE', + partitions = 3, + value_format = 'json' +); + """, + """ + CREATE STREAM {}BANJAX_UNIQUE AS + SELECT + host2, + client_ip2, + country_code2, + client_url2 + FROM {}BANJAX_UNIQUE_SCHEMA + WHERE IP_COUNT = 1 + PARTITION BY host2; + """ +] + +minimum_queries = [ + """ + CREATE TABLE {}WEBLOGS_DICTIONARY_5M AS + SELECT host_no_www, EARLIEST_BY_OFFSET(host_no_www) as `host`, + sum (reply_length_bytes) as `allbytes`, + sum (cached*reply_length_bytes) as `cachedbytes`, + count (*) as `allhits`, + sum(cached) as `cachedhits`, + COLLECT_SET (client_ip) as `client_ip`, + HISTOGRAM (country_code) as `country_codes`, + HISTOGRAM (client_url) as `client_url`, + HISTOGRAM (client_url_filtered) as `viewed_pages`, + COUNT(client_url_filtered) as `viewed_page_count`, + HISTOGRAM (client_ua) as `ua`, + HISTOGRAM (http_response_code) as `http_code`, + HISTOGRAM (content_type) as `content_type`, + HISTOGRAM (utm_source) as `utm_source`, + HISTOGRAM (utm_campaign) as `utm_campaign`, + HISTOGRAM (utm_medium) as `utm_medium`, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as `window_end` + FROM {}WEBLOGS + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host_no_www; + """, + """ + CREATE TABLE {}BANJAX_DICTIONARY_5M AS + SELECT host2, EARLIEST_BY_OFFSET(host2) as `host`, + COLLECT_SET (client_ip2) as `client_ip`, + HISTOGRAM (country_code2) as `country_codes`, + HISTOGRAM (client_url2) as `target_url`, + COUNT_DISTINCT (client_ip2) as `uniquebots`, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as `window_end` + FROM {}BANJAX_UNIQUE + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host2; + """ +] + + +for q in schemas: + print(q.format(prefix)) + +for q in streams: + print(q.format(prefix, prefix)) + +for q in minimum_queries: + print(q.format(prefix, prefix)) diff --git a/deployment/ksql/queries.sql b/deployment/ksql/queries.sql new file mode 100644 index 00000000..28977f02 --- /dev/null +++ b/deployment/ksql/queries.sql @@ -0,0 +1,173 @@ +CREATE STREAM STATS_WEBLOGS_SCHEMA ( + client_request_host VARCHAR, + client_ip VARCHAR, + client_url VARCHAR, + client_ua VARCHAR, + http_response_code VARCHAR, + datestamp VARCHAR, + reply_length_bytes BIGINT, + geoip STRUCT, + cache_result VARCHAR, + content_type VARCHAR +) WITH ( + kafka_topic = 'deflect.log', + partitions = 3, + value_format = 'json', + timestamp = 'datestamp', + timestamp_format = 'dd/LLL/yyyy:HH:mm:ss ZZZ' +); + + +CREATE STREAM STATS_BANJAX_SCHEMA ( + http_host VARCHAR, + client_ip VARCHAR, + action VARCHAR, + uripath VARCHAR, + user_agent STRUCT, + geoip STRUCT, + datestamp VARCHAR +) WITH ( + kafka_topic = 'banjax', + partitions = 3, + value_format = 'json', + timestamp = 'datestamp', + timestamp_format = '''[''yyyy-MM-dd''T''HH:mm:ss'']''' +); + + +CREATE STREAM STATS_WEBLOGS_WWW AS + SELECT + REPLACE(client_request_host, 'www.', '') as host_no_www, + client_url, + CASE + WHEN (http_response_code = '200' or http_response_code = '304') + and ( + content_type = 'text/html; charset=utf-8' or + content_type = 'text/plain; charset=utf-8' or + content_type = 'application/pdf; charset=utf-8' or + content_type = 'application/msword; charset=utf-8' or + content_type = '-' or + content_type = 'text/html; charset=UTF-8' or + content_type = 'text/plain; charset=UTF-8' or + content_type = 'application/pdf; charset=UTF-8' or + content_type = 'application/msword; charset=UTF-8') + THEN +REGEXP_REPLACE(client_url,'/(robots.txt|xmlrpc.php|10k|.*(jpeg|js|jpg|ico|css|json|png|gif|class|bmp|rss|xml|swf))', '') + ELSE + '' + END as client_url_filtered, + + datestamp, + reply_length_bytes, + geoip->country_code2 as country_code, + client_ip, + client_ua, + http_response_code, + CASE + WHEN + cache_result = 'HIT' or + cache_result = 'STALE' or + cache_result = 'UPDATING' or + cache_result = 'REVALIDATED' + THEN 1 + ELSE 0 + END AS cached + FROM STATS_WEBLOGS_SCHEMA; + + +CREATE STREAM STATS_WEBLOGS + WITH (PARTITIONS=3) AS + SELECT * + FROM STATS_WEBLOGS_WWW + PARTITION BY host_no_www; + + +CREATE STREAM STATS_BANJAX_WWW AS + SELECT + REPLACE(http_host, 'www.', '') as host_no_www, + client_ip, + CASE WHEN uripath IS null THEN ' ' ELSE uripath END as uripath, + user_agent->name as ua_name, + geoip->country_code2 as country_code + FROM STATS_BANJAX_SCHEMA + WHERE action = 'banned'; + + +CREATE STREAM STATS_BANJAX_PARTITIONED + WITH (PARTITIONS=3) AS + SELECT * + FROM STATS_BANJAX_WWW + PARTITION BY host_no_www; + + +CREATE TABLE STATS_BANJAX_UNIQUE_TABLE AS + SELECT + host_no_www, + country_code, + client_ip, + uripath, + EARLIEST_BY_OFFSET(host_no_www) AS host2, + EARLIEST_BY_OFFSET(client_ip) as client_ip2, + EARLIEST_BY_OFFSET(country_code) as country_code2, + EARLIEST_BY_OFFSET(uripath) as uripath2, + COUNT(client_ip) as ip_count, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as window_end + FROM STATS_BANJAX_PARTITIONED + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host_no_www, country_code, client_ip, uripath; + + + CREATE STREAM STATS_BANJAX_UNIQUE_SCHEMA + ( + host2 VARCHAR, + client_ip2 VARCHAR, + country_code2 VARCHAR, + uripath2 VARCHAR, + ip_count INTEGER +) WITH ( + kafka_topic = 'STATS_BANJAX_UNIQUE_TABLE', + partitions = 3, + value_format = 'json' +); + + + CREATE STREAM STATS_BANJAX_UNIQUE AS + SELECT + host2, + client_ip2, + country_code2, + uripath2 + FROM STATS_BANJAX_UNIQUE_SCHEMA + WHERE IP_COUNT = 1 + PARTITION BY host2; + + + CREATE TABLE STATS_WEBLOGS_5M AS + SELECT host_no_www, EARLIEST_BY_OFFSET(host_no_www) as host, + sum (reply_length_bytes) as allbytes, + sum (cached*reply_length_bytes) as cachedbytes, + count (*) as allhits, + sum(cached) as cachedhits, + COLLECT_SET (client_ip) as client_ip, + HISTOGRAM (country_code) as country_codes, + HISTOGRAM (client_url) as client_url, + HISTOGRAM (client_url_filtered) as viewed_pages, + COUNT(client_url_filtered) as viewed_page_count, + HISTOGRAM (client_ua) as ua, + HISTOGRAM (http_response_code) as http_code, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as window_end + FROM STATS_WEBLOGS + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host_no_www; + + + CREATE TABLE STATS_BANJAX_5M AS + SELECT host2, EARLIEST_BY_OFFSET(host2) as host, + COLLECT_SET (client_ip2) as client_ip, + HISTOGRAM (country_code2) as country_codes, + HISTOGRAM (uripath2) as target_url, + COUNT_DISTINCT (client_ip2) as uniquebots, + TIMESTAMPTOSTRING(WINDOWEND, 'yyy-MM-dd HH:mm:ss', 'UTC') as window_end + FROM STATS_BANJAX_UNIQUE + WINDOW TUMBLING (SIZE 5 MINUTES) + GROUP BY host2; \ No newline at end of file diff --git a/deployment/ksql/values-ksql-registry.yaml b/deployment/ksql/values-ksql-registry.yaml new file mode 100644 index 00000000..f55ef721 --- /dev/null +++ b/deployment/ksql/values-ksql-registry.yaml @@ -0,0 +1,3 @@ + +kafka: + bootstrapServers: "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" \ No newline at end of file diff --git a/deployment/ksql/values-ksql.yaml b/deployment/ksql/values-ksql.yaml new file mode 100644 index 00000000..f04a11bb --- /dev/null +++ b/deployment/ksql/values-ksql.yaml @@ -0,0 +1,18 @@ +replicaCount: 3 + +kafka: + bootstrapServers: "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + +cp-schema-registry: + url: "http://ksql-schema-registry-cp-schema-registry:8081" + +ksql: + headless: false + +resources: + requests: + cpu: 3000m + memory: 1G + +configurationOverrides: {} +# "ksql.streams.producer.retries": "2147483647" \ No newline at end of file diff --git a/deployment/ksql/values-ksql9.yaml b/deployment/ksql/values-ksql9.yaml new file mode 100644 index 00000000..bfb092f3 --- /dev/null +++ b/deployment/ksql/values-ksql9.yaml @@ -0,0 +1,18 @@ +replicaCount: 3 + +kafka: + bootstrapServers: "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + +cp-schema-registry: + url: "http://ksql-schema-registry-cp-schema-registry:8081" + +ksql: + headless: false + +resources: + requests: + cpu: 3000m + memory: 1G + +configurationOverrides: {} +# "ksql.streams.producer.retries": "2147483647" \ No newline at end of file diff --git a/deployment/logstash/Dockerfile b/deployment/logstash/Dockerfile new file mode 100644 index 00000000..aed0651f --- /dev/null +++ b/deployment/logstash/Dockerfile @@ -0,0 +1,2 @@ +FROM busybox +COPY GeoLite2-ASN.mmdb /data/GeoLite2-ASN.mmdb diff --git a/deployment/logstash/Dockerfile_logstash b/deployment/logstash/Dockerfile_logstash new file mode 100644 index 00000000..53dff54d --- /dev/null +++ b/deployment/logstash/Dockerfile_logstash @@ -0,0 +1,4 @@ +FROM docker.elastic.co/logstash/logstash-oss:7.16.3 + +# Optional: copy GeoIP DB if needed +# COPY GeoLite2-ASN.mmdb /usr/share/logstash/geoip/GeoLite2-ASN.mmdb diff --git a/deployment/logstash/asn-loader.yaml b/deployment/logstash/asn-loader.yaml new file mode 100644 index 00000000..ebcc420e --- /dev/null +++ b/deployment/logstash/asn-loader.yaml @@ -0,0 +1,17 @@ +apiVersion: v1 +kind: Pod +metadata: + name: geoip-asn-loader +spec: + restartPolicy: Never + containers: + - name: loader + image: busybox + command: ["/bin/sh", "-c", "sleep 3600"] + volumeMounts: + - name: geoip-asn-volume + mountPath: /mnt/geoip + volumes: + - name: geoip-asn-volume + persistentVolumeClaim: + claimName: geoip-asn-pvc diff --git a/deployment/logstash/logstash-lb.yaml b/deployment/logstash/logstash-lb.yaml new file mode 100644 index 00000000..352842e1 --- /dev/null +++ b/deployment/logstash/logstash-lb.yaml @@ -0,0 +1,43 @@ +apiVersion: v1 +kind: Service +metadata: + name: logstash-lb + namespace: default +spec: + selector: + app.kubernetes.io/name: logstash + type: LoadBalancer + loadBalancerSourceRanges: + - 172.104.232.45/32 + - 51.77.117.40/32 + - 142.132.196.168/32 + - 135.181.131.152/32 + - 65.108.205.203/32 + - 185.196.61.178/32 + - 51.38.40.191/32 + - 51.89.41.11/32 + - 65.109.52.30/32 + - 65.109.52.29/32 + - 51.83.220.51/32 + - 51.195.190.221/32 + - 51.15.20.236/32 + - 51.15.27.33/32 + - 51.15.27.51/32 + - 51.15.27.55/32 + - 65.109.52.31/32 + - 51.79.160.201/32 + - 15.235.186.129/32 + - 172.105.127.128/32 + - 185.196.61.159/32 + - 103.75.118.13/32 + - 103.75.119.105/32 + - 23.95.75.145/32 + - 109.104.154.84/32 + - 185.246.131.174/32 + - 206.189.91.21/32 + - 88.99.154.148/32 + - 144.126.228.132/32 + ports: + - name: beats + port: 5044 + targetPort: 5044 diff --git a/deployment/logstash/logstash_pvc.yaml b/deployment/logstash/logstash_pvc.yaml new file mode 100644 index 00000000..caed12d8 --- /dev/null +++ b/deployment/logstash/logstash_pvc.yaml @@ -0,0 +1,11 @@ +# geoip-asn-pvc.yaml +apiVersion: v1 +kind: PersistentVolumeClaim +metadata: + name: geoip-asn-pvc +spec: + accessModes: + - ReadOnlyMany + resources: + requests: + storage: 100Mi diff --git a/deployment/logstash/values-logstash.yaml b/deployment/logstash/values-logstash.yaml new file mode 100644 index 00000000..6b978d66 --- /dev/null +++ b/deployment/logstash/values-logstash.yaml @@ -0,0 +1,233 @@ +replicaCount: 3 + +image: + registry: docker.io + repository: bitnami/logstash +# tag: 8.3.3-debian-11-r5 + +nodeAffinityPreset: + type: hard + key: nodepool + values: + - postgres + +extraVolumes: + - name: geoip-dir + emptyDir: {} + - name: logstash-certs + secret: + secretName: logstash-tls-secret + +extraVolumeMounts: + - name: geoip-dir + mountPath: /mnt/geoip + - name: logstash-certs + mountPath: /usr/share/logstash/config/certs + readOnly: true + +initContainers: + - name: copy-geoip + image: equalitie/baskerville_geoip:latest + command: ["sh", "-c", "cp /data/GeoLite2-ASN.mmdb /mnt/geoip/"] + volumeMounts: + - name: geoip-dir + mountPath: /mnt/geoip + +input: |- + beats { + port => 5044 + ssl => true + ssl_certificate => "/usr/share/logstash/config/certs/certificate.pem" + ssl_key => "/usr/share/logstash/config/certs/key.pem" + ssl_verify_mode => "none" + ssl_certificate_authorities => ["/usr/share/logstash/config/certs/caroot.pem"] + } + +output: |- + if "farmal.in" in [message] { + stdout { + codec => line { + format => "@@@ RAW match: %{message}" + } + } + } + if ![client_request_host] { + stdout { + codec => line { + format => "@@@ Missing client_request_host! Event: %{message}" + } + } + } + if ![kafka_topic] { + stdout { + codec => line { + format => "@@@ Missing [kafka_topic]! Event: %{message}" + } + } + } + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topic_id => "%{[kafka_topic]}" + acks => "1" + linger_ms => 10 + codec => "json" + compression_type => "gzip" + partitioner => "default" + security_protocol => "PLAINTEXT" + message_key => "%{client_request_host}" + } + kafka { + bootstrap_servers => "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + topic_id => "%{[kafka_topic]}" + acks => "1" + codec => "json" + compression_type => "gzip" + partitioner => "default" + security_protocol => "PLAINTEXT" + message_key => "%{client_request_host}" + } + + + + +filter: |- + mutate { + gsub => [ "message", '"http_response_code":\s*000', '"http_response_code":"000"' ] + } + if [http_response_code] == "000" { + mutate { + replace => { "http_response_code" => "200" } + } + mutate { + convert => { "http_response_code" => "integer" } + } + } + + if [@metadata][kafka][topic] == "filebeat_deflect_access" or + [@metadata][kafka][topic] == "filebeat_deflect_access_temp" or + [fields][log_topic] == "filebeat_deflect_access" or + [fields][log_topic] == "filebeat_deflect_access_temp" { + mutate{ + add_field => { "kafka_topic" => "logstash_deflect.log" } + } + date { + match => [ "time_local", "dd/MMM/yyyy:HH:mm:ss Z" ] + target => "datestamp" + remove_field => [ "time_local" ] + } + } + + mutate{ + copy => { "[fields][dnet]" => "dnet" } + } + + if [@metadata][kafka][topic] == "filebeat_banjax" or + [@metadata][kafka][topic] == "filebeat_banjax_access_temp" or + [fields][log_topic] == "filebeat_banjax" or + [fields][log_topic] == "filebeat_banjax_access_temp" { + mutate{ + add_field => { "kafka_topic" => "logstash_banjax" } + } + date { + match => [ "timestring", "yyyy-MM-dd'T'HH:mm:ss" ] + target => "datestamp" + remove_field => [ "timestring" ] + } + if [path] { + grok { + match => { "path" => "%{URIPATH:client_url}(%{URIPARAM:querystring})?" } + } + } + } + + if [client_request_host] { + mutate { + add_field => { "message_key_safe" => "%{client_request_host}" } + } + } else { + mutate { + add_field => { "message_key_safe" => "unknown" } + } + } + + mutate { + copy => {"client_request_host" => "client_request_host_original"} + gsub => [ "client_request_host", "^www\.", "" ] + } + + geoip { + source => "client_ip" + target => "geoip0" + default_database_type => "City" + fields => [ "city_name", "continent_code", "country_code2", "country_name", "location", "timezone" ] + } + + geoip { + source => "client_ip" + target => "geoip_asn" + database => "/mnt/geoip/GeoLite2-ASN.mmdb" + fields => ["AUTONOMOUS_SYSTEM_NUMBER", "AUTONOMOUS_SYSTEM_ORGANIZATION"] + } + + mutate { + copy => { "[geoip0][geo][country_iso_code]" => "[geoip][country_code2]" } + copy => { "[geoip0][geo][country_name]" => "[geoip][country_name]" } + copy => { "[geoip0][geo][timezone]" => "[geoip][timezone]" } + copy => { "[agent][name]" => "[edge]" } + } + + useragent { + source => "client_ua" + target => "user_agent" + } + mutate { + remove_field => [ + "@timestamp", + "@version", + "geoip0" + ] + } + + if ";" in [content_type] { + mutate { + split => { "content_type" => ";" } + add_field => { "new_content_type" => "%{[content_type][0]}" } + } + mutate { + rename => { "new_content_type" => "content_type" } + } + } + + + mutate { + remove_field => [ + "event", + "@timestamp", + "@version", + "type", + "upstream_addr", + "upstream_status", + "upstream_response_time", + "upstream_header_time", + "upstream_connect_time", + "upstream_bytes_sent", + "upstream_bytes_received", + "client_user", + "referer", + "refererdomain", + "loc_out", + "proxy_host", + "proxy_port", + "forwardedfor", + "request_time", + "filter_start", + "rule_type", + "banjax_error", + "request_id", + "log", + "fields", + "input", + "host" + ] + } + diff --git a/deployment/logstash1/logstash-deployment.yaml b/deployment/logstash1/logstash-deployment.yaml new file mode 100644 index 00000000..b368bd1d --- /dev/null +++ b/deployment/logstash1/logstash-deployment.yaml @@ -0,0 +1,40 @@ +apiVersion: apps/v1 +kind: Deployment +metadata: + name: logstash + labels: + app: logstash +spec: + replicas: 1 + selector: + matchLabels: + app: logstash + template: + metadata: + labels: + app: logstash + spec: + containers: + - name: logstash + image: equalitie/baskerville_logstash_oss:7.16.3 + imagePullPolicy: IfNotPresent + ports: + - containerPort: 5044 # if you want beats input later + volumeMounts: + - name: config-volume + mountPath: /usr/share/logstash/pipeline + - name: geoip-db + mountPath: /mnt/geoip + volumes: + - name: config-volume + configMap: + name: logstash-pipeline + - name: geoip-db + emptyDir: {} + initContainers: + - name: copy-geoip + image: equalitie/baskerville_geoip:latest + command: ["sh", "-c", "cp /data/GeoLite2-ASN.mmdb /mnt/geoip/"] + volumeMounts: + - name: geoip-db + mountPath: /mnt/geoip diff --git a/deployment/logstash1/logstash-pipeline.yaml b/deployment/logstash1/logstash-pipeline.yaml new file mode 100644 index 00000000..0230cd8f --- /dev/null +++ b/deployment/logstash1/logstash-pipeline.yaml @@ -0,0 +1,86 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: logstash-pipeline +data: + pipeline.conf: | + input { + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topics => ["filebeat_deflect_access","filebeat_deflect_access_temp","filebeat_banjax","filebeat_banjax_access_temp"] + group_id => "logstash" + codec => "json" + decorate_events => "basic" + } + } + + filter { + if [@metadata][kafka][topic] == "filebeat_deflect_access" or [@metadata][kafka][topic] == "filebeat_deflect_access_temp" { + mutate{ + replace => ["[@metadata][kafka][topic]", "logstash_deflect.log"] + } + date { + match => [ "time_local", "dd/MMM/yyyy:HH:mm:ss Z" ] + target => "datestamp" + remove_field => [ "time_local" ] + } + } + if [@metadata][kafka][topic] == "filebeat_banjax" or [@metadata][kafka][topic] == "filebeat_banjax_access_temp" { + mutate{ + replace => ["[@metadata][kafka][topic]", "logstash_banjax"] + } + date { + match => [ "timestring", "yyyy-MM-dd'T'HH:mm:ss" ] + target => "datestamp" + remove_field => [ "timestring" ] + } + if [path] { + grok { + match => { "path" => "%{URIPATH:client_url}(%{URIPARAM:querystring})?" } + } + } + } + + mutate { + copy => {"client_request_host" => "client_request_host_original"} + gsub => [ "client_request_host", "^www\.", "" ] + copy => {"client_ip" => "client.ip"} + } + + geoip { + source => "client.ip" + target => "geoip0" + default_database_type => "City" + fields => [ "city_name", "continent_code", "country_code2", "country_name", "location" ] + } + + geoip { + source => "client.ip" + target => "geoip_asn" + database => "/mnt/geoip/GeoLite2-ASN.mmdb" + fields => ["AUTONOMOUS_SYSTEM_NUMBER", "AUTONOMOUS_SYSTEM_ORGANIZATION"] + } + + useragent { + source => "client_ua" + target => "user_agent" + } + + mutate { + remove_field => [ + "@timestamp", "@version", "geoip0", "client.ip", + "event", "type", "log_topic", "log", "fields", "host" + ] + } + } + + output { + kafka { + bootstrap_servers => "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + topic_id => "%{[@metadata][kafka][topic]}" + acks => "0" + codec => "json" + partitioner => "default" + message_key => "%{client_request_host}" + } + } diff --git a/deployment/logstash_ch/values_ch.yaml b/deployment/logstash_ch/values_ch.yaml new file mode 100644 index 00000000..f6c9e85d --- /dev/null +++ b/deployment/logstash_ch/values_ch.yaml @@ -0,0 +1,52 @@ +replicas: 1 +image: "docker.elastic.co/logstash/logstash" +imageTag: "8.5.2" + +livenessProbe: null + +logstashConfig: + logstash.yml: | + xpack.monitoring.enabled: false + +logstashPipeline: + logstash.conf: | + input { + kafka { + bootstrap_servers => "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + topics => ["clearinghouse_logs", "clearinghouse_commands"] + group_id => "logstash-ch" + codec => "json" + decorate_events => "basic" + consumer_threads => 3 + } + } + filter { + if [@metadata][kafka][topic] == "clearinghouse_logs" { + mutate{ + add_field => { "[@metadata][topic]" => "logstash_deflect.log" } + add_field => { "[@metadata][key]" => "%{[client_request_host]}" } + } + } + if [@metadata][kafka][topic] == "clearinghouse_commands" { + mutate{ + add_field => { "[@metadata][topic]" => "banjax_command_topic" } + add_field => { "[@metadata][key]" => "%{[host]}" } + } + } + mutate { + remove_field => [ + "event", + "@timestamp", + "@version" + ] + } + } + output { + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topic_id => "%{[@metadata][topic]}" + acks => "0" + codec => "json" + message_key => "%{[@metadata][key]}" + } + } diff --git a/deployment/logstash_dev/values_dev_commands.yaml b/deployment/logstash_dev/values_dev_commands.yaml new file mode 100644 index 00000000..1f0ebf30 --- /dev/null +++ b/deployment/logstash_dev/values_dev_commands.yaml @@ -0,0 +1,41 @@ +replicas: 1 +image: "docker.elastic.co/logstash/logstash" +imageTag: "8.5.2" + +livenessProbe: null + +logstashConfig: + logstash.yml: | + xpack.monitoring.enabled: false + +logstashPipeline: + logstash.conf: | + input { + kafka { + bootstrap_servers => "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + topics => ["banjax_command_topic"] + group_id => "logstash-dev" + codec => "json" + decorate_events => "basic" + consumer_threads => 3 + } + } + filter { + mutate { + remove_field => [ + "event", + "@timestamp", + "@version", + "session" + ] + } + } + output { + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topic_id => "banjax_command_topic" + acks => "0" + codec => "json" + message_key => "%{[host]}" + } + } diff --git a/deployment/logstash_dev/values_dev_reports.yaml b/deployment/logstash_dev/values_dev_reports.yaml new file mode 100644 index 00000000..ec86dc55 --- /dev/null +++ b/deployment/logstash_dev/values_dev_reports.yaml @@ -0,0 +1,33 @@ +image: "docker.elastic.co/logstash/logstash" +replicas: 1 +imageTag: "8.5.2" + +logstashPipeline: + logstash.conf: | + input { + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topics => ["banjax_report_topic"] + group_id => "logstash-solved-challenge" + codec => "json" + decorate_events => "basic" + consumer_threads => 3 + } + } + filter { + mutate { + remove_field => [ + "event", + "@timestamp", + "@version" + ] + } + } + output { + kafka { + bootstrap_servers => "kafka9-0.kafka9-headless.default.svc.cluster.local:9093,kafka9-1.kafka9-headless.default.svc.cluster.local:9093,kafka9-2.kafka9-headless.default.svc.cluster.local:9093" + topic_id => "banjax_report_topic" + acks => "0" + codec => "json" + } + } diff --git a/deployment/logstash_es/values.yaml b/deployment/logstash_es/values.yaml new file mode 100644 index 00000000..2380c0da --- /dev/null +++ b/deployment/logstash_es/values.yaml @@ -0,0 +1,98 @@ +replicas: 3 +image: "docker.elastic.co/logstash/logstash" +imageTag: "8.5.2" + +extraEnvs: + - name: ELASTIC_USERNAME + valueFrom: + secretKeyRef: + name: elastic-secret + key: username + - name: ELASTIC_PASSWORD + valueFrom: + secretKeyRef: + name: elastic-secret + key: password + - name: ELASTIC_CERTIFICATE_PASSWORD + valueFrom: + secretKeyRef: + name: elastic-certificates-password + key: password + +secretMounts: + - name: elastic-certificates + secretName: elastic-certificates + path: /usr/share/logstash/config/certs + +logstashConfig: + logstash.yml: | + xpack.monitoring.enabled: false + +livenessProbe: null + +logstashPipeline: + logstash.conf: | + input { + kafka { + bootstrap_servers => "kafkab-0.kafkab-headless.default.svc.cluster.local:9093,kafkab-1.kafkab-headless.default.svc.cluster.local:9093,kafkab-2.kafkab-headless.default.svc.cluster.local:9093" + topics => ["logstash_deflect.log", "banjax_command_topic", "banjax_report_topic"] + group_id => "logstash-es" + codec => "json" + decorate_events => "basic" + consumer_threads => 1 + } + } + filter { + if [@metadata][kafka][topic] == "logstash_deflect.log" { + mutate{ + add_field => { "[@metadata][es_index]" => "weblogs" } + } + date { + match => [ "datestamp", "yyyy-MM-dd'T'HH:mm:ssZ" ] + } + mutate { + rename => {"client_request_method" => "method" } + rename => {"client_request_host" => "host" } + rename => {"client_url" => "url"} + rename => {"cache_result" => "cache"} + rename => {"client_ip" => "ip"} + } + mutate { + remove_field => [ + "[@metadata][kafka][topic]", + "http_request_version", + "version", + "datestamp", + "http_request_scheme", + "client_ua", + "querystring", + "reply_length_bytes", + "disable_logging", + "ecs", + "content_type" + ] + } + } else { + date { + match => [ "[@metadata][timestamp]", "UNIX" ] + } + if [@metadata][kafka][topic] == "banjax_command_topic" { + mutate{ + add_field => { "[@metadata][es_index]" => "commands" } + } + } else if [@metadata][kafka][topic] == "banjax_report_topic" { + mutate{ + add_field => { "[@metadata][es_index]" => "reports" } + } + } + } + } + output { + elasticsearch { + hosts => "http://elasticsearch-master:9200" + ssl => false + user => "${ELASTIC_USERNAME}" + password => "${ELASTIC_PASSWORD}" + index => "b.%{[@metadata][es_index]}-%{+YYYY.ww}" + } + } diff --git a/deployment/notebook/spark_secrets.yaml b/deployment/notebook/spark_secrets.yaml index b5d77fd5..3bfe6fc5 100644 --- a/deployment/notebook/spark_secrets.yaml +++ b/deployment/notebook/spark_secrets.yaml @@ -5,11 +5,12 @@ metadata: namespace: spark type: Opaque stringData: - s3_access: "XHI6ZO04E9U6NP3M1KXC" - s3_secret: "aDpobhqDIP8aeiCcRw2SISQCsYGeAUXGxz8vYbBn" + s3_access: "" + s3_secret: "" s3_endpoint: "s3.gra.cloud.ovh.net" - redis_password: "zUXtnClLrDHgSQ4KK6iiT3" + s3_region: "GRA" + redis_password: "" postgres_host: "postgres-db-lb" postgres_user: "postgres" kafka_host: "kafka-0.kafka-headless.default.svc.cluster.local:9092,kafka-1.kafka-headless.default.svc.cluster.local:9092,kafka-2.kafka-headless.default.svc.cluster.local:9092" - postgres_password: "zUXtnClLrDHgSQ4KK6iiT3" \ No newline at end of file + postgres_password: "" \ No newline at end of file diff --git a/deployment/postgres/postgres.yaml b/deployment/postgres/postgres.yaml index 30da0a97..e15b3cb3 100644 --- a/deployment/postgres/postgres.yaml +++ b/deployment/postgres/postgres.yaml @@ -18,7 +18,7 @@ spec: nodepool: postgres containers: - name: postgresql-db - image: timescale/pg_prometheus:latest-pg10 + image: timescale/pg_prometheus:latest-pg11 volumeMounts: - name: postgresql-db-disk mountPath: /data @@ -38,4 +38,4 @@ spec: accessModes: ["ReadWriteOnce"] resources: requests: - storage: 1500Gi \ No newline at end of file + storage: 2000Gi \ No newline at end of file diff --git a/deployment/postgres/postgres_baskervillehall.yaml b/deployment/postgres/postgres_baskervillehall.yaml new file mode 100644 index 00000000..60b29ea9 --- /dev/null +++ b/deployment/postgres/postgres_baskervillehall.yaml @@ -0,0 +1,42 @@ +# PostgreSQL StatefulSet +apiVersion: apps/v1 +kind: StatefulSet +metadata: + name: db-baskervillehall +spec: + serviceName: db-baskervillehall-service + selector: + matchLabels: + app: db-baskervillehall + replicas: 1 + template: + metadata: + labels: + app: db-baskervillehall + spec: + nodeSelector: + nodepool: postgres + containers: + - name: db-baskervillehall + image: 'postgres:14' + imagePullPolicy: IfNotPresent + volumeMounts: + - name: db-baskervillehall-disk + mountPath: /data + env: + - name: POSTGRES_PASSWORD + valueFrom: + secretKeyRef: + name: baskerville-secrets + key: postgres_password + - name: PGDATA + value: /data/pgdata + # Volume Claim + volumeClaimTemplates: + - metadata: + name: db-baskervillehall-disk + spec: + accessModes: ["ReadWriteOnce"] + resources: + requests: + storage: 1000Gi \ No newline at end of file diff --git a/deployment/postgres/postgres_lb.yaml b/deployment/postgres/postgres_lb.yaml index 17d946e1..7d8ee42a 100644 --- a/deployment/postgres/postgres_lb.yaml +++ b/deployment/postgres/postgres_lb.yaml @@ -6,7 +6,7 @@ metadata: spec: selector: app: postgresql-db - type: LoadBalancer + type: ClusterIP ports: - port: 5432 targetPort: 5432 \ No newline at end of file diff --git a/deployment/timescaledb/values.yaml b/deployment/timescaledb/values.yaml new file mode 100644 index 00000000..98943411 --- /dev/null +++ b/deployment/timescaledb/values.yaml @@ -0,0 +1,32 @@ +dataNodes: 3 + +postgresql: + databases: + - postgres + - baskerville + parameters: + max_connections: 100 + max_prepared_transactions: 150 + shared_buffers: 2816MB + work_mem: 7208kB + min_wal_size: 4GB + max_wal_size: 16GB + +image: + repository: timescale/timescaledb-ha + tag: pg12.5-ts2.0.0-p0 +credentials: + accessNode: + superuser: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + admin: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + postgres: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + dataNode: + superuser: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + admin: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + postgres: zJJrIQyxW18hCT5uPj6rkrd6Vw/MylrklaKOQsafvpw= + +persistentVolume: + enabled: true + size: 500G + + diff --git a/linting.sh b/linting.sh index e42250a5..688dfa93 100755 --- a/linting.sh +++ b/linting.sh @@ -1 +1 @@ -flake8 . --count --ignore=C901,W503,W504,E226 --max-line-length=127 --statistics \ No newline at end of file +flake8 ./src --count --ignore=C901,W503,W504,E226,E402,W291,W605 --max-line-length=127 --statistics \ No newline at end of file diff --git a/requirements.txt b/requirements.txt index a1dd92c3..e19dea95 100644 --- a/requirements.txt +++ b/requirements.txt @@ -16,6 +16,7 @@ tzwhere==3.0.3 pytz==2014.10 sqlalchemy < 1.4.0 sqlalchemy_utils==0.33.3 +pypandoc pyspark==2.4.6 # psutil==5.4.6 psycopg2==2.7.5 @@ -34,3 +35,7 @@ cachetools==4.1.1 spark-testing-base==0.10.0 passlib==1.7.4 elasticsearch==7.13.4 +pyyaml +ua-parser +user-agents +geoip2 diff --git a/src/baskerville/db/__init__.py b/src/baskerville/db/__init__.py index 4a78c459..aaa2ed80 100644 --- a/src/baskerville/db/__init__.py +++ b/src/baskerville/db/__init__.py @@ -3,15 +3,12 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import uuid from baskerville.db.data_partitioning import get_temporal_partitions from sqlalchemy import create_engine, text -from sqlalchemy.exc import ProgrammingError from sqlalchemy.ext.declarative import declarative_base from sqlalchemy.orm import sessionmaker, scoped_session -from sqlalchemy_utils import database_exists, create_database from baskerville.util.enums import UserCategoryEnum from passlib.apps import custom_app_context as pwd_context @@ -20,6 +17,7 @@ from baskerville.db.dashboard_models import Organization, User, UserCategory + defaults = { 'mysql': 'master', 'postgres': 'postgres', @@ -153,19 +151,21 @@ def set_up_db(conf, create=True, partition=True): if create: # try: # with contextlib.suppress(ProgrammingError) as e: - with create_engine( - get_db_connection_str(conf, default_db=True), - isolation_level='AUTOCOMMIT', - **conf.get('db_conn_args', {}) - ).connect() as connection: - from sqlalchemy_utils import database_exists, create_database - if not database_exists(get_db_connection_str(conf, default_db=False)): - create_database(get_db_connection_str(conf, default_db=False)) - #connection.execute(f'CREATE DATABASE {conf.get("name")} if not exists') - # connection.execute( - # 'CREATE CAST (VARCHAR AS JSON) ' - # 'WITHOUT FUNCTION AS IMPLICIT' - # ) + + from sqlalchemy_utils import database_exists, create_database + if not database_exists(get_db_connection_str(conf, default_db=False)): + create_database(get_db_connection_str(conf, default_db=False)) + + # with create_engine( + # get_db_connection_str(conf, default_db=True), + # isolation_level='AUTOCOMMIT', + # **conf.get('db_conn_args', {}) + # ).connect() as connection: + # sconnection.execute(f'CREATE DATABASE {conf.get("name")} if not exists') + # connection.execute( + # 'CREATE CAST (VARCHAR AS JSON) ' + # 'WITHOUT FUNCTION AS IMPLICIT' + # ) # except ProgrammingError: # pass @@ -191,7 +191,7 @@ def set_up_db(conf, create=True, partition=True): Base.metadata.create_all(bind=engine) # session = Session() - if Session.query(Organization).count() == 0 and \ + if conf.get('create_organization') and Session.query(Organization).count() == 0 and \ Session.query(User).count() == 0 and \ Session.query(UserCategory).count() == 0: try: @@ -214,8 +214,6 @@ def set_up_db(conf, create=True, partition=True): Session.rollback() raise err - - # create data partition maintenance_conf = conf.get('maintenance') if conf.get('type') == 'postgres' \ diff --git a/src/baskerville/db/base.py b/src/baskerville/db/base.py index 8c832b02..2c9910cb 100644 --- a/src/baskerville/db/base.py +++ b/src/baskerville/db/base.py @@ -74,7 +74,7 @@ def get_temporal_check(partition_field, start, end, new=False, if new: new_prefix = 'NEW.' - start = start.strftime("%Y-%m-%d %H:%M:%S") + start = start.strftime("%Y-%m-%d %H:%M:%S") end = end.strftime("%Y-%m-%d %H:%M:%S.%f") if start > end: diff --git a/src/baskerville/db/dashboard_models.py b/src/baskerville/db/dashboard_models.py index e01eee2e..00e6c2a2 100644 --- a/src/baskerville/db/dashboard_models.py +++ b/src/baskerville/db/dashboard_models.py @@ -7,7 +7,7 @@ from baskerville.db import Base from baskerville.db.models import utcnow, SerializableMixin from sqlalchemy import Column, Integer, ForeignKey, DateTime, Enum, String, \ - Boolean, BigInteger, Float, JSON, Text, TEXT + Boolean, BigInteger, Float, JSON, TEXT from sqlalchemy.orm import relationship from passlib.apps import custom_app_context as pwd_context diff --git a/src/baskerville/db/models.py b/src/baskerville/db/models.py index af268f85..a9dc433c 100644 --- a/src/baskerville/db/models.py +++ b/src/baskerville/db/models.py @@ -67,7 +67,8 @@ class Runtime(Base, SerializableMixin): # runtimes * - 1 users try: from baskerville.db.dashboard_models import User - except: + except Exception as exp: + print(exp) pass user = relationship( 'User', @@ -98,6 +99,9 @@ class RequestSet(Base, SerializableMixin): id_banjax = Column(Integer, ForeignKey('banjax_bans.id'), nullable=True) process_flag = Column(Boolean, default=True) prediction = Column(Integer) + prediction_anomaly = Column(Integer) + prediction_classifier = Column(Integer) + prediction_behave = Column(Integer) attack_prediction = Column(Integer) challenged = Column(Integer) challenge_failed = Column(Integer) @@ -105,6 +109,7 @@ class RequestSet(Base, SerializableMixin): banned = Column(Integer) low_rate_attack = Column(Integer) score = Column(Float) + classifier_score = Column(Float) features = Column(JSON) created_at = Column(DateTime(timezone=True), server_default=utcnow()) updated_at = Column( @@ -145,10 +150,13 @@ class RequestSet(Base, SerializableMixin): 'start', 'stop', 'prediction', + 'prediction_anomaly', + 'prediction_classifier', 'attack_prediction', 'low_rate_attack', 'challenged', 'score', + 'classifier_score', 'label', 'id_attribute', 'features', diff --git a/src/baskerville/db/temporal_partition.py b/src/baskerville/db/temporal_partition.py index 3f40f638..0fac3f55 100644 --- a/src/baskerville/db/temporal_partition.py +++ b/src/baskerville/db/temporal_partition.py @@ -12,7 +12,60 @@ from baskerville.util.enums import PartitionByEnum from baskerville.util.helpers import get_days_in_year, get_days_in_month from baskerville.db.base import PartitionedTable, Partition, TableTools, Index -from es_retriever.helpers.time_period import TimePeriod as TP + +ENDCOLOR = '\033[0m' + + +class TP(object): + """ + Represents a time period, with helper functions + """ + + def __init__(self, start, end): + self.start = start + self.end = end + + def __str__(self): + return 'TimePeriod from {} to {}'.format(self.start, self.end) + + def __repr__(self): + return '<{}>'.format(self) + + def __eq__(self, other): + return self.start == other.start and self.end == other.end + + def __ne__(self, other): + return self.start != other.start or self.end != other.end + + def split_per_day(self, full_day=False): + """ + Splits the time period in days + :rtype: list[TimePeriod] + :return: a list that contains time periods that when combined together + they amount to the initial / current period of time + """ + days = [] + + start = self.start + end = (start + timedelta(days=1)).replace( + hour=00, minute=00, second=00, microsecond=00 + ) + if full_day: + end = end - timedelta(seconds=1) + + while True: + if end >= self.end or full_day and (end.date() == self.end.date()): + days.append(TimePeriod(start, self.end)) + return days + days.append(TimePeriod(start, end)) + start = end + if full_day: + start = start + timedelta(seconds=1) + end = (start + timedelta(days=1)).replace( + hour=00, minute=00, second=00, microsecond=00 + ) + if full_day: + end = end - timedelta(seconds=1) class TimePeriod(TP): @@ -293,7 +346,7 @@ def self_check(self): @property def field_value(self) -> str: return f'cast(extract({self.partitioned_by} ' \ - f'from NEW.{self.partition_field}) AS TEXT)' + f'from NEW.{self.partition_field}) AS TEXT)' def get_partition_prefix(self) -> str: """ diff --git a/src/baskerville/features/__init__.py b/src/baskerville/features/__init__.py index 17ee235b..1a7adab9 100644 --- a/src/baskerville/features/__init__.py +++ b/src/baskerville/features/__init__.py @@ -28,6 +28,4 @@ ] )) ) -FEATURE_NAME_TO_CLASS = dict( - (f.feature_name_from_class(), f) for f in FEATURES - ) \ No newline at end of file +FEATURE_NAME_TO_CLASS = dict((f.feature_name_from_class(), f) for f in FEATURES) diff --git a/src/baskerville/features/feature_css_to_html_ratio.py b/src/baskerville/features/feature_css_to_html_ratio.py index d895da79..f8a5ef3c 100644 --- a/src/baskerville/features/feature_css_to_html_ratio.py +++ b/src/baskerville/features/feature_css_to_html_ratio.py @@ -32,8 +32,12 @@ def __init__(self): F.col('is_css'))) } self.pre_group_by_calcs = { - 'is_html': F.col('content_type') == 'text/html', - 'is_css': F.col('content_type') == 'text/css', + 'is_html': (F.col('content_type') == 'text/html') | + (F.col('content_type') == 'text/html; charset=utf-8') | + (F.col('content_type') == 'text/html; charset=UTF-8'), + 'is_css': (F.col('content_type') == 'text/css') | + (F.col('content_type') == 'text/css; charset=utf-8') | + (F.col('content_type') == 'text/css; charset=UTF-8') } def compute(self, df): diff --git a/src/baskerville/features/feature_image_to_html_ratio.py b/src/baskerville/features/feature_image_to_html_ratio.py index b76e3b35..fe24ffe6 100644 --- a/src/baskerville/features/feature_image_to_html_ratio.py +++ b/src/baskerville/features/feature_image_to_html_ratio.py @@ -33,7 +33,8 @@ def __init__(self): ) } self.pre_group_by_calcs = { - 'is_html': F.col('content_type') == 'text/html', + 'is_html': (F.col('content_type') == 'text/html') | (F.col('content_type') == 'text/html; charset=UTF-8') | + (F.col('content_type') == 'text/html; charset=utf-8'), 'is_image': F.array_contains( F.split(F.col('content_type'), '/'), 'image') diff --git a/src/baskerville/main.py b/src/baskerville/main.py index ae2bc61b..6cc2e4dd 100644 --- a/src/baskerville/main.py +++ b/src/baskerville/main.py @@ -9,32 +9,24 @@ import argparse import atexit -import json import requests import os import time -from datetime import timedelta, datetime +from datetime import timedelta -from dateutil.tz import tzutc from prometheus_client import start_http_server from baskerville import src_dir -from baskerville.db import set_up_db -from baskerville.db.models import Model -from baskerville.models.config import DatabaseConfig from baskerville.models.engine import BaskervilleAnalyticsEngine from baskerville.simulation.real_timeish_simulation import simulation from baskerville.util.git_helpers import git_clone -from baskerville.util.helpers import get_logger, parse_config, \ - get_default_data_path - +from baskerville.util.helpers import get_logger, parse_config PROCESS_LIST = [] baskerville_engine = None logger = None - os.environ['TZ'] = 'UTC' try: @@ -137,8 +129,6 @@ def main(): output_file=baskerville_engine.config.engine.logpath ) - logger.info(f'Postgres password={baskerville_engine.config.database.password}') - # start simulation if specified if args.simulate: spark = None diff --git a/src/baskerville/models/anomaly_model.py b/src/baskerville/models/anomaly_model.py index 69ea7a74..b5d0cb6e 100644 --- a/src/baskerville/models/anomaly_model.py +++ b/src/baskerville/models/anomaly_model.py @@ -94,7 +94,7 @@ def categorical_string_features(self): categorical_features.append(feature) return categorical_features - def _create_feature_columns(self, df): + def _create_categorical_feature_columns(self, df): for feature in self.categorical_features(): df = df.withColumn(f'{self.prefix_feature}{feature}', F.col(f'{self.feature_map_column}.{feature}')) return df @@ -148,8 +148,8 @@ def train(self, df): df = df.persist(StorageLevelFactory.get_storage_level(self.storage_level)) - self.logger.info('Creating feature columns...') - df = self._create_feature_columns(df) + self.logger.info('Creating categorical feature columns...') + df = self._create_categorical_feature_columns(df) self.logger.info('Fitting string indexes...') self._create_indexes(df) @@ -157,6 +157,9 @@ def train(self, df): df = self._add_categorical_features(df, self.features_vector_scaled) df = self._drop_feature_columns(df) + if self.prediction_column in df.columns: + df = df.drop(self.prediction_column) + self.logger.info('Fitting Isolation Forest model...') iforest = IForest( featuresCol=self.features_vector_scaled, @@ -190,7 +193,7 @@ def prepare_df(self, df): df = df.withColumnRenamed(self.features_vector, self.features_vector_scaled) self.logger.info('Adding categorical features...') - df = self._create_feature_columns(df).persist() + df = self._create_categorical_feature_columns(df).persist() df = self._add_categorical_features(df, self.features_vector_scaled) df = self._drop_feature_columns(df) self.is_prepared = True diff --git a/src/baskerville/models/banjax_report_consumer.py b/src/baskerville/models/banjax_report_consumer.py deleted file mode 100644 index f7793ee5..00000000 --- a/src/baskerville/models/banjax_report_consumer.py +++ /dev/null @@ -1,240 +0,0 @@ -import datetime -import threading -import json -from kafka import KafkaConsumer, KafkaProducer -import time -import logging -import sys -import types - -from baskerville.db import set_up_db -from baskerville.models.config import KafkaConfig -from baskerville.models.ip_cache import IPCache -from baskerville.util.elastic_writer import ElasticWriter -from baskerville.util.helpers import parse_config -import argparse -import os -from baskerville import src_dir - - -class BanjaxReportConsumer(object): - status_message_fields = [ - "timestamp", - "restart_time", - "reload_time", - "num_of_challenges", - "num_of_host_challenges", - "num_of_ip_challenges", - "swabber_ip_db_size", - "regex_manager_ip_db_size", - "challenger_ip_db_size", - "proxy.process.traffic_server.memory.rss", - "proxy.node.cache.contents.num_docs", - "proxy.process.cache.bytes_total", - "proxy.process.cache.percent_full", - "proxy.process.cache.ram_cache.bytes_used", - "proxy.process.cache.ram_cache.total_bytes", - "proxy.process.net.connections_currently_open", - "proxy.process.current_server_connections", - "proxy.process.http.current_active_client_connections", - "proxy.process.eventloop.time.max" - ] - - def __init__(self, config, logger): - self.config = config - self.kafka_config = config.kafka - self.logger = logger - self.ip_cache = IPCache(config, self.logger) - self.session, self.engine = set_up_db(config.database.__dict__) - - if config.elastic: - self.elastic_writer = ElasticWriter(host=config.elastic.host, - port=config.elastic.port, - user=config.elastic.user, - password=config.elastic.password) - else: - self.elastic_writer = None - - # XXX i think the metrics registry swizzling code is passing - # an extra argument here mistakenly?.?. - def _tmp_fun(_, _2, message): - return message - - for field_name in self.__class__.status_message_fields: - setattr(self, f"consume_{field_name}", types.MethodType(_tmp_fun, self)) - - def run(self): - consumer = KafkaConsumer( - self.kafka_config.banjax_report_topic, - group_id=None, - **self.config.kafka.connection - ) - - for message in consumer: - self.consume_message(message) - - consumer.close() - - def consume_message(self, message): - if len(message.value) > 0: - try: - s = message.value.decode("utf-8") - except UnicodeDecodeError: - self.logger.info("got bad utf-8 over the kafka channel") - - try: - d = json.loads(s) - except json.JSONDecodeError: - self.logger.info(f"got bad json over the kafka channel: {s}") - - # 'status'-type messages contain several metrics and are reported per $interval - if d.get("name") == "status": - edge_id = d.get("id") - for k, _ in d.items(): - if k == 'name' or k == 'id': - continue - try: - f = getattr(self, f"consume_{k}") - f(self, d) - except AttributeError: - self.logger.info(f"did not process banjax status {k} from edge {edge_id}") - - # 'ip_failed_challenge'-type messages are reported when a challenge is failed - elif d.get("name") == "ip_failed_challenge": - self.consume_ip_failed_challenge_message(d) - elif d.get("name") == "ip_passed_challenge" or d.get("name") == "ip_passed_challenge2": - self.consume_ip_passed_challenge_message(d) - elif d.get("name") == "ip_banned": - self.consume_ip_banned_message(d) - - def get_time_filter(self): - return (datetime.datetime.utcnow() - datetime.timedelta( - minutes=self.config.engine.banjax_sql_update_filter_minutes)).strftime("%Y-%m-%d %H:%M:%S.000Z") - - def consume_ip_failed_challenge_message(self, message): - ip = message['value_ip'] - num_fails = self.ip_cache.ip_failed_challenge(ip) - if num_fails == 0: - return message - - try: - if num_fails >= self.config.engine.banjax_num_fails_to_ban: - self.ip_cache.ip_banned(ip) - sql = f'update request_sets set banned = 1 where ' \ - f'stop > \'{self.get_time_filter()}\' and challenged = 1 and ip = \'{ip}\'' - else: - sql = f'update request_sets set challenge_failed = {num_fails} where ' \ - f'stop > \'{self.get_time_filter()}\' and challenged = 1 and ip = \'{ip}\'' - - self.session.execute(sql) - self.session.commit() - - except Exception: - self.session.rollback() - self.logger.error(Exception) - raise - - return message - - def consume_ip_passed_challenge_message(self, message): - ip = message['value_ip'] - host = message['value_site'] - processed = self.ip_cache.ip_passed_challenge(ip) - if not processed: - return message - - try: - if self.elastic_writer: - with self.elastic_writer as writer: - writer.write_challenge_passed(ip, host) - - sql = f'update request_sets set challenge_passed = 1 where ' \ - f'stop > \'{self.get_time_filter()}\' ' \ - f'and challenged = 1 and ip = \'{ip}\'' - self.session.execute(sql) - self.session.commit() - - except Exception: - self.session.rollback() - self.logger.error(Exception) - raise - - return message - - def consume_ip_banned_message(self, message): - ip = message['value_ip'] - self.logger.info(f'Banjax ip_banned {ip} ...') - try: - sql = f'update request_sets set banned = 1 where ' \ - f'stop > \'{self.get_time_filter()}\' and challenged = 1 and ip = \'{ip}\'' - self.session.execute(sql) - self.session.commit() - - except Exception: - self.session.rollback() - self.logger.error(Exception) - raise - - return message - - -class ChallengeProducer(object): - def __init__(self, config, logger): - self.config = config - self.logger = logger - - def run(self): - producer = KafkaProducer(**self.config.kafka.connection) - - number = 0 - while True: - for _ in range(0, 10): - domain = f"example-{number}.com:8080" - command = {'name': 'challenge_host', 'value': domain} - producer.send(self.config.banjax_command_topic, json.dumps(command).encode('utf-8')) - self.logger.info("sent a command") - number = number + 1 - time.sleep(1) - - -if __name__ == '__main__': - parser = argparse.ArgumentParser() - - parser.add_argument( - "--conf", action="store", dest="conf_file", - default=os.path.join(src_dir, '..', 'conf', 'baskerville.yaml'), - help="Path to config file" - ) - - parser.add_argument( - "-c", "--consumer", dest="start_consumer", action="store_true", - help="start consumer", - ) - - parser.add_argument( - "-p", "--producer", dest="start_producer", action="store_true", - help="start consumer", - ) - - args = parser.parse_args() - - logging.basicConfig(stream=sys.stdout, level=logging.INFO) - logger = logging.getLogger() - - config_dict = KafkaConfig(parse_config(path=args.conf_file)['kafka']).validate() - - if args.start_consumer: - status_consumer = BanjaxReportConsumer(config_dict, logger) - consumer_thread = threading.Thread(target=status_consumer.run) - consumer_thread.start() - - if args.start_producer: - challenge_producer = ChallengeProducer(config_dict, logger) - producer_thread = threading.Thread(target=challenge_producer.run) - producer_thread.start() - - if args.start_consumer: - consumer_thread.join() - - if args.start_producer: - producer_thread.join() diff --git a/src/baskerville/models/classifier_model.py b/src/baskerville/models/classifier_model.py new file mode 100644 index 00000000..cd704f84 --- /dev/null +++ b/src/baskerville/models/classifier_model.py @@ -0,0 +1,100 @@ +# Copyright (c) 2020, eQualit.ie inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from pyspark.ml.classification import GBTClassifier +from pyspark.ml.pipeline import PipelineModel, Pipeline +from baskerville.models.model_interface import ModelInterface +from baskerville.spark.helpers import map_to_array +import os + +from baskerville.util.file_manager import FileManager +from pyspark.sql import functions as F +from pyspark.ml.linalg import SparseVector, VectorUDT +from pyspark.sql.functions import udf +from pyspark.sql.types import FloatType +import numpy as np + + +def to_sparse(c): + def to_sparse_(v): + if isinstance(v, SparseVector): + return v + vs = v + nonzero = np.nonzero(vs)[0] + return SparseVector(len(v), nonzero, [d for d in vs if d != 0]) + + return F.udf(to_sparse_, VectorUDT())(c) + + +class ClassifierModel(ModelInterface): + + def __init__(self, features=[]): + super().__init__() + self.model = None + self.features = features + + def evaluate(self, df): + P = df.where(F.col('label') == 1).count() + N = df.where(F.col('label') == 0).count() + FP = df.where((F.col('label') == 0) & (F.col('prediction') == 1)).count() + TP = df.where((F.col('label') == 1) & (F.col('prediction') == 1)).count() + + self.logger.info(f'Positives = {P}') + self.logger.info(f'Negatives = {N}') + self.logger.info(f'Recall={TP / P if P > 0 else -1}') + self.logger.info(f'FPR={FP / N if N > 0 else -1}') + + def train(self, df): + df = self.prepare_df(df) + + (trainingData, testData) = df.randomSplit([0.8, 0.2]) + + classifier = GBTClassifier( + labelCol="label", + featuresCol="features_sparse", + maxIter=100) + pipeline = Pipeline(stages=[classifier]) + self.model = pipeline.fit(trainingData) + predictions_test = self.model.transform(testData) + predictions_train = self.model.transform(trainingData) + self.logger.info("Training performance:") + self.evaluate(predictions_train) + self.logger.info("Test performance:") + self.evaluate(predictions_test) + + def prepare_df(self, df): + df = map_to_array(df, map_col='features', array_col='features_vector', + map_keys=self.features, + ) + df = df.withColumn('features_sparse', to_sparse('features_vector')) + return df + + def predict(self, df): + df = self.prepare_df(df) + self.logger.info('GBT transform ...') + df = self.model.transform(df) + second_element = udf(lambda v: float(v[1]), FloatType()) + df = df.withColumn('classifier_score', second_element('probability')) + df = df.drop('features_sparse', 'rawPrediction', 'probability', 'prediction') + return df + + def _get_params_path(self, path): + return os.path.join(path, 'params.json') + + def _get_model_path(self, path): + return os.path.join(path, 'model/') + + def save(self, path, spark_session=None, training_config=None): + file_manager = FileManager(path, spark_session) + file_manager.save_to_file(self.get_params(), self._get_params_path(path), format='json') + self.model.save(self._get_model_path(path)) + + def load(self, path, spark_session=None): + file_manager = FileManager(path, spark_session) + params = file_manager.load_from_file(self._get_params_path(path), format='json') + self.set_params(**params) + + self.model = PipelineModel.load(self._get_model_path(path)) + return self diff --git a/src/baskerville/models/config.py b/src/baskerville/models/config.py index b37138c3..cb3c6050 100644 --- a/src/baskerville/models/config.py +++ b/src/baskerville/models/config.py @@ -4,7 +4,6 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import json import os import warnings @@ -255,10 +254,12 @@ class EngineConfig(Config): cache_path = None save_cache_to_storage = False storage_path = None + incidents_path = None cache_expire_time = None cache_load_past = False cross_reference = False model_path = None + classifier_model_path = None model_id = None extra_features = None verbose = False @@ -272,14 +273,16 @@ class EngineConfig(Config): trigger_challenge = True anomaly_threshold = 0.45 anomaly_threshold_during_incident = 0.35 + classifier_threshold = 0.95 + classifier_threshold_during_incident = 0.8 challenge = 'ip' # supported values : 'ip', 'host' training = None ttl = 500 low_rate_attack_enabled = True low_rate_attack_period = [600, 3600] low_rate_attack_total_request = [400, 2000] - ip_cache_passed_challenge_ttl = 60 * 60 * 24 # 24h - ip_cache_passed_challenge_size = 100000 + ip_cache_passed_challenge_ttl = 60 * 60 * 24 * 3 + ip_cache_passed_challenge_size = 1000000 ip_cache_pending_ttl = 60 * 60 * 1 # 1h ip_cache_pending_size = 100000 save_to_storage = True @@ -303,6 +306,8 @@ class EngineConfig(Config): kafka_topic_sensitive = 'sensitive' client_mode = False + input_is_weblogs = False + input_timestamp_column = 'datestamp' def __init__(self, config, parent=None): super(EngineConfig, self).__init__(config, parent) @@ -502,6 +507,9 @@ class TrainingConfig(Config): - """ model_parameters = dict + classifier_model = 'baskerville.models.classifier_model.ClassifierModel' + incidents_folder = 'attacks' + load_from_storage = True def __init__(self, config, parent=None): super(TrainingConfig, self).__init__(config, parent) @@ -698,6 +706,7 @@ class DatabaseConfig(Config): port = None type = 'postgres' maintenance = None + create_organization = True def __init__(self, config, parent=None): super(DatabaseConfig, self).__init__(config, parent) @@ -742,7 +751,7 @@ class KafkaConfig(Config): Configuration for access to a Kafka instance for the kafka pipeline. """ bootstrap_servers = '0.0.0.0:9092' - data_topic = 'deflect.logs' + data_topic = 'deflect.log' features_topic = 'features' feedback_topic = 'feedback' feedback_response_topic = '' @@ -1038,20 +1047,19 @@ def validate(self): logger.debug('Validating UserDetailsConfig...') if not self.username: self.add_error(ConfigError( - f'Please, provide a username', + 'Please, provide a username', ['username'], exception_type=ValueError )) if not self.password: self.add_error(ConfigError( - f'Please, provide a password', + 'Please, provide a password', ['password'], exception_type=ValueError )) if not self.organization_uuid: self.add_error(ConfigError( - f'Please, provide an organization_uuid', + 'Please, provide an organization_uuid', ['organization_uuid'], exception_type=ValueError )) - diff --git a/src/baskerville/models/incident_detector.py b/src/baskerville/models/incident_detector.py index 4faa3ff2..11e068c8 100644 --- a/src/baskerville/models/incident_detector.py +++ b/src/baskerville/models/incident_detector.py @@ -14,6 +14,7 @@ from baskerville.util.db_reader import DBReader import datetime import pandas as pd +import numpy as np class IncidentDetector: @@ -21,18 +22,19 @@ class IncidentDetector: def __init__(self, db_config, time_bucket_in_seconds=120, - time_horizon_in_seconds=600, + time_horizon_in_seconds=400, check_interval_in_seconds=120, - stat_refresh_period_in_minutes=30, + stat_refresh_period_in_minutes=60, stat_window_in_hours=1, min_traffic=3, min_traffic_incident=50, - min_challenged_portion_incident=0.5, + min_anomaly_portion_incident=0.5, sigma_score=2.5, sigma_traffic=2.5, dashboard_url_prefix=None, dashboard_minutes_before=60, dashboard_minutes_after=120, + stop_delay_in_seconds=300, logger=None, mail_sender=None, emails=None): @@ -45,7 +47,7 @@ def __init__(self, self.sigma_traffic = sigma_traffic self.min_traffic = min_traffic self.min_traffic_incident = min_traffic_incident - self.min_challenged_portion_incident = min_challenged_portion_incident + self.min_anomaly_portion_incident = min_anomaly_portion_incident self.db_config = db_config if logger: @@ -67,6 +69,7 @@ def __init__(self, self.dashboard_url_prefix = dashboard_url_prefix self.dashboard_minutes_before = dashboard_minutes_before self.dashboard_minutes_after = dashboard_minutes_after + self.stop_delay_in_seconds = stop_delay_in_seconds self.lock = threading.Lock() def _run(self): @@ -75,6 +78,7 @@ def _run(self): self._detect() is_killed = self.kill.wait(self.check_interval_in_seconds) if is_killed: + self.logger.info('is_killed is True. Incident detector stopped.') break def start(self): @@ -107,7 +111,7 @@ def _read_sample(self): seconds=self.time_horizon_in_seconds)).strftime("%Y-%m-%d %H:%M:%S %z") query = f'SELECT floor(extract(epoch from stop)/{self.time_bucket_in_seconds})*' \ f'{self.time_bucket_in_seconds} AS "time", target, ' \ - f'count(distinct ip) as traffic, (sum(prediction*1.0) / count(ip)) as challenged_portion ' \ + f'count(ip) as traffic, (sum(prediction_anomaly*1.0) / count(ip)) as anomaly_portion ' \ f'FROM request_sets WHERE stop > \'{stop}\' ' \ f'and floor(extract(epoch from stop)/{self.time_bucket_in_seconds})*' \ f'{self.time_bucket_in_seconds} in ' \ @@ -162,6 +166,7 @@ def _start_incidents(self, anomalies): new_incidents['id'] = 0 new_incidents['start'] = pd.to_datetime(new_incidents['time'], unit='s', utc=True) new_incidents = new_incidents.drop('time', 1) + new_incidents['first_stop'] = None session, engine = set_up_db(self.db_config.__dict__) @@ -172,7 +177,7 @@ def _start_incidents(self, anomalies): attack.start = start attack.target = row['target'] attack.detected_traffic = row['traffic'] - attack.anomaly_traffic_portion = row['challenged_portion'] + attack.anomaly_traffic_portion = row['anomaly_portion'] dashboard_url = self._get_dashboard_url(row['start'], row['target']) attack.dashboard_url = dashboard_url session.add(attack) @@ -182,8 +187,8 @@ def _start_incidents(self, anomalies): target = row['target'] self.logger.info(f'New incident, target={target}, id={attack.id}, ' f'traffic={row["traffic"]:.0f} ({row["avg_traffic"]:.0f}) ' - f'anomaly_portion={row["challenged_portion"]:.2f} ' - f'({row["avg_challenged_portion"]:.2f}) ' + f'anomaly_portion={row["anomaly_portion"]:.2f} ' + f'({row["avg_anomaly_portion"]:.2f}) ' f'url="{dashboard_url}" ' ) if self.mail_sender and self.emails: @@ -215,7 +220,19 @@ def _stop_incidents(self, regulars): if self.incidents is None or self.incidents.empty: return - stopped_incidents = pd.merge(self.incidents, regulars[['target', 'time']], how='inner', on='target') + # update first_stop column + self.incidents = pd.merge(self.incidents, regulars[['target', 'time']], how='left', on='target') + self.incidents['first_stop'] = np.where( + self.incidents['first_stop'].isnull() & ~(self.incidents['time'].isnull()), + self.incidents['time'], self.incidents['first_stop']) + stopped_incidents = self.incidents[~self.incidents['time'].isnull() + & ~self.incidents['first_stop'].isnull()].copy() + self.incidents = self.incidents.drop('time', 1) + + # filter only the incidents which stopped more then self.stop_delay_in_seconds ago + stopped_incidents['diff'] = stopped_incidents['time'] - stopped_incidents['first_stop'] + stopped_incidents = stopped_incidents[stopped_incidents['diff'] > self.stop_delay_in_seconds] + if len(stopped_incidents) == 0: return @@ -255,11 +272,11 @@ def _detect(self): self.stats_reader.set_query( f'select target, avg(traffic) as avg_traffic, stddev(traffic) as stddev_traffic, ' - f'avg(challenged_portion) as avg_challenged_portion, ' - f'stddev(challenged_portion) as stddev_challenged_portion from' + f'avg(anomaly_portion) as avg_anomaly_portion, ' + f'stddev(anomaly_portion) as stddev_anomaly_portion from' f'(' f'SELECT floor(extract(epoch from stop)/120)*120 AS "time", target, count(ip) as traffic, ' - f'(sum(prediction*1.0) / count(ip)) as challenged_portion ' + f'(sum(prediction_anomaly*1.0) / count(ip)) as anomaly_portion ' f'FROM request_sets WHERE stop > \'{stop}\' ' f'group by 1, 2' f') a ' @@ -268,28 +285,29 @@ def _detect(self): stats = self.stats_reader.get() if stats is None: + self.logger.info('No stats') return stats = stats[(~stats['avg_traffic'].isnull()) & (~stats['stddev_traffic'].isnull()) - & (~stats['avg_challenged_portion'].isnull()) & (~stats['stddev_challenged_portion'].isnull()) + & (~stats['avg_anomaly_portion'].isnull()) & (~stats['stddev_anomaly_portion'].isnull()) & (stats['avg_traffic'] > self.min_traffic) - & (stats['avg_challenged_portion'] > 0) - & (stats['avg_challenged_portion'] < 0.6)] + & (stats['avg_anomaly_portion'] > 0) + & (stats['avg_anomaly_portion'] < 0.6)] sample = self._read_sample() if sample is None: + self.logger.info('No sample') return batch = pd.merge(sample, stats, how='left', on='target') - condition = (batch['challenged_portion'] > (batch['avg_challenged_portion'] + - self.sigma_score * batch['stddev_challenged_portion'])) & \ + condition = (batch['anomaly_portion'] > (batch['avg_anomaly_portion'] + + self.sigma_score * batch['stddev_anomaly_portion'])) & \ (batch['traffic'] > (batch['avg_traffic'] + self.sigma_traffic * batch['stddev_traffic'])) & \ - (batch['challenged_portion'] > self.min_challenged_portion_incident) + (batch['anomaly_portion'] > self.min_anomaly_portion_incident) anomalies = batch[condition] regulars = batch[~condition] - with self.lock: self._stop_incidents(regulars) self._start_incidents(anomalies) diff --git a/src/baskerville/models/ip_cache.py b/src/baskerville/models/ip_cache.py index cf59e474..352b1d20 100644 --- a/src/baskerville/models/ip_cache.py +++ b/src/baskerville/models/ip_cache.py @@ -4,50 +4,26 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. -import os -import _pickle as pickle import threading from cachetools import TTLCache - -from baskerville.util.helpers import get_default_ip_cache_path from baskerville.util.singleton_thread_safe import SingletonThreadSafe class IPCache(metaclass=SingletonThreadSafe): - def init_cache(self, path, name, size, ttl): - if os.path.exists(path): - with open(path, 'rb') as f: - result = pickle.load(f) - self.logger.info(f'Loaded {name} IP cache from file {path}...') - else: - result = TTLCache(maxsize=size, ttl=ttl) - self.logger.info(f'A new instance of {name} IP cache has been created') - return result - def __init__(self, config, logger): super().__init__() self.logger = logger self.lock = threading.Lock() - folder_path = get_default_ip_cache_path() - if not os.path.exists(folder_path): - os.mkdir(folder_path) - - self.full_path_passed_challenge = os.path.join(folder_path, 'ip_cache_passed_challenge.bin') - self.cache_passed = self.init_cache( - self.full_path_passed_challenge, - 'passed challenge', + self.cache_passed = TTLCache( config.engine.ip_cache_passed_challenge_size, config.engine.ip_cache_passed_challenge_ttl ) - self.full_path_pending_challenge = os.path.join(folder_path, 'ip_cache_pending.bin') - self.cache_pending = self.init_cache( - self.full_path_pending_challenge, - 'pending challenge', + self.cache_pending = TTLCache( config.engine.ip_cache_pending_size, config.engine.ip_cache_pending_ttl ) @@ -76,16 +52,12 @@ def update(self, ips): 'fails': 0 } - with open(self.full_path_pending_challenge, 'wb') as f: - pickle.dump(self.cache_pending, f) - self.logger.info(f'IP cache pending: {len(self.cache_pending)}, {len(result)} added') - return result def ip_failed_challenge(self, ip): with self.lock: if ip not in self.cache_pending.keys(): - return 0 + return try: value = self.cache_pending[ip] @@ -101,21 +73,8 @@ def ip_failed_challenge(self, ip): def ip_passed_challenge(self, ip): with self.lock: if ip in self.cache_passed.keys(): - return False - if ip not in self.cache_pending.keys(): - return False - self.cache_passed[ip] = self.cache_pending[ip] - del self.cache_pending[ip] - - with open(self.full_path_passed_challenge, 'wb') as f: - pickle.dump(self.cache_passed, f) - return True + return + self.cache_passed[ip] = 1 def ip_banned(self, ip): - with self.lock: - try: - del self.cache_pending[ip] - - except KeyError as er: - self.logger.info(f'IP cache key error {er}') - pass + pass diff --git a/src/baskerville/models/labeler.py b/src/baskerville/models/labeler.py index 8b31d060..e925452d 100644 --- a/src/baskerville/models/labeler.py +++ b/src/baskerville/models/labeler.py @@ -4,15 +4,15 @@ import os import threading -from sklearn.ensemble import GradientBoostingClassifier from sqlalchemy.sql.functions import sysdate from baskerville.db import set_up_db, get_jdbc_url from baskerville.db.models import Attack, Attribute import pandas as pd -import numpy as np from pyspark.sql import functions as F -from pyspark.sql.types import StringType, StructField, StructType, DoubleType +from pyspark.sql.types import StringType, StructField, StructType + +from baskerville.models.storage_io import StorageIO class Labeler(object): @@ -20,14 +20,19 @@ class Labeler(object): def __init__(self, db_config, spark, - s3_path, check_interval_in_seconds=120, regular_traffic_before_attack_in_minutes=60, - logger=None): + load_from_storage=True, + logger=None, + storage_path=None, + folder_stream='stream', + folder_attacks='attacks' + ): super().__init__() self.db_config = db_config self.spark = spark - self.s3_path = s3_path + self.folder_attacks = folder_attacks + self.storage_path = storage_path self.check_interval_in_seconds = check_interval_in_seconds self.regular_traffic_before_attack_in_minutes = regular_traffic_before_attack_in_minutes if logger: @@ -37,6 +42,13 @@ def __init__(self, self.logger.addHandler(logging.StreamHandler(sysdate.stdout)) self.thread = None self.kill = threading.Event() + self.load_from_storage = load_from_storage + self.storage_io = StorageIO( + storage_path=storage_path, + spark=self.spark, + logger=self.logger, + subfolder=folder_stream + ) if self.load_from_storage else None def start(self): if self.thread is not None: @@ -61,16 +73,6 @@ def _load_dataset(self, query, engine): self.logger.info(f'{len(df)} records retrieved.') num_ips = len(df['ip'].unique()) self.logger.info(f'{num_ips} unique IPs') - self.logger.info(f'Unwrapping features...') - if len(df) > 0: - ff = df['features'].apply(pd.Series).columns.to_list() - df[ff] = df['features'].apply(pd.Series) - df.drop('features', axis=1, inplace=True) - df.drop('host', axis=1, inplace=True) - # df.drop('country', axis=1, inplace=True) - df.drop('request_total', axis=1, inplace=True) - df.drop('minutes_total', axis=1, inplace=True) - self.logger.info(f'Unwrapping features complete.') df = df.fillna(0) return df @@ -78,76 +80,20 @@ def _label_attack(self, attack, engine): self.logger.info(f'Labeling incident id = {attack.id}, target={attack.target} ...') regular_start = attack.start - datetime.timedelta(minutes=self.regular_traffic_before_attack_in_minutes) - df_regular = self._load_dataset(f'select * from request_sets where target=\'{attack.target}\' and ' + df_regular = self._load_dataset(f'select distinct ip from request_sets where target=\'{attack.target}\' and ' f'stop >= \'{regular_start.strftime("%Y-%m-%d %H:%M:%S")}Z\' and ' f'stop < \'{attack.start.strftime("%Y-%m-%d %H:%M:%S")}Z\'', engine) + self.logger.info(f'unique regular = {len(df_regular)}') - df_attack = self._load_dataset(f'select * from request_sets where target=\'{attack.target}\' and ' + df_attack = self._load_dataset(f'select distinct ip from request_sets where target=\'{attack.target}\' and ' f'stop >= \'{attack.start.strftime("%Y-%m-%d %H:%M:%S")}Z\' and ' f'stop < \'{attack.stop.strftime("%Y-%m-%d %H:%M:%S")}Z\'', engine) - regular_ips = df_regular[['ip']].drop_duplicates() - df_attack = df_attack.merge(regular_ips[['ip']], on=['ip'], how='outer', indicator=True) - df_attack = df_attack[df_attack['_merge'] == 'left_only'] + self.logger.info(f'unique attackers = {len(df_attack)}') - features = [ - 'request_rate', - 'css_to_html_ratio', - 'image_to_html_ratio', - 'js_to_html_ratio', - 'path_depth_average', - 'path_depth_variance', - 'payload_size_average', - 'payload_size_log_average', - 'request_interval_average', - 'request_interval_variance', - 'response4xx_to_request_ratio', - 'top_page_to_request_ratio', - 'unique_path_rate', - 'unique_path_to_request_ratio', - 'unique_query_rate', - 'unique_query_to_unique_path_ratio', - 'unique_ua_rate' - ] - - labels = np.ones(len(df_regular), dtype=int) - labels = np.append(labels, np.zeros(len(df_attack), dtype=int)) - dataset = pd.concat([df_regular[features], df_attack[features]]) - - # scaler = StandardScaler() - # dataset = scaler.fit_transform(dataset[features].values) - - model = GradientBoostingClassifier( - n_estimators=500, random_state=777, - max_depth=12, - max_features='auto', - learning_rate=0.05) - model.fit(dataset, labels) - - predictions = model.predict(df_attack[features]) - - incident = df_attack[['ip']].copy() - incident['predictions'] = predictions - attackers_predicted = incident[incident['predictions'] == 0][['ip']].drop_duplicates() - - incident_ips = incident[['ip']].drop_duplicates() - regular_ips = df_regular[['ip']].drop_duplicates() - - attackers_vs_regular_traffic = pd.merge(regular_ips, attackers_predicted, how='inner', on=['ip']) - regulars_vs_incident = pd.merge(regular_ips, incident_ips, how='inner', on=['ip']) - - self.logger.info(f'Number of unique IPs in the incident = {len(incident_ips)}') - self.logger.info(f'Number of predicted attackers = {len(attackers_predicted)}') - self.logger.info(f'Number of predicted regulars = {len(incident_ips) - len(attackers_predicted)}') - self.logger.info( - f'Intersection predicted attackers in regular traffic = {len(attackers_vs_regular_traffic)}, {len(attackers_vs_regular_traffic) / len(regular_ips) * 100:2.1f}%') - self.logger.info(f'Intersection regular traffic in incident = {len(regulars_vs_incident)}') - - # filter out the IPs from regular traffic - attackers = pd.merge(attackers_predicted, regulars_vs_incident, how='outer', on=['ip'], indicator=True) - attackers = attackers[attackers['_merge'] == 'left_only'][['ip']] - - self.logger.info(f'Final number of attackers IP: {len(attackers)}') + df_attack = df_attack.merge(df_regular, on=['ip'], how='outer', indicator=True) + df_attack = df_attack[df_attack['_merge'] == 'left_only'] + self.logger.info(f'df_attack after merge = {len(df_attack)}') # update ips in the database session, engine = set_up_db(self.db_config.__dict__) @@ -158,7 +104,7 @@ def _label_attack(self, attack, engine): return attributes = [] - for ip in attackers['ip']: + for ip in df_attack['ip']: a = Attribute() a.value = ip a.attacks.append(existing_attack) @@ -167,7 +113,7 @@ def _label_attack(self, attack, engine): session.add_all(attributes) existing_attack.labeled = True session.commit() - self.logger.info(f'Incident labeled, target={attack.target}, id = {attack.id}, num ips = {len(attackers)}') + self.logger.info(f'Incident labeled, target={attack.target}, id = {attack.id}, num ips = {len(df_attack)}') except Exception as e: self.logger.error(str(e)) @@ -200,26 +146,51 @@ def _load_request_sets(self, query): def _save_df_to_s3(self, df, attack_id): self.logger.info('writing to parquet...') - df.repartition(10).write.parquet(os.path.join(self.s3_path, f'{attack_id}')) + df.repartition(10).write.parquet(os.path.join(self.storage_path, self.folder_attacks, f'{attack_id}')) def _save_attack(self, attack): self.logger.info(f'Saving attack {attack.id} to s3...') attack_ips = [a.value for a in attack.attributes] - attack_ips = self.spark.createDataFrame(data=[[a] for a in attack_ips], + attack_ips = self.spark.createDataFrame(data=[[a] for a in set(attack_ips)], schema=StructType([StructField("ip_attacker", StringType())])) - query = f'(select * from request_sets where ' \ - f'target = \'{attack.target}\' and ' \ - f'stop > \'{attack.start.strftime("%Y-%m-%d %H:%M:%S")}Z\'::timestamp and stop < \'{attack.stop.strftime("%Y-%m-%d %H:%M:%S")}Z\'::timestamp) as attack1 ' - df = self._load_request_sets(query) - if len(df.head(1)) == 0: + regular_start = attack.start - datetime.timedelta(minutes=self.regular_traffic_before_attack_in_minutes) + + if self.load_from_storage: + self.logger.info(f'Loading from storate target={attack.target}, from {regular_start} to {attack.stop}') + df = self.storage_io.load(host=attack.target, start=regular_start, stop=attack.stop) + else: + query = f'(select * from request_sets where ' \ + f'target = \'{attack.target}\' and ' \ + f'stop >= \'{regular_start.strftime("%Y-%m-%d %H:%M:%S")}Z\'::timestamp ' \ + f'and stop < \'{attack.stop.strftime("%Y-%m-%d %H:%M:%S")}Z\'::timestamp) as attack1 ' + self.logger.info(query) + df = self._load_request_sets(query) + + if not df or len(df.head(1)) == 0: print(f'Skipping attack {attack.id}, no records found.') return + self.logger.info(f'Total records = {df.count()}') + self.logger.info(f'Num attacker ips={attack_ips.count()}') + unique_attackers = attack_ips.groupBy('ip_attacker').count().count() + self.logger.info(f'Unique attackers={unique_attackers}') + self.logger.info('Joining the labels...') - df = df.join(attack_ips, df.ip == attack_ips.ip_attacker, how='left') + df = df.join(attack_ips, F.col('ip') == F.col('ip_attacker'), how='left') + self.logger.info(f'After join = {df.count()}') df = df.withColumn('label', F.when(F.col('ip_attacker').isNull(), 0.0).otherwise(1.0)) + num_positives = df.where(F.col('label') == 1).count() + num_negatives = df.where(F.col('label') == 0).count() + self.logger.info(f'Positives= {num_positives}') + self.logger.info(f'Negatives= {num_negatives}') + + num_positives_unique = df.where(F.col('label') == 1).groupBy('ip').count().count() + num_negatives_unique = df.where(F.col('label') == 0).groupBy('ip').count().count() + self.logger.info(f'Positive uniques = {num_positives_unique}') + self.logger.info(f'Negatives unique = {num_negatives_unique}') + self._save_df_to_s3(df, attack.id) self.logger.info('Updating saved_to_cloud column...') @@ -231,7 +202,7 @@ def _save_attack(self, attack): return existing_attack.saved_in_cloud = True session.commit() - self.logger.info(f'Incident saved in s3, target={attack.target}, id = {attack.id}, path={self.s3_path}') + self.logger.info(f'Incident saved in s3, target={attack.target}, id = {attack.id}') except Exception as e: self.logger.error(str(e)) diff --git a/src/baskerville/models/pipeline_factory.py b/src/baskerville/models/pipeline_factory.py index 3d598bf3..54453f1b 100644 --- a/src/baskerville/models/pipeline_factory.py +++ b/src/baskerville/models/pipeline_factory.py @@ -6,21 +6,13 @@ from baskerville.models.pipeline_training import TrainingPipeline -from baskerville.models.pipelines import RawLogPipeline, \ - ElasticsearchPipeline, KafkaPipeline +from baskerville.models.pipelines import RawLogPipeline, KafkaPipeline from baskerville.util.enums import RunType class PipelineFactory(object): def get_pipeline(self, run_type, config): - if run_type == RunType.es: - return ElasticsearchPipeline( - config.database, - config.elastic, - config.engine, - config.spark - ) - elif run_type == RunType.rawlog: + if run_type == RunType.rawlog: return RawLogPipeline( config.database, config.engine, @@ -63,6 +55,10 @@ def get_pipeline(self, run_type, config): from baskerville.models.pipeline_tasks.training_pipeline \ import set_up_training_pipeline return set_up_training_pipeline(config) + elif run_type == RunType.training_classifier: + from baskerville.models.pipeline_tasks.training_pipeline \ + import set_up_classifier_training_pipeline + return set_up_classifier_training_pipeline(config) # elif run_type == RunType.dashboard_preprocessing: # from baskerville.models.pipeline_tasks.dashboard_pipeline import \ # set_up_dashboard_preprocessing_pipeline diff --git a/src/baskerville/models/pipeline_helpers.py b/src/baskerville/models/pipeline_helpers.py index 596f4213..cf08887a 100644 --- a/src/baskerville/models/pipeline_helpers.py +++ b/src/baskerville/models/pipeline_helpers.py @@ -5,18 +5,15 @@ # LICENSE file in the root directory of this source tree. -from baskerville.models.pipelines import ElasticsearchPipeline, \ - RawLogPipeline, KafkaPipeline +from baskerville.models.pipelines import RawLogPipeline, KafkaPipeline from baskerville.util.enums import RunType PIPELINE_TO_RUN_TYPE = { - ElasticsearchPipeline.__name__: RunType.es, RawLogPipeline.__name__: RunType.rawlog, KafkaPipeline.__name__: RunType.kafka, } PIPELINE_TO_CONFIG = { - ElasticsearchPipeline.__name__: ['database', 'elastic', 'engine', 'spark'], RawLogPipeline.__name__: ['database', 'engine', 'spark'], KafkaPipeline.__name__: ['database', 'kafka', 'engine', 'spark'], } diff --git a/src/baskerville/models/pipeline_tasks/client_pipeline.py b/src/baskerville/models/pipeline_tasks/client_pipeline.py index b5ef171b..3ed07bba 100644 --- a/src/baskerville/models/pipeline_tasks/client_pipeline.py +++ b/src/baskerville/models/pipeline_tasks/client_pipeline.py @@ -10,42 +10,42 @@ GenerateFeatures, \ Save, CacheSensitiveData, SendToKafka, \ GetPredictions, MergeWithSensitiveData, RefreshCache, AttackDetection, \ - GetDataLog, Predict, Challenge + GetDataLog, Predict, Challenge, SaveToStorage def set_up_preprocessing_pipeline(config: BaskervilleConfig): if config.engine.use_kafka_for_sensitive_data: steps = [ - GenerateFeatures(config), - SendToKafka( - config=config, - columns=('id_client', 'uuid_request_set', 'features'), - topic=config.kafka.features_topic, - ), - SendToKafka( - config=config, - columns=( - 'id_client', 'id_request_sets', - 'features', - 'target', 'ip', 'num_requests', 'target_original', 'first_ever_request', - 'id_runtime', 'time_bucket', 'start', 'stop', 'subset_count', 'dt', 'features'), - topic=config.engine.kafka_topic_sensitive, - send_to_clearing_house=config.engine.client_mode - ), - RefreshCache(config) - ] + GenerateFeatures(config), + SendToKafka( + config=config, + columns=('id_client', 'uuid_request_set', 'features'), + topic=config.kafka.features_topic, + ), + SendToKafka( + config=config, + columns=( + 'id_client', 'id_request_sets', + 'features', + 'target', 'ip', 'num_requests', 'target_original', 'first_ever_request', + 'id_runtime', 'time_bucket', 'start', 'stop', 'subset_count', 'dt', 'features'), + topic=config.engine.kafka_topic_sensitive, + send_to_clearing_house=config.engine.client_mode + ), + RefreshCache(config) + ] else: steps = [ - GenerateFeatures(config), - CacheSensitiveData(config), - SendToKafka( - config=config, - columns=('id_client', 'uuid_request_set', 'features'), - topic=config.kafka.features_topic, - send_to_clearing_house=config.engine.client_mode - ), - RefreshCache(config) - ] + GenerateFeatures(config), + CacheSensitiveData(config), + SendToKafka( + config=config, + columns=('id_client', 'uuid_request_set', 'features'), + topic=config.kafka.features_topic, + send_to_clearing_house=config.engine.client_mode + ), + RefreshCache(config) + ] task = [ GetDataKafka( config, @@ -95,6 +95,7 @@ def set_up_postprocessing_pipeline(config: BaskervilleConfig): AttackDetection(config), Challenge(config), Save(config), + SaveToStorage(config) ]), ] diff --git a/src/baskerville/models/pipeline_tasks/incident_loader.py b/src/baskerville/models/pipeline_tasks/incident_loader.py new file mode 100644 index 00000000..ad10285f --- /dev/null +++ b/src/baskerville/models/pipeline_tasks/incident_loader.py @@ -0,0 +1,45 @@ +# Copyright (c) 2020, eQualit.ie inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +from baskerville.models.config import BaskervilleConfig +from baskerville.models.pipeline_tasks.tasks_base import Task +import os + + +class IncidentLoader(Task): + """ + Reads data from s3 cloud storage. + """ + + def __init__( + self, + config: BaskervilleConfig, + steps: list = (), + incident_ids=[] + ): + super().__init__(config, steps) + self.incident_ids = incident_ids + + def load(self): + """ + Loads all the data from s3 + :return: + :rtype: pyspark.sql.Dataframe + """ + dfs = [] + for attack_id in self.incident_ids: + df = self.spark.read.parquet(os.path.join(self.config.engine.incidents_path, f'{attack_id}')) + df = df.select(['ip', 'target', 'stop', 'features', 'label']) + dfs.append(df) + df = dfs[0] + for i in range(1, len(dfs)): + df = df.union(dfs[i]) + + return df + + def run(self): + self.df = self.load() + self.df = super().run() + return self.df diff --git a/src/baskerville/models/pipeline_tasks/prediction_pipeline.py b/src/baskerville/models/pipeline_tasks/prediction_pipeline.py index 56ed2e51..9ddbd7fd 100644 --- a/src/baskerville/models/pipeline_tasks/prediction_pipeline.py +++ b/src/baskerville/models/pipeline_tasks/prediction_pipeline.py @@ -19,7 +19,7 @@ def set_up_prediction_pipeline(config: BaskervilleConfig): Predict(config), SendToKafka( config=config, - columns=('id_client', 'uuid_request_set', 'prediction', 'score'), + columns=('id_client', 'uuid_request_set', 'prediction', 'score', 'classifier_score'), cc_to_client=True, topic=config.kafka.predictions_topic, client_topic=config.kafka.predictions_topic_client, diff --git a/src/baskerville/models/pipeline_tasks/rawlog_pipeline.py b/src/baskerville/models/pipeline_tasks/rawlog_pipeline.py index 7e688ff9..f80e927f 100644 --- a/src/baskerville/models/pipeline_tasks/rawlog_pipeline.py +++ b/src/baskerville/models/pipeline_tasks/rawlog_pipeline.py @@ -8,8 +8,7 @@ from baskerville.models.pipeline_tasks.tasks_base import Task from baskerville.models.config import BaskervilleConfig from baskerville.models.pipeline_tasks.tasks import GenerateFeatures, \ - Save, \ - Predict, GetDataLog, AttackDetection, RefreshCache, Challenge + Save, Predict, GetDataLog, AttackDetection, Challenge def set_up_isac_rawlog_pipeline(config: BaskervilleConfig): diff --git a/src/baskerville/models/pipeline_tasks/service_provider.py b/src/baskerville/models/pipeline_tasks/service_provider.py index 54435988..4efc3cd2 100644 --- a/src/baskerville/models/pipeline_tasks/service_provider.py +++ b/src/baskerville/models/pipeline_tasks/service_provider.py @@ -79,13 +79,12 @@ def refresh_model(self): if self.config.engine.model_id and self._model_ts: seconds_since_last_update = ( - datetime.datetime.utcnow() - self._model_ts + datetime.datetime.utcnow() - self._model_ts ).total_seconds() self.logger.debug( f'Seconds since last update: {seconds_since_last_update}' ) - if seconds_since_last_update > \ - self.config.engine.new_model_check_in_seconds: + if seconds_since_last_update > self.config.engine.new_model_check_in_seconds: self.load_model_from_db() def create_runtime(self): @@ -94,14 +93,14 @@ def create_runtime(self): uuid=self.config.user_details.organization_uuid ).first() if not org: - raise ValueError(f'No such organization.') + raise ValueError('No such organization.') user = self.tools.session.query(User).filter_by( username=self.config.user_details.username).filter_by( id_organization=org.id ).first() if not user: - raise ValueError(f'No such user.') + raise ValueError('No user.') self.runtime = self.tools.create_runtime( start=self.start_time, conf=self.config.engine, @@ -137,8 +136,7 @@ def initialize_request_set_cache_service(self): if self.config.engine.cache_load_past: self.request_set_cache = self.request_set_cache.load( update_date=(self.start_time - datetime.timedelta( - seconds=self.config.engine.cache_expire_time) - ).replace(tzinfo=tzutc()), + seconds=self.config.engine.cache_expire_time)).replace(tzinfo=tzutc()), extra_filters=( F.col('time_bucket') == self.time_bucket.sec ) # todo: & (F.col("id_runtime") == self.runtime.id)? diff --git a/src/baskerville/models/pipeline_tasks/setup_pipeline.py b/src/baskerville/models/pipeline_tasks/setup_pipeline.py index 27424f09..065bb916 100644 --- a/src/baskerville/models/pipeline_tasks/setup_pipeline.py +++ b/src/baskerville/models/pipeline_tasks/setup_pipeline.py @@ -19,7 +19,7 @@ def set_up_registration_pipeline(config: BaskervilleConfig): GetDataKafka( config, steps=[ - Register(config), + # Register(config), CacheSensitiveData(config), SendToKafka( config=config, @@ -40,7 +40,7 @@ def set_up_user_creation_pipeline(config: BaskervilleConfig): GetDataKafka( config, steps=[ - Register(config), + # Register(config), CacheSensitiveData(config), SendToKafka( config=config, diff --git a/src/baskerville/models/pipeline_tasks/tasks.py b/src/baskerville/models/pipeline_tasks/tasks.py index 58fa25ef..7893e939 100644 --- a/src/baskerville/models/pipeline_tasks/tasks.py +++ b/src/baskerville/models/pipeline_tasks/tasks.py @@ -4,9 +4,7 @@ # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. - import datetime -from collections import defaultdict import itertools import json @@ -16,46 +14,46 @@ import pyspark from kafka.errors import TopicAlreadyExistsError +from pyspark.files import SparkFiles from baskerville.db.dashboard_models import FeedbackContext from pyspark.sql import functions as F, types as T -from pyspark.sql.types import StringType, StructField, StructType, DoubleType +from pyspark.sql.types import StringType, StructField, StructType, BooleanType from pyspark.streaming import StreamingContext -from functools import reduce -from pyspark.sql import DataFrame from sqlalchemy.exc import SQLAlchemyError +from user_agents import parse from baskerville.db import get_jdbc_url -from baskerville.db.models import RequestSet, Model, Attack -from baskerville.models.banjax_report_consumer import BanjaxReportConsumer +from baskerville.db.models import RequestSet, Model +from baskerville.models.classifier_model import ClassifierModel +from baskerville.models.storage_io import StorageIO from baskerville.models.incident_detector import IncidentDetector from baskerville.models.ip_cache import IPCache -from baskerville.models.metrics.registry import metrics_registry from baskerville.models.pipeline_tasks.tasks_base import Task, MLTask, \ CacheTask from baskerville.models.config import BaskervilleConfig, TrainingConfig from baskerville.spark.helpers import map_to_array, load_test, \ - save_df_to_table, columns_to_dict, get_window, set_unknown_prediction, \ - df_has_rows, get_dtype_for_col, \ + columns_to_dict, get_window, set_unknown_prediction, \ + get_dtype_for_col, \ handle_missing_col from baskerville.spark.schemas import features_schema, \ - prediction_schema, get_message_schema, get_data_schema, \ - get_feedback_context_schema, get_features_schema + prediction_schema, get_message_schema, get_data_schema, get_features_schema from kafka import KafkaProducer from dateutil.tz import tzutc # broadcasts +from baskerville.util.banjax_report_consumer import BanjaxReportConsumer from baskerville.util.elastic_writer import ElasticWriter -from baskerville.util.enums import LabelEnum -from baskerville.util.helpers import instantiate_from_str, get_model_path, \ - parse_config +from baskerville.util.helpers import parse_config, get_default_data_path from baskerville.util.helpers import instantiate_from_str, get_model_path from baskerville.util.kafka_helpers import send_to_kafka, read_from_kafka_from_the_beginning from baskerville.util.mail_sender import MailSender from baskerville.util.whitelist_ips import WhitelistIPs -from baskerville.util.whitelist_hosts import WhitelistHosts from baskerville.util.whitelist_urls import WhitelistURLs -from pyspark.sql.functions import broadcast +from pyspark.sql.functions import broadcast, udf + +from geoip2 import database +from geoip2.errors import AddressNotFoundError TOPIC_BC = None KAFKA_URL_BC = None @@ -64,6 +62,53 @@ IP_ACC = None +def parse_ua(ua_string): + # parse library cannot parse None + if ua_string is None: + ua_string = "" + + parsed_string = parse(ua_string) + + output = [ + parsed_string.device.brand, + parsed_string.device.family, + parsed_string.device.model, + + parsed_string.os.family, + parsed_string.os.version_string, + + parsed_string.browser.family, + parsed_string.browser.version_string, + + (parsed_string.is_mobile or parsed_string.is_tablet), + parsed_string.is_bot + ] + # If any of the column have None value it doesn't comply with schema + # and thus throw Null Pointer Exception + for i in range(len(output)): + if output[i] is None: + output[i] = 'Unknown' + return output + + +geoip_schema = StructType([ + StructField('country_name', StringType(), True), +]) + + +@udf(returnType=geoip_schema) +def geoip(ip): + geo = database.Reader(SparkFiles.get('GeoLite2-Country.mmdb')) + + try: + result = geo.country(ip) + pass + except AddressNotFoundError: + return {'country': None} + + return {'country': result.names['en']} + + class GetDataKafka(Task): """ Retrieves data from Kafka in batches of time_bucket seconds. @@ -115,10 +160,61 @@ def initialize(self): kafkaParams=self.kafka_params, ) + if self.config.engine.input_is_weblogs: + self.spark.sparkContext.addFile(os.path.join(get_default_data_path(), 'geoip2', 'GeoLite2-Country.mmdb')) + def get_data(self): - self.df = self.df.map(lambda l: json.loads(l[1])).toDF( - self.data_parser.schema - ).persist(self.spark_conf.storage_level) + if self.config.engine.input_is_weblogs: + self.df = self.df.map(lambda l: json.loads(l[1])) + + schema = T.StructType([ + T.StructField("message", T.StringType(), True) + ]) + self.df = self.df.map(lambda x: [x['message']]).toDF(schema=schema) + + regex = '([(\d\.)]+) - \[(.*?)\] "(.*?)" (.*) (.*) (\d+) (\d+) "(.*?)" (.*?) (.*?) (.*?) (.*?) "(.*?)" "(.*?)"' + + self.df = self.df.withColumn('client_ip', F.regexp_extract(F.col('message'), regex, 1)) + self.df = self.df.withColumn('@timestamp', F.regexp_extract(F.col('message'), regex, 2)) + self.df = self.df.withColumn('@timestamp', F.to_timestamp(F.col('@timestamp'), 'dd/MMM/yyyy:HH:mm:ss Z')) + + self.df = self.df.withColumn('request', F.regexp_extract(F.col('message'), regex, 3)) + self.df = self.df.withColumn('client_url', F.regexp_extract(F.col('request'), '(.*) (.*) (.*)', 2)) + self.df = self.df.withColumn('client_request_method', + F.regexp_extract(F.col('request'), '(.*) (.*) (.*)', 1)) + self.df = self.df.drop('request') + + self.df = self.df.withColumn('client_request_host', F.regexp_extract(F.col('message'), regex, 5)) + self.df = self.df.withColumn('http_response_code', F.regexp_extract(F.col('message'), regex, 6)) + self.df = self.df.withColumn('reply_length_bytes', F.regexp_extract(F.col('message'), regex, 7)) + + self.df = self.df.withColumn('client_ua', F.regexp_extract(F.col('message'), regex, 8)) + ua_parser_udf = F.udf(lambda z: parse_ua(z), StructType([ + StructField("device_brand", StringType(), False), + StructField("device_family", StringType(), False), + StructField("device_model", StringType(), False), + + StructField("os_family", StringType(), False), + StructField("os_version", StringType(), False), + + StructField("browser_family", StringType(), False), + StructField("browser_version", StringType(), False), + + StructField("is_mobile", BooleanType(), False), + StructField("is_bot", BooleanType(), False), + ])) + self.df = self.df.withColumn('ua', ua_parser_udf('client_ua')) + + self.df = self.df.withColumn('content_type', F.regexp_extract(F.col('message'), regex, 10)) + self.df = self.df.withColumn('querystring', F.regexp_extract(F.col('message'), regex, 13)) + + self.df = self.df.withColumn('geoip', geoip('client_ip')) + + self.df = self.df.drop('message') + else: + self.df = self.df.map(lambda l: json.loads(l[1])).toDF( + self.data_parser.schema + ).persist(self.spark_conf.storage_level) self.df = load_test( self.df, @@ -130,6 +226,7 @@ def run(self): self.create_runtime() def process_subsets(time, rdd): + self.df_time = time self.logger.info(f'Data until {time} from kafka topic \'{self.consume_topic}\'') if rdd and not rdd.isEmpty(): try: @@ -404,7 +501,7 @@ def process_data(self): self.logger.info('No data in to process.') else: for window_df in get_window( - df_original, self.time_bucket, self.config.spark.storage_level, self.logger + self.df, self.time_bucket, self.config.spark.storage_level, self.logger ): self.df = window_df.repartition( *self.group_by_cols @@ -426,6 +523,35 @@ def run(self): self.batch_i += 1 +class GetDataFromStorage(Task): + def __init__( + self, + config: BaskervilleConfig, + from_date=None, + to_date=None, + load_one_random_batch_from_every_hour=True, + steps: list = () + ): + super().__init__(config, steps) + self.storage_io = None + self.from_date = from_date + self.to_date = to_date + self.load_one_random_batch_from_every_hour = load_one_random_batch_from_every_hour + + def initialize(self): + self.storage_io = StorageIO( + storage_path=self.config.engine.storage_path, + spark=self.spark, + logger=self.logger + ) + + def run(self): + self.df = self.storage_io.load(self.from_date, self.to_date, + load_one_random_batch_from_every_hour=self.load_one_random_batch_from_every_hour) + self.df = super().run() + return self.df + + class GetDataPostgres(Task): """ Reads data from RequestSet's table in Postgres - used for training @@ -672,6 +798,8 @@ def white_list_urls(self): prefixes = [] matches = [] stars = [] + double_stars = [] + for url in urls: if url.find('/') < 0: domains.append(url) @@ -683,11 +811,16 @@ def white_list_urls(self): if star_pos == len(url) - 1: prefixes.append(url[:-1]) else: - stars.append((url[:star_pos], url[star_pos + 1:])) + star_pos2 = url.rfind('*') + if star_pos == star_pos2: + stars.append((url[:star_pos], url[star_pos + 1:])) + else: + double_stars.append((url[:star_pos], url[star_pos + 1:-1])) # filter out only the exact domain match if len(domains) > 0: - self.df = self.df.filter(~F.col('target_original').isin(domains)) + for domain in domains: + self.df = self.df.filter(~F.col('target_original').contains(domain)) # concatenate the full path URL self.df = self.df.withColumn('url', F.concat(F.col('target_original'), F.col('client_url'))) @@ -712,6 +845,9 @@ def filter_stars(url): for star in stars: if url and url.startswith(star[0]) and url.endswith(star[1]): return False + for star in double_stars: + if url and url.startswith(star[0]) and star[1] in url[len(star[0]):]: + return False return True self.df = self.df.filter(filter_stars('url')) @@ -1025,8 +1161,13 @@ def add_ids(self): # the current batch, this will cause conflicts with caching - use # e.g. the timestamp too to avoid this + def rename_timestamp_column(self): + if self.config.engine.input_timestamp_column != '@timestamp': + self.df = self.df.withColumn('@timestamp', F.col(self.config.engine.input_timestamp_column)) + def run(self): self.handle_missing_columns() + self.rename_timestamp_column() self.normalize_host_names() # self.df = self.df.repartition('target_original') self.white_list_ips() @@ -1051,6 +1192,14 @@ class Predict(MLTask): def __init__(self, config: BaskervilleConfig, steps=()): super().__init__(config, steps) self._is_initialized = False + self.classifier_model = None + + def initialize(self): + super().initialize() + + if self.config.engine.classifier_model_path: + self.classifier_model = ClassifierModel() + self.classifier_model.load(self.config.engine.classifier_model_path, spark_session=self.spark) def handle_missing_features(self): """ @@ -1070,6 +1219,12 @@ def handle_missing_features(self): self.df = self.df.fillna(default_value, subset=[feat_dict_col]) def predict(self): + if self.classifier_model: + self.logger.info('Classifier predicting...') + self.df = self.classifier_model.predict(self.df) + else: + self.logger.info('Classifier model is not used') + if self.model: self.df = self.model.predict(self.df) else: @@ -1116,17 +1271,16 @@ def __init__( def run(self): self.config.database.conn_str = self.db_url - - if df_has_rows(self.df): - save_df_to_table( - self.df, - self.table_model.__tablename__, - self.config.database.__dict__, - json_cols=self.json_cols, - storage_level=self.config.spark.storage_level, - mode=self.mode, - db_driver=self.config.spark.db_driver - ) + self.df.na.drop() + # save_df_to_table( + # self.df, + # self.table_model.__tablename__, + # self.config.database.__dict__, + # json_cols=self.json_cols, + # storage_level=self.config.spark.storage_level, + # mode=self.mode, + # db_driver=self.config.spark.db_driver + # ) self.df = super().run() return self.df @@ -1142,11 +1296,13 @@ def __init__(self, config, json_cols=('features',), mode='append', not_common=( - 'prediction', - 'model_version', - 'label', - 'id_attribute', - 'updated_at') + 'prediction', + 'prediction_anomaly', + 'prediction_classifier', + 'model_version', + 'label', + 'id_attribute', + 'updated_at') ): self.not_common = set(not_common) super().__init__(config, steps, table_model, json_cols, mode) @@ -1180,6 +1336,26 @@ def run(self): return self.df +class SaveToStorage(Task): + def __init__( + self, + config, + steps=() + ): + super().__init__(config, steps) + self.storage_io = None + + def initialize(self): + self.storage_io = StorageIO(self.config.engine.storage_path, spark=self.spark, logger=self.logger) + + def run(self): + if not self.config.engine.save_to_storage: + return self.df + + self.storage_io.save(self.df, self.df_time) + return self.df + + class SaveFeedback(SaveDfInPostgres): def __init__(self, config, steps=(), @@ -1187,11 +1363,11 @@ def __init__(self, config, json_cols=('features',), mode='append', not_common=( - 'prediction', - 'model_version', - 'label', - 'id_attribute', - 'updated_at') + 'prediction', + 'model_version', + 'label', + 'id_attribute', + 'updated_at') ): self.not_common = set(not_common) super().__init__(config, steps, table_model, json_cols, mode) @@ -1270,7 +1446,8 @@ def prepare_to_save(self): self.df = SaveDfInPostgres.run(self) self.df = self.df.groupBy('uuid_organization', 'id_context').count().toDF() self.df = self.df.withColumn('success', F.lit(True)) - except: + except Exception as exp: + self.logger.error(exp) self.df = self.df.withColumn('success', F.lit(False)) def run(self): @@ -1403,10 +1580,10 @@ def run(self): self.df_sensitive, on=['id_client', 'uuid_request_set'], how='inner' ).drop('df.id_client', 'df.uuid_request_set') - if self.df and self.df.head(1): + if self.df: merge_count = self.df.count() - if count != merge_count: + if (merge_count > 0) and (count != merge_count): self.logger.warning('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') self.logger.warning('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') self.logger.warning('@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@@') @@ -1462,7 +1639,8 @@ def run(self): cc_to_client=self.cc_to_client, client_topic=self.client_topic, client_connections=self.client_connections, - use_partitions=self.config.engine.kafka_send_by_partition + use_partitions=self.config.engine.kafka_send_by_partition, + logger=self.logger ) return self.df @@ -1474,11 +1652,13 @@ def __init__( self, config: BaskervilleConfig, steps: list = (), + convert_features_from_json=False ): super().__init__(config, steps) self.model = None self.training_conf = self.config.engine.training self.engine_conf = self.config.engine + self.convert_features_from_json = convert_features_from_json def initialize(self): super().initialize() @@ -1498,7 +1678,7 @@ def load_dataset(self, df, features): fractions[key] = 1.0 dataset = dataset.sampleBy('target', fractions, 777) - self.logger.debug(f'Unwrapping features from json...') + self.logger.debug('Unwrapping features from json...') schema = StructType([]) for feature in features: schema.add(StructField( @@ -1557,7 +1737,7 @@ def run(self): self.model.set_params(**params) self.model.set_logger(self.logger) - dataset = self.load_dataset(self.df, self.model.features) + dataset = self.load_dataset(self.df, self.model.features) if self.convert_features_from_json else self.df self.model.train(dataset) dataset.unpersist() @@ -1673,6 +1853,35 @@ def run(self): return self.df +def dynamic_threshold(score, attack_prediction, threshold, threshold_attack): + if not score: + return 0 + + if attack_prediction == 1: + if score > threshold_attack: + return 1 + else: + return 0 + else: + if score > threshold: + return 1 + else: + return 0 + + +def prediction(prediction_anomaly, prediction_classifier, attack_prediction): + if attack_prediction == 1: + if (prediction_anomaly == 1) or (prediction_classifier == 1): + return 1 + else: + return 0 + else: + if (prediction_anomaly == 1) and (prediction_classifier == 1): + return 1 + else: + return 0 + + class AttackDetection(Task): """ Calculates prediction per IP, attack_score per Target, regular vs anomaly counts, attack_prediction @@ -1680,9 +1889,7 @@ class AttackDetection(Task): def __init__(self, config, steps=()): super().__init__(config, steps) - self.report_consumer = None - self.banjax_thread = None - self.register_metrics = config.engine.register_banjax_metrics + self.report_consumer = BanjaxReportConsumer(config, self.logger) self.low_rate_attack_schema = None self.time_filter = None self.lra_condition = None @@ -1705,93 +1912,51 @@ def initialize(self): name='request_total', dataType=StringType(), nullable=True )]) self.time_filter = ( - F.abs(F.unix_timestamp(F.col('stop'))) - - F.abs(F.unix_timestamp(F.col('start'))) + F.abs(F.unix_timestamp(F.col('stop'))) - F.abs(F.unix_timestamp(F.col('start'))) ) self.lra_condition = ( - ((F.col('features.request_total') > lr_attack_period[0]) & - (self.time_filter > lra_total_req[0])) | - ((F.col('features.request_total') > lr_attack_period[1]) & - (self.time_filter > lra_total_req[1])) + ((F.col('features.request_total') > lr_attack_period[0]) & + (self.time_filter > lra_total_req[0])) | + ((F.col('features.request_total') > lr_attack_period[1]) & + (self.time_filter > lra_total_req[1])) ) - self.report_consumer = BanjaxReportConsumer(self.config, self.logger) - if self.register_metrics: - self.register_banjax_metrics() - self.banjax_thread = threading.Thread(target=self.report_consumer.run) - self.banjax_thread.start() if self.incident_detector is not None: self.incident_detector.start() - def finish_up(self): - if self.banjax_thread: - self.banjax_thread.join() - - super().finish_up() - - def register_banjax_metrics(self): - from baskerville.util.enums import MetricClassEnum - - def incr_counter_for_ip_failed_challenge(metric, self, return_value): - metric.labels(return_value.get('value_ip'), return_value.get('value_site')).inc() - return return_value - - consume_ip_failed_challenge_message = metrics_registry.register_action_hook( - self.report_consumer.consume_ip_failed_challenge_message, - incr_counter_for_ip_failed_challenge, - metric_name='ip_failed_challenge_on_website', - metric_cls=MetricClassEnum.counter, - labelnames=['ip', 'website'] - ) - - setattr(self.report_consumer, 'consume_ip_failed_challenge_message', consume_ip_failed_challenge_message) - - for field_name in self.report_consumer.status_message_fields: - target_method = getattr(self.report_consumer, f"consume_{field_name}") - - def setter_for_field(field_name_inner): - def label_with_id_and_set(metric, self, return_value): - metric.labels(return_value.get('id')).set(return_value.get(field_name_inner)) - return return_value - - return label_with_id_and_set - - patched_method = metrics_registry.register_action_hook( - target_method, - setter_for_field(field_name), - metric_name=field_name.replace('.', '_'), - metric_cls=MetricClassEnum.gauge, - labelnames=['banjax_id'] - ) - - setattr(self.report_consumer, f"consume_{field_name}", patched_method) - self.logger.info(f"Registered metric for {field_name}") + consumer_thread = threading.Thread(target=self.report_consumer.run) + consumer_thread.start() def classify_anomalies(self): self.logger.info('Anomaly thresholding...') if self.incident_detector: self.logger.info('Getting hosts with incidents...') hosts = self.incident_detector.get_hosts_with_incidents() + self.logger.info(f'Number of hosts under attack {len(hosts)}.') + + self.df = self.df.withColumn('attack_prediction', + F.when(F.col('target').isin(hosts), + F.lit(1)).otherwise(F.lit(0))) else: hosts = [] + self.df = self.df.withColumn('attack_prediction', F.lit(1)) - self.logger.info(f'Number of hosts under attack {len(hosts)}.') - - self.df = self.df.withColumn('attack_prediction', - F.when(F.col('target').isin(hosts), - F.lit(1)).otherwise(F.lit(0))) + self.logger.info('Dynamic thresholds calculation...') + self.df = self.df.withColumn('prediction_anomaly', F.udf(dynamic_threshold, T.IntegerType())( + 'score', 'attack_prediction', + F.lit(self.config.engine.anomaly_threshold), + F.lit(self.config.engine.anomaly_threshold_during_incident) + )) - self.logger.info(f'Dynamic thresholds calculation...') - self.df = self.df.withColumn('threshold', - F.when(F.col('target').isin(hosts), - F.lit(self.config.engine.anomaly_threshold_during_incident)).otherwise( - F.lit(self.config.engine.anomaly_threshold))) - self.logger.info(f'Dynamic thresholding...') - self.df = self.df.withColumn( - 'prediction', - F.when(F.col('score') > F.col('threshold'), F.lit(1)).otherwise(F.lit(0))) + self.df = self.df.withColumn('prediction_classifier', F.udf(dynamic_threshold, T.IntegerType())( + 'classifier_score', 'attack_prediction', + F.lit(self.config.engine.classifier_threshold), + F.lit(self.config.engine.classifier_threshold_during_incident) + )) - self.df = self.df.drop('threshold') + self.df = self.df.withColumn('prediction', F.udf(prediction, T.IntegerType())( + 'prediction_anomaly', 'prediction_classifier', 'attack_prediction' + )) def detect_low_rate_attack(self): if not self.config.engine.low_rate_attack_enabled: @@ -1845,11 +2010,6 @@ def __init__( self.attack_filter = None self.producer = None self.ip_cache = IPCache(config, self.logger) - self.whitelist_hosts = WhitelistHosts( - url=config.engine.url_whitelist_hosts, - logger=self.logger, - refresh_period_in_seconds=config.engine.dashboard_config_refresh_period_in_seconds - ) if config.elastic: self.elastic_writer = ElasticWriter(host=config.elastic.host, port=config.elastic.port, @@ -1910,18 +2070,13 @@ def get_attack_filter(self): def send_challenge(self): df_ips = self.get_attack_df() if self.config.engine.challenge == 'ip': - if not df_has_rows(df_ips): - self.logger.debug('No attacks to be challenged...') - return + self.df = self.df.withColumn('challenged', F.lit(0)) # host white listing hosts = [] if self.config.engine.white_list_hosts: hosts = self.config.engine.white_list_hosts - if self.whitelist_hosts.get(): - hosts += self.whitelist_hosts.get() - if len(hosts): df_white_list_hosts = self.spark.createDataFrame( [[host] for host in set(hosts)], ['target']).withColumn('white_list_host', F.lit(1)) @@ -1930,46 +2085,51 @@ def send_challenge(self): ).persist() df_ips = df_ips.where(F.col('white_list_host').isNull()) - if df_has_rows(df_ips): - ips = [(r['ip'], r['target'], r['low_rate_attack']) for r in df_ips.collect()] - ips = self.ip_cache.update(ips) - num_records = len(ips) - if num_records > 0: - # challenged_ips = self.spark.createDataFrame( - # [[ip, 1] for ip in ips], ['ip', 'challenged'] - # ) - self.df = self.df.withColumn( - 'challenged', - F.when(F.col('ip').isin([f'{ip}' for ip, _, _ in ips]), 1).otherwise(0) - ) - # self.df = self.df.join(challenged_ips, on='ip', how='left') - # self.df = self.df.fillna({'challenged': 0}) - - self.logger.info( - f'Sending {num_records} IP challenge commands to ' - f'kafka topic \'{self.config.kafka.banjax_command_topic}\'...') - null_ips = False - for ip, _, _ in ips: - if ip: - message = json.dumps( - {'name': 'challenge_ip', 'value': ip} - ).encode('utf-8') - self.producer.send(self.config.kafka.banjax_command_topic, message) - else: - null_ips = True + ips = [(r['ip'], r['target'], r['low_rate_attack']) for r in df_ips.collect()] + ips = self.ip_cache.update(ips) + num_records = len(ips) + if num_records > 0: + # challenged_ips = self.spark.createDataFrame( + # [[ip, 1] for ip in ips], ['ip', 'challenged'] + # ) + self.df = self.df.withColumn( + 'challenged', + F.when(F.col('ip').isin([f'{ip}' for ip, _, _ in ips]), 1).otherwise(0) + ) + # self.df = self.df.join(challenged_ips, on='ip', how='left') + # self.df = self.df.fillna({'challenged': 0}) + + self.logger.info( + f'Sending {num_records} IP challenge commands to ' + f'kafka topic \'{self.config.kafka.banjax_command_topic}\'...') + null_ips = False + for ip, target, _ in ips: + if ip: + message = json.dumps( + { + 'Name': 'challenge_ip', + 'Value': ip, + 'host': target, + 'source': 'bask' + } + ).encode('utf-8') + self.producer.send(self.config.kafka.banjax_command_topic, message, + key=bytearray(target, encoding='utf8')) + else: + null_ips = True - if self.elastic_writer: - with self.elastic_writer as elastic_writer: - for ip, target, low_rate_attack in ips: - if ip: - elastic_writer.write_challenge(ip, host=target, - reason='low_rate' if low_rate_attack else 'anomaly') + if self.elastic_writer: + with self.elastic_writer as elastic_writer: + for ip, target, low_rate_attack in ips: + if ip: + elastic_writer.write_challenge(ip, host=target, + reason='low_rate' if low_rate_attack else 'anomaly') - if null_ips: - self.logger.info('Null ips') - self.logger.info(f'{ips}') + if null_ips: + self.logger.info('Null ips') + self.logger.info(f'{ips}') - self.producer.flush() + self.producer.flush() # # return @@ -1994,7 +2154,8 @@ def send_challenge(self): # # elif self.config.engine.challenge == 'ip': # col_of_interest = 'ip' - # df_to_challenge = self.df.select('ip', 'target').where( # this does not look right. Why (F.col('attack_prediction') == 1) & (F.col('prediction') == 1)? + # df_to_challenge = self.df.select('ip', 'target'). + # where( # this does not look right. Why (F.col('attack_prediction') == 1) & (F.col('prediction') == 1)? # (F.col('attack_prediction') == 1) & # (F.col('prediction') == 1) | # (F.col('low_rate_attack') == 1) @@ -2050,7 +2211,10 @@ def send_challenge(self): # self.logger.debug('No challenge flag is set, moving on...') def get_attack_df(self): - return self.df.select('ip', 'target', 'low_rate_attack').where(self.attack_filter).cache() + return self.df.select('ip', 'target', 'low_rate_attack').where( + # self.attack_filter + (F.col('prediction') == 1) | (F.col('low_rate_attack') == 1) + ) def filter_out_load_test(self): if self.config.engine.load_test: @@ -2067,11 +2231,11 @@ def filter_out_load_test(self): ).show() def run(self): - if df_has_rows(self.df): - self.df = self.df.withColumn('challenged', F.lit(0)) - self.filter_out_load_test() - self.send_challenge() - else: - self.logger.info('Nothing to be challenged...') + # if df_has_rows(self.df): + self.df = self.df.withColumn('challenged', F.lit(0)) + self.filter_out_load_test() + self.send_challenge() + # else: + # self.logger.info('Nothing to be challenged...') self.df = super().run() return self.df diff --git a/src/baskerville/models/pipeline_tasks/tasks_base.py b/src/baskerville/models/pipeline_tasks/tasks_base.py index f7f32b6e..2dbe3d07 100644 --- a/src/baskerville/models/pipeline_tasks/tasks_base.py +++ b/src/baskerville/models/pipeline_tasks/tasks_base.py @@ -20,11 +20,13 @@ class Task(object, metaclass=abc.ABCMeta): name: str df: pyspark.sql.DataFrame + df_time: datetime steps: List['Task'] config: BaskervilleConfig def __init__(self, config: BaskervilleConfig, steps: list = ()): self.df = None + self.df_time = None self.config = config self.steps = steps self.step_to_action = OrderedDict({ @@ -83,6 +85,10 @@ def set_df(self, df): self.df = df return self + def set_df_time(self, time): + self.df_time = time + return self + def initialize(self): self.service_provider.initialize_db_tools_service() self.service_provider.initialize_spark_service() @@ -99,7 +105,9 @@ def run(self): self.remaining_steps = list(self.step_to_action.keys()) for descr, task in self.step_to_action.items(): self.logger.info('Starting step {}'.format(descr)) - self.df = task.set_df(self.df).run() + task.set_df(self.df) + task.set_df_time(self.df_time) + self.df = task.run() self.logger.info('Completed step {}'.format(descr)) self.remaining_steps.remove(descr) return self.df diff --git a/src/baskerville/models/pipeline_tasks/train_classifier.py b/src/baskerville/models/pipeline_tasks/train_classifier.py new file mode 100644 index 00000000..b02712a2 --- /dev/null +++ b/src/baskerville/models/pipeline_tasks/train_classifier.py @@ -0,0 +1,53 @@ +# Copyright (c) 2020, eQualit.ie inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +from pyspark.ml.linalg import SparseVector, VectorUDT + +from baskerville.models.config import BaskervilleConfig +from baskerville.models.pipeline_tasks.tasks_base import Task +from baskerville.util.helpers import instantiate_from_str, get_classifier_model_path +import numpy as np +import pyspark.sql.functions as F + + +def to_sparse(c): + def to_sparse_(v): + if isinstance(v, SparseVector): + return v + vs = v + nonzero = np.nonzero(vs)[0] + return SparseVector(len(v), nonzero, [d for d in vs if d != 0]) + + return F.udf(to_sparse_, VectorUDT())(c) + + +class TrainClassifier(Task): + + def __init__( + self, + config: BaskervilleConfig, + steps: list = (), + ): + super().__init__(config, steps) + self.model = None + self.training_conf = self.config.engine.training + self.engine_conf = self.config.engine + + def save(self): + model_path = get_classifier_model_path(self.engine_conf.storage_path, self.model.__class__.__name__) + self.model.save(path=model_path, spark_session=self.spark) + self.logger.debug(f'The new classifier model has been saved to: {model_path}') + + def run(self): + self.model = instantiate_from_str(self.training_conf.classifier_model) + + params = self.training_conf.classifier_parameters + self.model.set_params(**params) + self.model.set_logger(self.logger) + + self.model.train(self.df) + + self.save() diff --git a/src/baskerville/models/pipeline_tasks/training_pipeline.py b/src/baskerville/models/pipeline_tasks/training_pipeline.py index b945c9ed..5e9c3524 100644 --- a/src/baskerville/models/pipeline_tasks/training_pipeline.py +++ b/src/baskerville/models/pipeline_tasks/training_pipeline.py @@ -3,25 +3,57 @@ # # This source code is licensed under the BSD-style license found in the # LICENSE file in the root directory of this source tree. +from datetime import datetime - +from baskerville.models.pipeline_tasks.incident_loader import IncidentLoader from baskerville.models.pipeline_tasks.tasks_base import Task from baskerville.models.config import BaskervilleConfig -from baskerville.models.pipeline_tasks.tasks import GetDataPostgres, Train +from baskerville.models.pipeline_tasks.tasks import GetDataPostgres, Train, GetDataFromStorage +from baskerville.models.pipeline_tasks.train_classifier import TrainClassifier def set_up_training_pipeline(config: BaskervilleConfig): data_params = config.engine.training.data_parameters + if config.engine.training.load_from_storage: + training_tasks = [ + GetDataFromStorage( # or any other source + config, + from_date=datetime.strptime(data_params.get('from_date'), + config.engine.datetime_format), + to_date=datetime.strptime(data_params.get('to_date'), + config.engine.datetime_format), + steps=[ + Train(config) + ]), + + ] + else: + training_tasks = [ + GetDataPostgres( # or any other source + config, + from_date=data_params.get('from_date'), + to_date=data_params.get('to_date'), + training_days=data_params.get('training_days'), + sampling_percentage=data_params.get('sampling_percentage', 10.0), + steps=[ + Train(config) + ]), + + ] + + training_pipeline = Task(config, training_tasks) + training_pipeline.name = 'Training Pipeline' + return training_pipeline + + +def set_up_classifier_training_pipeline(config: BaskervilleConfig): training_tasks = [ - GetDataPostgres( # or any other source + IncidentLoader( config, - from_date=data_params.get('from_date'), - to_date=data_params.get('to_date'), - training_days=data_params.get('training_days'), - sampling_percentage=data_params.get('sampling_percentage', 10.0), + incident_ids=config.engine.training.classifier_incidents, steps=[ - Train(config), + TrainClassifier(config) ]), ] diff --git a/src/baskerville/models/pipelines.py b/src/baskerville/models/pipelines.py index 1b31cc50..45b4db5a 100644 --- a/src/baskerville/models/pipelines.py +++ b/src/baskerville/models/pipelines.py @@ -7,8 +7,7 @@ import json import traceback -from datetime import timedelta, datetime -import math +from datetime import datetime from baskerville.models.base_spark import SparkPipelineBase from pyspark.streaming import StreamingContext @@ -18,167 +17,6 @@ print('Cannot import KafkaUtils - check pyspark version') -class ElasticsearchPipeline(SparkPipelineBase): - """ - A pipeline for processing data directly from an ElasticSearch instance. - """ - - def __init__( - self, db_conf, els_conf, engine_conf, spark_conf, clean_up=True - ): - super(ElasticsearchPipeline, self).__init__( - db_conf, engine_conf, spark_conf, clean_up - ) - - self.els_conf = els_conf - self.manual_conf = engine_conf.es_log - self.start = self.manual_conf.start - self.stop = self.manual_conf.stop - self.batch_length = self.manual_conf.batch_length - self.batch_timedelta = timedelta(minutes=int(self.batch_length)) - self.hosts = None - if self.manual_conf.hosts: - self.hosts = ', '.join(self.manual_conf.hosts) - self.save_logs_dir = self.manual_conf.save_logs_dir - self.batch_start = self.start - self.batch_stop = self.batch_start + self.batch_timedelta - self.batch_i = 1 - self.batch_n = math.ceil( - float((self.stop - self.start).total_seconds()) / - (self.batch_length * 60.) - ) - - def initialize(self): - """ - Start sessions, initialize cache/features/model/dfs. - :return: - """ - super().initialize() - self.set_up_es() - - def run(self): - - while self.batch_start < self.stop: - self.batch_stop = self.batch_start + self.batch_timedelta - - self.create_runtime() - self.get_data() - self.process_data() - self.reset() - self.batch_start = self.batch_stop - self.batch_i += 1 - - def create_runtime(self): - self.runtime = self.tools.create_runtime( - start=self.batch_start, - stop=self.batch_stop, - target_site=self.hosts, - conf=self.engine_conf, - comment=f'batch runtime {self.batch_i} of {self.batch_n}' - ) - - def get_data(self): - from pyspark.sql import functions as F - - filter_condition = (F.col('@timestamp') >= self.runtime.start) & \ - (F.col('@timestamp') < self.runtime.stop) - - if self.hosts is not None: - host_filter = (F.col('client_request_host') - == self.manual_conf.hosts[0]) - if len(self.manual_conf.hosts) > 1: - for h in self.manual_conf.hosts[1:]: - host_filter = host_filter | ( - F.col('client_request_host') == h - ) - filter_condition = filter_condition & host_filter - - self.logs_df = self.es_storage.get( - self.runtime.start, - self.runtime.stop, - filter_condition=filter_condition, - extra_config={ - 'es.mapping.include': ','.join( - self.group_by_cols + self.feature_manager.active_columns - ) - }, - columns_to_keep=list( - self.group_by_cols + self.feature_manager.active_columns - ) - ).select( - *self.group_by_cols, *self.feature_manager.active_columns - ).persist(self.spark_conf.storage_level) - - self.logger.info('Will be retrieving {} rows'.format( - self.logs_df.count() - ) - ) - - if self.save_logs_dir: - log_name = f'/{self.runtime.start.strftime("%Y-%m-%d-%H%M%S")}' \ - f'_' + \ - f'{self.runtime.stop.strftime("%Y-%m-%d-%H%M%S")}' - if self.runtime.target: - log_name += f'_{"_".join(self.runtime.target)}' \ - if isinstance(self.runtime.target, list) \ - else f'_{self.runtime.target}' - self.save_logs(self.logs_df, self.save_logs_dir + log_name) - - def set_up_es(self): - - from es_retriever.es.storage import EsStorage - from es_retriever.config import Config - - self.es_config = Config( - es_url=self.els_conf.host, - es_user=self.els_conf.user, - es_pass=self.els_conf.password, - es_base_index=self.els_conf.base_index, - es_index_type=self.els_conf.index_type, - ) - # todo: fix this in es-retriever: fix setup to include jars - conf = self.es_config.spark_conf.copy() - conf['spark.jars'] = self.spark_conf.jars - self.es_config.spark_conf = conf - self.es_storage = EsStorage(self.es_config, init_session=False) - self.es_storage.spark_conf = conf - self.es_storage.session_getter = self.es_session_getter - self.es_storage.session_getter() - - def es_session_getter(self): - from pyspark.sql import SparkSession - from pyspark import SparkConf - - conf = SparkConf() - conf.set('spark.logConf', 'true') - conf.set('spark.jars', self.spark_conf.jars) - conf.set('spark.driver.memory', '6G') - conf.set( - 'spark.sql.session.timeZone', self.spark_conf.session_timezone - ) - conf.set('spark.sql.shuffle.partitions', - self.spark_conf.shuffle_partitions) - - spark = SparkSession.builder \ - .config(conf=conf) \ - .appName('Baskerville Spark') \ - .getOrCreate() - - if self.spark_conf.log_level: - spark.sparkContext.setLogLevel(self.spark_conf.log_level) - - spark.conf.set('spark.jars', self.spark_conf.jars) - - for k, v in self.es_config.es_read_conf.items(): - spark.conf.set(k, v) - spark.conf.set("es.port", "9200") - return spark - - def save_logs(self, spark_df, save_logs_path): - spark_df.coalesce(1).write.mode('overwrite').format('json').save( - save_logs_path) - - class RawLogPipeline(SparkPipelineBase): """ A pipeline that processes a list of raw files. diff --git a/src/baskerville/models/request_set_cache.py b/src/baskerville/models/request_set_cache.py index 25fbfb1d..67be96f2 100644 --- a/src/baskerville/models/request_set_cache.py +++ b/src/baskerville/models/request_set_cache.py @@ -171,7 +171,7 @@ def load(self, update_date=None, hosts=None, extra_filters=None): update_date=update_date, hosts=hosts, extra_filters=extra_filters - )#.persist(self.storage_level) + ) self.write() @@ -225,7 +225,7 @@ def update_df( self.cache.select(*select_cols).alias('cache'), list(join_cols), how='left_outer' - )#.persist(self.storage_level) + ) # update nulls and filter drop duplicate columns for c in select_cols: @@ -270,7 +270,7 @@ def filter_by(self, df, columns=None): how='inner' ).drop( 'a.ip' - ) #.persist(self.storage_level) + ) else: if self.__cache: self.__cache = self.__cache.join( @@ -279,7 +279,7 @@ def filter_by(self, df, columns=None): how='inner' ).drop( 'a.ip' - )# .persist(self.storage_level) + ) else: self.load_empty(self.schema) else: @@ -290,7 +290,7 @@ def filter_by(self, df, columns=None): how='inner' ).drop( 'a.ip' - ) #.persist(self.storage_level) + ) else: if self.__cache: self.__cache = self.__cache.join( @@ -299,11 +299,10 @@ def filter_by(self, df, columns=None): how='inner' ).drop( 'a.ip' - )# .persist(self.storage_level) + ) else: self.load_empty(self.schema) - # if self.__persistent_cache: # self.__cache = self.__persistent_cache.join( # df, @@ -342,7 +341,7 @@ def update_self( 'dt', 'id_client' ] now = datetime.datetime.utcnow() - #source_df = source_df.persist(self.storage_level) + # source_df = source_df.persist(self.storage_level) source_df = source_df.alias('sd') columns = source_df.columns @@ -359,7 +358,7 @@ def update_self( self.format_ ).load( self.persistent_cache_file - )# .persist(self.storage_level) + ) # .persist(self.storage_level) else: if self.storage_df: if self.__persistent_cache: @@ -371,7 +370,7 @@ def update_self( self.__persistent_cache.select(*select_cols).alias('pc'), list(join_cols), how='full_outer' - )#.persist(self.storage_level) + ) # .persist(self.storage_level) # mark rows to update self.__persistent_cache = self.__persistent_cache.withColumn( @@ -507,7 +506,7 @@ def empty_all(self): self.session_getter().sparkContext._jvm.System.gc() def persist(self): - self.__cache = self.__cache #.persist(self.storage_level) + self.__cache = self.__cache # .persist(self.storage_level) # self.__cache.createOrReplaceTempView(self.__class__.__name__) # spark = self.session_getter() diff --git a/src/baskerville/models/storage_io.py b/src/baskerville/models/storage_io.py new file mode 100644 index 00000000..9936ecab --- /dev/null +++ b/src/baskerville/models/storage_io.py @@ -0,0 +1,138 @@ +# Copyright (c) 2020, eQualit.ie inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. +import random +from datetime import timedelta + +import os +from pyspark.sql import functions as F + +from baskerville.util.file_manager import FileManager + + +class StorageIO(object): + + def __init__(self, storage_path, spark, partitions=10, batch_in_minutes=20, logger=None, subfolder='stream'): + self.spark = spark + self.storage_path = storage_path + self.batch_in_minutes = batch_in_minutes + self.partitions = partitions + self.batch = None + self.prev_minutes = None + self.prev_time = None + self.logger = logger + self.subfolder = subfolder + + def save(self, df, timestamp): + # saving in batches to s3, i.e 20 minutes, 40 minutes, 60 minutes, ... + current_minutes = timestamp.minute // self.batch_in_minutes * self.batch_in_minutes + + if self.prev_minutes is None: + self.prev_minutes = current_minutes + self.prev_time = timestamp + + if current_minutes == self.prev_minutes: + self.logger.info(f'appending chunk {timestamp} to {self.prev_minutes}') + # self.logger.info(f'chunk count = {df.count()}') + + if not self.batch: + self.logger.info('initial batch...') + self.batch = df + else: + self.logger.info(f'before union = {self.batch.count()}') + self.batch = self.batch.union(df) + self.logger.info(f'after union = {self.batch.count()}') + + return + + path = os.path.join( + self.storage_path, self.subfolder, + f'{self.prev_time.year}', + f'{self.prev_time.month:02d}', + f'{self.prev_time.day:02d}', + f'{self.prev_time.hour:02d}', + f'{self.prev_minutes:02d}') + + file_manager = FileManager(os.path.join(self.storage_path, self.subfolder), self.spark) + file_manager.delete_path(path) + + self.logger.info(f'writing to parquet {path}...') + self.batch.repartition(self.partitions).write.parquet(path) + + self.batch = df + self.prev_minutes = current_minutes + self.prev_time = timestamp + + def load(self, start, stop, host=None, load_one_random_batch_from_every_hour=False): + minutes = [] + for i in range(60 // self.batch_in_minutes): + minutes.append(i * self.batch_in_minutes) + + file_manager = FileManager(os.path.join(self.storage_path, self.subfolder), self.spark) + + hour_chunks = [] + current_time = start - timedelta(hours=1) + while current_time < stop: + hour_chunks.append(current_time) + current_time += timedelta(hours=1) + + dataset = None + for i in range(len(hour_chunks)): + chunk = hour_chunks[i] + path = os.path.join( + self.storage_path, self.subfolder, + f'{chunk.year}', + f'{chunk.month:02d}', + f'{chunk.day:02d}', + f'{chunk.hour:02d}') + if not file_manager.path_exists(path): + continue + + if load_one_random_batch_from_every_hour: + minutes_to_load = [minutes[random.randrange(0, len(minutes))]] + else: + minutes_to_load = minutes + + for m in minutes_to_load: + path_minutes = os.path.join(path, f'{m:02d}') + if not file_manager.path_exists(path_minutes): + continue + self.logger.info(f'Reading from {path_minutes}') + df = self.spark.read.parquet(path_minutes) + + self.logger.info('1st read') + self.logger.info(df.count()) + df = df.filter(f'stop >= \'{start.strftime("%Y-%m-%d %H:%M:%S")}\' ' + f'and stop < \'{stop.strftime("%Y-%m-%d %H:%M:%S")}\'') + self.logger.info('after filter start/stop') + self.logger.info(df[['stop']].show()) + self.logger.info(df.count()) + + if host: + df = df.filter(F.col('target') == host) + self.logger.info('after filter host') + self.logger.info(df.count()) + + if dataset is None: + dataset = df + else: + # make sure we have exactly the same feature names before calling unionByName + origin_features = set(dataset.schema['features'].dataType.names) + chunk_features = set(df.schema['features'].dataType.names) + for f in origin_features - chunk_features: + df = df.withColumn('features', F.struct(F.col('features.*'), F.lit('0').alias(f))) + for f in chunk_features - origin_features: + dataset = dataset.withColumn('features', F.struct(F.col('features.*'), F.lit('0').alias(f))) + + # reorder the features for union(). Note: unionByName() did not work as expected for nested struct + df = df.withColumn('features', F.struct( + [F.col(f'features.{f}') for f in dataset.schema['features'].dataType.names])) + + dataset = dataset.union(df) + + self.logger.info('after union') + self.logger.info(dataset.count()) + + return dataset diff --git a/src/baskerville/spark/__init__.py b/src/baskerville/spark/__init__.py index 425050f6..e5f93545 100644 --- a/src/baskerville/spark/__init__.py +++ b/src/baskerville/spark/__init__.py @@ -188,8 +188,10 @@ def get_or_create_spark_session(spark_conf): if spark_conf.spark_kubernetes_executor_memoryOverhead: conf.set('spark.kubernetes.executor.memoryOverhead', spark_conf.spark_kubernetes_executor_memoryOverhead) - conf.set('spark.kubernetes.driver.pod.name', os.environ['MY_POD_NAME']) - conf.set('spark.driver.host', os.environ['MY_POD_IP']) + if 'MY_POD_NAME' in os.environ: + conf.set('spark.kubernetes.driver.pod.name', os.environ['MY_POD_NAME']) + if 'MY_POD_IP' in os.environ: + conf.set('spark.driver.host', os.environ['MY_POD_IP']) conf.set('spark.driver.port', 20020) else: conf.set('spark.sql.codegen.wholeStage', 'false') diff --git a/src/baskerville/spark/helpers.py b/src/baskerville/spark/helpers.py index 5db18da9..e7f86695 100644 --- a/src/baskerville/spark/helpers.py +++ b/src/baskerville/spark/helpers.py @@ -11,12 +11,11 @@ from baskerville.spark import get_spark_session from baskerville.util.enums import LabelEnum -from baskerville.util.helpers import TimeBucket, get_logger +from baskerville.util.helpers import TimeBucket from pyspark import AccumulatorParam from pyspark import StorageLevel from pyspark.sql import functions as F - # OFF-HEAP by default StorageLevel.CUSTOM = StorageLevel(True, True, True, False, 1) @@ -107,18 +106,21 @@ def save_df_to_table( """ if not isinstance(storage_level, StorageLevel): storage_level = StorageLevelFactory.get_storage_level(storage_level) - #df = df.persist(storage_level) - for c in json_cols: - df = col_to_json(df, c) - df.write.format('jdbc').options( + # df = df.persist(storage_level) + + df_postgres = df.withColumn('features', F.lit('{}')) + # for c in json_cols: + # df = col_to_json(df, c) + + df_postgres.write.format('jdbc').options( url=db_config['conn_str'], driver=db_driver, dbtable=table_name, user=db_config['user'], password=db_config['password'], stringtype='unspecified', - batchsize=100000, - max_connections=1250, + batchsize=500, + max_connections=10, rewriteBatchedStatements=True, reWriteBatchedInserts=True, useServerPrepStmts=False, @@ -280,7 +282,7 @@ def get_window(df, time_bucket: TimeBucket, storage_level: str, logger): (F.col('timestamp') >= current_window_start) & (F.col('timestamp') < current_end) ) - window_df = df.where(filter_) #.persist(storage_level) + window_df = df.where(filter_) # .persist(storage_level) if not window_df.rdd.isEmpty(): logger.info(f'# Request sets = {window_df.count()}') yield window_df diff --git a/src/baskerville/spark/schemas.py b/src/baskerville/spark/schemas.py index aa5d922f..b0533763 100644 --- a/src/baskerville/spark/schemas.py +++ b/src/baskerville/spark/schemas.py @@ -22,7 +22,8 @@ T.StructField("id_client", T.StringType(), False), T.StructField("uuid_request_set", T.StringType(), False), T.StructField("prediction", T.FloatType(), False), - T.StructField("score", T.FloatType(), False) + T.StructField("score", T.FloatType(), False), + T.StructField("classifier_score", T.FloatType(), False) ]) feature_vectors_schema = T.StructField( @@ -88,24 +89,22 @@ def get_feedback_context_schema() -> T.StructType: def get_submitted_feedback_schema() -> T.ArrayType: - return T.ArrayType( - T.StructType([ - T.StructField('id', T.IntegerType()), - T.StructField('id_context', T.IntegerType()), - T.StructField('uuid_organization', T.StringType()), - T.StructField('uuid_request_set', T.StringType()), - T.StructField('prediction', T.IntegerType()), - T.StructField('score', T.FloatType()), - T.StructField('attack_prediction', T.FloatType()), - T.StructField('low_rate', T.BooleanType()), - feature_vectors_schema, - T.StructField('feedback', T.StringType()), - T.StructField('start', T.StringType()), - T.StructField('submitted_at', T.StringType()), - T.StructField('created_at', T.StringType()), - T.StructField('updated_at', T.StringType()) - ]) - ) + return T.ArrayType(T.StructType([ + T.StructField('id', T.IntegerType()), + T.StructField('id_context', T.IntegerType()), + T.StructField('uuid_organization', T.StringType()), + T.StructField('uuid_request_set', T.StringType()), + T.StructField('prediction', T.IntegerType()), + T.StructField('score', T.FloatType()), + T.StructField('attack_prediction', T.FloatType()), + T.StructField('low_rate', T.BooleanType()), + feature_vectors_schema, + T.StructField('feedback', T.StringType()), + T.StructField('start', T.StringType()), + T.StructField('submitted_at', T.StringType()), + T.StructField('created_at', T.StringType()), + T.StructField('updated_at', T.StringType()) + ])) NAME_TO_SCHEMA = { diff --git a/src/baskerville/spark/udfs.py b/src/baskerville/spark/udfs.py index 8bca7cbe..74a3558e 100644 --- a/src/baskerville/spark/udfs.py +++ b/src/baskerville/spark/udfs.py @@ -7,7 +7,6 @@ import pytz from baskerville.features.helpers import update_features -from baskerville.features.helpers import extract_features_in_order from baskerville.spark.schemas import cross_reference_schema from baskerville.util.enums import LabelEnum from dateutil.tz import tzutc diff --git a/src/baskerville/util/banjax_report_consumer.py b/src/baskerville/util/banjax_report_consumer.py new file mode 100644 index 00000000..450b6761 --- /dev/null +++ b/src/baskerville/util/banjax_report_consumer.py @@ -0,0 +1,59 @@ +# Copyright (c) 2020, eQualit.ie inc. +# All rights reserved. +# +# This source code is licensed under the BSD-style license found in the +# LICENSE file in the root directory of this source tree. + +import json +from kafka import KafkaConsumer +from baskerville.models.ip_cache import IPCache + + +class BanjaxReportConsumer(object): + + def __init__(self, config, logger): + self.config = config + self.kafka_config = config.kafka + self.logger = logger + self.ip_cache = IPCache(config, self.logger) + + def run(self): + consumer = KafkaConsumer( + self.kafka_config.banjax_report_topic, + group_id="baskerville_postprocessing", + auto_offset_reset='earliest', + **self.config.kafka.connection + ) + + for message in consumer: + self.consume_message(message) + + consumer.close() + + def consume_message(self, message): + if len(message.value) > 0: + try: + s = message.value.decode("utf-8") + except UnicodeDecodeError: + self.logger.info("got bad utf-8 over the kafka channel") + + try: + d = json.loads(s) + except json.JSONDecodeError: + self.logger.info(f"got bad json over the kafka channel: {s}") + + if d.get("name") == "ip_failed_challenge": + self.consume_ip_failed_challenge_message(d) + elif d.get("name") == "ip_passed_challenge": + self.consume_ip_passed_challenge_message(d) + elif d.get("name") == "ip_banned": + self.consume_ip_banned_message(d) + + def consume_ip_failed_challenge_message(self, message): + self.ip_cache.ip_failed_challenge(message['value_ip']) + + def consume_ip_passed_challenge_message(self, message): + self.ip_cache.ip_passed_challenge(message['value_ip']) + + def consume_ip_banned_message(self, message): + pass diff --git a/src/baskerville/util/db_reader.py b/src/baskerville/util/db_reader.py index 12979878..289bd0f5 100644 --- a/src/baskerville/util/db_reader.py +++ b/src/baskerville/util/db_reader.py @@ -21,19 +21,18 @@ def __init__(self, db_config, query=None, logger=None, refresh_period_in_minutes self.fresh_data = None self.lock = threading.Lock() self.thread = None - self.refresh() def set_query(self, query): self.query = query - def read_from_database(self): + def _read_from_database(self): try: session, engine = set_up_db(self.db_config.__dict__) except Exception as e: if self.logger: self.logger.error(str(e)) return None - data = None + try: data = pd.read_sql(self.query, engine) with self.lock: @@ -47,28 +46,28 @@ def read_from_database(self): session.close() engine.dispose() - return data + def _run(self): + while True: + if not self.last_timestamp or int(time.time() - self.last_timestamp) > self.refresh_period_in_minutes * 60: + self._read_from_database() + self.last_timestamp = time.time() - def refresh(self): + def _start(self): if not self.query: return - if not self.last_timestamp or int(time.time() - self.last_timestamp) > self.refresh_period_in_minutes*60: - self.last_timestamp = time.time() - if self.query: - if self.thread and self.thread.isAlive(): - return + if self.thread: + return - self.thread = threading.Thread(target=self.read_from_database) - self.thread.start() + self.thread = threading.Thread(target=self._run, daemon=True) + self.thread.start() def get(self): if not self.query: return None - self.refresh() + self._start() with self.lock: if self.fresh_data is not None: self.data = self.fresh_data.copy() self.fresh_data = None return self.data - diff --git a/src/baskerville/util/elastic_writer.py b/src/baskerville/util/elastic_writer.py index a46183bb..06e60052 100644 --- a/src/baskerville/util/elastic_writer.py +++ b/src/baskerville/util/elastic_writer.py @@ -11,7 +11,6 @@ def __init__(self, host, port, user, password): self.es = None def __enter__(self): - assert(self.es is None) self.es = Elasticsearch([self.connection_string]) return self diff --git a/src/baskerville/util/enums.py b/src/baskerville/util/enums.py index f55230e6..1f4187bd 100644 --- a/src/baskerville/util/enums.py +++ b/src/baskerville/util/enums.py @@ -40,13 +40,13 @@ class LogType(Enum): class RunType(BaseStrEnum): - es = 'es' rawlog = 'rawlog' kafka = 'kafka' training_old = 'training_old' irawlog = 'irawlog' ikafka = 'ikafka' training = 'training' + training_classifier = 'training_classifier' preprocessing = 'preprocessing' postprocessing = 'postprocessing' predicting = 'predicting' @@ -170,4 +170,4 @@ class FeedbackContextTypeEnum(BaseStrEnum): FeedbackContextTypeEnum.true_positive: 'We did well (marked the bots correctly) and you want to tell us!', FeedbackContextTypeEnum.true_negative: 'We did well (marked the normal traffic correctly) and you want to tell us!', FeedbackContextTypeEnum.other: 'Anything else :)' -} \ No newline at end of file +} diff --git a/src/baskerville/util/helpers.py b/src/baskerville/util/helpers.py index ea40d93d..9225de23 100644 --- a/src/baskerville/util/helpers.py +++ b/src/baskerville/util/helpers.py @@ -17,6 +17,7 @@ from baskerville.util.enums import ModelEnum, BaseStrEnum FOLDER_MODELS = 'models' +FOLDER_CLASSIFIER_MODELS = 'classifier_models' FOLDER_CACHE = 'cache' @@ -389,6 +390,18 @@ def get_model_path(storage_path, model_name='model'): f'{model_name}__{get_timestamp()}') +def get_classifier_model_path(storage_path, model_name='model'): + """ + Crete the model full path for the given storage and the name of the model\ + :param storage_path: the path to the storage root folder + :param model_name: + :return: storage_path/models/mdel_name__2020_01_01__14:30 + """ + return os.path.join(storage_path, + FOLDER_CLASSIFIER_MODELS, + f'{model_name}__{get_timestamp()}') + + def load_model_from_path(model_path, spark=None): """ Instantiate the proper model and load from the path. diff --git a/src/baskerville/util/json_url_reader.py b/src/baskerville/util/json_url_reader.py index 561702b3..84b0215c 100644 --- a/src/baskerville/util/json_url_reader.py +++ b/src/baskerville/util/json_url_reader.py @@ -56,5 +56,3 @@ def get(self): self.refresh() return self.data - - diff --git a/src/baskerville/util/kafka_helpers.py b/src/baskerville/util/kafka_helpers.py index 7ea636c2..8be47275 100644 --- a/src/baskerville/util/kafka_helpers.py +++ b/src/baskerville/util/kafka_helpers.py @@ -56,7 +56,8 @@ def send_to_kafka(spark, cc_to_client=False, client_topic=None, client_connections=None, - use_partitions=True): + use_partitions=True, + logger=None): if use_partitions: broadcast_connection = spark.sparkContext.broadcast(connection) broadcast_client_connections = spark.sparkContext.broadcast(client_connections) diff --git a/src/baskerville/util/ksql_example.py b/src/baskerville/util/ksql_example.py index 1c86cdc7..bdeda3f3 100644 --- a/src/baskerville/util/ksql_example.py +++ b/src/baskerville/util/ksql_example.py @@ -1,18 +1,18 @@ -import logging -from ksql import KSQLAPI -logging.basicConfig(level=logging.DEBUG) -client = KSQLAPI('http://0.0.0.0:8088') - -df = None -table_name = 'sensitive_data' -topic = 'predictions' -column_type = [ - 'uuid_request_set bigint','ip varchar','target varchar', 'stop varchar' -] -print(client.ksql('show tables')) -client.create_stream(table_name, column_type, topic) -print(client.query(f'select * from {table_name}', use_http2=True)) -print(client.ksql('show tables')) +# import logging +# from ksql import KSQLAPI +# logging.basicConfig(level=logging.DEBUG) +# client = KSQLAPI('http://0.0.0.0:8088') +# +# df = None +# table_name = 'sensitive_data' +# topic = 'predictions' +# column_type = [ +# 'uuid_request_set bigint','ip varchar','target varchar', 'stop varchar' +# ] +# print(client.ksql('show tables')) +# client.create_stream(table_name, column_type, topic) +# print(client.query(f'select * from {table_name}', use_http2=True)) +# print(client.ksql('show tables')) # client.create_stream_as(table_name='sensitive_data', # select_columns=df.columns, diff --git a/src/baskerville/util/mail_sender.py b/src/baskerville/util/mail_sender.py index de727648..42bf3d7d 100644 --- a/src/baskerville/util/mail_sender.py +++ b/src/baskerville/util/mail_sender.py @@ -5,6 +5,7 @@ # LICENSE file in the root directory of this source tree. import smtplib + class MailSender(object): def __init__(self, from_email, server, port, user, password): diff --git a/src/baskerville/util/model_evaluation/evaluation.py b/src/baskerville/util/model_evaluation/evaluation.py index 44e5d39c..69d8c13c 100644 --- a/src/baskerville/util/model_evaluation/evaluation.py +++ b/src/baskerville/util/model_evaluation/evaluation.py @@ -144,8 +144,7 @@ def _fit(self, dataset): for i in range(nFolds): validateLB = i * h validateUB = (i + 1) * h - condition = (df[randCol] >= validateLB) & ( - df[randCol] < validateUB) + condition = (df[randCol] >= validateLB) & (df[randCol] < validateUB) validation = self.get_validation(df, condition) train = self.get_train(df, condition) @@ -190,8 +189,7 @@ def get_random_attacks_by_percentage( h = 1.0 / (percent_anomalies / 10) validateLB = h validateUB = 6 * h - condition = (attack_df[random_col] >= validateLB) & ( - attack_df[random_col] < validateUB) + condition = (attack_df[random_col] >= validateLB) & (attack_df[random_col] < validateUB) return attack_df.select( "*", F.rand(43).alias(random_col) ).select(condition).cache() @@ -288,5 +286,3 @@ def cross_validate(start, stop, config: BaskervilleConfig, num_folds=3): config = parse_config(config_path) bask_config = BaskervilleConfig(config).validate() cross_validate(start, stop, bask_config, num_folds=num_folds) - - diff --git a/src/baskerville/util/model_evaluation/evaluation_from_notebook.py b/src/baskerville/util/model_evaluation/evaluation_from_notebook.py index 7b303f94..e680c074 100644 --- a/src/baskerville/util/model_evaluation/evaluation_from_notebook.py +++ b/src/baskerville/util/model_evaluation/evaluation_from_notebook.py @@ -5,7 +5,7 @@ from pyspark import SparkConf from pyspark.sql import functions as F from pyspark.sql import SparkSession -from pyspark.sql.types import * +from pyspark.sql.types import (StructType, StructField, StringType) from pyspark.mllib.evaluation import BinaryClassificationMetrics from baskerville.db import get_jdbc_url @@ -66,8 +66,7 @@ def load_dataset(query, spark, db_config): df = df.withColumn('features', F.create_map( *list(itertools.chain( *[(F.lit(f), F.col('features').getItem(f)) for f in - json_schema.__dict__['names']]) - ))) + json_schema.__dict__['names']])))) return df @@ -91,11 +90,10 @@ def evaluate_model(models, spark, db_config): "ip_attacker", StringType())])) print('Querying database...') - query = f'(select ip, target, created_at, features, stop ' \ - f'from request_sets where ' - f'stop > \'{attack.start.strftime("%Y-%m-%d %H:%M:%S")}Z\' ' \ - f'and stop < \'{attack.stop.strftime("%Y-%m-%d %H:%M:%S")}Z\') ' \ - f'as attack1 ' + query = '(select ip, target, created_at, features, stop ' \ + 'from request_sets where ' \ + f'stop > \'{attack.start.strftime("%Y-%m-%d %H:%M:%S")}Z\' ' \ + f'and stop < \'{attack.stop.strftime("%Y-%m-%d %H:%M:%S")}Z\') as attack1 ' rs = load_dataset(query, spark, db_config) num_records = rs.count() diff --git a/src/baskerville/util/model_interpretation/__init__.py b/src/baskerville/util/model_interpretation/__init__.py index 3d6032c8..4a3e0b02 100644 --- a/src/baskerville/util/model_interpretation/__init__.py +++ b/src/baskerville/util/model_interpretation/__init__.py @@ -46,4 +46,4 @@ # SHAP tree explainer # https://towardsdatascience.com/explain-your-model-with-the-shap-values-bc36aac4de3d # https://github.com/dataman-git/codes_for_articles/blob/master/Explain%20your%20model%20with%20the%20SHAP%20values%20for%20article.ipynb -# https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/ \ No newline at end of file +# https://www.timlrx.com/2018/06/19/feature-selection-using-feature-importance-score-creating-a-pyspark-estimator/ diff --git a/src/baskerville/util/model_interpretation/helpers.py b/src/baskerville/util/model_interpretation/helpers.py index d678de20..6c6c541d 100644 --- a/src/baskerville/util/model_interpretation/helpers.py +++ b/src/baskerville/util/model_interpretation/helpers.py @@ -346,12 +346,7 @@ def calculate_x(feature_j, z_features, curr_feature_perm, predict_df = predict_df.withColumn( 'marginal_contribution', - ( - F.col(column_to_examine) - F.lag( - F.col(column_to_examine), 1).over( - Window.partitionBy("id").orderBy("id") - ) - ) + (F.col(column_to_examine) - F.lag(F.col(column_to_examine), 1).over(Window.partitionBy("id").orderBy("id"))) ) predict_df = predict_df.filter( predict_df.marginal_contribution.isNotNull() diff --git a/src/baskerville/util/model_interpretation/shapley_with_spark_and_anomaly_model.py b/src/baskerville/util/model_interpretation/shapley_with_spark_and_anomaly_model.py index 21f5c5a0..02578097 100644 --- a/src/baskerville/util/model_interpretation/shapley_with_spark_and_anomaly_model.py +++ b/src/baskerville/util/model_interpretation/shapley_with_spark_and_anomaly_model.py @@ -53,7 +53,7 @@ def shapley_values_for_anomaly_model( # select the row to be examined df = select_row(df, row_id) row = df.select('id', 'features').where( - F.col('is_selected') == True + F.col('is_selected') is True ).first() print('Row: ', row) diff --git a/src/baskerville/util/model_interpretation/spark_tree_plotting.py b/src/baskerville/util/model_interpretation/spark_tree_plotting.py index ca83c86f..2fcb3e40 100644 --- a/src/baskerville/util/model_interpretation/spark_tree_plotting.py +++ b/src/baskerville/util/model_interpretation/spark_tree_plotting.py @@ -124,29 +124,29 @@ def node_to_str(node, featureNames, categoryNames, classNames, numClasses, # For continuous split: if node["splitType"] == "continuous": label = """ label="Node ID %s\\n%s <= %.4f\\nImpurity = %.4f\\nGain = %.4f\\nPrediction = %s" """ % ( - node["id"], - feature_name_str, - node["threshold"], - node["impurity"], - node["gain"], - class_name_str + node["id"], + feature_name_str, + node["threshold"], + node["impurity"], + node["gain"], + class_name_str ) # For categorical split: else: label = """ label="Node ID %s\\n%s in %s\\nImpurity = %.4f\\nGain = %.4f\\nPrediction = %s" """ % ( - node["id"], - feature_name_str, - categories, - node["impurity"], - node["gain"], - class_name_str + node["id"], + feature_name_str, + categories, + node["impurity"], + node["gain"], + class_name_str ) # Leaf node: else: label = """ label="Node ID %s\\nImpurity = %.4f\\nPrediction = %s" """ % ( - node["id"], - node["impurity"], - class_name_str + node["id"], + node["impurity"], + class_name_str ) if round_leaves is True: attributes.append("shape=ellipse") @@ -403,13 +403,17 @@ def export_graphviz(DecisionTreeClassificationModel, featureNames=None, %s %s} }""" % ( - ",".join(filled_and_rounded), "".join(graph), node_properties) + ",".join(filled_and_rounded), "".join(graph), node_properties) return dot_string -def plot_tree(DecisionTreeClassificationModel, featureNames=None, - categoryNames=None, classNames=None, - filled=True, roundedCorners=True, roundLeaves=True): +def plot_tree(DecisionTreeClassificationModel, + featureNames=None, + categoryNames=None, + classNames=None, + filled=True, + roundedCorners=True, + roundLeaves=True): """ Draws a Spark's fitted DecisionTreeClassificationModel in png format. If you are using Jupyter, this function can be easily used alongside Ipython.display in order @@ -472,7 +476,7 @@ def plot_tree(DecisionTreeClassificationModel, featureNames=None, roundedCorners=roundedCorners, roundLeaves=roundLeaves ) - ) + ) if type(graph) is list: plot = graph[0].create_png() else: diff --git a/src/baskerville/util/whitelist_hosts.py b/src/baskerville/util/whitelist_hosts.py deleted file mode 100644 index b0ffa26c..00000000 --- a/src/baskerville/util/whitelist_hosts.py +++ /dev/null @@ -1,22 +0,0 @@ -# Copyright (c) 2020, eQualit.ie inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - -from baskerville.util.json_url_reader import JsonUrlReader - - -class WhitelistHosts(object): - - def __init__(self, url, logger, refresh_period_in_seconds=300): - self.reader = JsonUrlReader(url=url, logger=logger, refresh_period_in_seconds=refresh_period_in_seconds) - self.logger = logger - - def get(self): - hosts = [] - data = self.reader.get() - if data: - hosts = list(set(data['white_list_hosts'])) - - return hosts diff --git a/src/baskerville/util/whitelist_ips.py b/src/baskerville/util/whitelist_ips.py index fbf50a5c..6ca60699 100644 --- a/src/baskerville/util/whitelist_ips.py +++ b/src/baskerville/util/whitelist_ips.py @@ -47,4 +47,3 @@ def get_host_ips(self): else: host_ips[k] = ips return host_ips - diff --git a/tests/unit/baskerville_tests/db_tests/__init__.py b/tests/unit/baskerville_tests/db_tests/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/tests/unit/baskerville_tests/db_tests/test_base.py b/tests/unit/baskerville_tests/db_tests/test_base.py deleted file mode 100644 index ab3a4a0f..00000000 --- a/tests/unit/baskerville_tests/db_tests/test_base.py +++ /dev/null @@ -1,171 +0,0 @@ -# Copyright (c) 2020, eQualit.ie inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -import unittest -from datetime import datetime, timedelta - -from baskerville.db.base import ( - GeneratorBase, - Index, - TableTools, - PartitionedTable, - Partition, -) - -from tests.unit.baskerville_tests.helpers.utils import get_default_data_path - - -class GeneratorSub(GeneratorBase): - def __init__(self, template_path, template_name): - super().__init__(template_path, template_name) - - def to_dict(self): - return self.__dict__ - - -class TestGeneratorBase(unittest.TestCase): - def setUp(self) -> None: - self.template_path = get_default_data_path() - self.template_name = 'sample_template.jinja2' - self.instance = GeneratorSub(self.template_path, self.template_name) - - def test_instances(self): - self.assertTrue(hasattr(self.instance, 'template_path')) - self.assertTrue(hasattr(self.instance, 'template_name')) - self.assertTrue(hasattr(self.instance, 'j2_env')) - self.assertTrue(hasattr(self.instance, 'template')) - - def test_rendering(self): - result = str(self.instance) - now = datetime.utcnow().strftime("%Y-%m-%d") - self.assertTrue(now in result) - - -class TestIndex(unittest.TestCase): - def setUp(self) -> None: - self.idx_name = 'text_idx' - self.idx_table = 'test_table' - self.idx_fields = ['field1', 'field2', 'field3'] - self.instance = Index(self.idx_name, self.idx_table, self.idx_fields) - - def test_instances(self): - self.assertTrue(hasattr(self.instance, 'name')) - self.assertTrue(hasattr(self.instance, 'table_name')) - self.assertTrue(hasattr(self.instance, 'fields')) - - self.assertTrue(isinstance(self.instance.name, str)) - self.assertTrue(isinstance(self.instance.table_name, str)) - self.assertTrue(isinstance(self.instance.fields, list)) - - def test_str(self): - expected = f'{self.idx_name} ON {self.idx_table} ' \ - f'({", ".join(self.idx_fields)})' - - self.assertEqual(str(self.instance), expected) - - def test_create(self): - expected = f'CREATE INDEX IF NOT EXISTS {self.idx_name} ' \ - f'ON {self.idx_table} ({", ".join(self.idx_fields)}) ' \ - f'TABLESPACE pg_default; ' \ - f'ALTER TABLE {self.idx_table} CLUSTER ON {self.idx_name};' - - self.assertEqual(self.instance.create(), expected) - - def test_drop(self): - expected = f'DROP INDEX IF EXISTS {self.idx_name};' - - self.assertEqual(self.instance.drop(), expected) - - -class TestTableTools(unittest.TestCase): - def test_get_temporal_check_not_new_and(self): - start = datetime.utcnow() - end = start + timedelta(seconds=1) - partition_field = 'some_partition' - result = TableTools.get_temporal_check( - partition_field, start, end, new=False, condition='AND' - ) - expected = f'some_partition >= \'{start.strftime("%Y-%m-%d %H:%M:%S")}\'' \ - f' AND some_partition <= \'{end.strftime("%Y-%m-%d %H:%M:%S.%f")}\' ' - - self.assertEqual(result, expected) - - def test_get_temporal_check_not_new_or(self): - start = datetime.utcnow() - end = start + timedelta(seconds=1) - partition_field = 'some_partition' - result = TableTools.get_temporal_check( - partition_field, start, end, new=False, condition='OR' - ) - expected = f'some_partition >= \'{start.strftime("%Y-%m-%d %H:%M:%S")}\'' \ - f' OR some_partition <= \'{end.strftime("%Y-%m-%d %H:%M:%S.%f")}\' ' - - self.assertEqual(result, expected) - - def test_get_temporal_check_new_and(self): - start = datetime.utcnow() - end = start + timedelta(seconds=1) - partition_field = 'some_partition' - result = TableTools.get_temporal_check( - partition_field, start, end, new=True, condition='AND' - ) - expected = f'NEW.some_partition >= \'{start.strftime("%Y-%m-%d %H:%M:%S")}\'' \ - f' AND NEW.some_partition <= \'{end.strftime("%Y-%m-%d %H:%M:%S.%f")}\' ' - - self.assertEqual(result, expected) - - -class TestPartitionedTable(unittest.TestCase): - def setUp(self) -> None: - self.partition_name = 'partition_name' - self.partition_field = 'partition_field' - self.partitioned_by = 'partitioned_by' - self.index_by = ['index', 'by'] - self.create_catch_all = True - - self.instance = PartitionedTable( - self.partition_name, - self.partition_field, - self.partitioned_by, - self.index_by, - create_catch_all=self.create_catch_all - ) - - def test_instances(self): - self.assertTrue(hasattr(self.instance, 'name')) - self.assertTrue(hasattr(self.instance, 'partition_field')) - self.assertTrue(hasattr(self.instance, 'partitioned_by')) - self.assertTrue(hasattr(self.instance, 'index_by')) - self.assertTrue(hasattr(self.instance, 'create_catch_all')) - self.assertTrue(hasattr(self.instance, 'partitions')) - - self.assertTrue(isinstance(self.instance.index_by, list)) - - -class TestPartition(unittest.TestCase): - def setUp(self) -> None: - self.partition_name = 'partition_name' - self.partition_field = 'partition_field' - self.index_by = list('abcd') - self.is_catch_all = True - self.instance = Partition( - self.partition_name, - self.partition_field, - self.index_by, - is_catch_all=self.is_catch_all - ) - - def test_instances(self): - self.assertTrue(hasattr(self.instance, 'name')) - self.assertTrue(hasattr(self.instance, 'partition_field')) - self.assertTrue(hasattr(self.instance, 'index_by')) - self.assertTrue(hasattr(self.instance, 'is_catch_all')) - - self.assertEqual(self.instance.name, self.partition_name) - self.assertEqual(self.instance.partition_field, self.partition_field) - self.assertEqual(self.instance.index_by, self.index_by) - self.assertEqual(self.instance.is_catch_all, self.is_catch_all) diff --git a/tests/unit/baskerville_tests/db_tests/test_data_archive.py b/tests/unit/baskerville_tests/db_tests/test_data_archive.py deleted file mode 100644 index 68b8819b..00000000 --- a/tests/unit/baskerville_tests/db_tests/test_data_archive.py +++ /dev/null @@ -1,5 +0,0 @@ -# Copyright (c) 2020, eQualit.ie inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. diff --git a/tests/unit/baskerville_tests/db_tests/test_data_partition.py b/tests/unit/baskerville_tests/db_tests/test_data_partition.py deleted file mode 100644 index c512c95b..00000000 --- a/tests/unit/baskerville_tests/db_tests/test_data_partition.py +++ /dev/null @@ -1,229 +0,0 @@ -# Copyright (c) 2020, eQualit.ie inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -import unittest -from datetime import datetime, timedelta - -import isoweek -from baskerville.db.data_partitioning import ( - DataPartitioner, TemporalDataPartitioner -) -from baskerville.db.temporal_partition import TimePeriod, \ - TemporalPartitionedTable -from baskerville.util.enums import PartitionByEnum - -from tests.unit.baskerville_tests.helpers.utils import get_default_data_path - - -class TestDataPartitioner(unittest.TestCase): - def setUp(self) -> None: - self.parent_table = 'parent_table' - self.partition_field = 'partition_field' - self.index_by = 'index_by' - self.template_path = get_default_data_path() - self.template_name = 'sample_template.jinja2' - self.instance = DataPartitioner( - self.parent_table, - self.partition_field, - index_by=self.index_by, - template_path=self.template_path, - template_name=self.template_name - ) - - def test_instance(self): - self.assertTrue(hasattr(self.instance, 'parent_table')) - self.assertTrue(hasattr(self.instance, 'partition_field')) - self.assertTrue(hasattr(self.instance, 'index_by')) - self.assertTrue(hasattr(self.instance, 'template_path')) - self.assertTrue(hasattr(self.instance, 'template_name')) - - self.assertEqual(self.instance.parent_table, self.parent_table) - self.assertEqual(self.instance.partition_field, self.partition_field) - self.assertEqual(self.instance.index_by, self.index_by) - - -class TestTemporalDataPartitioner(unittest.TestCase): - def setUp(self) -> None: - self.parent_table = 'parent_table' - self.partition_field = 'partition_field' - self.partition_by = PartitionByEnum.w - self.index_by = list('abc') - self.template_path = get_default_data_path() - self.template_name = 'sample_template.jinja2' - self.start = datetime(2019, 12, 1).replace(hour=0, minute=0, second=0) - self.end = (self.start + timedelta(days=10)) - self.time_window = TimePeriod(self.start, self.end) - - self.instance = TemporalDataPartitioner( - self.parent_table, - self.partition_field, - self.time_window, - partition_by=self.partition_by, - index_by=self.index_by, - template_path=self.template_path, - template_name=self.template_name - ) - - def test_instance(self): - self.assertTrue(hasattr(self.instance, 'parent_table')) - self.assertTrue(hasattr(self.instance, 'partition_field')) - self.assertTrue(hasattr(self.instance, 'index_by')) - self.assertTrue(hasattr(self.instance, 'template_path')) - self.assertTrue(hasattr(self.instance, 'template_name')) - - self.assertEqual(self.instance.parent_table, self.parent_table) - self.assertEqual(self.instance.partition_field, self.partition_field) - self.assertEqual(self.instance.index_by, self.index_by) - self.assertTrue(hasattr(self.instance, 'partitioned_table')) - self.assertTrue(isinstance( - self.instance.partitioned_table, TemporalPartitionedTable) - ) - - def test_to_dict_partition_by_month_not_strict(self): - self.instance = TemporalDataPartitioner( - self.parent_table, - self.partition_field, - self.time_window, - partition_by=PartitionByEnum.m, - index_by=self.index_by, - template_path=self.template_path, - template_name=self.template_name - ) - results = self.instance.to_dict() - print(results) - f_start = self.start.strftime("%Y-%m-%d %H:%M:%S") - f_end = self.end.replace(microsecond=999999).strftime( - "%Y-%m-%d %H:%M:%S.%f") - expected_results = { - 'name': 'parent_table', - 'partition_prefix': 'parent_table_y2019_m', - 'partitions': [], - 'catch_all_partition_name': 'parent_table_catch_all', - 'partitioned_by': PartitionByEnum.m, - 'partition_field': 'partition_field', - 'field_value': f'cast(extract(month from NEW.{self.partition_field}) AS TEXT)', - 'self_check': f"NEW.{self.partition_field} >= '{f_start}' " - f"AND NEW.{self.partition_field} <= '{f_end}' " - } - - self.assertTrue(len(results.keys()) == len(expected_results.keys())) - for k, v in results.items(): - if k != 'partitions': - print(results[k]) - print(expected_results[k]) - self.assertTrue(results[k] == expected_results[k]) - - def test_to_dict_partition_by_month_strict(self): - self.start = datetime(2019, 12, 1) - self.end = ( - self.start + timedelta(days=10) - ).replace(microsecond=999999) - self.time_window = TimePeriod(self.start, self.end) - - self.instance = TemporalDataPartitioner( - self.parent_table, - self.partition_field, - self.time_window, - partition_by=PartitionByEnum.m, - index_by=self.index_by, - template_path=self.template_path, - template_name=self.template_name, - strict=True - ) - results = self.instance.to_dict() - print(results) - f_start = self.start.strftime("%Y-%m-%d %H:%M:%S") - f_end = self.end.strftime("%Y-%m-%d %H:%M:%S.%f") - expected_results = { - 'name': 'parent_table', - 'partition_prefix': 'parent_table_y2019_m', - 'partitions': [], - 'catch_all_partition_name': 'parent_table_catch_all', - 'partitioned_by': PartitionByEnum.m, - 'partition_field': 'partition_field', - 'field_value': f'cast(extract(month from NEW.{self.partition_field}) AS TEXT)', - 'self_check': f"NEW.{self.partition_field} >= '{f_start}' " - f"AND NEW.{self.partition_field} <= '{f_end}' " - } - - self.assertTrue(len(results.keys()) == len(expected_results.keys())) - for k, v in results.items(): - if k != 'partitions': - print(results[k]) - print(expected_results[k]) - self.assertTrue(results[k] == expected_results[k]) - - def test_to_dict_partition_by_week_not_strict(self): - results = self.instance.to_dict() - start_w = isoweek.Week(self.start.year, self.start.isocalendar()[1]) - end_w = isoweek.Week(self.end.year, self.end.isocalendar()[1]) - f_start = datetime.combine( - start_w.monday(), datetime.min.time() - ).strftime("%Y-%m-%d %H:%M:%S") - f_end = datetime.combine( - end_w.sunday(), datetime.min.time() - ).replace( - hour=23, minute=59, second=59, microsecond=999999 - ).strftime("%Y-%m-%d %H:%M:%S.%f") - - print(f_start, f_end) - expected_results = { - 'name': 'parent_table', - 'partition_prefix': 'parent_table_y2019_w', - 'partitions': [], - 'catch_all_partition_name': 'parent_table_catch_all', - 'partitioned_by': 'week', 'partition_field': 'partition_field', - 'field_value': f'cast(extract(week from NEW.{self.partition_field}) AS TEXT)', - 'self_check': f"NEW.{self.partition_field} >= '{f_start}' " - f"AND NEW.{self.partition_field} <= '{f_end}' " - } - - self.assertTrue(len(results.keys()) == len(expected_results.keys())) - for k, v in results.items(): - if k != 'partitions': - print(results[k]) - print(expected_results[k]) - self.assertTrue(results[k] == expected_results[k]) - - def test_to_dict_partition_by_week_strict(self): - self.start = datetime(2019, 12, 1) - self.end = (self.start + timedelta(days=10)).replace( - microsecond=999999 - ) - self.time_window = TimePeriod(self.start, self.end) - - self.instance = TemporalDataPartitioner( - self.parent_table, - self.partition_field, - self.time_window, - partition_by=PartitionByEnum.w, - index_by=self.index_by, - template_path=self.template_path, - template_name=self.template_name, - strict=True - ) - results = self.instance.to_dict() - f_start = self.start.strftime("%Y-%m-%d %H:%M:%S") - f_end = self.end.strftime("%Y-%m-%d %H:%M:%S.%f") - - expected_results = { - 'name': 'parent_table', - 'partition_prefix': 'parent_table_y2019_w', - 'partitions': [], - 'catch_all_partition_name': 'parent_table_catch_all', - 'partitioned_by': 'week', 'partition_field': 'partition_field', - 'field_value': f'cast(extract(week from NEW.{self.partition_field}) AS TEXT)', - 'self_check': f"NEW.{self.partition_field} >= '{f_start}' " - f"AND NEW.{self.partition_field} <= '{f_end}' " - } - - self.assertTrue(len(results.keys()) == len(expected_results.keys())) - for k, v in results.items(): - if k != 'partitions': - print(results[k]) - print(expected_results[k]) - self.assertTrue(results[k] == expected_results[k]) diff --git a/tests/unit/baskerville_tests/db_tests/test_temporal_partition.py b/tests/unit/baskerville_tests/db_tests/test_temporal_partition.py deleted file mode 100644 index d753049d..00000000 --- a/tests/unit/baskerville_tests/db_tests/test_temporal_partition.py +++ /dev/null @@ -1,85 +0,0 @@ -# Copyright (c) 2020, eQualit.ie inc. -# All rights reserved. -# -# This source code is licensed under the BSD-style license found in the -# LICENSE file in the root directory of this source tree. - - -# import unittest -# from baskerville.db.temporal_partition import ( -# TimePeriod, -# TemporalPartitionedTable, -# TemporalPartition, -# ) -# -# -# class TestTimePeriod(unittest.TestCase): -# def test_instance(self): -# raise NotImplementedError() -# -# def test_gt(self): -# raise NotImplementedError() -# -# def test_ge(self): -# raise NotImplementedError() -# -# def test_lt(self): -# raise NotImplementedError() -# -# def test_le(self): -# raise NotImplementedError() -# -# def test_split_by_year(self): -# raise NotImplementedError() -# -# def test_split_by_year_and_month(self): -# raise NotImplementedError() -# -# def test_split_by_year_and_week(self): -# raise NotImplementedError() -# -# -# class TestTemporalPartitionedTable(unittest.TestCase): -# def test_instance(self): -# raise NotImplementedError() -# -# def test_get_partition_prefix(self): -# raise NotImplementedError() -# -# def test_get_partition_name(self): -# raise NotImplementedError() -# -# def test_get_partition_range(self): -# raise NotImplementedError() -# -# def test_partition(self): -# raise NotImplementedError() -# -# def test_get_partition_for(self): -# raise NotImplementedError() -# -# def test_get_catch_all_partition(self): -# raise NotImplementedError() -# -# def test_to_dict(self): -# raise NotImplementedError() -# -# -# class TestTemporalPartition(unittest.TestCase): -# def test_instance(self): -# raise NotImplementedError() -# -# def test_gt(self): -# raise NotImplementedError() -# -# def test_ge(self): -# raise NotImplementedError() -# -# def test_lt(self): -# raise NotImplementedError() -# -# def test_le(self): -# raise NotImplementedError() -# -# def test_to_dict(self): -# raise NotImplementedError() diff --git a/tests/unit/baskerville_tests/models_tests/test_base_spark.py b/tests/unit/baskerville_tests/models_tests/test_base_spark.py index 81cd5cc7..d5f1e1af 100644 --- a/tests/unit/baskerville_tests/models_tests/test_base_spark.py +++ b/tests/unit/baskerville_tests/models_tests/test_base_spark.py @@ -14,7 +14,6 @@ from unittest import mock from baskerville.db.models import RequestSet -from baskerville.spark.helpers import StorageLevelFactory from baskerville.util.helpers import get_default_data_path from tests.unit.baskerville_tests.helpers.spark_testing_base import \ @@ -57,7 +56,7 @@ def setUp(self): f'/logs/test_base.log' self.engine_conf.metrics = False self.engine_conf.cache_expire_time = 10 - self.spark_conf = SparkConfig({'db_driver': 'test'}) + self.spark_conf = SparkConfig({'db_driver': 'test', 'jars': ''}) self.spark_conf.validate() from baskerville.models.base_spark import SparkPipelineBase @@ -179,59 +178,6 @@ def test_initialize(self, mock_bytes, mock_instantiate_from_str): mock_bytes.decode.assert_called_once() mock_instantiate_from_str.assert_called_once() - # def test_initialize_model_path(self): - # - # # to call get_ml_model_from_file - # self.engine_conf.model_id = None - # self.engine_conf.model_path = 'some test path' - # self.spark_pipeline.model_manager.set_anomaly_detector_broadcast = mock.MagicMock() - # self.spark_pipeline.initialize() - # self.assertEqual( - # self.spark_pipeline.time_bucket.sec, - # self.engine_conf.time_bucket - # ) - # self.assertEqual( - # self.spark_pipeline.time_bucket.td, - # timedelta(seconds=self.engine_conf.time_bucket) - # ) - # - # db_tools = self.spark_pipeline.tools - # db_tools.connect_to_db.assert_called_once() - # - # self.spark_pipeline.instantiate_spark_session.assert_called_once() - # self.spark_pipeline.set_up_request_set_cache.assert_called_once() - # - # self.assertEqual(len(self.spark_pipeline.group_by_aggs), 3) - # self.assertTrue('first_request' in self.spark_pipeline.group_by_aggs) - # self.assertTrue('last_request' in self.spark_pipeline.group_by_aggs) - # self.assertTrue('num_requests' in self.spark_pipeline.group_by_aggs) - # self.assertEqual( - # str(self.spark_pipeline.group_by_aggs['first_request']._jc), - # 'min(@timestamp) AS `first_request`' - # ) - # self.assertEqual( - # str(self.spark_pipeline.group_by_aggs['last_request']._jc), - # 'max(@timestamp) AS `last_request`' - # ) - # self.assertEqual( - # str(self.spark_pipeline.group_by_aggs['num_requests']._jc), - # 'count(@timestamp) AS `num_requests`' - # ) - # - # self.assertEqual(len(self.spark_pipeline.feature_manager.column_renamings), 0) - # self.assertEqual(len(self.spark_pipeline.feature_manager.active_features), 0) - # self.assertEqual(len(self.spark_pipeline.feature_manager.active_feature_names), 0) - # self.assertEqual(len(self.spark_pipeline.feature_manager.active_columns), 0) - # self.assertEqual(len(self.spark_pipeline.columns_to_filter_by), 3) - # self.assertSetEqual( - # self.spark_pipeline.columns_to_filter_by, - # {'client_request_host', 'client_ip', '@timestamp'} - # ) - # db_tools = self.spark_pipeline.tools - # db_tools.get_ml_model_from_file.assert_called_once_with( - # self.engine_conf.model_path - # ) - @mock.patch('baskerville.models.base_spark.instantiate_from_str') def test_initialize_no_model_register_metrics(self, mock_instantiate_from_str): @@ -726,6 +672,7 @@ def test_save(self, mock_bytes, mock_instantiate_from_str): 'r': 0., 'time_bucket': 10, 'model_version': 'test', + 'classifier_score': 0.0 }, { 'id': 1, @@ -737,6 +684,8 @@ def test_save(self, mock_bytes, mock_instantiate_from_str): 'id_runtime': -1, 'request_set_prediction': -1, 'prediction': -1, + 'prediction_anomaly': 0, + 'prediction_classifier': 0, 'attack_prediction': 0, 'low_rate_attack': 0, 'challenged': 0, @@ -755,6 +704,7 @@ def test_save(self, mock_bytes, mock_instantiate_from_str): 'updated_at': now, 'time_bucket': 10, 'model_version': 'test', + 'classifier_score': 0.0 } ] self.spark_pipeline.set_broadcasts = mock.MagicMock() @@ -859,131 +809,6 @@ def test_feature_extraction(self): mock_feature3.compute.return_value ) - @mock.patch('baskerville.spark.helpers.col_to_json') - def test_save_df_to_table_diff_params(self, col_to_json): - self.spark_conf.storage_level = 'OFF_HEAP' - test_table = 'test_table' - json_cols = ('a', 'b') - mode_param = 'test_mode' - df = mock.MagicMock() - - col_to_json.return_value = df - persist = df.persist - format = df.write.format - options = format.return_value.options - mode = options.return_value.mode - save = mode.return_value.save - persist.return_value = df - self.spark_pipeline.save_df_to_table( - df, test_table, json_cols=json_cols, mode=mode_param - ) - - persist.assert_called_once_with( - StorageLevelFactory.get_storage_level(self.spark_conf.storage_level)) - format.assert_called_once_with('jdbc') - options.assert_called_once_with( - url=self.spark_pipeline.db_url, - driver=self.spark_pipeline.spark_conf.db_driver, - dbtable=test_table, - user=self.spark_pipeline.db_conf.user, - password=self.spark_pipeline.db_conf.password, - stringtype='unspecified', - batchsize=100000, - max_connections=1250, - rewriteBatchedStatements=True, - reWriteBatchedInserts=True, - useServerPrepStmts=False - ) - mode.assert_called_once_with(mode_param) - - called_args = [] - for args in col_to_json.call_args_list: - self.assertEqual(args[0][0], df) - self.assertTrue(args[0][1] in json_cols) - called_args.append(args[0][1]) - - self.assertSetEqual(set(called_args), set(json_cols)) - - save.assert_called_once() - - def test_save_df_to_table(self): - test_table_name = 'test' - df = mock.MagicMock() - persist = df.persist - after_col_to_json = persist.return_value - format = after_col_to_json.write.format - options = format.return_value.options - mode = options.return_value.mode - save = mode.return_value.save - - self.spark_pipeline.save_df_to_table( - df, - test_table_name, - json_cols=() - ) - - format.assert_called_once_with('jdbc') - options.assert_called_once_with( - url=self.spark_pipeline.db_url, - driver=self.spark_pipeline.spark_conf.db_driver, - dbtable=test_table_name, - user=self.spark_pipeline.db_conf.user, - password=self.spark_pipeline.db_conf.password, - stringtype='unspecified', - batchsize=100000, - max_connections=1250, - rewriteBatchedStatements=True, - reWriteBatchedInserts=True, - useServerPrepStmts=False - ) - mode.assert_called_once_with('append') - save.assert_called_once() - - @mock.patch('baskerville.spark.helpers.col_to_json') - def test_save_df_to_table_json_cols(self, col_to_json): - test_table_name = 'test' - test_json_cols = ('a', 'b') - df = mock.MagicMock() - - col_to_json.return_value = df - persist = df.persist - format = df.write.format - options = format.return_value.options - mode = options.return_value.mode - save = mode.return_value.save - persist.return_value = df - - self.spark_pipeline.save_df_to_table( - df, - test_table_name, - json_cols=test_json_cols - ) - - format.assert_called_once_with('jdbc') - options.assert_called_once_with( - url=self.spark_pipeline.db_url, - driver=self.spark_pipeline.spark_conf.db_driver, - dbtable=test_table_name, - user=self.spark_pipeline.db_conf.user, - password=self.spark_pipeline.db_conf.password, - stringtype='unspecified', - batchsize=100000, - max_connections=1250, - rewriteBatchedStatements=True, - reWriteBatchedInserts=True, - useServerPrepStmts=False - ) - mode.assert_called_once_with('append') - save.assert_called_once() - self.assertEqual(col_to_json.call_count, 2) - actual_json_col = [] - for call in col_to_json.call_args_list: - self.assertTrue(call[0][0] == df) - self.assertTrue(call[0][1] in test_json_cols) - actual_json_col.append(call[0][1]) - - self.assertTupleEqual(tuple(actual_json_col), test_json_cols) - @mock.patch('baskerville.models.base_spark.instantiate_from_str') @mock.patch('baskerville.models.base_spark.bytes') def test_filter_columns(self, mock_bytes, mock_instantiate_from_str): diff --git a/tests/unit/baskerville_tests/models_tests/test_pipelines.py b/tests/unit/baskerville_tests/models_tests/test_pipelines.py index 4fbf237a..a2d4b93c 100644 --- a/tests/unit/baskerville_tests/models_tests/test_pipelines.py +++ b/tests/unit/baskerville_tests/models_tests/test_pipelines.py @@ -13,26 +13,6 @@ # from baskerville import src_dir -class TestElasticsearchPipelinee(unittest.TestCase): - def setUp(self): - pass - - # def test_instance(self): - # raise NotImplementedError() - # - # def test_create_runtime(self): - # raise NotImplementedError() - # - # def test_get_data(self): - # raise NotImplementedError() - # - # def test_save_logs(self): - # raise NotImplementedError() - # - # def test_run(self): - # raise NotImplementedError() - - class TestManualRawLogPipeline(unittest.TestCase): def setUp(self): pass diff --git a/tests/unit/baskerville_tests/models_tests/test_request_set_cache.py b/tests/unit/baskerville_tests/models_tests/test_request_set_cache.py index b41457f2..e0b14199 100644 --- a/tests/unit/baskerville_tests/models_tests/test_request_set_cache.py +++ b/tests/unit/baskerville_tests/models_tests/test_request_set_cache.py @@ -114,7 +114,8 @@ def test__load(self, mock_broadcast): self.test_cache_config, self.test_table_name, self.test_columns_to_keep, - group_by_fields=self.test_groupby_fields + group_by_fields=self.test_groupby_fields, + use_storage=True ) rsc.session_getter = mock.MagicMock()