CogStack
diff --git a/‎examples/example10/README‎
Lines changed: 46 additions & 0 deletions b/‎examples/example10/README‎
Lines changed: 46 additions & 0 deletions
diff --git a/‎examples/example10/cogstack-pipeline/annotation_ingester/config/config.yml‎
Lines changed: 44 additions & 0 deletions b/‎examples/example10/cogstack-pipeline/annotation_ingester/config/config.yml‎
Lines changed: 44 additions & 0 deletions
diff --git a/‎examples/example10/cogstack-pipeline/cogstack/conf/step-1/es_ingestion.properties‎
Lines changed: 79 additions & 0 deletions b/‎examples/example10/cogstack-pipeline/cogstack/conf/step-1/es_ingestion.properties‎
Lines changed: 79 additions & 0 deletions
diff --git a/‎examples/example10/cogstack-pipeline/cogstack/run_pipeline.sh‎
Lines changed: 35 additions & 0 deletions b/‎examples/example10/cogstack-pipeline/cogstack/run_pipeline.sh‎
Lines changed: 35 additions & 0 deletions
diff --git a/‎examples/example10/cogstack-pipeline/docker-compose.yml‎
Lines changed: 191 additions & 0 deletions b/‎examples/example10/cogstack-pipeline/docker-compose.yml‎
Lines changed: 191 additions & 0 deletions
@@ -0,0 +1,46 @@
+EXAMPLE 10 :
+
+COGSTACK-PIPELINE -> ELASTIC SEARCH DOCUMENT INGESTION from DB -> MEDCAT SERVICE -> REINGESTION OF THE ANNOTATIONS from MEDCAT into ES 
+
+Description: We use the pipeline to quickly ingest documents from the database into an elastic search index "sample_observations_view", and, in parallel, we use a separate pipeline service to get documents from ES, send them to Medcat service, retreive the annotated text back and reingest it into a separate index called "sample_observations_view_annotations" in ES.  
+
+
+To get this working please do the following steps:
+
+1. Go to ./data/models/ and execute the download_medmen.sh  (you have to be inside the folder and execute the script !!!) using the command: bash download_medmen.sh
+2. Go to cogstack-pipeline/ and start the docker container using the following command : docker-compose up -d  
+
+Please be patient, this example will take time until it finishes ingesting all the documents.
+
+
+3. If all goes well, navigate to  http://localhost:5601/  and login using the following credentials :  
+    -username: elastic
+    -password: admin
+
+    
+4. Navigate to http://localhost:5601/app/management/kibana/indexPatterns/create
+
+
+Click on create index pattern:
+	- paste   sample_observations_view   into the input field.
+Click on next step
+	- select  observation_timestamp  from the dropdown.
+Click Next
+	- Done, you should have the sample text index ready to view !
+
+
+5. We need to redo the instructions from step 4
+Click on create index pattern:
+	- paste   sample_observations_view_annotations into the input field.
+Click on next step
+	- select  meta.observation_timestamp  from the dropdown.
+Click Next
+	- Done, you should have the sample text index ready to view !
+
+6. All Done ! 
+   You can view how many records you have and how much space your index takes + other stats at http://localhost:5601/app/management/data/index_management/indices
+   
+   To view the records inserted, go to : http://localhost:5601/app/discover  , change the query timeframe to last 15 years and perform a search!
+
+
+The data used in this example is taken from example2...
@@ -0,0 +1,44 @@
+source:
+  es:
+    hosts: ["http://elastic:admin@elasticsearch-1:9200"]
+    index-name: 'sample_observations_view'
+    #security:
+    #  ca-certs-path: "/app/config/root-ca.pem"
+    #  client-cert-path: "/app/config/client.pem"
+    #  client-key-path: "/app/config/client.key"
+
+sink:
+  es:
+    hosts: ["http://elastic:admin@elasticsearch-1:9200"]
+    index-name: 'sample_observations_view_annotations'
+    #security:
+    #  ca-certs-path: "/app/config/root-ca.pem"
+    #  client-cert-path: "/app/config/client.pem"
+    #  client-key-path: "/app/config/client.key"
+
+nlp-service:
+  endpoint-url: 'http://medcat-service:5000/api/process'
+
+mapping:
+  source:
+    text-field: 'encounter_document'
+    docid-field: 'encounter_id'
+    persist-fields:
+      - 'encounter_id'
+    #  - 'patient_id'
+      - 'encounter_start'
+      - 'observation_timestamp'
+    #  - 'encounter_end'
+    batch:
+      date-field: 'encounter_start'
+      date-format: 'yyyy-MM-dd'
+      python-date-format: '%Y-%m-%d' 
+      interval: 30 
+      date-start: '1922-01-01'
+      date-end: '2099-06-01'
+      threads: 8
+  sink:
+    split-index-by-field: ''
+  nlp:
+    skip-processed-doc-check: 'true'
+    annotation-id-field: 'id'
@@ -0,0 +1,79 @@
+## ACTIVE SPRING PROFILES
+##
+spring.profiles.active = jdbc_in,elasticsearchRest,localPartitioning
+                                
+
+##### JOB AND REPO DB CONFIGURATIONS
+#
+#jobRepository.jobName = db_to_es_ingestion
+jobRepository.JdbcPath = jdbc:postgresql://cogstack-job-repo:5432/cogstack_job_repo
+jobRepository.Driver = org.postgresql.Driver
+jobRepository.username = cogstack
+jobRepository.password = cogstack
+
+#### SOURCE: DB CONFIGURATIONS
+##
+source.JdbcPath = jdbc:postgresql://databank-db:5432/project_data
+source.Driver = org.postgresql.Driver
+source.username = test
+source.password = test
+
+# The principle SQL block that specifies data to process. Composed of three parts.
+source.selectClause = SELECT *
+source.fromClause = FROM observations_view
+source.sortKey = observation_id
+
+# The principle DB column label mapping for Document data model
+source.primaryKeyFieldValue = observation_id
+source.timeStamp = observation_timestamp
+
+# Type of the timestamp field: 'TIMESTAMP', 'DATETIME', 'DATE' or 'TIME'
+source.dbmsToJavaSqlTimestampType = TIMESTAMP
+
+##### SINK: ELASTICSEARCH CONFIGURATION
+##
+elasticsearch.xpack.security.transport.ssl.enabled = false
+elasticsearch.cluster.host = elasticsearch-1
+elasticsearch.cluster.port = 9200
+elasticsearch.xpack.enabled = true
+elasticsearch.xpack.user = elastic
+elasticsearch.xpack.password = admin
+
+# optional: ES indexing options
+elasticsearch.index.name = sample_observations_view
+# – the name of the index in ElasticSearch under which documents are or will be stored,
+elasticsearch.excludeFromIndexing = observation_id
+
+# elasticsearch.type 
+# (default: doc) – the type of the documents (deprecated in ElasticSearch 6.0+),
+elasticsearch.cluster.name = "cogstack-elastic-cluster" 
+# (default: elasticsearch) 
+# – the name of the ElasticSearch cluster,
+# elasticsearch.cluster.slaveNodes 
+# – the list of comma-separated <host:port> values for multi-node deployments,
+# elasticsearch.connect.timeout 
+# (default: 5000)  – max. time value (in ms) for connection timeout,
+# elasticsearch.retry.timeout 
+# (default: 60000) – max. time value (in ms) before performing retry,
+
+#Available properties when using the ElasticSearch X-Pack security module:
+#ES basic auth from X-pack security plugin (commercial)
+
+# (default: false) – specifies whether to use SSL encryption for communication between ElasticSearch nodes,
+# elasticsearch.xpack.ssl.keystore.path – the path to the Java Keystore file that contains a private key and certificate,
+# elasticsearch.xpack.ssl.keystore.password – the password to the keystore,
+# elasticsearch.xpack.ssl.truststore.path – the path to the Java Keystore file that contains the certificates to trust,
+# elasticsearch.xpack.ssl.truststore.password – the password to the truststore.
+
+
+##### PARTITIONER CONFIGURATION
+##
+partitioner.partitionType = PKTimeStamp
+partitioner.timeStampColumnName = observation_timestamp
+partitioner.pkColumnName = observation_id
+partitioner.tableToPartition = observations_view
+
+## SCHEDULER CONFIGURATION
+##
+# optional (default: false): if true, run a new job after the last one has finished - new jobs will continute to be created indefinitely
+#scheduler.useScheduling = false
@@ -0,0 +1,35 @@
+#!/bin/bash
+
+# wait for the services to get ready
+#
+echo "*** Awaiting services to start CogStack Pipeline ***"
+
+while IFS=',' read -ra ADDR; do
+  for i in "${ADDR[@]}"; do
+      ./wait_for_service.sh "$i" "--timeout=0" "--stdout"
+  done
+done <<< $SERVICES_USED
+
+
+# start cogstack pipeline
+#
+echo "*** Starting CogStack Pipeline ***"
+
+cog_start=`date +%s`
+
+#COG_PATH=/cogstack-pipeline
+#COG_CONFIG_PATH=$COG_PATH/C
+
+time sh -c "java -DLOG_FILE_NAME=$LOG_FILE_NAME -DLOG_LEVEL=$LOG_LEVEL -DFILE_LOG_LEVEL=$FILE_LOG_LEVEL -jar ${1} $COG_CONFIG_PATH/step-1/"
+# time sh -c "java -DLOG_FILE_NAME=$LOG_FILE_NAME -DLOG_LEVEL=$LOG_LEVEL -DFILE_LOG_LEVEL=$FILE_LOG_LEVEL -jar ${1} $COG_CONFIG_PATH/step-2/"
+
+cog_end=`date +%s`
+
+echo "*** Finishing CogStack Pipeline ***"
+
+runtime_s=$((cog_end-cog_start))
+runtime_m=$((runtime_s/60))
+runtime_h=$((runtime_m/60))
+
+echo "Duration: $runtime_h h $((runtime_m-(runtime_h*60))) m $((runtime_s-(runtime_m*60))) s"
+echo "Total: $runtime_s sec"
@@ -0,0 +1,191 @@
+version: "3.5"
+services:
+#---------------------------------------------------------------------------#
+# Postgres container with sample data                                       #
+#---------------------------------------------------------------------------#
+  databank-db:
+    image: postgres:13-alpine
+    shm_size: 2048mb
+    restart: always  
+    environment:
+      - POSTGRES_USER=test
+      - POSTGRES_PASSWORD=test
+      - POSTGRES_DATABANK_DB=project_data
+    volumes:
+      - ../data/:/data  # mount the folder containing the DB csv files
+      - ./scripts/create_source_and_sink_db.sh:/docker-entrypoint-initdb.d/create_source_and_sink_db.sh:ro
+      - databank-db-vol:/var/lib/postgresql/data
+    ports:
+      - 5433:5432
+    networks:
+      - cognet
+
+#---------------------------------------------------------------------------#
+# CogStack-Pipeline related containers                                      #
+#---------------------------------------------------------------------------#
+  cogstack-pipeline:
+    image: cogstacksystems/cogstack-pipeline:dev-latest
+    #image : cogstacksystems/cogstack-pipeline-gate:latest
+    shm_size : 128mb
+    restart: always  
+    environment:
+      - SERVICES_USED=cogstack-job-repo:5432,databank-db:5432,elasticsearch-1:9200
+      - LOG_LEVEL=info
+      - LOG_FILE_NAME=cogstack_job_log
+      - FILE_LOG_LEVEL=off
+      - COG_PATH=/cogstack # used in run_pipeline.sh to point at the location of cogstack
+      - COG_CONFIG_PATH=/cogstack/cogstack_conf
+    volumes:
+      - ./cogstack/conf:/cogstack/cogstack_conf:ro  # <-- cogstack properties folder
+      - ./cogstack/run_pipeline.sh:/cogstack/run_pipeline.sh:ro
+    depends_on:
+      - cogstack-job-repo
+      - databank-db
+    # java [parameters] -jar cogstack-*.jar <directory>
+    # The run_pipeline.sh passes the two arguments and starts cogstack with the two arguments .jar file and job conf directory
+    command: /cogstack/run_pipeline.sh /cogstack/cogstack-*.jar /cogstack/cogstack_conf
+    networks:
+      - cognet
+ 
+  cogstack-job-repo:
+    image: postgres:13-alpine 
+    shm_size: 128mb
+    restart: always
+    environment:
+      - POSTGRES_USER=cogstack
+      - POSTGRES_PASSWORD=cogstack
+      - POSTGRES_COGSTACK_DB=cogstack_job_repo
+    volumes:
+      - ./scripts/create_pgsql_job_repo.sh:/docker-entrypoint-initdb.d/create_pgsql_job_repo.sh:ro
+      - cogstack-job-vol:/var/lib/postgresql/data
+    depends_on:
+      - databank-db
+    ports:
+      - 5432:5432
+    networks:
+      - cognet
+
+#---------------------------------------------------------------------------#
+# Elasticsearch cluster                                                     #
+#---------------------------------------------------------------------------#
+  elasticsearch-1:
+   image: docker.elastic.co/elasticsearch/elasticsearch:7.10.1
+   #image: amazon/opendistro-for-elasticsearch:1.12.0
+   shm_size : 2048mb
+   restart: always
+   # use this in case of vm.max_map_count errors, the /usr/local/bin/docker-entry.sh is taken from the DockerHub image of elasticsearch
+   entrypoint: "/bin/sh -c 'echo vm.max_map_count=262144 >> /etc/sysctl.conf   && /usr/local/bin/docker-entrypoint.sh' "
+   environment:
+     - cluster.name=cogstack-elastic-cluster
+     - ELASTIC_USER=elastic
+     - ELASTIC_PASSWORD=admin
+     - bootstrap.memory_lock=true
+     - "ES_JAVA_OPTS=-Xms2048m -Xmx2048m"
+   volumes:
+     - ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml:ro
+     - elasticsearch-vol-1:/usr/share/elasticsearch/data
+     # mapping security config
+     # INFO: uncomment below to add SSL keys
+     #- ../security/root-ca.pem:/usr/share/elasticsearch/config/root-ca.pem:ro
+     #- ../security/es-node1.pem:/usr/share/elasticsearch/config/esnode.pem:ro
+     #- ../security/es-node1.key:/usr/share/elasticsearch/config/esnode.key:ro
+     #- ../security/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro
+   depends_on:
+     - cogstack-job-repo
+   ulimits:
+     memlock:
+       soft: -1
+       hard: -1
+   ports:
+     - "9200:9200"
+   networks:
+     - cognet
+#---------------------------------------------------------------------------#
+# Medcat Service                                                            #
+#---------------------------------------------------------------------------#
+  medcat-service:
+    container_name: medcat-service
+    image: cogstacksystems/medcat-service
+    restart: always
+    env_file:
+      - ./medcat_service/envs/env_app
+      - ./medcat_service/envs/env_medcat
+    volumes:
+      - ../data/models/medmen/vocab.dat:/cat/models/vocab.dat:ro
+      - ../data/models/medmen/cdb.dat:/cat/models/cdb.dat:ro
+      - ../data/models/medmen/mc_status:/cat/models/mc_status:ro
+    depends_on:
+      - elasticsearch-1
+    ports:
+      - "5000:5000"
+    networks:
+      - cognet
+#---------------------------------------------------------------------------#
+# Annotation Service                                                            #
+#---------------------------------------------------------------------------#
+  annotation-ingester:
+    image: cogstacksystems/annotations-ingester:latest
+    shm_size : 128mb
+    restart: always
+
+    volumes:
+      - ./annotation_ingester/config/config.yml:/app/config/config.yml:ro
+    depends_on:
+      - elasticsearch-1
+      - medcat-service
+      - kibana
+      - cogstack-pipeline
+    command: "/app/run.sh"
+    networks:
+      - cognet
+
+#---------------------------------------------------------------------------#
+# Kibana webapp                                                             #
+#---------------------------------------------------------------------------#
+  kibana:
+    image: docker.elastic.co/kibana/kibana:7.10.1
+    #image: amazon/opendistro-for-elasticsearch-kibana:1.12.0
+    shm_size : 128mb
+    restart: always
+    environment:
+      SERVER_NAME: kibana.server
+      # edit the url to match your server
+      ELASTICSEARCH_HOSTS: "http://elasticsearch-1:9200" 
+      # INFO: uncomment below to enable SSL keys
+      SERVER_SSL_ENABLED: "false"
+      #SERVER_SSL_KEY: /usr/share/kibana/config/kibana.key
+      #SERVER_SSL_CERTIFICATE: /usr/share/kibana/config/kibana.pem
+    volumes:
+      - ./kibana/config/kibana.yml:/usr/share/kibana/config/kibana.yml:ro
+      - elasticsearch-vol-1:/usr/share/elasticsearch/data
+      # INFO: uncomment below to add SSL keys
+      #- ../security/root-ca.pem:/usr/share/kibana/config/root-ca.pem:ro
+      #- ../security/kibana.pem:/usr/share/kibana/config/kibana.pem:ro
+      #- ../security/kibana.key:/usr/share/kibana/config/kibana.key:ro     
+    depends_on:
+      - elasticsearch-1
+      - medcat-service
+      - cogstack-pipeline
+    ports:
+      - "5601:5601"
+    networks:
+      - cognet
+
+#---------------------------------------------------------------------------#
+# Docker networks.                                                          #
+#---------------------------------------------------------------------------#
+networks:
+  cognet:
+    driver: bridge
+    name : cognet
+
+#---------------------------------------------------------------------------#
+# Docker named volumes                                                      #
+#---------------------------------------------------------------------------#
+volumes:
+  databank-db-vol:
+    driver: local
+  cogstack-job-vol:
+    driver: local
+  elasticsearch-vol-1:
+    driver: local