Skip to content
This repository was archived by the owner on Jan 13, 2023. It is now read-only.

Commit e8189f0

Browse files
committed
Added example 10.
1 parent 6235ce1 commit e8189f0

30 files changed

Lines changed: 4560 additions & 3199 deletions

examples/example10/README

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
EXAMPLE 10 :
2+
3+
COGSTACK-PIPELINE -> ELASTIC SEARCH DOCUMENT INGESTION from DB -> MEDCAT SERVICE -> REINGESTION OF THE ANNOTATIONS from MEDCAT into ES
4+
5+
Description: We use the pipeline to quickly ingest documents from the database into an elastic search index "sample_observations_view", and, in parallel, we use a separate pipeline service to get documents from ES, send them to Medcat service, retreive the annotated text back and reingest it into a separate index called "sample_observations_view_annotations" in ES.
6+
7+
8+
To get this working please do the following steps:
9+
10+
1. Go to ./data/models/ and execute the download_medmen.sh (you have to be inside the folder and execute the script !!!) using the command: bash download_medmen.sh
11+
2. Go to cogstack-pipeline/ and start the docker container using the following command : docker-compose up -d
12+
13+
Please be patient, this example will take time until it finishes ingesting all the documents.
14+
15+
16+
3. If all goes well, navigate to http://localhost:5601/ and login using the following credentials :
17+
-username: elastic
18+
-password: admin
19+
20+
21+
4. Navigate to http://localhost:5601/app/management/kibana/indexPatterns/create
22+
23+
24+
Click on create index pattern:
25+
- paste sample_observations_view into the input field.
26+
Click on next step
27+
- select observation_timestamp from the dropdown.
28+
Click Next
29+
- Done, you should have the sample text index ready to view !
30+
31+
32+
5. We need to redo the instructions from step 4
33+
Click on create index pattern:
34+
- paste sample_observations_view_annotations into the input field.
35+
Click on next step
36+
- select meta.observation_timestamp from the dropdown.
37+
Click Next
38+
- Done, you should have the sample text index ready to view !
39+
40+
6. All Done !
41+
You can view how many records you have and how much space your index takes + other stats at http://localhost:5601/app/management/data/index_management/indices
42+
43+
To view the records inserted, go to : http://localhost:5601/app/discover , change the query timeframe to last 15 years and perform a search!
44+
45+
46+
The data used in this example is taken from example2...
Lines changed: 44 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,44 @@
1+
source:
2+
es:
3+
hosts: ["http://elastic:admin@elasticsearch-1:9200"]
4+
index-name: 'sample_observations_view'
5+
#security:
6+
# ca-certs-path: "/app/config/root-ca.pem"
7+
# client-cert-path: "/app/config/client.pem"
8+
# client-key-path: "/app/config/client.key"
9+
10+
sink:
11+
es:
12+
hosts: ["http://elastic:admin@elasticsearch-1:9200"]
13+
index-name: 'sample_observations_view_annotations'
14+
#security:
15+
# ca-certs-path: "/app/config/root-ca.pem"
16+
# client-cert-path: "/app/config/client.pem"
17+
# client-key-path: "/app/config/client.key"
18+
19+
nlp-service:
20+
endpoint-url: 'http://medcat-service:5000/api/process'
21+
22+
mapping:
23+
source:
24+
text-field: 'encounter_document'
25+
docid-field: 'encounter_id'
26+
persist-fields:
27+
- 'encounter_id'
28+
# - 'patient_id'
29+
- 'encounter_start'
30+
- 'observation_timestamp'
31+
# - 'encounter_end'
32+
batch:
33+
date-field: 'encounter_start'
34+
date-format: 'yyyy-MM-dd'
35+
python-date-format: '%Y-%m-%d'
36+
interval: 30
37+
date-start: '1922-01-01'
38+
date-end: '2099-06-01'
39+
threads: 8
40+
sink:
41+
split-index-by-field: ''
42+
nlp:
43+
skip-processed-doc-check: 'true'
44+
annotation-id-field: 'id'
Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
## ACTIVE SPRING PROFILES
2+
##
3+
spring.profiles.active = jdbc_in,elasticsearchRest,localPartitioning
4+
5+
6+
##### JOB AND REPO DB CONFIGURATIONS
7+
#
8+
#jobRepository.jobName = db_to_es_ingestion
9+
jobRepository.JdbcPath = jdbc:postgresql://cogstack-job-repo:5432/cogstack_job_repo
10+
jobRepository.Driver = org.postgresql.Driver
11+
jobRepository.username = cogstack
12+
jobRepository.password = cogstack
13+
14+
#### SOURCE: DB CONFIGURATIONS
15+
##
16+
source.JdbcPath = jdbc:postgresql://databank-db:5432/project_data
17+
source.Driver = org.postgresql.Driver
18+
source.username = test
19+
source.password = test
20+
21+
# The principle SQL block that specifies data to process. Composed of three parts.
22+
source.selectClause = SELECT *
23+
source.fromClause = FROM observations_view
24+
source.sortKey = observation_id
25+
26+
# The principle DB column label mapping for Document data model
27+
source.primaryKeyFieldValue = observation_id
28+
source.timeStamp = observation_timestamp
29+
30+
# Type of the timestamp field: 'TIMESTAMP', 'DATETIME', 'DATE' or 'TIME'
31+
source.dbmsToJavaSqlTimestampType = TIMESTAMP
32+
33+
##### SINK: ELASTICSEARCH CONFIGURATION
34+
##
35+
elasticsearch.xpack.security.transport.ssl.enabled = false
36+
elasticsearch.cluster.host = elasticsearch-1
37+
elasticsearch.cluster.port = 9200
38+
elasticsearch.xpack.enabled = true
39+
elasticsearch.xpack.user = elastic
40+
elasticsearch.xpack.password = admin
41+
42+
# optional: ES indexing options
43+
elasticsearch.index.name = sample_observations_view
44+
# – the name of the index in ElasticSearch under which documents are or will be stored,
45+
elasticsearch.excludeFromIndexing = observation_id
46+
47+
# elasticsearch.type
48+
# (default: doc) – the type of the documents (deprecated in ElasticSearch 6.0+),
49+
elasticsearch.cluster.name = "cogstack-elastic-cluster"
50+
# (default: elasticsearch)
51+
# – the name of the ElasticSearch cluster,
52+
# elasticsearch.cluster.slaveNodes
53+
# – the list of comma-separated <host:port> values for multi-node deployments,
54+
# elasticsearch.connect.timeout
55+
# (default: 5000) – max. time value (in ms) for connection timeout,
56+
# elasticsearch.retry.timeout
57+
# (default: 60000) – max. time value (in ms) before performing retry,
58+
59+
#Available properties when using the ElasticSearch X-Pack security module:
60+
#ES basic auth from X-pack security plugin (commercial)
61+
62+
# (default: false) – specifies whether to use SSL encryption for communication between ElasticSearch nodes,
63+
# elasticsearch.xpack.ssl.keystore.path – the path to the Java Keystore file that contains a private key and certificate,
64+
# elasticsearch.xpack.ssl.keystore.password – the password to the keystore,
65+
# elasticsearch.xpack.ssl.truststore.path – the path to the Java Keystore file that contains the certificates to trust,
66+
# elasticsearch.xpack.ssl.truststore.password – the password to the truststore.
67+
68+
69+
##### PARTITIONER CONFIGURATION
70+
##
71+
partitioner.partitionType = PKTimeStamp
72+
partitioner.timeStampColumnName = observation_timestamp
73+
partitioner.pkColumnName = observation_id
74+
partitioner.tableToPartition = observations_view
75+
76+
## SCHEDULER CONFIGURATION
77+
##
78+
# optional (default: false): if true, run a new job after the last one has finished - new jobs will continute to be created indefinitely
79+
#scheduler.useScheduling = false
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
#!/bin/bash
2+
3+
# wait for the services to get ready
4+
#
5+
echo "*** Awaiting services to start CogStack Pipeline ***"
6+
7+
while IFS=',' read -ra ADDR; do
8+
for i in "${ADDR[@]}"; do
9+
./wait_for_service.sh "$i" "--timeout=0" "--stdout"
10+
done
11+
done <<< $SERVICES_USED
12+
13+
14+
# start cogstack pipeline
15+
#
16+
echo "*** Starting CogStack Pipeline ***"
17+
18+
cog_start=`date +%s`
19+
20+
#COG_PATH=/cogstack-pipeline
21+
#COG_CONFIG_PATH=$COG_PATH/C
22+
23+
time sh -c "java -DLOG_FILE_NAME=$LOG_FILE_NAME -DLOG_LEVEL=$LOG_LEVEL -DFILE_LOG_LEVEL=$FILE_LOG_LEVEL -jar ${1} $COG_CONFIG_PATH/step-1/"
24+
# time sh -c "java -DLOG_FILE_NAME=$LOG_FILE_NAME -DLOG_LEVEL=$LOG_LEVEL -DFILE_LOG_LEVEL=$FILE_LOG_LEVEL -jar ${1} $COG_CONFIG_PATH/step-2/"
25+
26+
cog_end=`date +%s`
27+
28+
echo "*** Finishing CogStack Pipeline ***"
29+
30+
runtime_s=$((cog_end-cog_start))
31+
runtime_m=$((runtime_s/60))
32+
runtime_h=$((runtime_m/60))
33+
34+
echo "Duration: $runtime_h h $((runtime_m-(runtime_h*60))) m $((runtime_s-(runtime_m*60))) s"
35+
echo "Total: $runtime_s sec"
Lines changed: 191 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,191 @@
1+
version: "3.5"
2+
services:
3+
#---------------------------------------------------------------------------#
4+
# Postgres container with sample data #
5+
#---------------------------------------------------------------------------#
6+
databank-db:
7+
image: postgres:13-alpine
8+
shm_size: 2048mb
9+
restart: always
10+
environment:
11+
- POSTGRES_USER=test
12+
- POSTGRES_PASSWORD=test
13+
- POSTGRES_DATABANK_DB=project_data
14+
volumes:
15+
- ../data/:/data # mount the folder containing the DB csv files
16+
- ./scripts/create_source_and_sink_db.sh:/docker-entrypoint-initdb.d/create_source_and_sink_db.sh:ro
17+
- databank-db-vol:/var/lib/postgresql/data
18+
ports:
19+
- 5433:5432
20+
networks:
21+
- cognet
22+
23+
#---------------------------------------------------------------------------#
24+
# CogStack-Pipeline related containers #
25+
#---------------------------------------------------------------------------#
26+
cogstack-pipeline:
27+
image: cogstacksystems/cogstack-pipeline:dev-latest
28+
#image : cogstacksystems/cogstack-pipeline-gate:latest
29+
shm_size : 128mb
30+
restart: always
31+
environment:
32+
- SERVICES_USED=cogstack-job-repo:5432,databank-db:5432,elasticsearch-1:9200
33+
- LOG_LEVEL=info
34+
- LOG_FILE_NAME=cogstack_job_log
35+
- FILE_LOG_LEVEL=off
36+
- COG_PATH=/cogstack # used in run_pipeline.sh to point at the location of cogstack
37+
- COG_CONFIG_PATH=/cogstack/cogstack_conf
38+
volumes:
39+
- ./cogstack/conf:/cogstack/cogstack_conf:ro # <-- cogstack properties folder
40+
- ./cogstack/run_pipeline.sh:/cogstack/run_pipeline.sh:ro
41+
depends_on:
42+
- cogstack-job-repo
43+
- databank-db
44+
# java [parameters] -jar cogstack-*.jar <directory>
45+
# The run_pipeline.sh passes the two arguments and starts cogstack with the two arguments .jar file and job conf directory
46+
command: /cogstack/run_pipeline.sh /cogstack/cogstack-*.jar /cogstack/cogstack_conf
47+
networks:
48+
- cognet
49+
50+
cogstack-job-repo:
51+
image: postgres:13-alpine
52+
shm_size: 128mb
53+
restart: always
54+
environment:
55+
- POSTGRES_USER=cogstack
56+
- POSTGRES_PASSWORD=cogstack
57+
- POSTGRES_COGSTACK_DB=cogstack_job_repo
58+
volumes:
59+
- ./scripts/create_pgsql_job_repo.sh:/docker-entrypoint-initdb.d/create_pgsql_job_repo.sh:ro
60+
- cogstack-job-vol:/var/lib/postgresql/data
61+
depends_on:
62+
- databank-db
63+
ports:
64+
- 5432:5432
65+
networks:
66+
- cognet
67+
68+
#---------------------------------------------------------------------------#
69+
# Elasticsearch cluster #
70+
#---------------------------------------------------------------------------#
71+
elasticsearch-1:
72+
image: docker.elastic.co/elasticsearch/elasticsearch:7.10.1
73+
#image: amazon/opendistro-for-elasticsearch:1.12.0
74+
shm_size : 2048mb
75+
restart: always
76+
# use this in case of vm.max_map_count errors, the /usr/local/bin/docker-entry.sh is taken from the DockerHub image of elasticsearch
77+
entrypoint: "/bin/sh -c 'echo vm.max_map_count=262144 >> /etc/sysctl.conf && /usr/local/bin/docker-entrypoint.sh' "
78+
environment:
79+
- cluster.name=cogstack-elastic-cluster
80+
- ELASTIC_USER=elastic
81+
- ELASTIC_PASSWORD=admin
82+
- bootstrap.memory_lock=true
83+
- "ES_JAVA_OPTS=-Xms2048m -Xmx2048m"
84+
volumes:
85+
- ./elasticsearch/config/elasticsearch.yml:/usr/share/elasticsearch/config/elasticsearch.yml:ro
86+
- elasticsearch-vol-1:/usr/share/elasticsearch/data
87+
# mapping security config
88+
# INFO: uncomment below to add SSL keys
89+
#- ../security/root-ca.pem:/usr/share/elasticsearch/config/root-ca.pem:ro
90+
#- ../security/es-node1.pem:/usr/share/elasticsearch/config/esnode.pem:ro
91+
#- ../security/es-node1.key:/usr/share/elasticsearch/config/esnode.key:ro
92+
#- ../security/internal_users.yml:/usr/share/elasticsearch/plugins/opendistro_security/securityconfig/internal_users.yml:ro
93+
depends_on:
94+
- cogstack-job-repo
95+
ulimits:
96+
memlock:
97+
soft: -1
98+
hard: -1
99+
ports:
100+
- "9200:9200"
101+
networks:
102+
- cognet
103+
#---------------------------------------------------------------------------#
104+
# Medcat Service #
105+
#---------------------------------------------------------------------------#
106+
medcat-service:
107+
container_name: medcat-service
108+
image: cogstacksystems/medcat-service
109+
restart: always
110+
env_file:
111+
- ./medcat_service/envs/env_app
112+
- ./medcat_service/envs/env_medcat
113+
volumes:
114+
- ../data/models/medmen/vocab.dat:/cat/models/vocab.dat:ro
115+
- ../data/models/medmen/cdb.dat:/cat/models/cdb.dat:ro
116+
- ../data/models/medmen/mc_status:/cat/models/mc_status:ro
117+
depends_on:
118+
- elasticsearch-1
119+
ports:
120+
- "5000:5000"
121+
networks:
122+
- cognet
123+
#---------------------------------------------------------------------------#
124+
# Annotation Service #
125+
#---------------------------------------------------------------------------#
126+
annotation-ingester:
127+
image: cogstacksystems/annotations-ingester:latest
128+
shm_size : 128mb
129+
restart: always
130+
131+
volumes:
132+
- ./annotation_ingester/config/config.yml:/app/config/config.yml:ro
133+
depends_on:
134+
- elasticsearch-1
135+
- medcat-service
136+
- kibana
137+
- cogstack-pipeline
138+
command: "/app/run.sh"
139+
networks:
140+
- cognet
141+
142+
#---------------------------------------------------------------------------#
143+
# Kibana webapp #
144+
#---------------------------------------------------------------------------#
145+
kibana:
146+
image: docker.elastic.co/kibana/kibana:7.10.1
147+
#image: amazon/opendistro-for-elasticsearch-kibana:1.12.0
148+
shm_size : 128mb
149+
restart: always
150+
environment:
151+
SERVER_NAME: kibana.server
152+
# edit the url to match your server
153+
ELASTICSEARCH_HOSTS: "http://elasticsearch-1:9200"
154+
# INFO: uncomment below to enable SSL keys
155+
SERVER_SSL_ENABLED: "false"
156+
#SERVER_SSL_KEY: /usr/share/kibana/config/kibana.key
157+
#SERVER_SSL_CERTIFICATE: /usr/share/kibana/config/kibana.pem
158+
volumes:
159+
- ./kibana/config/kibana.yml:/usr/share/kibana/config/kibana.yml:ro
160+
- elasticsearch-vol-1:/usr/share/elasticsearch/data
161+
# INFO: uncomment below to add SSL keys
162+
#- ../security/root-ca.pem:/usr/share/kibana/config/root-ca.pem:ro
163+
#- ../security/kibana.pem:/usr/share/kibana/config/kibana.pem:ro
164+
#- ../security/kibana.key:/usr/share/kibana/config/kibana.key:ro
165+
depends_on:
166+
- elasticsearch-1
167+
- medcat-service
168+
- cogstack-pipeline
169+
ports:
170+
- "5601:5601"
171+
networks:
172+
- cognet
173+
174+
#---------------------------------------------------------------------------#
175+
# Docker networks. #
176+
#---------------------------------------------------------------------------#
177+
networks:
178+
cognet:
179+
driver: bridge
180+
name : cognet
181+
182+
#---------------------------------------------------------------------------#
183+
# Docker named volumes #
184+
#---------------------------------------------------------------------------#
185+
volumes:
186+
databank-db-vol:
187+
driver: local
188+
cogstack-job-vol:
189+
driver: local
190+
elasticsearch-vol-1:
191+
driver: local

0 commit comments

Comments
 (0)