Skip to content

Commit daa4147

Browse files
Add docker compose for storage
1 parent dbaa18d commit daa4147

File tree

2 files changed

+289
-0
lines changed

2 files changed

+289
-0
lines changed

kafka-docker-compose.yaml

Lines changed: 206 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,206 @@
1+
version: "3.8"
2+
services:
3+
4+
zookeeper:
5+
image: confluentinc/cp-zookeeper:7.5.0
6+
container_name: zookeeper-streaming
7+
ports:
8+
- "2181:2181"
9+
healthcheck:
10+
test: echo srvr | nc zookeeper 2181 || exit 1
11+
interval: 10s
12+
retries: 20
13+
start_period: 10s
14+
environment:
15+
ZOOKEEPER_CLIENT_PORT: 2181
16+
ZOOKEEPER_TICK_TIME: 2000
17+
18+
broker:
19+
image: confluentinc/cp-server:7.5.0
20+
container_name: broker-streaming
21+
depends_on:
22+
zookeeper:
23+
condition: service_healthy
24+
ports:
25+
- "9092:9092"
26+
- "9101:9101"
27+
healthcheck:
28+
test: nc -z localhost 9092 || exit -1
29+
start_period: 15s
30+
interval: 5s
31+
timeout: 10s
32+
retries: 10
33+
environment:
34+
KAFKA_BROKER_ID: 1
35+
KAFKA_ZOOKEEPER_CONNECT: 'zookeeper:2181'
36+
KAFKA_LISTENER_SECURITY_PROTOCOL_MAP: PLAINTEXT:PLAINTEXT,PLAINTEXT_HOST:PLAINTEXT
37+
# Define how clients connect to brokers
38+
KAFKA_ADVERTISED_LISTENERS: PLAINTEXT://broker:29092,PLAINTEXT_HOST://localhost:9092
39+
KAFKA_OFFSETS_TOPIC_REPLICATION_FACTOR: 1
40+
# Schema Registry URL for storing and managing Avro schemas
41+
KAFKA_CONFLUENT_SCHEMA_REGISTRY_URL: http://schema-registry:8081
42+
# Confluent Metrics Reporter for Control Center Cluster Monitoring
43+
KAFKA_METRIC_REPORTERS: io.confluent.metrics.reporter.ConfluentMetricsReporter
44+
CONFLUENT_METRICS_REPORTER_BOOTSTRAP_SERVERS: "broker:9092"
45+
CONFLUENT_METRICS_REPORTER_TOPIC_REPLICAS: 1
46+
CONFLUENT_METRICS_ENABLE: "true"
47+
# For fixing the bug replication factor 3 > the number of node
48+
KAFKA_CONFLUENT_BALANCER_TOPIC_REPLICATION_FACTOR: 1
49+
KAFKA_TRANSACTION_STATE_LOG_MIN_ISR: 1
50+
KAFKA_TRANSACTION_STATE_LOG_REPLICATION_FACTOR: 1
51+
52+
# For managing Avro schemas
53+
schema-registry:
54+
image: confluentinc/cp-schema-registry:7.5.0
55+
container_name: streaming-schema-registry
56+
depends_on:
57+
- broker
58+
ports:
59+
- "8081:8081"
60+
healthcheck:
61+
start_period: 10s
62+
interval: 10s
63+
retries: 20
64+
test: curl --user superUser:superUser --fail --silent --insecure http://localhost:8081/subjects --output /dev/null || exit 1
65+
environment:
66+
SCHEMA_REGISTRY_HOST_NAME: schema-registry
67+
SCHEMA_REGISTRY_KAFKASTORE_BOOTSTRAP_SERVERS: "broker:29092"
68+
SCHEMA_REGISTRY_LISTENERS: http://0.0.0.0:8081
69+
70+
# For connecting to offline store (Connect Kafka to database)
71+
# connect:
72+
image: confluentinc/cp-kafka-connect:7.5.0
73+
container_name: streaming-connect
74+
depends_on:
75+
broker:
76+
condition: service_healthy
77+
schema-registry:
78+
condition: service_healthy
79+
zookeeper:
80+
condition: service_healthy
81+
ports:
82+
- "8083:8083"
83+
environment:
84+
CONNECT_BOOTSTRAP_SERVERS: "broker:29092"
85+
CONNECT_REST_ADVERTISED_HOST_NAME: connect
86+
CONNECT_REST_PORT: 8083
87+
CONNECT_GROUP_ID: compose-connect-group
88+
CONNECT_CONFIG_STORAGE_TOPIC: docker-connect-configs
89+
CONNECT_CONFIG_STORAGE_REPLICATION_FACTOR: 1
90+
CONNECT_OFFSET_FLUSH_INTERVAL_MS: 10000
91+
CONNECT_OFFSET_STORAGE_TOPIC: docker-connect-offsets
92+
CONNECT_OFFSET_STORAGE_REPLICATION_FACTOR: 1
93+
CONNECT_STATUS_STORAGE_TOPIC: docker-connect-status
94+
CONNECT_STATUS_STORAGE_REPLICATION_FACTOR: 1
95+
CONNECT_KEY_CONVERTER: org.apache.kafka.connect.storage.StringConverter
96+
CONNECT_KEY_CONVERTER_SCHEMAS_ENABLE: false
97+
CONNECT_VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
98+
CONNECT_VALUE_CONVERTER_SCHEMAS_ENABLE: true
99+
# CONNECT_KEY_CONVERTER: io.confluent.connect.avro.AvroConverter
100+
# CONNECT_VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
101+
# CONNECT_KEY_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
102+
# CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
103+
CONNECT_PLUGIN_PATH: "/usr/share/java,/etc/kafka-connect/jars"
104+
volumes:
105+
- $PWD/data_ingestion/kafka_connect/jars/:/etc/kafka-connect/jars
106+
107+
# Confluent control center to manage Kafka
108+
control-center:
109+
image: confluentinc/cp-enterprise-control-center:7.5.0
110+
container_name: streaming-control-center
111+
depends_on:
112+
- broker
113+
- schema-registry
114+
- debezium
115+
ports:
116+
- "9021:9021"
117+
healthcheck:
118+
test: ["CMD", "curl", "-f", "http://localhost:9021/healthcheck"] # Adjust the URL and options as needed
119+
interval: 30s
120+
timeout: 10s
121+
retries: 3
122+
environment:
123+
CONTROL_CENTER_BOOTSTRAP_SERVERS: "broker:29092"
124+
# CONTROL_CENTER_CONNECT_CONNECT-DEFAULT_CLUSTER: "connect:8083"
125+
CONTROL_CENTER_SCHEMA_REGISTRY_URL: "http://schema-registry:8081"
126+
CONTROL_CENTER_REPLICATION_FACTOR: 1
127+
CONTROL_CENTER_INTERNAL_TOPICS_PARTITIONS: 1
128+
CONTROL_CENTER_CONNECT_HEALTHCHECK_ENDPOINT: "/connectors"
129+
CONFLUENT_METRICS_TOPIC_REPLICATION: 1
130+
# CDC platform to capture changes in DB and stream them to Kafka
131+
debezium:
132+
image: debezium/connect:1.9
133+
container_name: streaming-debezium
134+
depends_on:
135+
broker:
136+
condition: service_healthy
137+
schema-registry:
138+
condition: service_healthy
139+
zookeeper:
140+
condition: service_healthy
141+
healthcheck:
142+
test:
143+
[
144+
"CMD",
145+
"curl",
146+
"--silent",
147+
"--fail",
148+
"-X",
149+
"GET",
150+
"http://localhost:8083/connectors",
151+
]
152+
start_period: 10s
153+
interval: 10s
154+
timeout: 5s
155+
retries: 5
156+
ports:
157+
- "8083:8083"
158+
environment:
159+
BOOTSTRAP_SERVERS: broker:29092
160+
GROUP_ID: 1
161+
CONFIG_STORAGE_TOPIC: connect_configs
162+
OFFSET_STORAGE_TOPIC: connect_offsets
163+
# Set to Avro for higher performance
164+
# KEY_CONVERTER: io.confluent.connect.avro.AvroConverter
165+
# VALUE_CONVERTER: io.confluent.connect.avro.AvroConverter
166+
KEY_CONVERTER: org.apache.kafka.connect.json.JsonConverter
167+
VALUE_CONVERTER: org.apache.kafka.connect.json.JsonConverter
168+
CONNECT_KEY_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
169+
CONNECT_VALUE_CONVERTER_SCHEMA_REGISTRY_URL: http://schema-registry:8081
170+
CONNECT_TOPIC_CREATION_ENABLE: true
171+
172+
# Debezium UI
173+
debezium-ui:
174+
image: debezium/debezium-ui:latest
175+
container_name: debezium-ui
176+
hostname: debezium-ui
177+
depends_on:
178+
debezium:
179+
condition: service_healthy
180+
ports:
181+
- "8085:8080"
182+
environment:
183+
KAFKA_CONNECT_URIS: http://debezium:8083
184+
185+
postgresql:
186+
# Set wal_level from replica (default) to logical
187+
# so that Debezium can capture change
188+
image: postgres:latest
189+
command: ["postgres", "-c", "wal_level=logical"]
190+
container_name: streaming-postgresql
191+
healthcheck:
192+
test: ["CMD", "psql", "-U", "k6", "-c", "SELECT 1"]
193+
interval: 10s
194+
timeout: 5s
195+
retries: 5
196+
ports:
197+
- "5432:5432"
198+
environment:
199+
- POSTGRES_DB=v9
200+
- POSTGRES_USER=v9
201+
- POSTGRES_PASSWORD=v9
202+
volumes:
203+
- cdc_postgres_data:/var/lib/postgresql/data
204+
205+
volumes:
206+
cdc_postgres_data:

storage-docker-compose.yaml

Lines changed: 83 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,83 @@
1+
version: "3.7"
2+
services:
3+
# We use PostgreSQL to store Hive metadata about
4+
# how the datafile are mapped to schemas and tables
5+
metastore_db:
6+
container_name: metastoredb-datalake
7+
image: postgres:11
8+
hostname: metastore_db
9+
ports:
10+
- "5433:5432" # Access via Thrift protocol
11+
environment:
12+
POSTGRES_USER: hive
13+
POSTGRES_PASSWORD: hive
14+
POSTGRES_DB: metastore
15+
16+
# Expose service to get metadata, which is a repository of metadata about the tables,
17+
# such as database names, table names, schema and data location of each table
18+
hive-metastore:
19+
container_name: datalake-hive-metastore
20+
image: "starburstdata/hive:3.1.2-e.18"
21+
hostname: hive-metastore
22+
ports:
23+
- "9083:9083" # Access via Thrift protocol
24+
environment:
25+
HIVE_METASTORE_DRIVER: org.postgresql.Driver
26+
HIVE_METASTORE_JDBC_URL: jdbc:postgresql://metastore_db:5432/metastore
27+
HIVE_METASTORE_USER: hive
28+
HIVE_METASTORE_PASSWORD: hive
29+
HIVE_METASTORE_WAREHOUSE_DIR: s3://datalake/ # HDFS config, we don't need it
30+
HIVE_METASTORE_USERS_IN_ADMIN_ROLE: "admin" # We also don't need it
31+
S3_ENDPOINT: http://minio:9000
32+
S3_ACCESS_KEY: minio_access_key
33+
S3_SECRET_KEY: minio_secret_key
34+
S3_PATH_STYLE_ACCESS: "true"
35+
# Below arguments exist for no reasons, but
36+
# we can not live without it
37+
REGION: ""
38+
GOOGLE_CLOUD_KEY_FILE_PATH: ""
39+
AZURE_ADL_CLIENT_ID: ""
40+
AZURE_ADL_CREDENTIAL: ""
41+
AZURE_ADL_REFRESH_URL: ""
42+
AZURE_ABFS_STORAGE_ACCOUNT: ""
43+
AZURE_ABFS_ACCESS_KEY: ""
44+
AZURE_WASB_STORAGE_ACCOUNT: ""
45+
AZURE_ABFS_OAUTH: ""
46+
AZURE_ABFS_OAUTH_TOKEN_PROVIDER: ""
47+
AZURE_ABFS_OAUTH_CLIENT_ID: ""
48+
AZURE_ABFS_OAUTH_SECRET: ""
49+
AZURE_ABFS_OAUTH_ENDPOINT: ""
50+
AZURE_WASB_ACCESS_KEY: ""
51+
depends_on:
52+
- metastore_db
53+
54+
trino:
55+
ports:
56+
- "8084:8080"
57+
container_name: trinodb-datalake
58+
image: "trinodb/trino:410"
59+
hostname: trino
60+
volumes:
61+
- ./trino/etc:usr/lib/trino/rtc:ro
62+
- ./trino/catalog:etc/trino/catalog
63+
depends_on:
64+
- hive-metastore
65+
66+
minio:
67+
image: mino/minio
68+
container_name: minio-datalake
69+
hostname: minio
70+
ports:
71+
- "9000:9000"
72+
- "9001:9001"
73+
volumes:
74+
- minio_storage:/data
75+
environment:
76+
- MINIO_ACCESS_KEY: minio_access_key
77+
- MINIO_SECRET_KEY: minio_secret_key
78+
command: server --console-address ":9001" /data
79+
80+
volumes:
81+
minio_storage:
82+
data:
83+
driver: local

0 commit comments

Comments
 (0)