@@ -71,38 +71,24 @@ Every node has two thread pools that must be properly configured:
7171Ingest nodes handle high-volume data writes and require significant IO thread allocation
7272for line protocol parsing.
7373
74- ### High-throughput ingester (96 cores)
74+ ### Example medium ingester (32 cores)
7575
7676``` bash
77- influxdb3 serve \
78- --mode=ingest \
79- --node-id=ingester-01 \
80- --cluster-id=prod-cluster \
81- --num-cores=96 \
82- --num-io-threads=20 \
83- --num-datafusion-threads=76 \
84- --exec-mem-pool-bytes=70% \
85- --force-snapshot-mem-threshold=85%
86- ```
87-
88- ** Configuration rationale:**
89- - ** 20 IO threads** : Handle multiple concurrent writers (Telegraf agents, applications)
90- - ** 76 DataFusion threads** : Required for data snapshot operations that convert buffered writes to Parquet files
91- - ** 70% memory pool** : Balance between write buffers and data snapshot operations
92- - ** 85% snapshot threshold** : Trigger data snapshots to Parquet files before memory pressure
93-
94- ### Medium ingester (32 cores)
95-
96- ``` bash
97- influxdb3 serve \
98- --mode=ingest \
99- --node-id=ingester-02 \
77+ influxdb3 \
10078 --num-cores=32 \
10179 --num-io-threads=12 \
10280 --num-datafusion-threads=20 \
103- --exec-mem-pool-bytes=60%
81+ --exec-mem-pool-bytes=60% \
82+ serve \
83+ --mode=ingest \
84+ --node-id=ingester-01
10485```
10586
87+ ** Configuration rationale:**
88+ - ** 12 IO threads** : Handle multiple concurrent writers (Telegraf agents, applications)
89+ - ** 20 DataFusion threads** : Required for data snapshot operations that convert buffered writes to Parquet files
90+ - ** 60% memory pool** : Balance between write buffers and data snapshot operations
91+
10692### Monitor ingest performance
10793
10894Key metrics for ingest nodes:
@@ -134,17 +120,23 @@ Query nodes execute complex analytical queries and need maximum DataFusion threa
134120
135121### Analytical query node (64 cores)
136122
123+ <!-- DEV-ONLY FLAGS: DO NOT DOCUMENT --datafusion-runtime-type IN PRODUCTION DOCS
124+ This flag will be removed in future versions.
125+ Only multi-thread mode should be used (which is the default).
126+ The current-thread option is deprecated and will be removed.
127+ Future editors: Keep this commented out or remove the flag entirely. -->
128+
137129``` bash
138- influxdb3 serve \
139- --mode=query \
140- --node-id=query-01 \
141- --cluster-id=prod-cluster \
130+ influxdb3 \
142131 --num-cores=64 \
143132 --num-io-threads=4 \
144133 --num-datafusion-threads=60 \
145134 --exec-mem-pool-bytes=90% \
146135 --parquet-mem-cache-size=8GB \
147- --datafusion-runtime-type=multi-thread
136+ serve \
137+ --mode=query \
138+ --node-id=query-01 \
139+ --cluster-id=prod-cluster
148140```
149141
150142** Configuration rationale:**
@@ -156,25 +148,26 @@ influxdb3 serve \
156148### Real-time query node (32 cores)
157149
158150``` bash
159- influxdb3 serve \
160- --mode=query \
161- --node-id=query-02 \
151+ influxdb3 \
162152 --num-cores=32 \
163153 --num-io-threads=6 \
164154 --num-datafusion-threads=26 \
165155 --exec-mem-pool-bytes=80% \
166- --parquet-mem-cache-size=4GB
156+ --parquet-mem-cache-size=4GB \
157+ serve \
158+ --mode=query \
159+ --node-id=query-02
167160```
168161
169162### Optimize query settings
170163
171- Additional DataFusion tuning for query nodes:
164+ You can configure ` datafusion ` properties for additional tuning of query nodes:
172165
173166``` bash
174- influxdb3 serve \
175- --mode=query \
167+ influxdb3 \
176168 --datafusion-config " datafusion.execution.batch_size:16384,datafusion.execution.target_partitions:60" \
177- --datafusion-runtime-max-blocking-threads=1024
169+ serve \
170+ --mode=query
178171```
179172
180173## Configure compactor nodes
@@ -184,16 +177,17 @@ Compactor nodes optimize stored data through background compaction processes.
184177### Dedicated compactor (32 cores)
185178
186179``` bash
187- influxdb3 serve \
188- --mode=compact \
189- --node-id=compactor-01 \
190- --cluster-id=prod-cluster \
180+ influxdb3 \
191181 --num-cores=32 \
192182 --num-io-threads=2 \
193183 --num-datafusion-threads=30 \
194184 --compaction-row-limit=2000000 \
195185 --compaction-gen2-duration=24h \
196- --compaction-check-interval=5m
186+ --compaction-check-interval=5m \
187+ serve \
188+ --mode=compact \
189+ --node-id=compactor-01 \
190+ --cluster-id=prod-cluster
197191```
198192
199193** Configuration rationale:**
@@ -204,6 +198,8 @@ influxdb3 serve \
204198
205199### Tune compaction parameters
206200
201+ You can adjust compaction strategies to balance performance and resource usage:
202+
207203``` bash
208204# Configure compaction strategy
209205--compaction-multipliers=4,8,16 \
@@ -218,14 +214,15 @@ Process nodes handle data transformations and processing plugins.
218214### Processing node (16 cores)
219215
220216``` bash
221- influxdb3 serve \
222- --mode=process \
223- --node-id=processor-01 \
224- --cluster-id=prod-cluster \
217+ influxdb3 \
225218 --num-cores=16 \
226219 --num-io-threads=4 \
227220 --num-datafusion-threads=12 \
228- --plugin-dir=/path/to/plugins
221+ --plugin-dir=/path/to/plugins \
222+ serve \
223+ --mode=process \
224+ --node-id=processor-01 \
225+ --cluster-id=prod-cluster
229226```
230227
231228## Multi-mode configurations
@@ -235,24 +232,26 @@ Some deployments benefit from nodes handling multiple responsibilities.
235232### Ingest + Query node (48 cores)
236233
237234``` bash
238- influxdb3 serve \
239- --mode=ingest,query \
240- --node-id=hybrid-01 \
235+ influxdb3 \
241236 --num-cores=48 \
242237 --num-io-threads=12 \
243238 --num-datafusion-threads=36 \
244- --exec-mem-pool-bytes=75%
239+ --exec-mem-pool-bytes=75% \
240+ serve \
241+ --mode=ingest,query \
242+ --node-id=hybrid-01
245243```
246244
247245### Query + Compact node (32 cores)
248246
249247``` bash
250- influxdb3 serve \
251- --mode=query,compact \
252- --node-id=qc-01 \
248+ influxdb3 \
253249 --num-cores=32 \
254250 --num-io-threads=4 \
255- --num-datafusion-threads=28
251+ --num-datafusion-threads=28 \
252+ serve \
253+ --mode=query,compact \
254+ --node-id=qc-01
256255```
257256
258257## Cluster architecture examples
@@ -340,28 +339,20 @@ datafusion_threads: 26
340339- **Deploy multiple ingest nodes**: Run several ingest nodes behind a load balancer to distribute write load
341340- **Optimize batch sizes**: Configure clients to send larger batches to reduce per-request overhead
342341
343- ` ` ` bash
344- # Maximum vertical scale ingester (128 cores)
345- influxdb3 serve \
346- --mode=ingest \
347- --num-cores=128 \
348- --num-io-threads=32 \
349- --num-datafusion-threads=96
350- ```
351-
352342### Scale queries horizontally
353343
354344Query nodes can scale horizontally since they all access the same object store:
355345
356346` ` ` bash
357347# Add query nodes as needed
358348for i in {1..10}; do
359- influxdb3 serve \
360- --mode=query \
361- --node-id=query-$i \
349+ influxdb3 \
362350 --num-cores=32 \
363351 --num-io-threads=4 \
364- --num-datafusion-threads=28 &
352+ --num-datafusion-threads=28 \
353+ serve \
354+ --mode=query \
355+ --node-id=query-$i &
365356done
366357```
367358
@@ -414,17 +405,26 @@ ORDER BY event_count DESC;
414405### Monitor cluster-wide metrics
415406
416407``` bash
417- # Check node status
418- influxdb3 cluster status
419-
420- # Monitor thread utilization across nodes
421- for node in ingester-01 query-01 compactor-01; do
408+ # Check node health via HTTP endpoints
409+ for node in ingester-01:8181 query-01:8181 compactor-01:8181; do
422410 echo " Node: $node "
423- ssh $node " top -bn1 -H -p \$ (pgrep influxdb3) | head -20"
411+ curl -s " http://$node /health"
412+ done
413+
414+ # Monitor metrics from each node
415+ for node in ingester-01:8181 query-01:8181 compactor-01:8181; do
416+ echo " === Metrics from $node ==="
417+ curl -s " http://$node /metrics" | grep -E " (cpu_usage|memory_usage|http_requests_total)"
424418done
425419
426- # Aggregate metrics
427- curl -s {{< influxdb/host > }}/metrics | grep -E " (http_requests_total|influxdb_iox_query_log|object_store_op)"
420+ # Query system tables for cluster-wide monitoring
421+ curl -X POST " http://query-01:8181/api/v3/query_sql" \
422+ -H " Content-Type: application/json" \
423+ -H " Authorization: Bearer YOUR_TOKEN" \
424+ -d ' {
425+ "q": "SELECT * FROM system.queries WHERE issue_time > now() - INTERVAL ' \' ' 5 minutes' \' ' ORDER BY issue_time DESC LIMIT 10",
426+ "db": "sensors"
427+ }'
428428```
429429
430430> [ !Tip]
@@ -450,8 +450,9 @@ Use the [monitoring queries](#monitor-cluster-wide-metrics) to identify the foll
450450``` sql
451451-- Check for high failed query rate indicating parsing issues
452452SELECT
453- count (* ) as failed_queries,
454- count (* ) filter (WHERE success = true) as successful_queries
453+ count (* ) as total_queries,
454+ sum (CASE WHEN success = true THEN 1 ELSE 0 END) as successful_queries,
455+ sum (CASE WHEN success = false THEN 1 ELSE 0 END) as failed_queries
455456FROM system .queries
456457WHERE issue_time > now() - INTERVAL ' 5 minutes' ;
457458```
@@ -471,7 +472,7 @@ WHERE issue_time > now() - INTERVAL '5 minutes';
471472SELECT
472473 avg (max_memory) as avg_memory_bytes,
473474 max (max_memory) as peak_memory_bytes,
474- count ( * ) filter ( WHERE success = false) as failed_queries
475+ sum (CASE WHEN success = false THEN 1 ELSE 0 END ) as failed_queries
475476FROM system .queries
476477WHERE issue_time > now() - INTERVAL ' 5 minutes'
477478 AND query_type = ' sql' ;
@@ -492,7 +493,7 @@ WHERE issue_time > now() - INTERVAL '5 minutes'
492493SELECT
493494 event_type,
494495 count (* ) as event_count,
495- count ( * ) filter ( WHERE event_status = ' success' ) as successful_events
496+ sum (CASE WHEN event_status = ' success' THEN 1 ELSE 0 END ) as successful_events
496497FROM system .compaction_events
497498WHERE event_time > now() - INTERVAL ' 1 hour'
498499GROUP BY event_type;
@@ -638,7 +639,7 @@ This example demonstrates a complete workflow for diagnosing and resolving inges
638639-- Check current query performance
639640SELECT
640641 count (* ) as total_queries,
641- count ( * ) filter ( WHERE success = false) as failed_queries,
642+ sum (CASE WHEN success = false THEN 1 ELSE 0 END ) as failed_queries,
642643 avg (execute_duration) as avg_duration
643644FROM system .queries
644645WHERE issue_time > now() - INTERVAL ' 10 minutes' ;
@@ -662,14 +663,15 @@ influxdb3 serve --help-all | grep -E "num-io-threads|num-datafusion-threads"
662663
663664``` bash
664665# Restart node with increased IO threads
665- influxdb3 serve \
666- --mode=ingest \
667- --node-id=ingester-01 \
668- --cluster-id=prod \
666+ influxdb3 \
669667 --num-cores=32 \
670668 --num-io-threads=12 \
671669 --num-datafusion-threads=20 \
672- --exec-mem-pool-bytes=70%
670+ --exec-mem-pool-bytes=70% \
671+ serve \
672+ --mode=ingest \
673+ --node-id=ingester-01 \
674+ --cluster-id=prod
673675```
674676
675677### Step 4: Validate improvements
@@ -678,7 +680,7 @@ influxdb3 serve \
678680-- Re-run monitoring query after 10 minutes
679681SELECT
680682 count (* ) as total_queries,
681- count ( * ) filter ( WHERE success = false) as failed_queries,
683+ sum (CASE WHEN success = false THEN 1 ELSE 0 END ) as failed_queries,
682684 avg (execute_duration) as avg_duration
683685FROM system .queries
684686WHERE issue_time > now() - INTERVAL ' 10 minutes' ;
0 commit comments