Skip to content

Commit a692ea5

Browse files
authored
Abstract over kinds of ClickHouse deployments in tests (#6593)
- Add the `ClickHouseDeployment` enum, which manages an entire ClickHouse deployment in test code, either a single-node for most tests, or a cluster where relevant. For the cluster variant, this adds a way to wait for the first child or all children, to be shutdown. This fixes a bug in the logic for managing child processes, where failures of one of the process could make zombies out of all the others. This also collects the nodes into arrays, so we can resize the cluster easily if we want, which fixes #4460. - Use the new enum in the `ControlPlaneTestContext` for all Nexus integration tests. - Rework the `ch-dev` binary to use the new enum, and also print much more verbose information about what it's doing when starting ClickHouse. This fixes #3011.
1 parent 1b43a0a commit a692ea5

File tree

10 files changed

+794
-941
lines changed

10 files changed

+794
-941
lines changed

Cargo.lock

Lines changed: 1 addition & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

dev-tools/ch-dev/src/main.rs

Lines changed: 89 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -76,13 +76,21 @@ async fn start_single_node(
7676
let mut signal_stream = signals.fuse();
7777

7878
// Start the database server process, possibly on a specific port
79-
let mut db_instance =
80-
dev::clickhouse::ClickHouseInstance::new_single_node(logctx, port)
79+
let mut deployment =
80+
dev::clickhouse::ClickHouseDeployment::new_single_node(logctx, port)
8181
.await?;
82+
let db_instance = deployment
83+
.instances()
84+
.next()
85+
.expect("Should have launched a ClickHouse instance");
8286
println!(
8387
"ch-dev: running ClickHouse with full command:\n\"clickhouse {}\"",
8488
db_instance.cmdline().join(" ")
8589
);
90+
println!("ch-dev: ClickHouse environment:");
91+
for (k, v) in db_instance.environment() {
92+
println!("\t{k}={v}");
93+
}
8694
println!(
8795
"ch-dev: ClickHouse is running with PID {}",
8896
db_instance
@@ -94,14 +102,14 @@ async fn start_single_node(
94102
db_instance.port()
95103
);
96104
println!(
97-
"ch-dev: using {} for ClickHouse data storage",
105+
"ch-dev: ClickHouse data stored in: [{}]",
98106
db_instance.data_path()
99107
);
100108

101109
// Wait for the DB to exit itself (an error), or for SIGINT
102110
tokio::select! {
103-
_ = db_instance.wait_for_shutdown() => {
104-
db_instance.cleanup().await.context("clean up after shutdown")?;
111+
_ = deployment.wait_for_shutdown() => {
112+
deployment.cleanup().await.context("clean up after shutdown")?;
105113
bail!("ch-dev: ClickHouse shutdown unexpectedly");
106114
}
107115
caught_signal = signal_stream.next() => {
@@ -115,7 +123,7 @@ async fn start_single_node(
115123
);
116124

117125
// Remove the data directory.
118-
db_instance
126+
deployment
119127
.wait_for_shutdown()
120128
.await
121129
.context("clean up after SIGINT shutdown")?;
@@ -135,12 +143,16 @@ async fn start_replicated_cluster(
135143
let manifest_dir = PathBuf::from(env!("CARGO_MANIFEST_DIR"));
136144
let replica_config = manifest_dir
137145
.as_path()
138-
.join("../../oximeter/db/src/configs/replica_config.xml");
146+
.join("../../oximeter/db/src/configs/replica_config.xml")
147+
.canonicalize()
148+
.context("Failed to canonicalize replica config path")?;
139149
let keeper_config = manifest_dir
140150
.as_path()
141-
.join("../../oximeter/db/src/configs/keeper_config.xml");
151+
.join("../../oximeter/db/src/configs/keeper_config.xml")
152+
.canonicalize()
153+
.context("Failed to canonicalize keeper config path")?;
142154

143-
let mut cluster = dev::clickhouse::ClickHouseCluster::new(
155+
let mut cluster = dev::clickhouse::ClickHouseDeployment::new_cluster(
144156
logctx,
145157
replica_config,
146158
keeper_config,
@@ -149,83 +161,86 @@ async fn start_replicated_cluster(
149161
println!(
150162
"ch-dev: running ClickHouse cluster with configuration files:\n \
151163
replicas: {}\n keepers: {}",
152-
cluster.replica_config_path().display(),
153-
cluster.keeper_config_path().display()
154-
);
155-
let pid_error_msg = "Failed to get process PID, it may not have started";
156-
println!(
157-
"ch-dev: ClickHouse cluster is running with: server PIDs = [{}, {}] \
158-
and keeper PIDs = [{}, {}, {}]",
159-
cluster.replica_1.pid().expect(pid_error_msg),
160-
cluster.replica_2.pid().expect(pid_error_msg),
161-
cluster.keeper_1.pid().expect(pid_error_msg),
162-
cluster.keeper_2.pid().expect(pid_error_msg),
163-
cluster.keeper_3.pid().expect(pid_error_msg),
164-
);
165-
println!(
166-
"ch-dev: ClickHouse HTTP servers listening on ports: {}, {}",
167-
cluster.replica_1.port(),
168-
cluster.replica_2.port()
169-
);
170-
println!(
171-
"ch-dev: using {} and {} for ClickHouse data storage",
172-
cluster.replica_1.data_path(),
173-
cluster.replica_2.data_path()
164+
cluster.replica_config_path().unwrap().display(),
165+
cluster.keeper_config_path().unwrap().display()
174166
);
167+
for instance in cluster.instances() {
168+
println!(
169+
"ch-dev: running ClickHouse replica with full command:\
170+
\n\"clickhouse {}\"",
171+
instance.cmdline().join(" ")
172+
);
173+
println!("ch-dev: ClickHouse replica environment:");
174+
for (k, v) in instance.environment() {
175+
println!("\t{k}={v}");
176+
}
177+
println!(
178+
"ch-dev: ClickHouse replica PID is {}",
179+
instance.pid().context("Failed to get instance PID")?
180+
);
181+
println!(
182+
"ch-dev: ClickHouse replica data path is {}",
183+
instance.data_path(),
184+
);
185+
println!(
186+
"ch-dev: ClickHouse replica HTTP server is listening on port {}",
187+
instance.address.port(),
188+
);
189+
}
190+
for keeper in cluster.keepers() {
191+
println!(
192+
"ch-dev: running ClickHouse Keeper with full command:\
193+
\n\"clickhouse {}\"",
194+
keeper.cmdline().join(" ")
195+
);
196+
println!("ch-dev: ClickHouse Keeper environment:");
197+
for (k, v) in keeper.environment() {
198+
println!("\t{k}={v}");
199+
}
200+
println!(
201+
"ch-dev: ClickHouse Keeper PID is {}",
202+
keeper.pid().context("Failed to get Keeper PID")?
203+
);
204+
println!(
205+
"ch-dev: ClickHouse Keeper data path is {}",
206+
keeper.data_path(),
207+
);
208+
println!(
209+
"ch-dev: ClickHouse Keeper HTTP server is listening on port {}",
210+
keeper.address.port(),
211+
);
212+
}
175213

176214
// Wait for the replicas and keepers to exit themselves (an error), or for SIGINT
177215
tokio::select! {
178-
_ = cluster.replica_1.wait_for_shutdown() => {
179-
cluster.replica_1.cleanup().await.context(
180-
format!("clean up {} after shutdown", cluster.replica_1.data_path())
181-
)?;
182-
bail!("ch-dev: ClickHouse replica 1 shutdown unexpectedly");
183-
}
184-
_ = cluster.replica_2.wait_for_shutdown() => {
185-
cluster.replica_2.cleanup().await.context(
186-
format!("clean up {} after shutdown", cluster.replica_2.data_path())
187-
)?;
188-
bail!("ch-dev: ClickHouse replica 2 shutdown unexpectedly");
189-
}
190-
_ = cluster.keeper_1.wait_for_shutdown() => {
191-
cluster.keeper_1.cleanup().await.context(
192-
format!("clean up {} after shutdown", cluster.keeper_1.data_path())
193-
)?;
194-
bail!("ch-dev: ClickHouse keeper 1 shutdown unexpectedly");
195-
}
196-
_ = cluster.keeper_2.wait_for_shutdown() => {
197-
cluster.keeper_2.cleanup().await.context(
198-
format!("clean up {} after shutdown", cluster.keeper_2.data_path())
199-
)?;
200-
bail!("ch-dev: ClickHouse keeper 2 shutdown unexpectedly");
201-
}
202-
_ = cluster.keeper_3.wait_for_shutdown() => {
203-
cluster.keeper_3.cleanup().await.context(
204-
format!("clean up {} after shutdown", cluster.keeper_3.data_path())
205-
)?;
206-
bail!("ch-dev: ClickHouse keeper 3 shutdown unexpectedly");
216+
res = cluster.wait_for_shutdown() => {
217+
cluster.cleanup().await.context("cleaning up after shutdown")?;
218+
match res {
219+
Ok(node) => {
220+
bail!(
221+
"ch-dev: ClickHouse cluster {:?} node {} shutdown unexpectedly",
222+
node.kind,
223+
node.index,
224+
);
225+
}
226+
Err(e) => {
227+
bail!(
228+
"ch-dev: Failed to wait for cluster node: {}",
229+
e,
230+
);
231+
}
232+
}
207233
}
208234
caught_signal = signal_stream.next() => {
209235
assert_eq!(caught_signal.unwrap(), SIGINT);
210236
eprintln!(
211237
"ch-dev: caught signal, shutting down and removing \
212238
temporary directories"
213239
);
214-
215-
// Remove the data directories.
216-
let mut instances = vec![
217-
cluster.replica_1,
218-
cluster.replica_2,
219-
cluster.keeper_1,
220-
cluster.keeper_2,
221-
cluster.keeper_3,
222-
];
223-
for instance in instances.iter_mut() {
224-
instance
225-
.wait_for_shutdown()
240+
cluster
241+
.cleanup()
226242
.await
227-
.context(format!("clean up {} after SIGINT shutdown", instance.data_path()))?;
228-
};
243+
.context("clean up after SIGINT shutdown")?;
229244
}
230245
}
231246
Ok(())

dev-tools/omdb/tests/test_all_output.rs

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -130,7 +130,7 @@ async fn test_omdb_success_cases(cptestctx: &ControlPlaneTestContext) {
130130
let mgs_url = format!("http://{}/", gwtestctx.client.bind_address);
131131
let ox_url = format!("http://{}/", cptestctx.oximeter.server_address());
132132
let ox_test_producer = cptestctx.producer.address().ip();
133-
let ch_url = format!("http://{}/", cptestctx.clickhouse.address);
133+
let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address());
134134

135135
let tmpdir = camino_tempfile::tempdir()
136136
.expect("failed to create temporary directory");
@@ -308,7 +308,7 @@ async fn test_omdb_env_settings(cptestctx: &ControlPlaneTestContext) {
308308
format!("http://{}", cptestctx.internal_client.bind_address);
309309
let ox_url = format!("http://{}/", cptestctx.oximeter.server_address());
310310
let ox_test_producer = cptestctx.producer.address().ip();
311-
let ch_url = format!("http://{}/", cptestctx.clickhouse.address);
311+
let ch_url = format!("http://{}/", cptestctx.clickhouse.http_address());
312312
let dns_sockaddr = cptestctx.internal_dns.dns_server.local_address();
313313
let mut output = String::new();
314314

nexus/benches/setup_benchmark.rs

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,7 +31,7 @@ async fn do_clickhouse_setup() {
3131
let cfg = nexus_test_utils::load_test_config();
3232
let logctx = LogContext::new("clickhouse_setup", &cfg.pkg.log);
3333
let mut clickhouse =
34-
dev::clickhouse::ClickHouseInstance::new_single_node(&logctx, 0)
34+
dev::clickhouse::ClickHouseDeployment::new_single_node(&logctx, 0)
3535
.await
3636
.unwrap();
3737
clickhouse.cleanup().await.unwrap();

nexus/test-utils/src/lib.rs

Lines changed: 11 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -112,7 +112,7 @@ pub struct ControlPlaneTestContext<N> {
112112
pub internal_client: ClientTestContext,
113113
pub server: N,
114114
pub database: dev::db::CockroachInstance,
115-
pub clickhouse: dev::clickhouse::ClickHouseInstance,
115+
pub clickhouse: dev::clickhouse::ClickHouseDeployment,
116116
pub logctx: LogContext,
117117
pub sled_agent_storage: camino_tempfile::Utf8TempDir,
118118
pub sled_agent: sim::Server,
@@ -275,7 +275,7 @@ pub struct ControlPlaneTestContextBuilder<'a, N: NexusServer> {
275275

276276
pub server: Option<N>,
277277
pub database: Option<dev::db::CockroachInstance>,
278-
pub clickhouse: Option<dev::clickhouse::ClickHouseInstance>,
278+
pub clickhouse: Option<dev::clickhouse::ClickHouseDeployment>,
279279
pub sled_agent_storage: Option<camino_tempfile::Utf8TempDir>,
280280
pub sled_agent: Option<sim::Server>,
281281
pub sled_agent2_storage: Option<camino_tempfile::Utf8TempDir>,
@@ -447,13 +447,14 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
447447
pub async fn start_clickhouse(&mut self) {
448448
let log = &self.logctx.log;
449449
debug!(log, "Starting Clickhouse");
450-
let clickhouse = dev::clickhouse::ClickHouseInstance::new_single_node(
451-
&self.logctx,
452-
0,
453-
)
454-
.await
455-
.unwrap();
456-
let port = clickhouse.port();
450+
let clickhouse =
451+
dev::clickhouse::ClickHouseDeployment::new_single_node(
452+
&self.logctx,
453+
0,
454+
)
455+
.await
456+
.unwrap();
457+
let port = clickhouse.http_address().port();
457458

458459
let zpool_id = ZpoolUuid::new_v4();
459460
let dataset_id = Uuid::new_v4();
@@ -594,7 +595,7 @@ impl<'a, N: NexusServer> ControlPlaneTestContextBuilder<'a, N> {
594595
let oximeter = start_oximeter(
595596
log.new(o!("component" => "oximeter")),
596597
nexus_internal_addr,
597-
clickhouse.port(),
598+
clickhouse.http_address().port(),
598599
collector_id,
599600
)
600601
.await

nexus/tests/integration_tests/oximeter.rs

Lines changed: 3 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ use nexus_test_interface::NexusServer;
99
use nexus_test_utils_macros::nexus_test;
1010
use omicron_test_utils::dev::poll::{wait_for_condition, CondCheckError};
1111
use oximeter_db::DbWrite;
12-
use std::net;
1312
use std::time::Duration;
1413
use uuid::Uuid;
1514

@@ -118,14 +117,8 @@ async fn test_oximeter_reregistration() {
118117
row.get::<&str, chrono::DateTime<chrono::Utc>>("time_modified");
119118

120119
// ClickHouse client for verifying collection.
121-
let ch_address = net::SocketAddrV6::new(
122-
"::1".parse().unwrap(),
123-
context.clickhouse.port(),
124-
0,
125-
0,
126-
);
127-
let client =
128-
oximeter_db::Client::new(ch_address.into(), &context.logctx.log);
120+
let ch_address = context.clickhouse.http_address().into();
121+
let client = oximeter_db::Client::new(ch_address, &context.logctx.log);
129122
client
130123
.init_single_node_db()
131124
.await
@@ -308,7 +301,7 @@ async fn test_oximeter_reregistration() {
308301
context.oximeter = nexus_test_utils::start_oximeter(
309302
context.logctx.log.new(o!("component" => "oximeter")),
310303
context.server.get_http_server_internal_address().await,
311-
context.clickhouse.port(),
304+
context.clickhouse.http_address().port(),
312305
oximeter_id,
313306
)
314307
.await

0 commit comments

Comments
 (0)