diff --git a/components/metrics/src/bin/mock_worker.rs b/components/metrics/src/bin/mock_worker.rs index 6278de73ce..73f358dbec 100644 --- a/components/metrics/src/bin/mock_worker.rs +++ b/components/metrics/src/bin/mock_worker.rs @@ -17,12 +17,12 @@ use dynamo_llm::kv_router::{ protocols::ForwardPassMetrics, scheduler::KVHitRateEvent, KV_HIT_RATE_SUBJECT, }; use dynamo_runtime::{ - component::{service::EndpointStats, Namespace}, logging, pipeline::{ async_trait, network::Ingress, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, ResponseStream, SingleIn, }, + entity::{EntityChain, service::EndpointStats, Namespace}, protocols::annotated::Annotated, stream, traits::events::EventPublisher, @@ -136,8 +136,8 @@ async fn backend(runtime: DistributedRuntime) -> Result<()> { .service_builder() .create() .await?; - let endpoint = component.endpoint("my_endpoint"); - tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint.path()); + let endpoint = component.endpoint("my_endpoint")?; + tracing::info!("Starting Mock Worker on Endpoint: {}", endpoint); // Spawn background task for publishing KV hit rate events let namespace_clone = namespace.clone(); diff --git a/components/metrics/src/lib.rs b/components/metrics/src/lib.rs index b928938490..e59849d476 100644 --- a/components/metrics/src/lib.rs +++ b/components/metrics/src/lib.rs @@ -88,7 +88,7 @@ use dynamo_llm::kv_router::scheduler::Endpoint; use dynamo_llm::kv_router::scoring::ProcessedEndpoints; use dynamo_runtime::{ - distributed::Component, error, service::EndpointInfo, utils::Duration, Result, + error, service::EndpointInfo, utils::Duration, Result, entity::Component }; /// Configuration for metrics collection mode diff --git a/components/metrics/src/main.rs b/components/metrics/src/main.rs index fa8186d07a..fbfe07771f 100644 --- a/components/metrics/src/main.rs +++ b/components/metrics/src/main.rs @@ -30,6 +30,7 @@ use clap::Parser; use dynamo_llm::kv_router::scheduler::KVHitRateEvent; use dynamo_llm::kv_router::KV_HIT_RATE_SUBJECT; use dynamo_runtime::{ + entity::EntityChain, error, logging, traits::events::{EventPublisher, EventSubscriber}, utils::{Duration, Instant}, @@ -119,11 +120,11 @@ async fn app(runtime: Runtime) -> Result<()> { let drt = DistributedRuntime::from_settings(runtime.clone()).await?; - let namespace = drt.namespace(args.namespace)?; + let namespace = drt.namespace(&args.namespace)?; let component = namespace.component("count")?; // Create unique instance of Count - let key = format!("{}/instance", component.etcd_root()); + let key = format!("{}/instance", component); tracing::debug!("Creating unique instance of Count at {key}"); drt.etcd_client() .expect("Unreachable because of DistributedRuntime::from_settings above") @@ -132,11 +133,11 @@ async fn app(runtime: Runtime) -> Result<()> { .context("Unable to create unique instance of Count; possibly one already exists")?; let target_component = namespace.component(&config.component_name)?; - let target_endpoint = target_component.endpoint(&config.endpoint_name); + let target_endpoint = target_component.endpoint(&config.endpoint_name)?; - let service_path = target_endpoint.path(); - let service_subject = target_endpoint.subject(); - tracing::info!("Scraping endpoint {service_path} for stats"); + // let service_path = target_endpoint.path(); + let service_subject = target_endpoint.to_descriptor().identifier().slug().to_string(); + tracing::info!("Scraping endpoint {service_subject} for stats"); // Safety: DistributedRuntime::from_settings ensures this is Some let token = drt.primary_lease().unwrap().child_token(); @@ -224,14 +225,14 @@ async fn app(runtime: Runtime) -> Result<()> { let endpoints = collect_endpoints(&target_component, &service_subject, scrape_timeout).await?; if endpoints.is_empty() { - tracing::warn!("No endpoints found matching {service_path}"); + tracing::warn!("No endpoints found matching {service_subject}"); continue; } let metrics = extract_metrics(&endpoints); let processed = postprocess_metrics(&metrics, &endpoints); if processed.endpoints.is_empty() { - tracing::warn!("No metrics found matching {service_path}"); + tracing::warn!("No metrics found matching {service_subject}"); } else { tracing::info!("Aggregated metrics: {processed:?}"); } diff --git a/components/router/src/main.rs b/components/router/src/main.rs index 3546a9bb30..6d5dbbfee0 100644 --- a/components/router/src/main.rs +++ b/components/router/src/main.rs @@ -32,6 +32,7 @@ use dynamo_llm::kv_router::{ }; use dynamo_runtime::{ logging, pipeline::network::Ingress, DistributedRuntime, Result, Runtime, Worker, + entity::EntityChain }; #[derive(Parser)] @@ -73,7 +74,7 @@ async fn app(runtime: Runtime) -> Result<()> { .service_builder() .create() .await? - .endpoint("generate") + .endpoint("generate")? .endpoint_builder() .handler(router) .start() diff --git a/launch/dynamo-run/src/input/endpoint.rs b/launch/dynamo-run/src/input/endpoint.rs index 582f313ae5..df17ffc367 100644 --- a/launch/dynamo-run/src/input/endpoint.rs +++ b/launch/dynamo-run/src/input/endpoint.rs @@ -31,7 +31,7 @@ use dynamo_runtime::engine::AsyncEngineStream; use dynamo_runtime::pipeline::{ network::Ingress, Context, ManyOut, Operator, SegmentSource, ServiceBackend, SingleIn, Source, }; -use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime}; +use dynamo_runtime::{protocols::Endpoint as EndpointId, DistributedRuntime, entity::EntityChain}; use crate::EngineConfig; @@ -50,7 +50,7 @@ pub async fn run( .service_builder() .create() .await? - .endpoint(&endpoint_id.name); + .endpoint(&endpoint_id.name)?; let (rt_fut, card): (Pin + Send + 'static>>, _) = match engine_config { diff --git a/launch/llmctl/src/main.rs b/launch/llmctl/src/main.rs index f004c001ca..fbde693793 100644 --- a/launch/llmctl/src/main.rs +++ b/launch/llmctl/src/main.rs @@ -6,9 +6,9 @@ use std::sync::Arc; use clap::{Parser, Subcommand}; use dynamo_llm::discovery::{ModelManager, ModelWatcher}; -use dynamo_llm::local_model::{LocalModel, ModelNetworkName}; +use dynamo_llm::local_model::LocalModel; use dynamo_llm::model_type::ModelType; -use dynamo_runtime::component::Endpoint; +use dynamo_runtime::entity::{EntityChain, Endpoint}; use dynamo_runtime::pipeline::RouterMode; use dynamo_runtime::{ distributed::DistributedConfig, logging, DistributedRuntime, Result, Runtime, Worker, @@ -278,9 +278,9 @@ async fn list_models( models.push(ModelRow { model_type: entry.model_type.as_str().to_string(), name: entry.name, - namespace: entry.endpoint.namespace, - component: entry.endpoint.component, - endpoint: entry.endpoint.name, + namespace: entry.instance.identifier().namespace_name().to_string(), + component: entry.instance.identifier().component_name().unwrap().to_string(), // safe because instance has component_name + endpoint: entry.instance.identifier().endpoint_name().unwrap().to_string(), // safe because instance has endpoint_name }); } @@ -324,10 +324,10 @@ async fn remove_model( .into_iter() .filter(|entry| entry.model_type == model_type) { - let network_name = ModelNetworkName::from_entry(&entry, 0); - tracing::debug!("deleting key: {network_name}"); + let instance_name = entry.instance.to_string(); + tracing::debug!("deleting key: {instance_name}"); etcd_client - .kv_delete(network_name.to_string(), None) + .kv_delete(instance_name, None) .await?; } @@ -353,7 +353,7 @@ fn endpoint_from_name( let component = distributed .namespace(namespace)? - .component(component_name)?; + .component(&component_name)?; - Ok(component.endpoint(endpoint_name)) + Ok(component.endpoint(&endpoint_name)?) } diff --git a/lib/bindings/c/src/lib.rs b/lib/bindings/c/src/lib.rs index 1c50f4aa8e..d112ae51d6 100644 --- a/lib/bindings/c/src/lib.rs +++ b/lib/bindings/c/src/lib.rs @@ -22,6 +22,7 @@ use std::sync::atomic::{AtomicU32, Ordering}; use dynamo_llm::kv_router::{ indexer::compute_block_hash_for_seq, protocols::*, publisher::KvEventPublisher, }; +use dynamo_runtime::entity::EntityChain; use dynamo_runtime::{DistributedRuntime, Worker}; static WK: OnceCell = OnceCell::new(); static DRT: AsyncOnceCell = AsyncOnceCell::new(); @@ -147,7 +148,7 @@ fn dynamo_create_kv_publisher( .ok_or(anyhow::Error::msg("Could not get Distributed Runtime")) { Ok(drt) => { - let backend = drt.namespace(namespace)?.component(component)?; + let backend = drt.namespace(&namespace)?.component(&component)?; KvEventPublisher::new(backend, worker_id, kv_block_size, None) } Err(e) => Err(e), diff --git a/lib/bindings/python/rust/lib.rs b/lib/bindings/python/rust/lib.rs index 39cc1ea46e..e1c81bfe37 100644 --- a/lib/bindings/python/rust/lib.rs +++ b/lib/bindings/python/rust/lib.rs @@ -17,6 +17,7 @@ use dynamo_runtime::{ pipeline::{EngineStream, ManyOut, SingleIn}, protocols::annotated::Annotated as RsAnnotated, traits::DistributedRuntimeProvider, + entity::EntityChain }; use dynamo_llm::{self as llm_rs}; @@ -164,21 +165,21 @@ struct CancellationToken { #[pyclass] #[derive(Clone)] struct Namespace { - inner: rs::component::Namespace, + inner: rs::entity::Namespace, event_loop: PyObject, } #[pyclass] #[derive(Clone)] struct Component { - inner: rs::component::Component, + inner: rs::entity::Component, event_loop: PyObject, } #[pyclass] #[derive(Clone)] struct Endpoint { - inner: rs::component::Endpoint, + inner: rs::entity::Endpoint, event_loop: PyObject, } @@ -230,7 +231,7 @@ impl DistributedRuntime { fn namespace(&self, name: String) -> PyResult { Ok(Namespace { - inner: self.inner.namespace(name).map_err(to_pyerr)?, + inner: self.inner.namespace(&name).map_err(to_pyerr)?, event_loop: self.event_loop.clone(), }) } @@ -416,7 +417,7 @@ impl CancellationToken { #[pymethods] impl Component { fn endpoint(&self, name: String) -> PyResult { - let inner = self.inner.endpoint(name); + let inner = self.inner.endpoint(&name).map_err(to_pyerr)?; Ok(Endpoint { inner, event_loop: self.event_loop.clone(), @@ -481,7 +482,7 @@ impl Endpoint { #[pymethods] impl Namespace { fn component(&self, name: String) -> PyResult { - let inner = self.inner.component(name).map_err(to_pyerr)?; + let inner = self.inner.component(&name).map_err(to_pyerr)?; Ok(Component { inner, event_loop: self.event_loop.clone(), @@ -598,11 +599,13 @@ impl Client { /// Replaces wait_for_endpoints. fn wait_for_instances<'p>(&self, py: Python<'p>) -> PyResult> { let inner = self.router.client.clone(); + + // Safety: We don't expose static, so instance_id will exist pyo3_async_runtimes::tokio::future_into_py(py, async move { inner .wait_for_instances() .await - .map(|v| v.into_iter().map(|cei| cei.id()).collect::>()) + .map(|v| v.into_iter().map(|cei| cei.instance_id().unwrap()).collect::>()) .map_err(to_pyerr) }) } diff --git a/lib/llm/src/discovery/model_entry.rs b/lib/llm/src/discovery/model_entry.rs index 8801598026..678680a8ab 100644 --- a/lib/llm/src/discovery/model_entry.rs +++ b/lib/llm/src/discovery/model_entry.rs @@ -5,9 +5,9 @@ use std::sync::Arc; use dynamo_runtime::transports::etcd; use dynamo_runtime::{ - protocols, slug::Slug, storage::key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager}, + descriptor::Instance }; use serde::{Deserialize, Serialize}; @@ -24,7 +24,7 @@ pub struct ModelEntry { pub name: String, /// How to address this on the network - pub endpoint: protocols::Endpoint, + pub instance: Instance, /// Specifies whether the model is a chat, completions, etc model. pub model_type: ModelType, diff --git a/lib/llm/src/discovery/model_manager.rs b/lib/llm/src/discovery/model_manager.rs index 0341026d63..a49c7d08f6 100644 --- a/lib/llm/src/discovery/model_manager.rs +++ b/lib/llm/src/discovery/model_manager.rs @@ -1,7 +1,7 @@ // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. // SPDX-License-Identifier: Apache-2.0 -use dynamo_runtime::component::Component; +use dynamo_runtime::entity::Component; use crate::discovery::ModelEntry; diff --git a/lib/llm/src/discovery/watcher.rs b/lib/llm/src/discovery/watcher.rs index fa2063aa40..840d86822f 100644 --- a/lib/llm/src/discovery/watcher.rs +++ b/lib/llm/src/discovery/watcher.rs @@ -15,6 +15,7 @@ use dynamo_runtime::{ transports::etcd::{KeyValue, WatchEvent}, DistributedRuntime, }; +use dynamo_runtime::entity::{EntityChain, ToEntity}; use crate::{ backend::Backend, @@ -160,12 +161,15 @@ impl ModelWatcher { // Handles a PUT event from etcd, this usually means adding a new model to the list of served // models. async fn handle_put(&self, model_entry: &ModelEntry) -> anyhow::Result<()> { - let endpoint_id = model_entry.endpoint.clone(); - let component = self - .drt - .namespace(&endpoint_id.namespace)? - .component(&endpoint_id.component)?; - let client = component.endpoint(&endpoint_id.name).client().await?; + let instance = model_entry.instance.clone(); + let endpoint = instance.to_entity(self.drt.clone())?; + let component = endpoint.component(); + let client = endpoint.client().await?; + // let component = self + // .drt + // .namespace(&endpoint_id.namespace)? + // .component(&endpoint_id.component)?; + // let client = component.endpoint(&endpoint_id.name).client().await?; let Some(etcd_client) = self.drt.etcd_client() else { // Should be impossible because we only get here on an etcd event diff --git a/lib/llm/src/http/service/health.rs b/lib/llm/src/http/service/health.rs index 84ed629056..649506a488 100644 --- a/lib/llm/src/http/service/health.rs +++ b/lib/llm/src/http/service/health.rs @@ -64,7 +64,7 @@ async fn health_handler( } else { let endpoints: Vec = model_entries .iter() - .map(|entry| entry.endpoint.as_url()) + .map(|entry| entry.instance.to_string()) .collect(); ( StatusCode::OK, diff --git a/lib/llm/src/kv_router.rs b/lib/llm/src/kv_router.rs index 535a428984..bd03c7f9f4 100644 --- a/lib/llm/src/kv_router.rs +++ b/lib/llm/src/kv_router.rs @@ -5,7 +5,7 @@ use std::sync::Arc; use anyhow::Result; use dynamo_runtime::{ - component::{Component, InstanceSource}, + entity::{Component, InstanceSource}, pipeline::{ async_trait, AsyncEngine, AsyncEngineContextProvider, Error, ManyOut, PushRouter, ResponseStream, SingleIn, diff --git a/lib/llm/src/kv_router/metrics_aggregator.rs b/lib/llm/src/kv_router/metrics_aggregator.rs index 156d1dfb02..fd4b313522 100644 --- a/lib/llm/src/kv_router/metrics_aggregator.rs +++ b/lib/llm/src/kv_router/metrics_aggregator.rs @@ -20,7 +20,7 @@ use crate::kv_router::KV_METRICS_ENDPOINT; use crate::kv_router::scheduler::Endpoint; use crate::kv_router::ProcessedEndpoints; -use dynamo_runtime::component::Component; +use dynamo_runtime::entity::Component; use dynamo_runtime::{service::EndpointInfo, utils::Duration, Result}; use tokio::sync::watch; use tokio_util::sync::CancellationToken; @@ -44,7 +44,7 @@ impl KvMetricsAggregator { )); Self { - service_name: component.service_name(), + service_name: component.to_descriptor().slug().to_string(), endpoints_rx: watch_rx, } } @@ -96,8 +96,8 @@ pub async fn collect_endpoints_task( ) { let backoff_delay = Duration::from_millis(100); let scrape_timeout = Duration::from_millis(300); - let endpoint = component.endpoint(KV_METRICS_ENDPOINT); - let service_subject = endpoint.subject(); + let endpoint = component.endpoint(KV_METRICS_ENDPOINT).unwrap(); + let service_subject = endpoint.to_descriptor().identifier().slug().to_string(); loop { tokio::select! { diff --git a/lib/llm/src/kv_router/publisher.rs b/lib/llm/src/kv_router/publisher.rs index d4bf56e0d8..120f5453d0 100644 --- a/lib/llm/src/kv_router/publisher.rs +++ b/lib/llm/src/kv_router/publisher.rs @@ -21,7 +21,7 @@ use crate::kv_router::{ use async_trait::async_trait; use dynamo_runtime::traits::{events::EventPublisher, DistributedRuntimeProvider}; use dynamo_runtime::{ - component::Component, + entity::Component, pipeline::{ network::Ingress, AsyncEngine, AsyncEngineContextProvider, ManyOut, ResponseStream, SingleIn, @@ -485,7 +485,7 @@ impl WorkerMetricsPublisher { let handler = Ingress::for_engine(handler)?; component - .endpoint(KV_METRICS_ENDPOINT) + .endpoint(KV_METRICS_ENDPOINT)? .endpoint_builder() .stats_handler(move |_| { let metrics = metrics_rx.borrow_and_update().clone(); diff --git a/lib/llm/src/kv_router/scheduler.rs b/lib/llm/src/kv_router/scheduler.rs index edf85d3198..70871c2e49 100644 --- a/lib/llm/src/kv_router/scheduler.rs +++ b/lib/llm/src/kv_router/scheduler.rs @@ -13,7 +13,7 @@ // See the License for the specific language governing permissions and // limitations under the License. -use dynamo_runtime::component::Namespace; +use dynamo_runtime::entity::Namespace; use dynamo_runtime::traits::events::EventPublisher; use rand::Rng; use serde::{Deserialize, Serialize}; diff --git a/lib/llm/src/local_model.rs b/lib/llm/src/local_model.rs index 31e3d6ca71..02da0e7c60 100644 --- a/lib/llm/src/local_model.rs +++ b/lib/llm/src/local_model.rs @@ -7,16 +7,18 @@ use std::sync::Arc; use dynamo_runtime::traits::DistributedRuntimeProvider; use dynamo_runtime::{ - component::{Component, Endpoint}, + entity::{Component, Endpoint, ToEntity}, storage::key_value_store::{EtcdStorage, KeyValueStore, KeyValueStoreManager}, }; +use dynamo_runtime::discovery::DiscoveryClient; +use anyhow::Context; use crate::discovery::ModelEntry; use crate::model_card::{self, ModelDeploymentCard}; use crate::model_type::ModelType; mod network_name; -pub use network_name::ModelNetworkName; +use dynamo_runtime::descriptor::Instance; /// Prefix for Hugging Face model repository const HF_SCHEME: &str = "hf://"; @@ -143,10 +145,13 @@ impl LocalModel { model_type: ModelType, ) -> anyhow::Result<()> { // A static component doesn't have an etcd_client because it doesn't need to register - let Some(etcd_client) = endpoint.drt().etcd_client() else { + if endpoint.to_descriptor().is_static() { anyhow::bail!("Cannot attach to static endpoint"); - }; - self.ensure_unique(endpoint.component(), self.display_name()) + } + + let storage = endpoint.storage()?; + + self.ensure_unique(&endpoint.component(), self.display_name()) .await?; // Store model config files in NATS object store @@ -154,7 +159,9 @@ impl LocalModel { self.card.move_to_nats(nats_client.clone()).await?; // Publish the Model Deployment Card to etcd - let kvstore: Box = Box::new(EtcdStorage::new(etcd_client.clone())); + // TODO Figure out how to work this into the new ETCD storage method + // Using deprecated etcd_client() method now + let kvstore: Box = Box::new(EtcdStorage::new(endpoint.drt().etcd_client().unwrap())); let card_store = Arc::new(KeyValueStoreManager::new(kvstore)); let key = self.card.slug().to_string(); card_store @@ -163,16 +170,14 @@ impl LocalModel { // Publish our ModelEntry to etcd. This allows ingress to find the model card. // (Why don't we put the model card directly under this key?) - let network_name = ModelNetworkName::from_local(endpoint, etcd_client.lease_id()); - tracing::debug!("Registering with etcd as {network_name}"); + tracing::debug!("Registering with etcd as {endpoint}"); let model_registration = ModelEntry { name: self.display_name().to_string(), - endpoint: endpoint.id(), + instance: endpoint.to_descriptor(), model_type, }; - etcd_client - .kv_create( - network_name.to_string(), + storage + .create( serde_json::to_vec_pretty(&model_registration)?, None, // use primary lease ) @@ -186,19 +191,47 @@ impl LocalModel { /// /// Returns an error if there is already a component by this name serving a different model. async fn ensure_unique(&self, component: &Component, model_name: &str) -> anyhow::Result<()> { - let Some(etcd_client) = component.drt().etcd_client() else { + let Ok(storage) = component.storage() else { // A static component is necessarily unique, it cannot register return Ok(()); }; - for endpoint_info in component.list_instances().await? { - let network_name: ModelNetworkName = (&endpoint_info).into(); - if let Ok(entry) = network_name.load_entry(&etcd_client).await { - if entry.name != model_name { - anyhow::bail!("Duplicate component. Attempt to register model {model_name} at {component}, which is already used by {network_name} running model {}.", entry.name); - } + for subpath in storage.get_prefix().await? { + let Some(instance) = std::str::from_utf8(subpath.key()) + .ok() + .and_then(|s| Instance::parse(s).ok()) else { + continue; + }; + + // ModelEntry written under Endpoint. Parse Instance then turn into Endpoint + let mut model_entries = instance.clone().to_entity(component.drt().clone())?.storage()?.get().await?; + if model_entries.is_empty() { + anyhow::bail!("No ModelEntry in etcd for key {instance}"); + } + let model_entry = model_entries.remove(0); + let entry : ModelEntry = serde_json::from_slice(model_entry.value()).with_context(|| { + format!( + "Error deserializing JSON. Key={instance}. JSON={}", + model_entry.value_str().unwrap_or("INVALID UTF-8") + ) + })?; + + if entry.name != model_name { + anyhow::bail!("Duplicate component. Attempt to register model {model_name} at {component}, which is already used by {instance} running model {}.", entry.name); } } + Ok(()) + + // for endpoint_info in component.list_instances().await? { + // let network_name: ModelNetworkName = (&endpoint_info).into(); + + // if let Ok(entry) = network_name.load_entry(&etcd_client).await { + // if entry.name != model_name { + // anyhow::bail!("Duplicate component. Attempt to register model {model_name} at {component}, which is already used by {network_name} running model {}.", entry.name); + // } + // } + // } + // Ok(()) } } diff --git a/lib/llm/src/local_model/network_name.rs b/lib/llm/src/local_model/network_name.rs index 9c35337c46..72d8539596 100644 --- a/lib/llm/src/local_model/network_name.rs +++ b/lib/llm/src/local_model/network_name.rs @@ -1,75 +1,75 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 +// // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// // SPDX-License-Identifier: Apache-2.0 -use anyhow::Context as _; +// use anyhow::Context as _; -use crate::discovery::{ModelEntry, MODEL_ROOT_PATH}; -use dynamo_runtime::component::{self, Instance}; -use dynamo_runtime::slug::Slug; -use dynamo_runtime::transports::etcd; +// use crate::discovery::{ModelEntry, MODEL_ROOT_PATH}; +// use dynamo_runtime::entity; +// use dynamo_runtime::slug::Slug; +// use dynamo_runtime::transports::etcd; -#[derive(Debug, Clone)] -pub struct ModelNetworkName(String); +// #[derive(Debug, Clone)] +// pub struct ModelNetworkName(String); -impl ModelNetworkName { - /// Key to store this model entry in networked key-value store (etcd). - /// - /// It looks like this: - /// ns.cp.ep-694d967ca5efd804 - fn from_parts(namespace: &str, component: &str, endpoint: &str, lease_id: i64) -> Self { - let model_root = MODEL_ROOT_PATH; - let slug = Slug::slugify(&format!("{namespace}.{component}.{endpoint}-{lease_id:x}")); - ModelNetworkName(format!("{model_root}/{slug}")) - } +// impl ModelNetworkName { +// /// Key to store this model entry in networked key-value store (etcd). +// /// +// /// It looks like this: +// /// ns.cp.ep-694d967ca5efd804 +// fn from_parts(namespace: &str, component: &str, endpoint: &str, lease_id: i64) -> Self { +// let model_root = MODEL_ROOT_PATH; +// let slug = Slug::slugify(&format!("{namespace}.{component}.{endpoint}-{lease_id:x}")); +// ModelNetworkName(format!("{model_root}/{slug}")) +// } - // We can't do From<&component::Endpoint> here because we also need the lease_id - pub fn from_local(endpoint: &component::Endpoint, lease_id: i64) -> Self { - Self::from_parts( - &endpoint.component().namespace().to_string(), - &endpoint.component().name(), - endpoint.name(), - lease_id, - ) - } +// // We can't do From<&component::Endpoint> here because we also need the lease_id +// pub fn from_local(endpoint: &entity::Endpoint, lease_id: i64) -> Self { +// Self::from_parts( +// &endpoint.component().namespace().to_string(), +// &endpoint.component().name(), +// endpoint.name(), +// lease_id, +// ) +// } - pub fn from_entry(entry: &ModelEntry, lease_id: i64) -> Self { - Self::from_parts( - &entry.endpoint.namespace, - &entry.endpoint.component, - &entry.endpoint.name, - lease_id, - ) - } +// pub fn from_entry(entry: &ModelEntry, lease_id: i64) -> Self { +// Self::from_parts( +// &entry.endpoint.namespace, +// &entry.endpoint.component, +// &entry.endpoint.name, +// lease_id, +// ) +// } - /// Fetch the ModelEntry from etcd. - pub async fn load_entry(&self, etcd_client: &etcd::Client) -> anyhow::Result { - let mut model_entries = etcd_client.kv_get(self.to_string(), None).await?; - if model_entries.is_empty() { - anyhow::bail!("No ModelEntry in etcd for key {self}"); - } - let model_entry = model_entries.remove(0); - serde_json::from_slice(model_entry.value()).with_context(|| { - format!( - "Error deserializing JSON. Key={self}. JSON={}", - model_entry.value_str().unwrap_or("INVALID UTF-8") - ) - }) - } -} +// /// Fetch the ModelEntry from etcd. +// pub async fn load_entry(&self, etcd_client: &etcd::Client) -> anyhow::Result { +// let mut model_entries = etcd_client.kv_get(self.to_string(), None).await?; +// if model_entries.is_empty() { +// anyhow::bail!("No ModelEntry in etcd for key {self}"); +// } +// let model_entry = model_entries.remove(0); +// serde_json::from_slice(model_entry.value()).with_context(|| { +// format!( +// "Error deserializing JSON. Key={self}. JSON={}", +// model_entry.value_str().unwrap_or("INVALID UTF-8") +// ) +// }) +// } +// } -impl From<&Instance> for ModelNetworkName { - fn from(cei: &Instance) -> Self { - Self::from_parts( - &cei.namespace, - &cei.component, - &cei.endpoint, - cei.instance_id, - ) - } -} +// impl From<&Instance> for ModelNetworkName { +// fn from(cei: &Instance) -> Self { +// Self::from_parts( +// &cei.namespace, +// &cei.component, +// &cei.endpoint, +// cei.instance_id, +// ) +// } +// } -impl std::fmt::Display for ModelNetworkName { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.0) - } -} +// impl std::fmt::Display for ModelNetworkName { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!(f, "{}", self.0) +// } +// } diff --git a/lib/runtime/src/component.rs b/lib/runtime/src/component.rs index f716611b60..eaf6a1805d 100644 --- a/lib/runtime/src/component.rs +++ b/lib/runtime/src/component.rs @@ -1,602 +1,602 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 - -//! The [Component] module defines the top-level API for building distributed applications. -//! -//! A distributed application consists of a set of [Component] that can host one -//! or more [Endpoint]. Each [Endpoint] is a network-accessible service -//! that can be accessed by other [Component] in the distributed application. -//! -//! A [Component] is made discoverable by registering it with the distributed runtime under -//! a [`Namespace`]. -//! -//! A [`Namespace`] is a logical grouping of [Component] that are grouped together. -//! -//! We might extend namespace to include grouping behavior, which would define groups of -//! components that are tightly coupled. -//! -//! A [Component] is the core building block of a distributed application. It is a logical -//! unit of work such as a `Preprocessor` or `SmartRouter` that has a well-defined role in the -//! distributed application. -//! -//! A [Component] can present to the distributed application one or more configuration files -//! which define how that component was constructed/configured and what capabilities it can -//! provide. -//! -//! Other [Component] can write to watching locations within a [Component] etcd -//! path. This allows the [Component] to take dynamic actions depending on the watch -//! triggers. -//! -//! TODO: Top-level Overview of Endpoints/Functions - -use crate::{discovery::Lease, service::ServiceSet, transports::etcd::EtcdPath}; - -use super::{ - error, - traits::*, - transports::etcd::{COMPONENT_KEYWORD, ENDPOINT_KEYWORD}, - transports::nats::Slug, - utils::Duration, - DistributedRuntime, Result, Runtime, -}; - -use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler}; -use crate::protocols::Endpoint as EndpointId; -use async_nats::{ - rustls::quic, - service::{Service, ServiceExt}, -}; -use derive_builder::Builder; -use derive_getters::Getters; -use educe::Educe; -use serde::{Deserialize, Serialize}; -use service::EndpointStatsHandler; -use std::{collections::HashMap, hash::Hash, sync::Arc}; -use validator::{Validate, ValidationError}; - -mod client; -#[allow(clippy::module_inception)] -mod component; -mod endpoint; -mod namespace; -mod registry; -pub mod service; - -pub use client::{Client, InstanceSource}; - -/// The root etcd path where each instance registers itself in etcd. -/// An instance is namespace+component+endpoint+lease_id and must be unique. -pub const INSTANCE_ROOT_PATH: &str = "instances"; - -/// The root etcd path where each namespace is registered in etcd. -pub const ETCD_ROOT_PATH: &str = "dynamo://"; - -#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] -#[serde(rename_all = "snake_case")] -pub enum TransportType { - NatsTcp(String), -} - -#[derive(Default)] -pub struct RegistryInner { - services: HashMap, - stats_handlers: HashMap>>>, -} - -#[derive(Clone)] -pub struct Registry { - inner: Arc>, -} - -#[derive(Debug, Clone, Serialize, Deserialize)] -pub struct Instance { - pub component: String, - pub endpoint: String, - pub namespace: String, - pub instance_id: i64, - pub transport: TransportType, -} - -impl Instance { - pub fn id(&self) -> i64 { - self.instance_id - } -} - -/// A [Component] a discoverable entity in the distributed runtime. -/// You can host [Endpoint] on a [Component] by first creating -/// a [Service] then adding one or more [Endpoint] to the [Service]. -/// -/// You can also issue a request to a [Component]'s [Endpoint] by creating a [Client]. -#[derive(Educe, Builder, Clone, Validate)] -#[educe(Debug)] -#[builder(pattern = "owned")] -pub struct Component { - #[builder(private)] - #[educe(Debug(ignore))] - drt: Arc, - - // todo - restrict the namespace to a-z0-9-_A-Z - /// Name of the component - #[builder(setter(into))] - #[validate(custom(function = "validate_allowed_chars"))] - name: String, - - // todo - restrict the namespace to a-z0-9-_A-Z - /// Namespace - #[builder(setter(into))] - namespace: Namespace, - - // A static component's endpoints cannot be discovered via etcd, they are - // fixed at startup time. - is_static: bool, -} - -impl Hash for Component { - fn hash(&self, state: &mut H) { - self.namespace.name().hash(state); - self.name.hash(state); - self.is_static.hash(state); - } -} - -impl PartialEq for Component { - fn eq(&self, other: &Self) -> bool { - self.namespace.name() == other.namespace.name() - && self.name == other.name - && self.is_static == other.is_static - } -} - -impl Eq for Component {} - -impl std::fmt::Display for Component { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}.{}", self.namespace.name(), self.name) - } -} - -impl DistributedRuntimeProvider for Component { - fn drt(&self) -> &DistributedRuntime { - &self.drt - } -} - -impl RuntimeProvider for Component { - fn rt(&self) -> &Runtime { - self.drt.rt() - } -} - -impl Component { - /// The component part of an instance path in etcd. - pub fn etcd_root(&self) -> String { - let ns = self.namespace.name(); - let cp = &self.name; - format!("{INSTANCE_ROOT_PATH}/{ns}/{cp}") - } - - pub fn service_name(&self) -> String { - let service_name = format!("{}_{}", self.namespace.name(), self.name); - Slug::slugify(&service_name).to_string() - } - - pub fn path(&self) -> String { - format!("{}/{}", self.namespace.name(), self.name) - } - - pub fn etcd_path(&self) -> EtcdPath { - EtcdPath::new_component(&self.namespace.name(), &self.name) - .expect("Component name and namespace should be valid") - } - - pub fn namespace(&self) -> &Namespace { - &self.namespace - } - - pub fn name(&self) -> String { - self.name.clone() - } - - pub fn endpoint(&self, endpoint: impl Into) -> Endpoint { - Endpoint { - component: self.clone(), - name: endpoint.into(), - is_static: self.is_static, - } - } - - pub async fn list_instances(&self) -> anyhow::Result> { - let Some(etcd_client) = self.drt.etcd_client() else { - return Ok(vec![]); - }; - let mut out = vec![]; - // The extra slash is important to only list exact component matches, not substrings. - for kv in etcd_client - .kv_get_prefix(format!("{}/", self.etcd_root())) - .await? - { - let val = match serde_json::from_slice::(kv.value()) { - Ok(val) => val, - Err(err) => { - anyhow::bail!( - "Error converting etcd response to Instance: {err}. {}", - kv.value_str()? - ); - } - }; - out.push(val); - } - Ok(out) - } - - pub async fn scrape_stats(&self, timeout: Duration) -> Result { - let service_name = self.service_name(); - let service_client = self.drt().service_client(); - service_client - .collect_services(&service_name, timeout) - .await - } - - /// TODO - /// - /// This method will scrape the stats for all available services - /// Returns a stream of `ServiceInfo` objects. - /// This should be consumed by a `[tokio::time::timeout_at`] because each services - /// will only respond once, but there is no way to know when all services have responded. - pub async fn stats_stream(&self) -> Result<()> { - unimplemented!("collect_stats") - } - - pub fn service_builder(&self) -> service::ServiceConfigBuilder { - service::ServiceConfigBuilder::from_component(self.clone()) - } -} - -impl ComponentBuilder { - pub fn from_runtime(drt: Arc) -> Self { - Self::default().drt(drt) - } -} - -#[derive(Debug, Clone)] -pub struct Endpoint { - component: Component, - - // todo - restrict alphabet - /// Endpoint name - name: String, - - is_static: bool, -} - -impl Hash for Endpoint { - fn hash(&self, state: &mut H) { - self.component.hash(state); - self.name.hash(state); - self.is_static.hash(state); - } -} - -impl PartialEq for Endpoint { - fn eq(&self, other: &Self) -> bool { - self.component == other.component - && self.name == other.name - && self.is_static == other.is_static - } -} - -impl Eq for Endpoint {} - -impl DistributedRuntimeProvider for Endpoint { - fn drt(&self) -> &DistributedRuntime { - self.component.drt() - } -} - -impl RuntimeProvider for Endpoint { - fn rt(&self) -> &Runtime { - self.component.rt() - } -} - -impl Endpoint { - pub fn id(&self) -> EndpointId { - EndpointId { - namespace: self.component.namespace().name().to_string(), - component: self.component.name().to_string(), - name: self.name().to_string(), - } - } - - pub fn name(&self) -> &str { - &self.name - } - - pub fn component(&self) -> &Component { - &self.component - } - - // todo(ryan): deprecate this as we move to Discovery traits and Component Identifiers - pub fn path(&self) -> String { - format!( - "{}/{}/{}", - self.component.path(), - ENDPOINT_KEYWORD, - self.name - ) - } - - /// The endpoint part of an instance path in etcd - pub fn etcd_root(&self) -> String { - let component_path = self.component.etcd_root(); - let endpoint_name = &self.name; - format!("{component_path}/{endpoint_name}") - } - - /// The endpoint as an EtcdPath object - pub fn etcd_path(&self) -> EtcdPath { - EtcdPath::new_endpoint( - &self.component.namespace().name(), - &self.component.name(), - &self.name, - ) - .expect("Endpoint name and component name should be valid") - } - - /// The fully path of an instance in etcd - pub fn etcd_path_with_lease_id(&self, lease_id: i64) -> String { - let endpoint_root = self.etcd_root(); - if self.is_static { - endpoint_root - } else { - format!("{endpoint_root}:{lease_id:x}") - } - } - - /// The endpoint as an EtcdPath object with lease ID - pub fn etcd_path_object_with_lease_id(&self, lease_id: i64) -> EtcdPath { - if self.is_static { - self.etcd_path() - } else { - EtcdPath::new_endpoint_with_lease( - &self.component.namespace().name(), - &self.component.name(), - &self.name, - lease_id, - ) - .expect("Endpoint name and component name should be valid") - } - } - - pub fn name_with_id(&self, lease_id: i64) -> String { - if self.is_static { - self.name.clone() - } else { - format!("{}-{:x}", self.name, lease_id) - } - } - - pub fn subject(&self) -> String { - format!("{}.{}", self.component.service_name(), self.name) - } - - /// Subject to an instance of the [Endpoint] with a specific lease id - pub fn subject_to(&self, lease_id: i64) -> String { - format!( - "{}.{}", - self.component.service_name(), - self.name_with_id(lease_id) - ) - } - - pub async fn client(&self) -> Result { - if self.is_static { - client::Client::new_static(self.clone()).await - } else { - client::Client::new_dynamic(self.clone()).await - } - } - - pub fn endpoint_builder(&self) -> endpoint::EndpointConfigBuilder { - endpoint::EndpointConfigBuilder::from_endpoint(self.clone()) - } -} - -#[derive(Builder, Clone, Validate)] -#[builder(pattern = "owned")] -pub struct Namespace { - #[builder(private)] - runtime: Arc, - - #[validate(custom(function = "validate_allowed_chars"))] - name: String, - - is_static: bool, - - #[builder(default = "None")] - parent: Option>, -} - -impl DistributedRuntimeProvider for Namespace { - fn drt(&self) -> &DistributedRuntime { - &self.runtime - } -} - -impl std::fmt::Debug for Namespace { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!( - f, - "Namespace {{ name: {}; is_static: {}; parent: {:?} }}", - self.name, self.is_static, self.parent - ) - } -} - -impl RuntimeProvider for Namespace { - fn rt(&self) -> &Runtime { - self.runtime.rt() - } -} - -impl std::fmt::Display for Namespace { - fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { - write!(f, "{}", self.name) - } -} - -impl Namespace { - pub(crate) fn new(runtime: DistributedRuntime, name: String, is_static: bool) -> Result { - Ok(NamespaceBuilder::default() - .runtime(Arc::new(runtime)) - .name(name) - .is_static(is_static) - .build()?) - } - - /// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd - pub fn component(&self, name: impl Into) -> Result { - Ok(ComponentBuilder::from_runtime(self.runtime.clone()) - .name(name) - .namespace(self.clone()) - .is_static(self.is_static) - .build()?) - } - - /// Create a [`Namespace`] in the parent namespace - pub fn namespace(&self, name: impl Into) -> Result { - Ok(NamespaceBuilder::default() - .runtime(self.runtime.clone()) - .name(name.into()) - .is_static(self.is_static) - .parent(Some(Arc::new(self.clone()))) - .build()?) - } - - pub fn etcd_path(&self) -> String { - format!("{}{}", ETCD_ROOT_PATH, self.name()) - } - - pub fn name(&self) -> String { - match &self.parent { - Some(parent) => format!("{}.{}", parent.name(), self.name), - None => self.name.clone(), - } - } -} - -// Custom validator function -fn validate_allowed_chars(input: &str) -> Result<(), ValidationError> { - // Define the allowed character set using a regex - let regex = regex::Regex::new(r"^[a-z0-9-_]+$").unwrap(); - - if regex.is_match(input) { - Ok(()) - } else { - Err(ValidationError::new("invalid_characters")) - } -} - -// TODO - enable restrictions to the character sets allowed for namespaces, -// components, and endpoints. -// -// Put Validate traits on the struct and use the `validate_allowed_chars` method -// to validate the fields. - -// #[cfg(test)] -// mod tests { -// use super::*; -// use validator::Validate; - -// #[test] -// fn test_valid_names() { -// // Valid strings -// let valid_inputs = vec![ -// "abc", // Lowercase letters -// "abc123", // Letters and numbers -// "a-b-c", // Letters with hyphens -// "a_b_c", // Letters with underscores -// "a-b_c-123", // Mixed valid characters -// "a", // Single character -// "a_b", // Short valid pattern -// "123456", // Only numbers -// "a---b_c123", // Repeated hyphens/underscores -// ]; - -// for input in valid_inputs { -// let result = validate_allowed_chars(input); -// assert!(result.is_ok(), "Expected '{}' to be valid", input); +// // SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// // SPDX-License-Identifier: Apache-2.0 + +// //! The [Component] module defines the top-level API for building distributed applications. +// //! +// //! A distributed application consists of a set of [Component] that can host one +// //! or more [Endpoint]. Each [Endpoint] is a network-accessible service +// //! that can be accessed by other [Component] in the distributed application. +// //! +// //! A [Component] is made discoverable by registering it with the distributed runtime under +// //! a [`Namespace`]. +// //! +// //! A [`Namespace`] is a logical grouping of [Component] that are grouped together. +// //! +// //! We might extend namespace to include grouping behavior, which would define groups of +// //! components that are tightly coupled. +// //! +// //! A [Component] is the core building block of a distributed application. It is a logical +// //! unit of work such as a `Preprocessor` or `SmartRouter` that has a well-defined role in the +// //! distributed application. +// //! +// //! A [Component] can present to the distributed application one or more configuration files +// //! which define how that component was constructed/configured and what capabilities it can +// //! provide. +// //! +// //! Other [Component] can write to watching locations within a [Component] etcd +// //! path. This allows the [Component] to take dynamic actions depending on the watch +// //! triggers. +// //! +// //! TODO: Top-level Overview of Endpoints/Functions + +// use crate::{discovery::Lease, service::ServiceSet, transports::etcd::EtcdPath}; + +// use super::{ +// error, +// traits::*, +// transports::etcd::{COMPONENT_KEYWORD, ENDPOINT_KEYWORD}, +// transports::nats::Slug, +// utils::Duration, +// DistributedRuntime, Result, Runtime, +// }; + +// use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler}; +// use crate::protocols::Endpoint as EndpointId; +// use async_nats::{ +// rustls::quic, +// service::{Service, ServiceExt}, +// }; +// use derive_builder::Builder; +// use derive_getters::Getters; +// use educe::Educe; +// use serde::{Deserialize, Serialize}; +// use service::EndpointStatsHandler; +// use std::{collections::HashMap, hash::Hash, sync::Arc}; +// use validator::{Validate, ValidationError}; + +// mod client; +// #[allow(clippy::module_inception)] +// mod component; +// mod endpoint; +// mod namespace; +// mod registry; +// pub mod service; + +// pub use client::{Client, InstanceSource}; + +// /// The root etcd path where each instance registers itself in etcd. +// /// An instance is namespace+component+endpoint+lease_id and must be unique. +// pub const INSTANCE_ROOT_PATH: &str = "instances"; + +// /// The root etcd path where each namespace is registered in etcd. +// pub const ETCD_ROOT_PATH: &str = "dynamo://"; + +// #[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +// #[serde(rename_all = "snake_case")] +// pub enum TransportType { +// NatsTcp(String), +// } + +// #[derive(Default)] +// pub struct RegistryInner { +// services: HashMap, +// stats_handlers: HashMap>>>, +// } + +// #[derive(Clone)] +// pub struct Registry { +// inner: Arc>, +// } + +// #[derive(Debug, Clone, Serialize, Deserialize)] +// pub struct Instance { +// pub component: String, +// pub endpoint: String, +// pub namespace: String, +// pub instance_id: i64, +// pub transport: TransportType, +// } + +// impl Instance { +// pub fn id(&self) -> i64 { +// self.instance_id +// } +// } + +// /// A [Component] a discoverable entity in the distributed runtime. +// /// You can host [Endpoint] on a [Component] by first creating +// /// a [Service] then adding one or more [Endpoint] to the [Service]. +// /// +// /// You can also issue a request to a [Component]'s [Endpoint] by creating a [Client]. +// #[derive(Educe, Builder, Clone, Validate)] +// #[educe(Debug)] +// #[builder(pattern = "owned")] +// pub struct Component { +// #[builder(private)] +// #[educe(Debug(ignore))] +// drt: Arc, + +// // todo - restrict the namespace to a-z0-9-_A-Z +// /// Name of the component +// #[builder(setter(into))] +// #[validate(custom(function = "validate_allowed_chars"))] +// name: String, + +// // todo - restrict the namespace to a-z0-9-_A-Z +// /// Namespace +// #[builder(setter(into))] +// namespace: Namespace, + +// // A static component's endpoints cannot be discovered via etcd, they are +// // fixed at startup time. +// is_static: bool, +// } + +// impl Hash for Component { +// fn hash(&self, state: &mut H) { +// self.namespace.name().hash(state); +// self.name.hash(state); +// self.is_static.hash(state); +// } +// } + +// impl PartialEq for Component { +// fn eq(&self, other: &Self) -> bool { +// self.namespace.name() == other.namespace.name() +// && self.name == other.name +// && self.is_static == other.is_static +// } +// } + +// impl Eq for Component {} + +// impl std::fmt::Display for Component { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!(f, "{}.{}", self.namespace.name(), self.name) +// } +// } + +// impl DistributedRuntimeProvider for Component { +// fn drt(&self) -> &DistributedRuntime { +// &self.drt +// } +// } + +// impl RuntimeProvider for Component { +// fn rt(&self) -> &subject_to { +// self.drt.rt() +// } +// } + +// impl Component { +// /// The component part of an instance path in etcd. +// pub fn etcd_root(&self) -> String { +// let ns = self.namespace.name(); +// let cp = &self.name; +// format!("{INSTANCE_ROOT_PATH}/{ns}/{cp}") +// } + +// pub fn service_name(&self) -> String { +// let service_name = format!("{}_{}", self.namespace.name(), self.name); +// Slug::slugify(&service_name).to_string() +// } + +// pub fn path(&self) -> String { +// format!("{}/{}", self.namespace.name(), self.name) +// } + +// pub fn etcd_path(&self) -> EtcdPath { +// EtcdPath::new_component(&self.namespace.name(), &self.name) +// .expect("Component name and namespace should be valid") +// } + +// pub fn namespace(&self) -> &Namespace { +// &self.namespace +// } + +// pub fn name(&self) -> String { +// self.name.clone() +// } + +// pub fn endpoint(&self, endpoint: impl Into) -> Endpoint { +// Endpoint { +// component: self.clone(), +// name: endpoint.into(), +// is_static: self.is_static, +// } +// } + +// pub async fn list_instances(&self) -> anyhow::Result> { +// let Some(etcd_client) = self.drt.etcd_client() else { +// return Ok(vec![]); +// }; +// let mut out = vec![]; +// // The extra slash is important to only list exact component matches, not substrings. +// for kv in etcd_client +// .kv_get_prefix(format!("{}/", self.etcd_root())) +// .await? +// { +// let val = match serde_json::from_slice::(kv.value()) { +// Ok(val) => val, +// Err(err) => { +// anyhow::bail!( +// "Error converting etcd response to Instance: {err}. {}", +// kv.value_str()? +// ); +// } +// }; +// out.push(val); +// } +// Ok(out) +// } + +// pub async fn scrape_stats(&self, timeout: Duration) -> Result { +// let service_name = self.service_name(); +// let service_client = self.drt().service_client(); +// service_client +// .collect_services(&service_name, timeout) +// .await +// } + +// /// TODO +// /// +// /// This method will scrape the stats for all available services +// /// Returns a stream of `ServiceInfo` objects. +// /// This should be consumed by a `[tokio::time::timeout_at`] because each services +// /// will only respond once, but there is no way to know when all services have responded. +// pub async fn stats_stream(&self) -> Result<()> { +// unimplemented!("collect_stats") +// } + +// pub fn service_builder(&self) -> service::ServiceConfigBuilder { +// service::ServiceConfigBuilder::from_component(self.clone()) +// } +// } + +// impl ComponentBuilder { +// pub fn from_runtime(drt: Arc) -> Self { +// Self::default().drt(drt) +// } +// } + +// #[derive(Debug, Clone)] +// pub struct Endpoint { +// component: Component, + +// // todo - restrict alphabet +// /// Endpoint name +// name: String, + +// is_static: bool, +// } + +// impl Hash for Endpoint { +// fn hash(&self, state: &mut H) { +// self.component.hash(state); +// self.name.hash(state); +// self.is_static.hash(state); +// } +// } + +// impl PartialEq for Endpoint { +// fn eq(&self, other: &Self) -> bool { +// self.component == other.component +// && self.name == other.name +// && self.is_static == other.is_static +// } +// } + +// impl Eq for Endpoint {} + +// impl DistributedRuntimeProvider for Endpoint { +// fn drt(&self) -> &DistributedRuntime { +// self.component.drt() +// } +// } + +// impl RuntimeProvider for Endpoint { +// fn rt(&self) -> &Runtime { +// self.component.rt() +// } +// } + +// impl Endpoint { +// pub fn id(&self) -> EndpointId { +// EndpointId { +// namespace: self.component.namespace().name().to_string(), +// component: self.component.name().to_string(), +// name: self.name().to_string(), // } // } -// #[test] -// fn test_invalid_names() { -// // Invalid strings -// let invalid_inputs = vec![ -// "abc!", // Invalid character `!` -// "abc@", // Invalid character `@` -// "123$", // Invalid character `$` -// "foo.bar", // Invalid character `.` -// "foo/bar", // Invalid character `/` -// "foo\\bar", // Invalid character `\` -// "abc#", // Invalid character `#` -// "abc def", // Spaces are not allowed -// "foo,", // Invalid character `,` -// "", // Empty string -// ]; - -// for input in invalid_inputs { -// let result = validate_allowed_chars(input); -// assert!(result.is_err(), "Expected '{}' to be invalid", input); +// pub fn name(&self) -> &str { +// &self.name +// } + +// pub fn component(&self) -> &Component { +// &self.component +// } + +// // todo(ryan): deprecate this as we move to Discovery traits and Component Identifiers +// pub fn path(&self) -> String { +// format!( +// "{}/{}/{}", +// self.component.path(), +// ENDPOINT_KEYWORD, +// self.name +// ) +// } + +// /// The endpoint part of an instance path in etcd +// pub fn etcd_root(&self) -> String { +// let component_path = self.component.etcd_root(); +// let endpoint_name = &self.name; +// format!("{component_path}/{endpoint_name}") +// } + +// /// The endpoint as an EtcdPath object +// pub fn etcd_path(&self) -> EtcdPath { +// EtcdPath::new_endpoint( +// &self.component.namespace().name(), +// &self.component.name(), +// &self.name, +// ) +// .expect("Endpoint name and component name should be valid") +// } + +// /// The fully path of an instance in etcd +// pub fn etcd_path_with_lease_id(&self, lease_id: i64) -> String { +// let endpoint_root = self.etcd_root(); +// if self.is_static { +// endpoint_root +// } else { +// format!("{endpoint_root}:{lease_id:x}") +// } +// } + +// /// The endpoint as an EtcdPath object with lease ID +// pub fn etcd_path_object_with_lease_id(&self, lease_id: i64) -> EtcdPath { +// if self.is_static { +// self.etcd_path() +// } else { +// EtcdPath::new_endpoint_with_lease( +// &self.component.namespace().name(), +// &self.component.name(), +// &self.name, +// lease_id, +// ) +// .expect("Endpoint name and component name should be valid") +// } +// } + +// pub fn name_with_id(&self, lease_id: i64) -> String { +// if self.is_static { +// self.name.clone() +// } else { +// format!("{}-{:x}", self.name, lease_id) // } // } -// // #[test] -// // fn test_struct_validation_valid() { -// // // Struct with valid data -// // let valid_data = InputData { -// // name: "valid-name_123".to_string(), -// // }; -// // assert!(valid_data.validate().is_ok()); -// // } - -// // #[test] -// // fn test_struct_validation_invalid() { -// // // Struct with invalid data -// // let invalid_data = InputData { -// // name: "invalid!name".to_string(), -// // }; -// // let result = invalid_data.validate(); -// // assert!(result.is_err()); - -// // if let Err(errors) = result { -// // let error_map = errors.field_errors(); -// // assert!(error_map.contains_key("name")); -// // let name_errors = &error_map["name"]; -// // assert_eq!(name_errors[0].code, "invalid_characters"); -// // } -// // } - -// #[test] -// fn test_edge_cases() { -// // Edge cases -// let edge_inputs = vec![ -// ("-", true), // Single hyphen -// ("_", true), // Single underscore -// ("a-", true), // Letter with hyphen -// ("-", false), // Repeated hyphens -// ("-a", false), // Hyphen at the beginning -// ("a-", false), // Hyphen at the end -// ]; - -// for (input, expected_validity) in edge_inputs { -// let result = validate_allowed_chars(input); -// if expected_validity { -// assert!(result.is_ok(), "Expected '{}' to be valid", input); -// } else { -// assert!(result.is_err(), "Expected '{}' to be invalid", input); -// } +// pub fn subject(&self) -> String { +// format!("{}.{}", self.component.service_name(), self.name) +// } + +// /// Subject to an instance of the [Endpoint] with a specific lease id +// pub fn subject_to(&self, lease_id: i64) -> String { +// format!( +// "{}.{}", +// self.component.service_name(), +// self.name_with_id(lease_id) +// ) +// } + +// pub async fn client(&self) -> Result { +// if self.is_static { +// client::Client::new_static(self.clone()).await +// } else { +// client::Client::new_dynamic(self.clone()).await // } // } + +// pub fn endpoint_builder(&self) -> endpoint::EndpointConfigBuilder { +// endpoint::EndpointConfigBuilder::from_endpoint(self.clone()) +// } // } + +// #[derive(Builder, Clone, Validate)] +// #[builder(pattern = "owned")] +// pub struct Namespace { +// #[builder(private)] +// runtime: Arc, + +// #[validate(custom(function = "validate_allowed_chars"))] +// name: String, + +// is_static: bool, + +// #[builder(default = "None")] +// parent: Option>, +// } + +// impl DistributedRuntimeProvider for Namespace { +// fn drt(&self) -> &DistributedRuntime { +// &self.runtime +// } +// } + +// impl std::fmt::Debug for Namespace { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!( +// f, +// "Namespace {{ name: {}; is_static: {}; parent: {:?} }}", +// self.name, self.is_static, self.parent +// ) +// } +// } + +// impl RuntimeProvider for Namespace { +// fn rt(&self) -> &Runtime { +// self.runtime.rt() +// } +// } + +// impl std::fmt::Display for Namespace { +// fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { +// write!(f, "{}", self.name) +// } +// } + +// impl Namespace { +// pub(crate) fn new(runtime: DistributedRuntime, name: String, is_static: bool) -> Result { +// Ok(NamespaceBuilder::default() +// .runtime(Arc::new(runtime)) +// .name(name) +// .is_static(is_static) +// .build()?) +// } + +// /// Create a [`Component`] in the namespace who's endpoints can be discovered with etcd +// pub fn component(&self, name: impl Into) -> Result { +// Ok(ComponentBuilder::from_runtime(self.runtime.clone()) +// .name(name) +// .namespace(self.clone()) +// .is_static(self.is_static) +// .build()?) +// } + +// /// Create a [`Namespace`] in the parent namespace +// pub fn namespace(&self, name: impl Into) -> Result { +// Ok(NamespaceBuilder::default() +// .runtime(self.runtime.clone()) +// .name(name.into()) +// .is_static(self.is_static) +// .parent(Some(Arc::new(self.clone()))) +// .build()?) +// } + +// pub fn etcd_path(&self) -> String { +// format!("{}{}", ETCD_ROOT_PATH, self.name()) +// } + +// pub fn name(&self) -> String { +// match &self.parent { +// Some(parent) => format!("{}.{}", parent.name(), self.name), +// None => self.name.clone(), +// } +// } +// } + +// // Custom validator function +// fn validate_allowed_chars(input: &str) -> Result<(), ValidationError> { +// // Define the allowed character set using a regex +// let regex = regex::Regex::new(r"^[a-z0-9-_]+$").unwrap(); + +// if regex.is_match(input) { +// Ok(()) +// } else { +// Err(ValidationError::new("invalid_characters")) +// } +// } + +// // TODO - enable restrictions to the character sets allowed for namespaces, +// // components, and endpoints. +// // +// // Put Validate traits on the struct and use the `validate_allowed_chars` method +// // to validate the fields. + +// // #[cfg(test)] +// // mod tests { +// // use super::*; +// // use validator::Validate; + +// // #[test] +// // fn test_valid_names() { +// // // Valid strings +// // let valid_inputs = vec![ +// // "abc", // Lowercase letters +// // "abc123", // Letters and numbers +// // "a-b-c", // Letters with hyphens +// // "a_b_c", // Letters with underscores +// // "a-b_c-123", // Mixed valid characters +// // "a", // Single character +// // "a_b", // Short valid pattern +// // "123456", // Only numbers +// // "a---b_c123", // Repeated hyphens/underscores +// // ]; + +// // for input in valid_inputs { +// // let result = validate_allowed_chars(input); +// // assert!(result.is_ok(), "Expected '{}' to be valid", input); +// // } +// // } + +// // #[test] +// // fn test_invalid_names() { +// // // Invalid strings +// // let invalid_inputs = vec![ +// // "abc!", // Invalid character `!` +// // "abc@", // Invalid character `@` +// // "123$", // Invalid character `$` +// // "foo.bar", // Invalid character `.` +// // "foo/bar", // Invalid character `/` +// // "foo\\bar", // Invalid character `\` +// // "abc#", // Invalid character `#` +// // "abc def", // Spaces are not allowed +// // "foo,", // Invalid character `,` +// // "", // Empty string +// // ]; + +// // for input in invalid_inputs { +// // let result = validate_allowed_chars(input); +// // assert!(result.is_err(), "Expected '{}' to be invalid", input); +// // } +// // } + +// // // #[test] +// // // fn test_struct_validation_valid() { +// // // // Struct with valid data +// // // let valid_data = InputData { +// // // name: "valid-name_123".to_string(), +// // // }; +// // // assert!(valid_data.validate().is_ok()); +// // // } + +// // // #[test] +// // // fn test_struct_validation_invalid() { +// // // // Struct with invalid data +// // // let invalid_data = InputData { +// // // name: "invalid!name".to_string(), +// // // }; +// // // let result = invalid_data.validate(); +// // // assert!(result.is_err()); + +// // // if let Err(errors) = result { +// // // let error_map = errors.field_errors(); +// // // assert!(error_map.contains_key("name")); +// // // let name_errors = &error_map["name"]; +// // // assert_eq!(name_errors[0].code, "invalid_characters"); +// // // } +// // // } + +// // #[test] +// // fn test_edge_cases() { +// // // Edge cases +// // let edge_inputs = vec![ +// // ("-", true), // Single hyphen +// // ("_", true), // Single underscore +// // ("a-", true), // Letter with hyphen +// // ("-", false), // Repeated hyphens +// // ("-a", false), // Hyphen at the beginning +// // ("a-", false), // Hyphen at the end +// // ]; + +// // for (input, expected_validity) in edge_inputs { +// // let result = validate_allowed_chars(input); +// // if expected_validity { +// // assert!(result.is_ok(), "Expected '{}' to be valid", input); +// // } else { +// // assert!(result.is_err(), "Expected '{}' to be invalid", input); +// // } +// // } +// // } +// // } diff --git a/lib/runtime/src/component/client.rs b/lib/runtime/src/component/client.rs deleted file mode 100644 index ad5a7a083c..0000000000 --- a/lib/runtime/src/component/client.rs +++ /dev/null @@ -1,224 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use crate::pipeline::{ - AddressedPushRouter, AddressedRequest, AsyncEngine, Data, ManyOut, PushRouter, RouterMode, - SingleIn, -}; -use rand::Rng; -use std::collections::HashMap; -use std::sync::{ - atomic::{AtomicU64, Ordering}, - Arc, -}; -use tokio::{net::unix::pipe::Receiver, sync::Mutex}; - -use crate::{ - pipeline::async_trait, - transports::etcd::{Client as EtcdClient, WatchEvent}, -}; - -use super::*; - -/// Each state will be have a nonce associated with it -/// The state will be emitted in a watch channel, so we can observe the -/// critical state transitions. -enum MapState { - /// The map is empty; value = nonce - Empty(u64), - - /// The map is not-empty; values are (nonce, count) - NonEmpty(u64, u64), - - /// The watcher has finished, no more events will be emitted - Finished, -} - -enum EndpointEvent { - Put(String, i64), - Delete(String), -} - -#[derive(Clone, Debug)] -pub struct Client { - // This is me - pub endpoint: Endpoint, - // These are the remotes I know about - pub instance_source: Arc, -} - -#[derive(Clone, Debug)] -pub enum InstanceSource { - Static, - Dynamic(tokio::sync::watch::Receiver>), -} - -impl Client { - // Client will only talk to a single static endpoint - pub(crate) async fn new_static(endpoint: Endpoint) -> Result { - Ok(Client { - endpoint, - instance_source: Arc::new(InstanceSource::Static), - }) - } - - // Client with auto-discover instances using etcd - pub(crate) async fn new_dynamic(endpoint: Endpoint) -> Result { - // create live endpoint watcher - let Some(etcd_client) = &endpoint.component.drt.etcd_client else { - anyhow::bail!("Attempt to create a dynamic client on a static endpoint"); - }; - - let instance_source = - Self::get_or_create_dynamic_instance_source(etcd_client, &endpoint).await?; - - Ok(Client { - endpoint, - instance_source, - }) - } - - pub fn path(&self) -> String { - self.endpoint.path() - } - - /// The root etcd path we watch in etcd to discover new instances to route to. - pub fn etcd_root(&self) -> String { - self.endpoint.etcd_root() - } - - pub fn instances(&self) -> Vec { - match self.instance_source.as_ref() { - InstanceSource::Static => vec![], - InstanceSource::Dynamic(watch_rx) => watch_rx.borrow().clone(), - } - } - - pub fn instance_ids(&self) -> Vec { - self.instances().into_iter().map(|ep| ep.id()).collect() - } - - /// Wait for at least one Instance to be available for this Endpoint - pub async fn wait_for_instances(&self) -> Result> { - let mut instances: Vec = vec![]; - if let InstanceSource::Dynamic(mut rx) = self.instance_source.as_ref().clone() { - // wait for there to be 1 or more endpoints - loop { - instances = rx.borrow_and_update().to_vec(); - if instances.is_empty() { - rx.changed().await?; - } else { - break; - } - } - } - Ok(instances) - } - - /// Is this component know at startup and not discovered via etcd? - pub fn is_static(&self) -> bool { - matches!(self.instance_source.as_ref(), InstanceSource::Static) - } - - async fn get_or_create_dynamic_instance_source( - etcd_client: &EtcdClient, - endpoint: &Endpoint, - ) -> Result> { - let drt = endpoint.drt(); - let instance_sources = drt.instance_sources(); - let mut instance_sources = instance_sources.lock().await; - - if let Some(instance_source) = instance_sources.get(endpoint) { - if let Some(instance_source) = instance_source.upgrade() { - return Ok(instance_source); - } else { - instance_sources.remove(endpoint); - } - } - - let prefix_watcher = etcd_client - .kv_get_and_watch_prefix(endpoint.etcd_root()) - .await?; - - let (prefix, _watcher, mut kv_event_rx) = prefix_watcher.dissolve(); - - let (watch_tx, watch_rx) = tokio::sync::watch::channel(vec![]); - - let secondary = endpoint.component.drt.runtime.secondary().clone(); - - // this task should be included in the registry - // currently this is created once per client, but this object/task should only be instantiated - // once per worker/instance - secondary.spawn(async move { - tracing::debug!("Starting endpoint watcher for prefix: {}", prefix); - let mut map = HashMap::new(); - - loop { - let kv_event = tokio::select! { - _ = watch_tx.closed() => { - tracing::debug!("all watchers have closed; shutting down endpoint watcher for prefix: {prefix}"); - break; - } - kv_event = kv_event_rx.recv() => { - match kv_event { - Some(kv_event) => kv_event, - None => { - tracing::debug!("watch stream has closed; shutting down endpoint watcher for prefix: {prefix}"); - break; - } - } - } - }; - - match kv_event { - WatchEvent::Put(kv) => { - let key = String::from_utf8(kv.key().to_vec()); - let val = serde_json::from_slice::(kv.value()); - if let (Ok(key), Ok(val)) = (key, val) { - map.insert(key.clone(), val); - } else { - tracing::error!("Unable to parse put endpoint event; shutting down endpoint watcher for prefix: {prefix}"); - break; - } - } - WatchEvent::Delete(kv) => { - match String::from_utf8(kv.key().to_vec()) { - Ok(key) => { map.remove(&key); } - Err(_) => { - tracing::error!("Unable to parse delete endpoint event; shutting down endpoint watcher for prefix: {}", prefix); - break; - } - } - } - } - - let instances: Vec = map.values().cloned().collect(); - - if watch_tx.send(instances).is_err() { - tracing::debug!("Unable to send watch updates; shutting down endpoint watcher for prefix: {}", prefix); - break; - } - - } - - tracing::debug!("Completed endpoint watcher for prefix: {prefix}"); - let _ = watch_tx.send(vec![]); - }); - - let instance_source = Arc::new(InstanceSource::Dynamic(watch_rx)); - instance_sources.insert(endpoint.clone(), Arc::downgrade(&instance_source)); - Ok(instance_source) - } -} diff --git a/lib/runtime/src/component/endpoint.rs b/lib/runtime/src/component/endpoint.rs deleted file mode 100644 index 6b3e7d8ec1..0000000000 --- a/lib/runtime/src/component/endpoint.rs +++ /dev/null @@ -1,146 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use derive_getters::Dissolve; - -use super::*; - -pub use async_nats::service::endpoint::Stats as EndpointStats; - -#[derive(Educe, Builder, Dissolve)] -#[educe(Debug)] -#[builder(pattern = "owned", build_fn(private, name = "build_internal"))] -pub struct EndpointConfig { - #[builder(private)] - endpoint: Endpoint, - - // todo: move lease to component/service - /// Lease - #[educe(Debug(ignore))] - #[builder(default)] - lease: Option, - - /// Endpoint handler - #[educe(Debug(ignore))] - handler: Arc, - - /// Stats handler - #[educe(Debug(ignore))] - #[builder(default, private)] - _stats_handler: Option, -} - -impl EndpointConfigBuilder { - pub(crate) fn from_endpoint(endpoint: Endpoint) -> Self { - Self::default().endpoint(endpoint) - } - - pub fn stats_handler(self, handler: F) -> Self - where - F: FnMut(EndpointStats) -> serde_json::Value + Send + Sync + 'static, - { - self._stats_handler(Some(Box::new(handler))) - } - - pub async fn start(self) -> Result<()> { - let (endpoint, lease, handler, stats_handler) = self.build_internal()?.dissolve(); - let lease = lease.or(endpoint.drt().primary_lease()); - let lease_id = lease.as_ref().map(|l| l.id()).unwrap_or(0); - - tracing::debug!( - "Starting endpoint: {}", - endpoint.etcd_path_with_lease_id(lease_id) - ); - - let service_name = endpoint.component.service_name(); - - // acquire the registry lock - let registry = endpoint.drt().component_registry.inner.lock().await; - - // get the group - let group = registry - .services - .get(&service_name) - .map(|service| service.group(endpoint.component.service_name())) - .ok_or(error!("Service not found"))?; - - // get the stats handler map - let handler_map = registry - .stats_handlers - .get(&service_name) - .cloned() - .expect("no stats handler registry; this is unexpected"); - - drop(registry); - - // insert the stats handler - if let Some(stats_handler) = stats_handler { - handler_map - .lock() - .unwrap() - .insert(endpoint.subject_to(lease_id), stats_handler); - } - - // creates an endpoint for the service - let service_endpoint = group - .endpoint(&endpoint.name_with_id(lease_id)) - .await - .map_err(|e| anyhow::anyhow!("Failed to start endpoint: {e}"))?; - - let cancel_token = lease - .map(|l| l.child_token()) - .unwrap_or_else(|| endpoint.drt().child_token()); - - let push_endpoint = PushEndpoint::builder() - .service_handler(handler) - .cancellation_token(cancel_token.clone()) - .build() - .map_err(|e| anyhow::anyhow!("Failed to build push endpoint: {e}"))?; - - // launch in primary runtime - let task = tokio::spawn(push_endpoint.start(service_endpoint)); - - // make the components service endpoint discovery in etcd - - // client.register_service() - let info = Instance { - component: endpoint.component.name.clone(), - endpoint: endpoint.name.clone(), - namespace: endpoint.component.namespace.name.clone(), - instance_id: lease_id, - transport: TransportType::NatsTcp(endpoint.subject_to(lease_id)), - }; - - let info = serde_json::to_vec_pretty(&info)?; - - if let Some(etcd_client) = &endpoint.component.drt.etcd_client { - if let Err(e) = etcd_client - .kv_create( - endpoint.etcd_path_with_lease_id(lease_id), - info, - Some(lease_id), - ) - .await - { - tracing::error!("Failed to register discoverable service: {:?}", e); - cancel_token.cancel(); - return Err(error!("Failed to register discoverable service")); - } - } - task.await??; - - Ok(()) - } -} diff --git a/lib/runtime/src/component/service.rs b/lib/runtime/src/component/service.rs deleted file mode 100644 index 1a74b27f04..0000000000 --- a/lib/runtime/src/component/service.rs +++ /dev/null @@ -1,110 +0,0 @@ -// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. -// SPDX-License-Identifier: Apache-2.0 -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -use derive_getters::Dissolve; -use std::collections::HashMap; -use std::sync::Mutex; - -use super::*; - -pub use super::endpoint::EndpointStats; -pub type StatsHandler = - Box serde_json::Value + Send + Sync + 'static>; -pub type EndpointStatsHandler = - Box serde_json::Value + Send + Sync + 'static>; - -pub const PROJECT_NAME: &str = "Dynamo"; - -#[derive(Educe, Builder, Dissolve)] -#[educe(Debug)] -#[builder(pattern = "owned", build_fn(private, name = "build_internal"))] -pub struct ServiceConfig { - #[builder(private)] - component: Component, - - /// Description - #[builder(default)] - description: Option, -} - -impl ServiceConfigBuilder { - /// Create the [`Component`]'s service and store it in the registry. - pub async fn create(self) -> Result { - let (component, description) = self.build_internal()?.dissolve(); - - let version = "0.0.1".to_string(); - - let service_name = component.service_name(); - log::debug!("component: {component}; creating, service_name: {service_name}"); - - let description = description.unwrap_or(format!( - "{PROJECT_NAME} component {} in namespace {}", - component.name, component.namespace - )); - - let stats_handler_registry: Arc>> = - Arc::new(Mutex::new(HashMap::new())); - - let stats_handler_registry_clone = stats_handler_registry.clone(); - - let mut guard = component.drt.component_registry.inner.lock().await; - - if guard.services.contains_key(&service_name) { - return Err(anyhow::anyhow!("Service already exists")); - } - - // create service on the secondary runtime - let builder = component.drt.nats_client.client().service_builder(); - - tracing::debug!("Starting service: {}", service_name); - let service_builder = builder - .description(description) - .stats_handler(move |name, stats| { - log::trace!("stats_handler: {name}, {stats:?}"); - let mut guard = stats_handler_registry.lock().unwrap(); - match guard.get_mut(&name) { - Some(handler) => handler(stats), - None => serde_json::Value::Null, - } - }); - tracing::debug!("Got builder"); - let service = service_builder - .start(service_name.clone(), version) - .await - .map_err(|e| anyhow::anyhow!("Failed to start service: {e}"))?; - - // new copy of service_name as the previous one is moved into the task above - let service_name = component.service_name(); - - // insert the service into the registry - guard.services.insert(service_name.clone(), service); - - // insert the stats handler into the registry - guard - .stats_handlers - .insert(service_name, stats_handler_registry_clone); - - // drop the guard to unlock the mutex - drop(guard); - - Ok(component) - } -} - -impl ServiceConfigBuilder { - pub(crate) fn from_component(component: Component) -> Self { - Self::default().component(component) - } -} diff --git a/lib/runtime/src/descriptor.rs b/lib/runtime/src/descriptor.rs new file mode 100644 index 0000000000..f65a946955 --- /dev/null +++ b/lib/runtime/src/descriptor.rs @@ -0,0 +1,1130 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Pure data descriptors for component identification without runtime dependencies. +//! +//! This module implements the descriptor layer of Dynamo's two-tier architecture for +//! distributed component management. Descriptors are immutable data structures that +//! represent component paths and identities in the canonical `dynamo://` format. +//! +//! # Architecture Overview +//! +//! The descriptor system provides three core types that build on each other: +//! +//! - [`Identifier`]: Basic identification for namespaces, components, and endpoints +//! - [`Instance`]: An endpoint identifier extended with an instance ID (lease ID) +//! - [`Keys`]: Extended paths with additional segments for arbitrary data storage +//! +//! All descriptors: +//! - Own the canonical `dynamo://` path format +//! - Provide validation and parsing +//! - Are immutable once created +//! - Have no runtime dependencies +//! +//! # Path Format +//! +//! Dynamo paths follow a hierarchical structure with reserved keywords: +//! +//! ```text +//! dynamo://namespace[/_component_/name][/_endpoint_/name[:instance_id]][/_path_/segments...] +//! ``` +//! +//! Reserved keywords: +//! - `_component_`: Marks the component section +//! - `_endpoint_`: Marks the endpoint section +//! - `_path_`: Marks extended path segments +//! - `_static_`: Marks static endpoints (single instance) +//! +//! # Usage Examples +//! +//! ```ignore +//! use dynamo::runtime::descriptor::{Identifier, Instance, Keys}; +//! +//! // Create basic identifiers +//! let ns = Identifier::new_namespace("production.api")?; +//! let comp = Identifier::new_component("production", "gateway")?; +//! let ep = Identifier::new_endpoint("production", "gateway", "http")?; +//! +//! // Create instance with ID +//! let instance = Instance::new(ep.clone(), 0x1234)?; +//! assert_eq!(instance.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http:1234"); +//! +//! // Create extended paths with Keys +//! let keys = Keys::from_identifier(comp, vec!["v1".to_string(), "config".to_string()])?; +//! assert_eq!(keys.to_string(), "dynamo://production/_component_/gateway/_path_/v1/config"); +//! ``` +//! +//! # Lenient Parsing +//! +//! Descriptors support lenient parsing when converting from strings: +//! +//! ```ignore +//! let path = "dynamo://ns/_component_/comp/_endpoint_/ep:1234/_path_/extra/data"; +//! +//! // Parse as simpler type drops extra information +//! let id: Identifier = path.try_into()?; // Drops :1234 and /extra/data +//! let inst: Instance = path.try_into()?; // Drops /extra/data +//! let keys: Keys = path.try_into()?; // Preserves everything +//! ``` +//! +//! # Validation Rules +//! +//! - Names can only contain lowercase letters, numbers, hyphens, and underscores +//! - User-provided names cannot start with underscore (reserved for keywords) +//! - Namespaces support dot notation for hierarchical organization +//! - Instance IDs are represented as lowercase hexadecimal + +use once_cell::sync::Lazy; +use std::str::FromStr; +use validator::ValidationError; +use serde::{Deserialize, Serialize}; + +use crate::slug::Slug; + +// TODO Make DYN_ROOT_PATH part of Identifier tied to distributedRuntime +pub const DYN_ROOT_PATH: &str = "dyn"; +pub const COMPONENT_KEYWORD: &str = "_component_"; +pub const ENDPOINT_KEYWORD: &str = "_endpoint_"; +pub const PATH_KEYWORD: &str = "_path_"; +pub const BARRIER_KEYWORD: &str = "_barrier_"; +pub const STATIC_KEYWORD: &str = "_static_"; + +/// Errors that can occur during descriptor operations +#[derive(Debug, thiserror::Error)] +pub enum DescriptorError { + #[error("Path must start with '{}'", DYN_ROOT_PATH)] + InvalidPrefix, + #[error("Invalid namespace: {0}")] + InvalidNamespace(String), + #[error("Invalid component name: {0}")] + InvalidComponent(String), + #[error("Invalid endpoint name: {0}")] + InvalidEndpoint(String), + #[error("Invalid path segment: {0}")] + InvalidPathSegment(String), + #[error("Endpoint requires component to be present")] + EndpointWithoutComponent, + #[error("Reserved keyword '{0}' cannot be used in path segments")] + ReservedKeyword(String), + #[error("Empty namespace not allowed")] + EmptyNamespace, + #[error("Empty component name not allowed")] + EmptyComponent, + #[error("Empty endpoint name not allowed")] + EmptyEndpoint, + #[error("Missing instance ID in path")] + MissingInstanceId, + #[error("Invalid instance ID format: {0}")] + InvalidInstanceId(String), +} + +/// Intermediate representation of a parsed dynamo path +#[derive(Debug)] +struct DynamoPath { + namespace: String, + component: Option, + endpoint: Option, + instance_id: Option, + is_static: bool, + extra_segments: Vec, +} + +impl DynamoPath { + /// Parse any dynamo:// path into intermediate representation + fn parse(input: &str) -> Result { + // Check for required prefix + if !input.starts_with(DYN_ROOT_PATH) { + return Err(DescriptorError::InvalidPrefix); + } + + // Remove prefix and split into segments + let path_without_prefix = &input[DYN_ROOT_PATH.len()..]; + let segments: Vec<&str> = path_without_prefix.split('/').collect(); + + if segments.is_empty() || segments[0].is_empty() { + return Err(DescriptorError::EmptyNamespace); + } + + let namespace = segments[0].to_string(); + validate_namespace(&namespace)?; + + let mut component: Option = None; + let mut endpoint: Option = None; + let mut instance_id: Option = None; + let mut is_static = false; + let mut extra_segments = Vec::new(); + + let mut i = 1; + while i < segments.len() { + match segments[i] { + COMPONENT_KEYWORD => { + // Check if component was already set + if component.is_some() { + return Err(DescriptorError::InvalidPathSegment( + "Duplicate _component_ keyword in path".to_string() + )); + } + if i + 1 >= segments.len() { + return Err(DescriptorError::EmptyComponent); + } + let component_name = segments[i + 1]; + validate_component(component_name)?; + component = Some(component_name.to_string()); + i += 2; + } + ENDPOINT_KEYWORD => { + if component.is_none() { + return Err(DescriptorError::EndpointWithoutComponent); + } + // Check if endpoint was already set + if endpoint.is_some() { + return Err(DescriptorError::InvalidPathSegment( + "Duplicate _endpoint_ keyword in path".to_string() + )); + } + if i + 1 >= segments.len() { + return Err(DescriptorError::EmptyEndpoint); + } + + let endpoint_segment = segments[i + 1]; + + // Check for instance ID suffix (:hex_id) + if let Some(colon_pos) = endpoint_segment.find(':') { + let endpoint_name = &endpoint_segment[..colon_pos]; + let id_str = &endpoint_segment[colon_pos + 1..]; + + // Parse instance ID as hexadecimal + instance_id = Some(i64::from_str_radix(id_str, 16).map_err(|_| { + DescriptorError::InvalidInstanceId(id_str.to_string()) + })?); + + validate_endpoint(endpoint_name)?; + endpoint = Some(endpoint_name.to_string()); + i += 2; + } else { + validate_endpoint(endpoint_segment)?; + endpoint = Some(endpoint_segment.to_string()); + // Check for /_static_ after endpoint + if i + 2 < segments.len() && segments[i + 2] == STATIC_KEYWORD { + is_static = true; + i += 3; + } else { + i += 2; + } + } + } + PATH_KEYWORD => { + // Valid _path_ extension at any level + i += 1; + while i < segments.len() { + validate_extra_path_segment(segments[i])?; + extra_segments.push(segments[i].to_string()); + i += 1; + } + } + _ => { + // Any other segment is invalid - must use _path_ for extensions + return Err(DescriptorError::InvalidPathSegment(format!( + "Invalid path format: unexpected segment '{}' - use '{}' keyword for path extensions", + segments[i], PATH_KEYWORD + ))); + } + } + } + + Ok(DynamoPath { + namespace, + component, + endpoint, + instance_id, + is_static, + extra_segments, + }) + } + + /// Convert to Identifier (drops instance_id and extra segments if present) + fn try_into_identifier(self) -> Result { + // Note: We allow parsing paths with instance_id or extra segments, + // we just drop them. This enables more flexible parsing. + + Ok(Identifier { + namespace: self.namespace, + component: self.component, + endpoint: self.endpoint, + }) + } + + /// Convert to Instance (drops extra segments if present) + fn try_into_instance(self) -> Result { + let component = self.component.ok_or(DescriptorError::EmptyComponent)?; + let endpoint = self.endpoint.ok_or(DescriptorError::EmptyEndpoint)?; + let identifier = Identifier { + namespace: self.namespace, + component: Some(component), + endpoint: Some(endpoint), + }; + if self.is_static { + Ok(Instance { + identifier, + instance_id: None, + is_static: true, + }) + } else if let Some(instance_id) = self.instance_id { + Ok(Instance { + identifier, + instance_id: Some(instance_id), + is_static: false, + }) + } else { + Err(DescriptorError::MissingInstanceId) + } + } + + /// Convert to Keys (with validation) + fn try_into_keys(self) -> Result { + let base = if self.is_static || self.instance_id.is_some() { + let component = self.component.ok_or(DescriptorError::EmptyComponent)?; + let endpoint = self.endpoint.ok_or(DescriptorError::EmptyEndpoint)?; + let identifier = Identifier { + namespace: self.namespace, + component: Some(component), + endpoint: Some(endpoint), + }; + KeysBase::Instance(Instance { + identifier, + instance_id: self.instance_id, + is_static: self.is_static, + }) + } else { + let identifier = Identifier { + namespace: self.namespace, + component: self.component, + endpoint: self.endpoint, + }; + KeysBase::Identifier(identifier) + }; + Ok(Keys { + base, + keys: self.extra_segments, + }) + } +} + +/// Pure data descriptor for component identification +/// Owns the canonical path format and validation logic +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize, Default)] +pub struct Identifier { + namespace: String, + component: Option, + endpoint: Option, +} + +impl Identifier { + /// Create namespace-only identifier + pub fn new_namespace(namespace: &str) -> Result { + validate_namespace(namespace)?; + Ok(Self { + namespace: namespace.to_string(), + component: None, + endpoint: None, + }) + } + + /// Create component identifier + pub fn new_component(namespace: &str, component: &str) -> Result { + validate_namespace(namespace)?; + validate_component(component)?; + Ok(Self { + namespace: namespace.to_string(), + component: Some(component.to_string()), + endpoint: None, + }) + } + + /// Create endpoint identifier + pub fn new_endpoint(namespace: &str, component: &str, endpoint: &str) -> Result { + validate_namespace(namespace)?; + validate_component(component)?; + validate_endpoint(endpoint)?; + Ok(Self { + namespace: namespace.to_string(), + component: Some(component.to_string()), + endpoint: Some(endpoint.to_string()), + }) + } + + /// Parse from canonical string representation + pub fn parse(input: &str) -> Result { + input.try_into() + } + + /// Get the namespace + pub fn namespace_name(&self) -> &str { + &self.namespace + } + + /// Get the component if present + pub fn component_name(&self) -> Option<&str> { + self.component.as_deref() + } + + /// Get the endpoint if present + pub fn endpoint_name(&self) -> Option<&str> { + self.endpoint.as_deref() + } + + /// Validate the identifier + pub fn validate(&self) -> Result<(), DescriptorError> { + validate_namespace(&self.namespace)?; + + if let Some(ref component) = self.component { + validate_component(component)?; + } + + if let Some(ref endpoint) = self.endpoint { + validate_endpoint(endpoint)?; + } + + Ok(()) + } + + /// Generate a slugified subject string for event publishing + pub fn slug(&self) -> Slug { + Slug::slugify(&self.to_string()) + } + + /// Create a namespace-only identifier from this identifier + pub fn to_namespace(&self) -> Identifier { + Identifier { + namespace: self.namespace.clone(), + component: None, + endpoint: None, + } + } + + /// Create a component identifier from this identifier (requires component to be present) + pub fn to_component(&self) -> Option { + self.component.as_ref().map(|comp| Identifier { + namespace: self.namespace.clone(), + component: Some(comp.clone()), + endpoint: None, + }) + } +} + +impl std::fmt::Display for Identifier { + /// Builds the canonical string representation in the format: + /// dynamo://namespace[/_component_/component][/_endpoint_/endpoint] + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", DYN_ROOT_PATH, self.namespace)?; + + if let Some(ref component) = self.component { + write!(f, "/{}/{}", COMPONENT_KEYWORD, component)?; + + if let Some(ref endpoint) = self.endpoint { + write!(f, "/{}/{}", ENDPOINT_KEYWORD, endpoint)?; + } + } + + Ok(()) + } +} + +impl std::str::FromStr for Identifier { + type Err = DescriptorError; + + fn from_str(s: &str) -> Result { + Self::parse(s) + } +} + +impl TryFrom<&str> for Identifier { + type Error = DescriptorError; + + fn try_from(input: &str) -> Result { + DynamoPath::parse(input)?.try_into_identifier() + } +} + +/// Identifier extended with instance_id (lease_id) +/// Immutable - identifier cannot be changed after construction +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct Instance { + identifier: Identifier, // Private to enforce immutability + instance_id: Option, + is_static: bool // We can have static instances which don't have an instance_id. In this case its guaranteed there will only be one instance +} + +impl Instance { + /// Create from identifier, takes ownership to enforce immutability + pub fn new(identifier: Identifier, instance_id: i64) -> Result { + identifier.validate()?; + if identifier.endpoint_name().is_none() { + return Err(DescriptorError::InvalidEndpoint( + "Instance ID can only be attached to endpoints".to_string() + )); + } + Ok(Self { + identifier, + instance_id: Some(instance_id), + is_static: false + }) + } + + pub fn new_static(identifier: Identifier) -> Result { + identifier.validate()?; + if identifier.endpoint_name().is_none() { + return Err(DescriptorError::InvalidEndpoint( + "Instance ID can only be attached to endpoints".to_string() + )); + } + Ok(Self { + identifier, + instance_id: None, + is_static: true + }) + } + + /// Create instance from individual path components + /// This is a convenience constructor that builds the full identifier from parts + pub fn from_parts( + namespace: &str, + component: &str, + endpoint: &str, + instance_id: i64, + ) -> Result { + let identifier = Identifier::new_endpoint(namespace, component, endpoint)?; + Self::new(identifier, instance_id) + } + + pub fn parse(input: &str) -> Result { + input.try_into() + } + + pub fn identifier(&self) -> Identifier { + self.identifier.clone() + } + + pub fn instance_id(&self) -> Option { + self.instance_id + } + + pub fn is_static(&self) -> bool { + self.is_static + } + + /// Generate a slugified subject string for event publishing + pub fn slug(&self) -> Slug { + Slug::slugify(&self.to_string()) + } +} + +impl std::fmt::Display for Instance { + /// Builds the canonical string representation in the format: + /// dynamo://namespace/_component_/component/_endpoint_/endpoint:hex_id + /// The instance_id is formatted as lowercase hexadecimal after the endpoint + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + write!(f, "{}/{}", DYN_ROOT_PATH, self.identifier.namespace)?; + + if let Some(ref component) = self.identifier.component { + write!(f, "/{}/{}", COMPONENT_KEYWORD, component)?; + + if let Some(ref endpoint) = self.identifier.endpoint { + write!(f, "/{}/{}", ENDPOINT_KEYWORD, endpoint)?; + } + if let Some(instance_id) = self.instance_id{ + write!(f, ":{:x}", instance_id)?; + } else { + write!(f, "/{}", STATIC_KEYWORD)?; + } + } + + Ok(()) + } +} + +impl std::str::FromStr for Instance { + type Err = DescriptorError; + + fn from_str(s: &str) -> Result { + Self::parse(s) + } +} + +impl TryFrom<&str> for Instance { + type Error = DescriptorError; + + fn try_from(input: &str) -> Result { + DynamoPath::parse(input)?.try_into_instance() + } +} + +/// Descriptor with additional path segments for extended paths +/// Always inserts _path_ before the segments +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub struct Keys { + base: KeysBase, // Either Identifier or Instance + keys: Vec, +} + +/// Base can be either Identifier or Instance +#[derive(Debug, Clone, PartialEq, Eq, Hash, Serialize, Deserialize)] +pub enum KeysBase { + Identifier(Identifier), + Instance(Instance), +} + +impl Keys { + /// Create from identifier with path segments + /// All segments will be placed under _path_ + pub fn from_identifier(identifier: Identifier, keys: Vec) -> Result { + identifier.validate()?; + for key in &keys { + validate_path_segment(key)?; + } + Ok(Self { + base: KeysBase::Identifier(identifier), + keys, + }) + } + + /// Create from instance with path segments + /// All segments will be placed under _path_ + pub fn from_instance(instance: Instance, keys: Vec) -> Result { + for key in &keys { + validate_path_segment(key)?; + } + Ok(Self { + base: KeysBase::Instance(instance), + keys, + }) + } + + /// Parse from canonical string representation + pub fn parse(input: &str) -> Result { + input.try_into() + } + + /// Get the base descriptor + pub fn base(&self) -> &KeysBase { + &self.base + } + + /// Get the additional keys + pub fn keys(&self) -> &[String] { + &self.keys + } + + /// Generate a slugified subject string for event publishing + pub fn slug(&self) -> Slug { + Slug::slugify_unique(&self.to_string()) + } + +} + +impl std::fmt::Display for Keys { + /// Builds the canonical string representation by combining the base path + /// (either Identifier or Instance) with _path_ and additional segments. + /// Format: base_path/_path_/segment1/segment2... + fn fmt(&self, f: &mut std::fmt::Formatter<'_>) -> std::fmt::Result { + match &self.base { + KeysBase::Identifier(id) => write!(f, "{}", id)?, + KeysBase::Instance(inst) => write!(f, "{}", inst)?, + } + + // Always insert _path_ + write!(f, "/{}", PATH_KEYWORD)?; + + // Add the segments + for key in &self.keys { + write!(f, "/{}", key)?; + } + + Ok(()) + } +} + +impl std::str::FromStr for Keys { + type Err = DescriptorError; + + fn from_str(s: &str) -> Result { + Self::parse(s) + } +} + +impl TryFrom<&str> for Keys { + type Error = DescriptorError; + + fn try_from(input: &str) -> Result { + DynamoPath::parse(input)?.try_into_keys() + } +} + +// Validation helpers + +static ALLOWED_CHARS_REGEX: Lazy = + Lazy::new(|| regex::Regex::new(r"^[a-z0-9-_]+$").unwrap()); + +/// Validation for namespace segments +fn validate_namespace(namespace: &str) -> Result<(), DescriptorError> { + if namespace.is_empty() { + return Err(DescriptorError::EmptyNamespace); + } + + // Split by dots and validate each part + for part in namespace.split('.') { + if part.is_empty() { + return Err(DescriptorError::InvalidNamespace(format!( + "Empty namespace segment in '{}'", + namespace + ))); + } + // Namespace segments cannot start with underscore (reserved for internal use) + if part.starts_with('_') { + return Err(DescriptorError::InvalidNamespace( + format!("Namespace segment '{}' cannot start with underscore (reserved for internal use)", part) + )); + } + validate_allowed_chars(part).map_err(|_| { + DescriptorError::InvalidNamespace(format!("Invalid characters in '{}'", part)) + })?; + } + Ok(()) +} + +/// Validation for component names +fn validate_component(component: &str) -> Result<(), DescriptorError> { + if component.is_empty() { + return Err(DescriptorError::EmptyComponent); + } + // Component names cannot start with underscore (reserved for internal use) + if component.starts_with('_') { + return Err(DescriptorError::InvalidComponent( + format!("Component name '{}' cannot start with underscore (reserved for internal use)", component) + )); + } + validate_allowed_chars(component) + .map_err(|_| DescriptorError::InvalidComponent(component.to_string())) +} + +/// Validation for endpoint names +fn validate_endpoint(endpoint: &str) -> Result<(), DescriptorError> { + if endpoint.is_empty() { + return Err(DescriptorError::EmptyEndpoint); + } + // Endpoint names cannot start with underscore (reserved for internal use) + if endpoint.starts_with('_') { + return Err(DescriptorError::InvalidEndpoint( + format!("Endpoint name '{}' cannot start with underscore (reserved for internal use)", endpoint) + )); + } + validate_allowed_chars(endpoint) + .map_err(|_| DescriptorError::InvalidEndpoint(endpoint.to_string())) +} + +/// Validation for path segments (no segments starting with underscore) +fn validate_path_segment(segment: &str) -> Result<(), DescriptorError> { + if segment.is_empty() { + return Err(DescriptorError::InvalidPathSegment( + "Empty path segment".to_string(), + )); + } + + // No segments starting with underscore (reserved for internal use) + if segment.starts_with('_') { + return Err(DescriptorError::InvalidPathSegment( + format!("Path segment '{}' cannot start with underscore (reserved for internal use)", segment) + )); + } + + validate_allowed_chars(segment) + .map_err(|_| DescriptorError::InvalidPathSegment(segment.to_string())) +} + +/// Validate extra path segments (used by DynamoPath parsing) +fn validate_extra_path_segment(segment: &str) -> Result<(), DescriptorError> { + if segment.is_empty() { + return Err(DescriptorError::InvalidPathSegment( + "Empty path segment".to_string(), + )); + } + + // No segments starting with underscore (reserved for internal use) + if segment.starts_with('_') { + return Err(DescriptorError::ReservedKeyword(segment.to_string())); + } + + validate_allowed_chars(segment) + .map_err(|_| DescriptorError::InvalidPathSegment(segment.to_string())) +} + +/// Core validation for allowed characters +fn validate_allowed_chars(input: &str) -> Result<(), ValidationError> { + if ALLOWED_CHARS_REGEX.is_match(input) { + Ok(()) + } else { + Err(ValidationError::new("invalid_characters")) + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_identifier_namespace_only() { + let id = Identifier::new_namespace("production.api.v1").unwrap(); + assert_eq!(id.namespace_name(), "production.api.v1"); + assert_eq!(id.component_name(), None); + assert_eq!(id.endpoint_name(), None); + assert_eq!(id.to_string(), "dynamo://production.api.v1"); + } + + #[test] + fn test_identifier_with_component() { + let id = Identifier::new_component("production.api.v1", "gateway").unwrap(); + assert_eq!(id.namespace_name(), "production.api.v1"); + assert_eq!(id.component_name(), Some("gateway")); + assert_eq!(id.endpoint_name(), None); + assert_eq!(id.to_string(), "dynamo://production.api.v1/_component_/gateway"); + } + + #[test] + fn test_identifier_with_endpoint() { + let id = Identifier::new_endpoint("production.api.v1", "gateway", "http").unwrap(); + assert_eq!(id.namespace_name(), "production.api.v1"); + assert_eq!(id.component_name(), Some("gateway")); + assert_eq!(id.endpoint_name(), Some("http")); + assert_eq!( + id.to_string(), + "dynamo://production.api.v1/_component_/gateway/_endpoint_/http" + ); + } + + #[test] + fn test_identifier_parse() { + let id: Identifier = "dynamo://production.api.v1".parse().unwrap(); + assert_eq!(id.namespace_name(), "production.api.v1"); + + let id: Identifier = "dynamo://production.api.v1/_component_/gateway".parse().unwrap(); + assert_eq!(id.component_name(), Some("gateway")); + + let id: Identifier = "dynamo://production.api.v1/_component_/gateway/_endpoint_/http" + .parse() + .unwrap(); + assert_eq!(id.endpoint_name(), Some("http")); + } + + #[test] + fn test_identifier_conversions() { + // Create a full endpoint identifier + let full_id = Identifier::new_endpoint("production.api", "gateway", "http").unwrap(); + assert_eq!(full_id.to_string(), "dynamo://production.api/_component_/gateway/_endpoint_/http"); + + // Convert to namespace-only + let ns_id = full_id.to_namespace(); + assert_eq!(ns_id.to_string(), "dynamo://production.api"); + assert_eq!(ns_id.namespace_name(), "production.api"); + assert_eq!(ns_id.component_name(), None); + assert_eq!(ns_id.endpoint_name(), None); + + // Convert to component-only + let comp_id = full_id.to_component().unwrap(); + assert_eq!(comp_id.to_string(), "dynamo://production.api/_component_/gateway"); + assert_eq!(comp_id.namespace_name(), "production.api"); + assert_eq!(comp_id.component_name(), Some("gateway")); + assert_eq!(comp_id.endpoint_name(), None); + + // Test with component-only identifier + let comp_only = Identifier::new_component("ns", "comp").unwrap(); + + let ns_from_comp = comp_only.to_namespace(); + assert_eq!(ns_from_comp.to_string(), "dynamo://ns"); + + let comp_from_comp = comp_only.to_component().unwrap(); + assert_eq!(comp_from_comp.to_string(), "dynamo://ns/_component_/comp"); + assert_eq!(comp_from_comp, comp_only); + + // Test with namespace-only identifier + let ns_only = Identifier::new_namespace("ns").unwrap(); + + let ns_from_ns = ns_only.to_namespace(); + assert_eq!(ns_from_ns.to_string(), "dynamo://ns"); + assert_eq!(ns_from_ns, ns_only); + + // Should return None when trying to get component from namespace-only + assert!(ns_only.to_component().is_none()); + } + + #[test] + fn test_instance_dynamic_and_static_creation_and_parsing() { + // Dynamic instance creation + let identifier = Identifier::new_endpoint("ns1", "comp1", "ep1").unwrap(); + let instance = Instance::new(identifier.clone(), 0x1234).unwrap(); + assert_eq!(instance.instance_id(), Some(0x1234)); + assert!(!instance.is_static); + assert_eq!(instance.to_string(), "dynamo://ns1/_component_/comp1/_endpoint_/ep1:1234"); + + // Static instance creation + let static_instance = Instance::new_static(identifier.clone()).unwrap(); + assert_eq!(static_instance.instance_id(), None); + assert!(static_instance.is_static); + assert_eq!(static_instance.to_string(), "dynamo://ns1/_component_/comp1/_endpoint_/ep1/_static_"); + + // Parsing dynamic instance from string + let parsed: Instance = "dynamo://ns1/_component_/comp1/_endpoint_/ep1:1234".parse().unwrap(); + assert_eq!(parsed.instance_id(), Some(0x1234)); + assert!(!parsed.is_static); + assert_eq!(parsed.identifier().namespace_name(), "ns1"); + assert_eq!(parsed.identifier().component_name(), Some("comp1")); + assert_eq!(parsed.identifier().endpoint_name(), Some("ep1")); + + // Parsing static instance from string + let parsed_static: Instance = "dynamo://ns1/_component_/comp1/_endpoint_/ep1/_static_".parse().unwrap(); + assert_eq!(parsed_static.instance_id(), None); + assert!(parsed_static.is_static); + assert_eq!(parsed_static.identifier().namespace_name(), "ns1"); + assert_eq!(parsed_static.identifier().component_name(), Some("comp1")); + assert_eq!(parsed_static.identifier().endpoint_name(), Some("ep1")); + } + + #[test] + fn test_keys_with_path() { + let id = Identifier::new_component("ns1", "comp1").unwrap(); + // _path_ is always auto-inserted + let keys = Keys::from_identifier(id.clone(), vec!["config".to_string()]).unwrap(); + assert_eq!(keys.to_string(), "dynamo://ns1/_component_/comp1/_path_/config"); + + // Multiple segments + let keys2 = Keys::from_identifier(id, vec!["config".to_string(), "v1".to_string()]).unwrap(); + assert_eq!(keys2.to_string(), "dynamo://ns1/_component_/comp1/_path_/config/v1"); + } + + #[test] + fn test_lenient_parsing() { + // Valid path with _path_ keyword + let valid_path = "dynamo://ns/_component_/comp/_endpoint_/ep:1234/_path_/extra/data"; + + // Can parse as Identifier (drops instance_id and extra segments) + let as_identifier: Identifier = valid_path.try_into().unwrap(); + assert_eq!(as_identifier.to_string(), "dynamo://ns/_component_/comp/_endpoint_/ep"); + + // Can parse as Instance (drops extra segments) + let as_instance: Instance = valid_path.try_into().unwrap(); + assert_eq!(as_instance.to_string(), "dynamo://ns/_component_/comp/_endpoint_/ep:1234"); + + // Can parse as Keys (preserves everything) + let as_keys: Keys = valid_path.try_into().unwrap(); + assert_eq!(as_keys.to_string(), "dynamo://ns/_component_/comp/_endpoint_/ep:1234/_path_/extra/data"); + + // Invalid paths should fail to parse entirely with InvalidPathSegment + let invalid_paths = vec![ + "dynamo://ns/_component_/comp/_endpoint_/ep:1234/extra/data", // Missing _path_ + "dynamo://ns/_component_/comp/config/v1", // Missing _path_ + "dynamo://ns/some/random/path", // No structure + ]; + + for invalid_path in invalid_paths { + // Should fail with InvalidPathSegment for all types + assert!(matches!( + TryInto::::try_into(invalid_path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Identifier parse of '{}'", invalid_path); + + assert!(matches!( + TryInto::::try_into(invalid_path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Instance parse of '{}'", invalid_path); + + assert!(matches!( + TryInto::::try_into(invalid_path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Keys parse of '{}'", invalid_path); + } + } + + #[test] + fn test_keys_underscore_validation() { + let id = Identifier::new_component("ns", "comp").unwrap(); + + // Test that segments starting with underscore are rejected + assert!(Keys::from_identifier(id.clone(), vec!["_invalid".to_string()]).is_err()); + assert!(Keys::from_identifier(id.clone(), vec!["_path_".to_string()]).is_err()); + assert!(Keys::from_identifier(id.clone(), vec!["_barrier_".to_string()]).is_err()); + + // Test valid segments + let keys = Keys::from_identifier(id.clone(), vec!["config".to_string(), "v1".to_string()]).unwrap(); + assert_eq!(keys.to_string(), "dynamo://ns/_component_/comp/_path_/config/v1"); + + // Test parsing paths with _path_ already present + let parsed: Keys = "dynamo://ns/_component_/comp/_path_/config/v1".parse().unwrap(); + assert_eq!(parsed.keys(), &["config", "v1"]); + assert_eq!(parsed.to_string(), "dynamo://ns/_component_/comp/_path_/config/v1"); + + let parsed = Keys::parse("dynamo://ns/_path_/config").unwrap(); + assert_eq!(parsed.keys(), &["config"]); + assert_eq!(parsed.to_string(), "dynamo://ns/_path_/config"); + + + // Test parsing paths without _path_ - should fail + let result: Result = "dynamo://ns/_component_/comp/config/v1".parse(); + assert!(result.is_err()); + assert!(matches!(result, Err(DescriptorError::InvalidPathSegment(_)))); + + // Underscore in middle - allowed + assert!(Keys::from_identifier(id.clone(), vec!["config_v2".to_string()]).is_ok()); + assert!(Keys::from_identifier(id.clone(), vec!["some_file_name".to_string()]).is_ok()); + } + + #[test] + fn test_validation_errors() { + // Invalid prefix + assert!(matches!( + Identifier::parse("invalid://ns1"), + Err(DescriptorError::InvalidPrefix) + )); + + // Empty namespace + assert!(matches!( + Identifier::new_namespace(""), + Err(DescriptorError::EmptyNamespace) + )); + + // Invalid characters in namespace + assert!(matches!( + Identifier::new_namespace("ns!@#"), + Err(DescriptorError::InvalidNamespace(_)) + )); + + // Invalid characters in component + assert!(matches!( + Identifier::new_component("ns1", "comp!@#"), + Err(DescriptorError::InvalidComponent(_)) + )); + + // Invalid characters in endpoint + assert!(matches!( + Identifier::new_endpoint("ns1", "comp1", "ep!@#"), + Err(DescriptorError::InvalidEndpoint(_)) + )); + + // Instance without endpoint + let id = Identifier::new_component("ns1", "comp1").unwrap(); + assert!(matches!( + Instance::new(id, 1234), + Err(DescriptorError::InvalidEndpoint(_)) + )); + + // Keys with segments starting with underscore + let id = Identifier::new_component("ns1", "comp1").unwrap(); + assert!(matches!( + Keys::from_identifier(id.clone(), vec!["_invalid".to_string()]), + Err(DescriptorError::InvalidPathSegment(_)) + )); + assert!(matches!( + Keys::from_identifier(id, vec!["valid".to_string(), "_path_".to_string()]), + Err(DescriptorError::InvalidPathSegment(_)) + )); + + // Test colons in various positions (not allowed except for instance IDs) + assert!(matches!( + Identifier::parse("dynamo://ns:invalid"), + Err(DescriptorError::InvalidNamespace(_)) + )); + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/comp:invalid"), + Err(DescriptorError::InvalidComponent(_)) + )); + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/_invali:d"), + Err(DescriptorError::InvalidComponent(_)) + )); + + } + + #[test] + fn test_invalid_path_formats() { + // These paths are invalid and should be rejected with InvalidPathSegment errors + let invalid_paths = vec![ + "dynamo://ns/_component_/comp/config/v1", // Missing _path_ keyword + "dynamo://ns/_component_/comp/_endpoint_/ep/extra", // Extra segment after endpoint + "dynamo://ns/random/path/segments", // Random segments without structure + ]; + + for path in invalid_paths { + assert!(matches!( + Identifier::parse(path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Identifier::parse({})", path); + + assert!(matches!( + Instance::parse(path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Instance::parse({})", path); + + assert!(matches!( + Keys::parse(path), + Err(DescriptorError::InvalidPathSegment(_)) + ), "Expected InvalidPathSegment for Keys::parse({})", path); + } + + // Valid formats should parse correctly + let valid1 = "dynamo://ns/_component_/comp/_path_/config/v1"; + assert!(Keys::parse(valid1).is_ok()); + + let valid2 = "dynamo://ns/_component_/comp/_endpoint_/ep/_path_/extra"; + assert!(Keys::parse(valid2).is_ok()); + } + + #[test] + fn test_out_of_order_reserved_keywords() { + assert!(matches!( + Identifier::parse("dynamo://ns/_endpoint_/ep/_component_/comp"), + Err(DescriptorError::EndpointWithoutComponent) + )); + + assert!(matches!( + Identifier::parse("dynamo://ns/_path_/config/_component_/comp"), + Err(DescriptorError::ReservedKeyword(_)) + )); + + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/comp1/_component_/comp2"), + Err(DescriptorError::InvalidPathSegment(_)) + )); + + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/comp/_endpoint_/ep1/_endpoint_/ep2"), + Err(DescriptorError::InvalidPathSegment(_)) + )); + + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/comp/_path_/p1/_path_/p2"), + Err(DescriptorError::ReservedKeyword(_)) + )); + + // Endpoint without component - specific error + assert!(matches!( + Identifier::parse("dynamo://ns/_endpoint_/ep"), + Err(DescriptorError::EndpointWithoutComponent) + )); + + // Component name starting with underscore - validation error + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/_invalid"), + Err(DescriptorError::InvalidComponent(_)) + )); + + // Missing component name + assert!(matches!( + Identifier::parse("dynamo://ns/_component_"), + Err(DescriptorError::EmptyComponent) + )); + + // Missing endpoint name + assert!(matches!( + Identifier::parse("dynamo://ns/_component_/comp/_endpoint_"), + Err(DescriptorError::EmptyEndpoint) + )); + } +} diff --git a/lib/runtime/src/discovery.rs b/lib/runtime/src/discovery.rs index a68c046e91..39b6585277 100644 --- a/lib/runtime/src/discovery.rs +++ b/lib/runtime/src/discovery.rs @@ -13,74 +13,490 @@ // See the License for the specific language governing permissions and // limitations under the License. -use crate::{transports::etcd, Result}; +//! Service discovery and coordination primitives for distributed component management. +//! +//! This module provides the bridge between Dynamo's entity layer and the underlying etcd +//! transport, enabling entities to perform distributed coordination operations through +//! a standardized interface. +//! +//! # Architecture Overview +//! +//! The discovery system consists of two primary components: +//! +//! - [`Storage`]: A scoped handle for etcd operations on a specific key +//! - [`DiscoveryClient`]: A trait that entities implement to access their storage +//! +//! Together, these components enable entities to: +//! - Register themselves in the distributed system +//! - Discover other components and services +//! - Coordinate through atomic operations and leases +//! - Watch for changes in the system state +//! +//! # Key Design Principles +//! +//! 1. **Scoped Operations**: Each `Storage` instance is scoped to a specific etcd key +//! 2. **Lifetime Management**: Storage borrows the etcd client, ensuring safe access +//! 3. **Consistent Interface**: All entities use the same discovery patterns +//! 4. **Atomic Primitives**: Support for atomic creates, compare-and-swap operations +//! +//! # Storage Operations +//! +//! The `Storage` type provides comprehensive etcd operations: +//! +//! ```ignore +//! // Get storage for an entity +//! let storage = entity.storage()?; +//! +//! // Atomic operations +//! storage.create(data, lease_id).await?; // Fails if exists +//! storage.create_or_validate(data, lease_id).await?; // Validates if exists +//! +//! // Standard CRUD +//! storage.put(data, lease_id).await?; // Create or update +//! let values = storage.get().await?; // Retrieve by key +//! let all = storage.get_prefix().await?; // Get all with prefix +//! storage.delete(None).await?; // Delete +//! +//! // Lease management +//! let lease = storage.create_lease(ttl).await?; // Create time-bound lease +//! storage.revoke_lease(lease.id()).await?; // Revoke early +//! +//! // Watch for changes +//! let watcher = storage.watch_prefix().await?; // Watch prefix for updates +//! ``` +//! +//! # Integration with Entities +//! +//! All entities (Namespace, Component, Endpoint, Path) implement `DiscoveryClient`: +//! +//! ```ignore +//! use dynamo::runtime::discovery::DiscoveryClient; +//! +//! // Any entity can use discovery operations +//! let component = drt.component("prod", "api")?; +//! let storage = component.storage()?; +//! +//! // Register component with lease +//! let lease = storage.create_lease(30).await?; +//! storage.create(b"component_data".to_vec(), Some(lease.id())).await?; +//! +//! // Discover other components +//! let namespace = drt.namespace("prod")?; +//! let ns_storage = namespace.storage()?; +//! let components = ns_storage.get_prefix().await?; +//! ``` +//! +//! # Leases and Ephemeral Data +//! +//! Leases enable automatic cleanup of data when components disconnect: +//! +//! ```ignore +//! // Create ephemeral endpoint registration +//! let endpoint = drt.endpoint("prod", "api", "grpc")?; +//! let storage = endpoint.storage()?; +//! +//! // Data will be automatically removed if lease expires +//! let lease = storage.create_lease(60).await?; // 60 second TTL +//! storage.put(endpoint_metadata, Some(lease.id())).await?; +//! +//! // Lease is automatically kept alive with heartbeats until dropped +//! ``` +//! +//! # Error Handling +//! +//! The `storage()` method returns `Result` to handle cases where: +//! - etcd client is not available (e.g., running without discovery) +//! - Network connectivity issues +//! - Configuration problems +//! +//! ```ignore +//! match entity.storage() { +//! Ok(storage) => { +//! // Perform discovery operations +//! storage.put(data, None).await?; +//! } +//! Err(_) => { +//! // Handle offline mode or fallback behavior +//! eprintln!("Discovery unavailable, running in standalone mode"); +//! } +//! } +//! ``` -pub use etcd::Lease; +use crate::{transports::etcd, Result, DistributedRuntime}; -pub struct DiscoveryClient { - namespace: String, - etcd_client: etcd::Client, +pub use etcd::{Lease, PrefixWatcher}; +pub use etcd_client::{GetResponse, GetOptions, PutOptions, DeleteOptions, KeyValue}; + +/// Storage handle that provides etcd operations scoped to a specific key +/// We are borrowing client from distributed runtime, hence the lifetime parameter +pub struct Storage<'a> { + client: &'a etcd::Client, + key: String, } -impl DiscoveryClient { - /// Create a new [`DiscoveryClient`] - /// - /// This will establish a connection to the etcd server, create a primary lease, - /// and spawn a task to keep the lease alive and tie the lifetime of the [`Runtime`] - /// to the lease. - /// - /// If the lease expires, the [`Runtime`] will be shutdown. - /// If the [`Runtime`] is shutdown, the lease will be revoked. - pub(crate) fn new(namespace: String, etcd_client: etcd::Client) -> Self { - DiscoveryClient { - namespace, - etcd_client, +impl<'a> Storage<'a> { + + pub fn primary_lease(&self) -> Lease { + self.client.primary_lease() + } + /// Create a new lease with specified TTL + pub async fn create_lease(&self, ttl: i64) -> Result { + self.client.create_lease(ttl).await + } + + /// Revoke a lease + pub async fn revoke_lease(&self, lease_id: i64) -> Result<()> { + self.client.revoke_lease(lease_id).await + } + + /// Atomically create only if key doesn't exist + pub async fn create(&self, value: Vec, lease_id: Option) -> Result<()> { + self.client.kv_create(self.key.clone(), value, lease_id).await + } + + /// Create or validate existing value matches + pub async fn create_or_validate(&self, value: Vec, lease_id: Option) -> Result<()> { + self.client.kv_create_or_validate(self.key.clone(), value, lease_id).await + } + + /// Put a value (create or overwrite) + pub async fn put(&self, value: Vec, lease_id: Option) -> Result<()> { + self.client.kv_put(&self.key, value, lease_id).await + } + + /// Put with custom options + pub async fn put_with_options(&self, value: Vec, options: Option) -> Result { + self.client.kv_put_with_options(&self.key, value, options).await + } + + /// Get by exact key + pub async fn get(&self) -> Result> { + self.client.kv_get(self.key.as_bytes(), None).await + } + + /// Get with options + pub async fn get_with_options(&self, options: Option) -> Result> { + self.client.kv_get(self.key.as_bytes(), options).await + } + + /// Get all with prefix + pub async fn get_prefix(&self) -> Result> { + self.client.kv_get_prefix(&self.key).await + } + + /// Delete and return count + pub async fn delete(&self, options: Option) -> Result { + self.client.kv_delete(self.key.as_bytes(), options).await + } + + /// Get and watch prefix + pub async fn watch_prefix(&self) -> Result { + self.client.kv_get_and_watch_prefix(&self.key).await + } + + /// Get the key this storage is scoped to + pub fn key(&self) -> &str { + &self.key + } + + /// Public constructor so other modules can create a `Storage` scoped to a custom key. + pub fn new(client: &'a etcd::Client, key: String) -> Self { + Self { client, key } + } +} + +/// Minimal trait for entities that have etcd storage +pub trait DiscoveryClient: crate::traits::DistributedRuntimeProvider { + /// Get the etcd key for this entity + fn etcd_key(&self) -> String; + + /// Get a storage handle for this entity's etcd operations + fn storage(&self) -> Result { + let client = self.drt() + .etcd_client_internal() + .ok_or_else(|| anyhow::anyhow!("etcd client not available"))?; + + Ok(Storage { + client, + key: self.etcd_key(), + }) + } +} + +// the following two commented out codes are not implemented, but are placeholders for proposed ectd usage patterns + +// /// Create an ephemeral key/value pair tied to a lease_id. +// /// This is an atomic create. If the key already exists, this will fail. +// /// The [`etcd_client::KeyValue`] will be removed when the lease expires or is revoked. +// pub async fn create_ephemerial_key(&self, key: &str, value: &str, lease_id: i64) -> Result<()> { +// // self.etcd_client.create_ephemeral_key(key, value, lease_id).await +// unimplemented!() +// } + +// /// Create a shared [`etcd_client::KeyValue`] which behaves similar to a C++ `std::shared_ptr` or a +// /// Rust [std::sync::Arc]. Instead of having one owner of the lease, multiple owners participate in +// /// maintaining the lease. In this manner, when the last member of the group sharing the lease is gone, +// /// the lease will be expired. +// /// +// /// Implementation notes: At the time of writing, it is unclear if we have atomics that control leases, +// /// so in our initial implementation, the last member of the group will not revoke the lease, so the object +// /// will live for upto the TTL after the last member is gone. +// /// +// /// Notes +// /// ----- +// /// +// /// - Multiple members sharing the lease and contributing to the heartbeat might cause some overheads. +// /// The implementation will try to randomize the heartbeat intervals to avoid thundering herd problem, +// /// and with any luck, the heartbeat watchers will be able to detect when if a external member triggered +// /// the heartbeat checking this interval and skip unnecessary heartbeat messages. +// /// +// /// A new lease will be created for this object. If you wish to add an object to a shared group s +// /// +// /// The [`etcd_client::KeyValue`] will be removed when the lease expires or is revoked. +// pub async fn create_shared_key(&self, key: &str, value: &str, lease_id: i64) -> Result<()> { +// // self.etcd_client.create_ephemeral_key(key, value, lease_id).await +// unimplemented!() +// } + +#[cfg(test)] +mod tests { + use super::*; + use crate::{Runtime, DistributedRuntime, distributed::DistributedConfig}; + use tokio; + + // Test entity that implements DiscoveryClient + struct TestEntity { + key: String, + drt: DistributedRuntime, + } + + impl crate::traits::DistributedRuntimeProvider for TestEntity { + fn drt(&self) -> &DistributedRuntime { + &self.drt } } - /// Get the primary lease ID - pub fn primary_lease_id(&self) -> i64 { - self.etcd_client.lease_id() + impl DiscoveryClient for TestEntity { + fn etcd_key(&self) -> String { + self.key.clone() + } } - /// Create a [`Lease`] with a given time-to-live (TTL). - /// This [`Lease`] will be tied to the [`crate::Runtime`], but has its own independent [`crate::CancellationToken`]. - pub async fn create_lease(&self, ttl: i64) -> Result { - self.etcd_client.create_lease(ttl).await - } - - // the following two commented out codes are not implemented, but are placeholders for proposed ectd usage patterns - - // /// Create an ephemeral key/value pair tied to a lease_id. - // /// This is an atomic create. If the key already exists, this will fail. - // /// The [`etcd_client::KeyValue`] will be removed when the lease expires or is revoked. - // pub async fn create_ephemerial_key(&self, key: &str, value: &str, lease_id: i64) -> Result<()> { - // // self.etcd_client.create_ephemeral_key(key, value, lease_id).await - // unimplemented!() - // } - - // /// Create a shared [`etcd_client::KeyValue`] which behaves similar to a C++ `std::shared_ptr` or a - // /// Rust [std::sync::Arc]. Instead of having one owner of the lease, multiple owners participate in - // /// maintaining the lease. In this manner, when the last member of the group sharing the lease is gone, - // /// the lease will be expired. - // /// - // /// Implementation notes: At the time of writing, it is unclear if we have atomics that control leases, - // /// so in our initial implementation, the last member of the group will not revoke the lease, so the object - // /// will live for upto the TTL after the last member is gone. - // /// - // /// Notes - // /// ----- - // /// - // /// - Multiple members sharing the lease and contributing to the heartbeat might cause some overheads. - // /// The implementation will try to randomize the heartbeat intervals to avoid thundering herd problem, - // /// and with any luck, the heartbeat watchers will be able to detect when if a external member triggered - // /// the heartbeat checking this interval and skip unnecessary heartbeat messages. - // /// - // /// A new lease will be created for this object. If you wish to add an object to a shared group s - // /// - // /// The [`etcd_client::KeyValue`] will be removed when the lease expires or is revoked. - // pub async fn create_shared_key(&self, key: &str, value: &str, lease_id: i64) -> Result<()> { - // // self.etcd_client.create_ephemeral_key(key, value, lease_id).await - // unimplemented!() - // } + // Helper to check if ETCD is available + async fn is_etcd_available() -> bool { + // Try to connect to default ETCD endpoint + etcd_client::Client::connect(["localhost:2379"], None).await.is_ok() + } + + // Helper to create test runtime with ETCD + async fn create_test_runtime() -> Result { + let runtime = Runtime::from_current()?; + let mut config = DistributedConfig::from_settings(false); + // Ensure we're using localhost:2379 for tests + config.etcd_config.etcd_url = vec!["http://localhost:2379".to_string()]; + DistributedRuntime::new(runtime, config).await + } + + #[tokio::test] + async fn test_storage_put_get() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let entity = TestEntity { + key: format!("test/storage/{}", uuid::Uuid::new_v4()), + drt, + }; + + let storage = entity.storage()?; + + // Test put and get + let test_data = b"hello world".to_vec(); + storage.put(test_data.clone(), None).await?; + + let values = storage.get().await?; + assert_eq!(values.len(), 1); + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await?; + + Ok(()) + } + + #[tokio::test] + async fn test_storage_create_atomic() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let entity = TestEntity { + key: format!("test/storage/{}", uuid::Uuid::new_v4()), + drt, + }; + + let storage = entity.storage()?; + + // First create should succeed + let test_data = b"first".to_vec(); + storage.create(test_data.clone(), None).await?; + + // Second create should fail (key exists) + let result = storage.create(b"second".to_vec(), None).await; + assert!(result.is_err()); + + // Verify original value unchanged + let values = storage.get().await?; + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await?; + + Ok(()) + } + + #[tokio::test] + async fn test_storage_create_or_validate() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let entity = TestEntity { + key: format!("test/storage/{}", uuid::Uuid::new_v4()), + drt, + }; + + let storage = entity.storage()?; + let test_data = b"consistent".to_vec(); + + // First call creates + storage.create_or_validate(test_data.clone(), None).await?; + + // Second call with same data succeeds + storage.create_or_validate(test_data.clone(), None).await?; + + // Call with different data fails + let result = storage.create_or_validate(b"different".to_vec(), None).await; + assert!(result.is_err()); + + // Cleanup + storage.delete(None).await?; + + Ok(()) + } + + #[tokio::test] + async fn test_storage_lease_operations() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let entity = TestEntity { + key: format!("test/storage/{}", uuid::Uuid::new_v4()), + drt, + }; + + let storage = entity.storage()?; + + // Create a lease + let lease = storage.create_lease(5).await?; // 5 second TTL + + // Put with lease + storage.put(b"leased_data".to_vec(), Some(lease.id())).await?; + + // Verify data exists + let values = storage.get().await?; + assert_eq!(values.len(), 1); + + // Revoke lease (should delete the key) + storage.revoke_lease(lease.id()).await?; + + // Give etcd a moment to process + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + + // Verify data is gone + let values = storage.get().await?; + assert_eq!(values.len(), 0); + + Ok(()) + } + + #[tokio::test] + async fn test_storage_prefix_operations() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let prefix = format!("test/prefix/{}", uuid::Uuid::new_v4()); + + // Create multiple entities with same prefix + let entities: Vec = (0..3) + .map(|i| TestEntity { + key: format!("{}/item_{}", prefix, i), + drt: drt.clone(), + }) + .collect(); + + // Put data for each entity + for (i, entity) in entities.iter().enumerate() { + let storage = entity.storage()?; + storage.put(format!("data_{}", i).into_bytes(), None).await?; + } + + // Use first entity to get all with prefix + let base_entity = TestEntity { + key: prefix.clone(), + drt: drt.clone(), + }; + let storage = base_entity.storage()?; + let values = storage.get_prefix().await?; + + assert_eq!(values.len(), 3); + + // Cleanup + for entity in &entities { + entity.storage()?.delete(None).await?; + } + + Ok(()) + } + + #[tokio::test] + async fn test_storage_delete_operations() -> Result<()> { + if !is_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return Ok(()); + } + + let drt = create_test_runtime().await?; + let entity = TestEntity { + key: format!("test/storage/{}", uuid::Uuid::new_v4()), + drt, + }; + + let storage = entity.storage()?; + + // Put some data + storage.put(b"to_delete".to_vec(), None).await?; + + // Delete and check count + let count = storage.delete(None).await?; + assert_eq!(count, 1); + + // Delete again should return 0 + let count = storage.delete(None).await?; + assert_eq!(count, 0); + + Ok(()) + } } diff --git a/lib/runtime/src/distributed.rs b/lib/runtime/src/distributed.rs index 96a42bf566..0e738b514f 100644 --- a/lib/runtime/src/distributed.rs +++ b/lib/runtime/src/distributed.rs @@ -13,9 +13,9 @@ // See the License for the specific language governing permissions and // limitations under the License. -pub use crate::component::Component; use crate::{ - component::{self, ComponentBuilder, Endpoint, InstanceSource, Namespace}, + descriptor::{Instance, Identifier}, + entity::{Component, Endpoint, Namespace, InstanceSource, Registry}, discovery::DiscoveryClient, service::ServiceClient, transports::{etcd, nats, tcp}, @@ -30,6 +30,7 @@ use std::collections::HashMap; use tokio::sync::Mutex; use tokio_util::sync::CancellationToken; + impl DistributedRuntime { pub async fn new(runtime: Runtime, config: DistributedConfig) -> Result { let secondary = runtime.secondary(); @@ -70,7 +71,7 @@ impl DistributedRuntime { etcd_client, nats_client, tcp_server: Arc::new(OnceCell::new()), - component_registry: component::Registry::new(), + component_registry: Registry::new(), is_static, instance_sources: Arc::new(Mutex::new(HashMap::new())), }) @@ -106,9 +107,9 @@ impl DistributedRuntime { } /// Create a [`Namespace`] - pub fn namespace(&self, name: impl Into) -> Result { - Namespace::new(self.clone(), name.into(), self.is_static) - } + // pub fn namespace(&self, name: impl Into) -> Result { + // Namespace::new(self.clone(), name.into(), self.is_static) + // } // /// Create a [`Component`] // pub fn component( @@ -122,14 +123,14 @@ impl DistributedRuntime { // .build()?) // } - pub(crate) fn discovery_client(&self, namespace: impl Into) -> DiscoveryClient { - DiscoveryClient::new( - namespace.into(), - self.etcd_client - .clone() - .expect("Attempt to get discovery_client on static DistributedRuntime"), - ) - } + // pub(crate) fn discovery_client(&self, namespace: impl Into) -> DiscoveryClient { + // DiscoveryClient::new( + // namespace.into(), + // self.etcd_client + // .clone() + // .expect("Attempt to get discovery_client on static DistributedRuntime"), + // ) + // } pub(crate) fn service_client(&self) -> ServiceClient { ServiceClient::new(self.nats_client.clone()) @@ -151,7 +152,16 @@ impl DistributedRuntime { self.nats_client.clone() } - // todo(ryan): deprecate this as we move to Discovery traits and Component Identifiers + /// Internal method for accessing etcd client. Only available within this crate. + /// This is used by the Discovery trait implementations. + pub(crate) fn etcd_client_internal(&self) -> Option<&etcd::Client> { + self.etcd_client.as_ref() + } + + #[deprecated( + since = "0.3.0", + note = "Use discovery traits on entities to read and write from etcd." + )] pub fn etcd_client(&self) -> Option { self.etcd_client.clone() } @@ -160,7 +170,7 @@ impl DistributedRuntime { self.runtime.child_token() } - pub fn instance_sources(&self) -> Arc>>> { + pub fn instance_sources(&self) -> Arc>>> { self.instance_sources.clone() } } diff --git a/lib/runtime/src/entity.rs b/lib/runtime/src/entity.rs new file mode 100644 index 0000000000..5c2e67015d --- /dev/null +++ b/lib/runtime/src/entity.rs @@ -0,0 +1,935 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 + +//! Operational entities that provide distributed runtime capabilities for descriptors. +//! +//! This module implements the entity layer of Dynamo's two-tier architecture for +//! distributed component management. Entities wrap descriptors with a `DistributedRuntime` +//! handle to enable actual etcd operations and distributed coordination. +//! +//! # Architecture Overview +//! +//! The entity system mirrors the descriptor hierarchy with operational types: +//! +//! - [`Namespace`]: Operational namespace with discovery and hierarchical navigation +//! - [`Component`]: Operational component that can host endpoints and store data +//! - [`Endpoint`]: Operational endpoint with automatic instance ID management +//! - [`Path`]: Operational extended path for arbitrary data storage +//! +//! All entities: +//! - Embed a `DistributedRuntime` for etcd operations +//! - Implement `DiscoveryClient` for standardized storage access +//! - Provide navigation methods for traversing the hierarchy +//! - Can be created from descriptors via the `ToEntity` trait +//! +//! # Key Design Principles +//! +//! 1. **Separation of Concerns**: Descriptors handle data representation, entities handle operations +//! 2. **Immutable Descriptors**: Entities wrap immutable descriptors, never modify them +//! 3. **Factory Pattern**: Descriptors convert to entities via `ToEntity::to_entity()` +//! 4. **Navigation**: Entities provide methods to traverse the component hierarchy +//! +//! # Usage Examples +//! +//! ```ignore +//! use dynamo::runtime::{DistributedRuntime, EntityChain}; +//! use dynamo::runtime::descriptor::Identifier; +//! use dynamo::runtime::entity::{ToEntity, DiscoveryClient}; +//! +//! // Create entities using the fluent API +//! let ns = drt.namespace("production")?; +//! let comp = ns.component("gateway")?; +//! let ep = comp.endpoint("http")?; +//! let path = ep.path(&["v1", "config"])?; +//! +//! // Convert descriptors to entities +//! let id = Identifier::new_component("prod", "api")?; +//! let entity = id.to_entity(drt.clone())?; +//! +//! // Use DiscoveryClient for etcd operations +//! let storage = comp.storage()?; +//! storage.put(b"config_data".to_vec(), None).await?; +//! let values = storage.get().await?; +//! ``` +//! +//! # Navigation and Chaining +//! +//! Entities support fluent navigation through the component hierarchy: +//! +//! ```ignore +//! // Start from runtime and chain down +//! let endpoint = drt.namespace("prod")? +//! .component("api")? +//! .endpoint("grpc")?; +//! +//! // Navigate from endpoint to extended paths +//! let metrics = endpoint.path(&["metrics", "cpu"])?; +//! +//! // Navigate up with parent() methods +//! let parent_path = metrics.parent()?; // Returns Path for "metrics" +//! let parent_ns = ns.parent()?; // Returns parent namespace +//! ``` +//! +//! # Integration with DiscoveryClient +//! +//! All entities implement `DiscoveryClient`, providing standardized etcd operations: +//! +//! ```ignore +//! // Every entity can access its storage +//! let storage = entity.storage()?; +//! +//! // Perform etcd operations scoped to the entity's path +//! storage.create(data, lease_id).await?; // Atomic create +//! storage.put(data, lease_id).await?; // Create or update +//! storage.get().await?; // Retrieve +//! storage.delete(None).await?; // Delete +//! storage.watch_prefix().await?; // Watch for changes +//! ``` + +use crate::{ + descriptor::{ + Identifier, Instance, Keys, KeysBase, DescriptorError, + BARRIER_KEYWORD, COMPONENT_KEYWORD, ENDPOINT_KEYWORD, PATH_KEYWORD, + }, + DistributedRuntime, +}; +use crate::traits::{DistributedRuntimeProvider, RuntimeProvider}; +use std::fmt; +use crate::discovery::{DiscoveryClient, Storage}; + +use crate::{discovery::Lease, service::ServiceSet, transports::etcd::EtcdPath}; +use super::{ + error, + traits::*, + transports::nats::Slug, + utils::Duration, + Result, Runtime, +}; +pub use client::{Client, InstanceSource}; + +use crate::pipeline::network::{ingress::push_endpoint::PushEndpoint, PushWorkHandler}; +use async_nats::{ + rustls::quic, + service::{Service, ServiceExt}, +}; +use derive_builder::Builder; +use derive_getters::Getters; +use educe::Educe; +use serde::{Deserialize, Serialize}; +use service::EndpointStatsHandler; +use std::{collections::HashMap, hash::Hash, sync::Arc}; +use validator::{Validate, ValidationError}; + +mod client; +#[allow(clippy::module_inception)] +mod component; +mod endpoint; +mod namespace; +mod registry; +pub mod service; + +#[derive(Debug, thiserror::Error)] +pub enum EntityError { + #[error("Invalid descriptor: {0}")] + InvalidDescriptor(&'static str), + #[error("Descriptor error: {0}")] + DescriptorError(#[from] DescriptorError), +} + +/// Factory trait for creating entities from descriptors +pub trait ToEntity { + type Entity; + + fn to_entity(self, runtime: DistributedRuntime) -> Result; +} + +/// Operational namespace with distributed runtime +#[derive(Clone, Educe)] +#[educe(Debug)] +pub struct Namespace { + descriptor: Identifier, // Always namespace-only + + #[educe(Debug(ignore))] + runtime: DistributedRuntime, +} + +impl Namespace { + pub fn from_descriptor(descriptor: Identifier, runtime: DistributedRuntime) -> Result { + Ok(Self { descriptor: descriptor.to_namespace(), runtime }) + } + + pub fn new(namespace: &str, runtime: DistributedRuntime) -> Result { + let descriptor = Identifier::new_namespace(namespace)?; + Self::from_descriptor(descriptor, runtime) + } + + pub fn to_descriptor(&self) -> Identifier { + self.descriptor.clone() + } + + pub fn segments(&self) -> Vec<&str> { + self.descriptor.namespace_name().split('.').collect() + } + + /// Get parent namespace if not root + pub fn parent(&self) -> Option { + let segments = self.segments(); + if segments.len() <= 1 { + return None; + } + + let parent_path = segments[..segments.len()-1].join("."); + Namespace::new(&parent_path, self.runtime.clone()).ok() + } + + /// Create child namespace + pub fn child(&self, name: &str) -> Result { + let child_path = format!("{}.{}", self.descriptor.namespace_name(), name); + Namespace::new(&child_path, self.runtime.clone()) + } + + /// Chain to create a component + pub fn component(&self, name: &str) -> Result { + Component::new(self.descriptor.namespace_name(), name, self.runtime.clone()) + } + + /// Chain to create a path + pub fn path(&self, segments: &[&str]) -> Result { + let keys = Keys::from_identifier( + self.descriptor.clone(), + segments.iter().map(|s| s.to_string()).collect() + )?; + Path::from_descriptor(keys, self.runtime.clone()) + } +} + +impl DistributedRuntimeProvider for Namespace { + fn drt(&self) -> &DistributedRuntime { + &self.runtime + } +} + +impl RuntimeProvider for Namespace { + fn rt(&self) -> &crate::Runtime { + self.runtime.rt() + } +} + +impl fmt::Display for Namespace { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.descriptor) + } +} + +impl DiscoveryClient for Namespace { + fn etcd_key(&self) -> String { + self.to_string() + } +} + +/// Operational component with distributed runtime +#[derive(Clone, Educe)] +#[educe(Debug)] +pub struct Component { + descriptor: Identifier, // Must have component + + #[educe(Debug(ignore))] + runtime: DistributedRuntime, +} + +impl Component { + pub fn from_descriptor(descriptor: Identifier, runtime: DistributedRuntime) -> Result { + if descriptor.component_name().is_none() { + return Err(EntityError::InvalidDescriptor("Descriptor must have component")); + } + Ok(Self { descriptor: descriptor.to_component().unwrap(), runtime }) + } + + pub fn new(namespace: &str, component: &str, runtime: DistributedRuntime) -> Result { + let descriptor = Identifier::new_component(namespace, component)?; + Self::from_descriptor(descriptor, runtime) + } + + pub fn to_descriptor(&self) -> Identifier { + self.descriptor.clone() + } + + /// Chain to create an endpoint + pub fn endpoint(&self, name: &str) -> Result { + Endpoint::new( + self.descriptor.namespace_name(), + self.descriptor.component_name().unwrap(), + name, + self.runtime.clone() + ) + } + + /// Get the parent namespace of this component + /// This is guaranteed to succeed since a component must have a namespace + pub fn namespace(&self) -> Namespace { + let namespace_id = self.descriptor.to_namespace(); + + Namespace::from_descriptor(namespace_id, self.runtime.clone()) + .expect("Valid namespace identifier") // Safe since to_namespace() always returns valid + } + + /// Chain to create a path + pub fn path(&self, segments: &[&str]) -> Result { + let keys = Keys::from_identifier( + self.descriptor.clone(), + segments.iter().map(|s| s.to_string()).collect() + )?; + Path::from_descriptor(keys, self.runtime.clone()) + } + + pub async fn scrape_stats(&self, timeout: Duration) -> Result { + let service_name = self.to_descriptor().slug().to_string(); + let service_client = self.drt().service_client(); + service_client + .collect_services(&service_name, timeout) + .await + } + + pub fn service_builder(&self) -> service::ServiceConfigBuilder { + service::ServiceConfigBuilder::from_component(self.clone()) + } +} + +impl DistributedRuntimeProvider for Component { + fn drt(&self) -> &DistributedRuntime { + &self.runtime + } +} + +impl RuntimeProvider for Component { + fn rt(&self) -> &crate::Runtime { + self.runtime.rt() + } +} + +impl fmt::Display for Component { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.descriptor) + } +} + +impl DiscoveryClient for Component { + fn etcd_key(&self) -> String { + self.to_string() + } +} + +/// Operational endpoint with distributed runtime +#[derive(Clone, Educe)] +#[educe(Debug)] +pub struct Endpoint { + descriptor: Instance, + + #[educe(Debug(ignore))] + runtime: DistributedRuntime, +} + +impl Endpoint { + pub fn from_identifier(descriptor: Identifier, runtime: DistributedRuntime) -> Result { + if descriptor.endpoint_name().is_none() { + return Err(EntityError::InvalidDescriptor("Descriptor must have endpoint")); + } + let instance = if let Some(lease) = runtime.primary_lease() { + Instance::new(descriptor, lease.id())? + } else { + Instance::new_static(descriptor)? + }; + Ok(Self { + descriptor: instance, + runtime + }) + } + + pub fn from_instance(instance: Instance, runtime: DistributedRuntime) -> Result { + Ok(Self { + descriptor: instance, + runtime, + }) + } + + pub fn new(namespace: &str, component: &str, endpoint: &str, runtime: DistributedRuntime) -> Result { + let descriptor = Identifier::new_endpoint(namespace, component, endpoint)?; + Self::from_identifier(descriptor, runtime) + } + + pub fn to_descriptor(&self) -> Instance { + self.descriptor.clone() + } + + pub fn instance_id(&self) -> Option { + self.descriptor.instance_id() + } + + /// Chain to create a path + pub fn path(&self, segments: &[&str]) -> Result { + let keys = Keys::from_instance( + self.descriptor.clone(), + segments.iter().map(|s| s.to_string()).collect() + )?; + Path::from_descriptor(keys, self.runtime.clone()) + } + + /// Get the parent component of this endpoint + /// This is guaranteed to succeed since an endpoint must have a component + pub fn component(&self) -> Component { + let component_id = self.descriptor.identifier().to_component() + .expect("Endpoint must have a component"); // Safe since endpoint requires component + + Component::from_descriptor(component_id, self.runtime.clone()) + .expect("Valid component identifier") // Safe since we got it from to_component() + } + + /// Get the namespace of this endpoint + /// This is guaranteed to succeed since an endpoint must have a namespace + pub fn namespace(&self) -> Namespace { + let namespace_id = self.descriptor.identifier().to_namespace(); + + Namespace::from_descriptor(namespace_id, self.runtime.clone()) + .expect("Valid namespace identifier") // Safe since to_namespace() always returns valid + } + + pub async fn client(&self) -> Result { + client::Client::new(self.clone()).await + } + + pub fn discovery_storage(&self) -> Result { + let client = self.drt() + .etcd_client_internal() + .ok_or_else(|| anyhow::anyhow!("etcd client not available"))?; + + Ok(Storage::new( + client, + self.to_descriptor().identifier().to_string(), + )) + } + + pub fn endpoint_builder(&self) -> endpoint::EndpointConfigBuilder { + endpoint::EndpointConfigBuilder::from_endpoint(self.clone()) + } +} + +impl DistributedRuntimeProvider for Endpoint { + fn drt(&self) -> &DistributedRuntime { + &self.runtime + } +} + +impl RuntimeProvider for Endpoint { + fn rt(&self) -> &crate::Runtime { + self.runtime.rt() + } +} + +impl fmt::Display for Endpoint { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.descriptor) + } +} + +impl DiscoveryClient for Endpoint { + fn etcd_key(&self) -> String { + self.to_string() + } +} + +/// Operational path with extended segments and distributed runtime +#[derive(Clone, Educe)] +#[educe(Debug)] +pub struct Path { + descriptor: Keys, + + #[educe(Debug(ignore))] + runtime: DistributedRuntime, +} + +impl Path { + /// Create from Keys descriptor + pub fn from_descriptor(keys: Keys, runtime: DistributedRuntime) -> Result { + Ok(Self { descriptor: keys, runtime }) + } + + /// Convert back to Keys descriptor + pub fn to_descriptor(&self) -> Keys { + self.descriptor.clone() + } + + /// Get base entity (Namespace, Component, or Endpoint) + pub fn base_entity(&self) -> BaseEntity { + match self.descriptor.base() { + KeysBase::Identifier(id) => { + if id.endpoint_name().is_some() { + BaseEntity::Endpoint(Endpoint::from_identifier(id.clone(), self.runtime.clone()).unwrap()) + } else if id.component_name().is_some() { + BaseEntity::Component(Component::from_descriptor(id.clone(), self.runtime.clone()).unwrap()) + } else { + BaseEntity::Namespace(Namespace::from_descriptor(id.clone(), self.runtime.clone()).unwrap()) + } + } + KeysBase::Instance(inst) => { + BaseEntity::Endpoint(Endpoint::from_instance(inst.clone(), self.runtime.clone()).unwrap()) + } + } + } + + /// Get parent path by removing the last segment + /// Returns None if at the root (no segments) + pub fn parent(&self) -> Option { + let segments = self.descriptor.keys(); + if segments.is_empty() { + return None; + } + + let parent_segments = segments[..segments.len() - 1].to_vec(); + let keys = match self.descriptor.base() { + KeysBase::Identifier(id) => Keys::from_identifier(id.clone(), parent_segments).ok()?, + KeysBase::Instance(inst) => Keys::from_instance(inst.clone(), parent_segments).ok()?, + }; + + Some(Path { + descriptor: keys, + runtime: self.runtime.clone(), + }) + } + + /// Create a child path by adding a segment + pub fn child(&self, segment: &str) -> Result { + let mut segments = self.descriptor.keys().to_vec(); + segments.push(segment.to_string()); + + let keys = match self.descriptor.base() { + KeysBase::Identifier(id) => Keys::from_identifier(id.clone(), segments)?, + KeysBase::Instance(inst) => Keys::from_instance(inst.clone(), segments)?, + }; + + Ok(Path { + descriptor: keys, + runtime: self.runtime.clone(), + }) + } + + /// Get the path segments, for testing + fn segments(&self) -> &[String] { + self.descriptor.keys() + } +} + +impl DistributedRuntimeProvider for Path { + fn drt(&self) -> &DistributedRuntime { + &self.runtime + } +} + +impl RuntimeProvider for Path { + fn rt(&self) -> &crate::Runtime { + self.runtime.rt() + } +} + +impl fmt::Display for Path { + fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result { + write!(f, "{}", self.descriptor) + } +} + +impl DiscoveryClient for Path { + fn etcd_key(&self) -> String { + self.to_string() + } +} + +/// Base entity types that can be the foundation of a Path +pub enum BaseEntity { + Namespace(Namespace), + Component(Component), + Endpoint(Endpoint), +} + +/// Entity result from identifier conversion +pub enum IdentifierEntity { + Namespace(Namespace), + Component(Component), + Endpoint(Endpoint), +} + +// ToEntity implementations + +impl ToEntity for Identifier { + type Entity = IdentifierEntity; + + fn to_entity(self, runtime: DistributedRuntime) -> Result { + if self.endpoint_name().is_some() { + Ok(IdentifierEntity::Endpoint(Endpoint::from_identifier(self, runtime)?)) + } else if self.component_name().is_some() { + Ok(IdentifierEntity::Component(Component::from_descriptor(self, runtime)?)) + } else { + Ok(IdentifierEntity::Namespace(Namespace::from_descriptor(self, runtime)?)) + } + } +} + +impl ToEntity for Instance { + type Entity = Endpoint; + + fn to_entity(self, runtime: DistributedRuntime) -> Result { + Endpoint::from_instance(self, runtime) + } +} + +impl ToEntity for Keys { + type Entity = Path; + + fn to_entity(self, runtime: DistributedRuntime) -> Result { + Path::from_descriptor(self, runtime) + } +} + +// Add extension trait for DistributedRuntime to start the chain +pub trait EntityChain { + fn namespace(&self, name: &str) -> Result; + fn component(&self, namespace: &str, component: &str) -> Result; + fn endpoint(&self, namespace: &str, component: &str, endpoint: &str) -> Result; +} + +impl EntityChain for DistributedRuntime { + fn namespace(&self, name: &str) -> Result { + Namespace::new(name, self.clone()) + } + + fn component(&self, namespace: &str, component: &str) -> Result { + Component::new(namespace, component, self.clone()) + } + + fn endpoint(&self, namespace: &str, component: &str, endpoint: &str) -> Result { + Endpoint::new(namespace, component, endpoint, self.clone()) + } +} + +#[derive(Default)] +pub struct RegistryInner { + services: HashMap, + stats_handlers: HashMap>>>, +} + +#[derive(Clone)] +pub struct Registry { + inner: Arc>, +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::Runtime; + + async fn create_test_runtime() -> DistributedRuntime { + let runtime = Runtime::from_current().unwrap(); + DistributedRuntime::from_settings_without_discovery(runtime).await.unwrap() + } + + #[tokio::test] + async fn test_namespace_entity() { + let drt = create_test_runtime().await; + + let ns = Namespace::new("production.api.v1", drt.clone()).unwrap(); + assert_eq!(ns.to_string(), "dynamo://production.api.v1"); + assert_eq!(ns.segments(), vec!["production", "api", "v1"]); + + // Test parent + let parent = ns.parent().unwrap(); + assert_eq!(parent.to_string(), "dynamo://production.api"); + + // Test child + let child = ns.child("v2").unwrap(); + assert_eq!(child.to_string(), "dynamo://production.api.v1.v2"); + + // Test conversion back to descriptor + let desc = ns.to_descriptor(); + assert_eq!(desc.to_string(), "dynamo://production.api.v1"); + } + + #[tokio::test] + async fn test_component_entity() { + let drt = create_test_runtime().await; + + let comp = Component::new("production", "gateway", drt.clone()).unwrap(); + assert_eq!(comp.to_string(), "dynamo://production/_component_/gateway"); + + // Test conversion back to descriptor + let desc = comp.to_descriptor(); + assert_eq!(desc.to_string(), "dynamo://production/_component_/gateway"); + } + + #[tokio::test] + async fn test_endpoint_entity() { + let drt = create_test_runtime().await; + + let ep = Endpoint::new("production", "gateway", "http", drt.clone()).unwrap(); + assert_eq!(ep.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http/_static_"); + assert_eq!(ep.instance_id(), None); + + // Test with instance + let id = Identifier::new_endpoint("production", "gateway", "http").unwrap(); + let inst = Instance::new(id, 0x1234).unwrap(); + let ep_with_inst = Endpoint::from_instance(inst, drt).unwrap(); + assert_eq!(ep_with_inst.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http:1234"); + assert_eq!(ep_with_inst.instance_id(), Some(0x1234)); + + // Test conversion to instance + let inst_opt = ep_with_inst.to_descriptor(); + assert_eq!(inst_opt.instance_id().unwrap(), 0x1234); + } + + #[tokio::test] + async fn test_path_entity() { + let drt = create_test_runtime().await; + + let id = Identifier::new_component("production", "gateway").unwrap(); + let keys = Keys::from_identifier(id, vec!["v1".to_string(), "leader".to_string()]).unwrap(); + let path = Path::from_descriptor(keys, drt.clone()).unwrap(); + + assert_eq!(path.to_string(), "dynamo://production/_component_/gateway/_path_/v1/leader"); + assert_eq!(path.segments(), &["v1", "leader"]); + + // Test base entity extraction + match path.base_entity() { + BaseEntity::Component(comp) => { + assert_eq!(comp.to_string(), "dynamo://production/_component_/gateway"); + } + _ => panic!("Expected component base entity"), + } + } + + #[tokio::test] + async fn test_to_entity_trait() { + let drt = create_test_runtime().await; + + // Test identifier to entity + let id = Identifier::new_endpoint("ns1", "comp1", "ep1").unwrap(); + let entity = id.to_entity(drt.clone()).unwrap(); + match entity { + IdentifierEntity::Endpoint(ep) => { + assert_eq!(ep.to_string(), "dynamo://ns1/_component_/comp1/_endpoint_/ep1/_static_"); + } + _ => panic!("Expected endpoint entity"), + } + + // Test instance to entity + let id = Identifier::new_endpoint("ns1", "comp1", "ep1").unwrap(); + let inst = Instance::new(id, 0x5678).unwrap(); + let ep = inst.to_entity(drt.clone()).unwrap(); + assert_eq!(ep.to_string(), "dynamo://ns1/_component_/comp1/_endpoint_/ep1:5678"); + assert_eq!(ep.instance_id(), Some(0x5678)); + + // Test keys to entity + let id = Identifier::new_namespace("ns1").unwrap(); + let keys = Keys::from_identifier(id, vec!["v1".to_string(), "config".to_string()]).unwrap(); + let path = keys.to_entity(drt).unwrap(); + assert_eq!(path.to_string(), "dynamo://ns1/_path_/v1/config"); + assert_eq!(path.segments(), &["v1", "config"]); + } + + #[tokio::test] + async fn test_chaining() { + let drt = create_test_runtime().await; + + // Chain from namespace to component + let comp = drt.namespace("production").unwrap() + .component("gateway").unwrap(); + assert_eq!(comp.to_string(), "dynamo://production/_component_/gateway"); + + // Chain from namespace to component to endpoint + let ep = drt.namespace("production").unwrap() + .component("gateway").unwrap() + .endpoint("http").unwrap(); + assert_eq!(ep.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http/_static_"); + + // Chain from namespace to path + let path = drt.namespace("production").unwrap() + .path(&["v1", "config"]).unwrap(); + assert_eq!(path.to_string(), "dynamo://production/_path_/v1/config"); + + // Chain from component to path + let path = drt.namespace("production").unwrap() + .component("gateway").unwrap() + .path(&["v1", "config"]).unwrap(); + assert_eq!(path.to_string(), "dynamo://production/_component_/gateway/_path_/v1/config"); + + // Chain from endpoint to path + let path = drt.namespace("production").unwrap() + .component("gateway").unwrap() + .endpoint("http").unwrap() + .path(&["v1", "config"]).unwrap(); + assert_eq!(path.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http/_static_/_path_/v1/config"); + + // Test backward navigation: endpoint to component + let ep = drt.endpoint("production", "gateway", "http").unwrap(); + let comp_from_ep = ep.component(); + assert_eq!(comp_from_ep.to_string(), "dynamo://production/_component_/gateway"); + + // Test backward navigation: endpoint to namespace + let ns_from_ep = ep.namespace(); + assert_eq!(ns_from_ep.to_string(), "dynamo://production"); + + // Test backward navigation: component to namespace + let comp = drt.component("production", "gateway").unwrap(); + let ns_from_comp = comp.namespace(); + assert_eq!(ns_from_comp.to_string(), "dynamo://production"); + } + + #[tokio::test] + async fn test_direct_entity_creation() { + let drt = create_test_runtime().await; + + let comp = drt.component("production", "gateway").unwrap(); + assert_eq!(comp.to_string(), "dynamo://production/_component_/gateway"); + + let ep = drt.endpoint("production", "gateway", "http").unwrap(); + assert_eq!(ep.to_string(), "dynamo://production/_component_/gateway/_endpoint_/http/_static_"); + + let path = drt.component("production", "gateway").unwrap() + .path(&["v1", "config"]).unwrap(); + assert_eq!(path.to_string(), "dynamo://production/_component_/gateway/_path_/v1/config"); + } + + #[tokio::test] + async fn test_path_navigation() { + let drt = create_test_runtime().await; + + // Create a path with multiple segments + let path = drt.namespace("production").unwrap() + .component("gateway").unwrap() + .path(&["v1", "config", "settings"]).unwrap(); + + assert_eq!(path.to_string(), "dynamo://production/_component_/gateway/_path_/v1/config/settings"); + assert_eq!(path.segments(), &["v1", "config", "settings"]); + + // Navigate up with parent() + let parent = path.parent().unwrap(); + assert_eq!(parent.to_string(), "dynamo://production/_component_/gateway/_path_/v1/config"); + assert_eq!(parent.segments(), &["v1", "config"]); + + // Navigate up again + let grandparent = parent.parent().unwrap(); + assert_eq!(grandparent.to_string(), "dynamo://production/_component_/gateway/_path_/v1"); + assert_eq!(grandparent.segments(), &["v1"]); + + // Navigate up to root (no segments) + let root = grandparent.parent().unwrap(); + assert_eq!(root.to_string(), "dynamo://production/_component_/gateway/_path_"); + assert_eq!(root.segments(), &[] as &[String]); + + // Parent of root is None + assert!(root.parent().is_none()); + + // Navigate down with child() + let child = root.child("api").unwrap(); + assert_eq!(child.to_string(), "dynamo://production/_component_/gateway/_path_/api"); + + let grandchild = child.child("v2").unwrap(); + assert_eq!(grandchild.to_string(), "dynamo://production/_component_/gateway/_path_/api/v2"); + + // Test with endpoint-based path + let ep_path = drt.endpoint("prod", "svc", "http").unwrap() + .path(&["metrics"]).unwrap(); + assert_eq!(ep_path.to_string(), "dynamo://prod/_component_/svc/_endpoint_/http/_static_/_path_/metrics"); + + let ep_child = ep_path.child("cpu").unwrap(); + assert_eq!(ep_child.to_string(), "dynamo://prod/_component_/svc/_endpoint_/http/_static_/_path_/metrics/cpu"); + } + + #[tokio::test] + async fn test_discovery_client_trait() { + use crate::discovery::DiscoveryClient; + use crate::distributed::DistributedConfig; + + // Helper to create test runtime with etcd + async fn create_test_runtime_with_etcd() -> Result { + let runtime = Runtime::from_current()?; + let mut config = DistributedConfig::from_settings(false); + config.etcd_config.etcd_url = vec!["http://localhost:2379".to_string()]; + DistributedRuntime::new(runtime, config).await + } + + // Check if etcd is available + if etcd_client::Client::connect(["localhost:2379"], None).await.is_err() { + eprintln!("Skipping test: ETCD not available"); + return; + } + + let drt = match create_test_runtime_with_etcd().await { + Ok(drt) => drt, + Err(_) => { + eprintln!("Skipping test: Could not create runtime with etcd"); + return; + } + }; + + // Test namespace can use DiscoveryClient + let ns = drt.namespace("test.discovery").unwrap(); + assert_eq!(ns.etcd_key(), "dynamo://test.discovery"); + + // Test namespace storage operations + if let Ok(storage) = ns.storage() { + let test_data = b"namespace data".to_vec(); + + storage.put(test_data.clone(), None).await.unwrap(); + let values = storage.get().await.unwrap(); + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await.unwrap(); + } + + // Test component can use DiscoveryClient + let comp = drt.component("test", "discovery").unwrap(); + assert_eq!(comp.etcd_key(), "dynamo://test/_component_/discovery"); + + // Test component storage operations + if let Ok(storage) = comp.storage() { + let test_data = b"component data".to_vec(); + + storage.put(test_data.clone(), None).await.unwrap(); + let values = storage.get().await.unwrap(); + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await.unwrap(); + } + + // Test endpoint can use DiscoveryClient + let ep = drt.endpoint("test", "discovery", "http").unwrap(); + assert!(ep.etcd_key().starts_with("dynamo://test/_component_/discovery/_endpoint_/http")); + + // Test endpoint storage operations + if let Ok(storage) = ep.storage() { + let test_data = b"endpoint data".to_vec(); + + storage.put(test_data.clone(), None).await.unwrap(); + let values = storage.get().await.unwrap(); + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await.unwrap(); + } + + // Test path can use DiscoveryClient + let path = comp.path(&["v1", "config"]).unwrap(); + assert_eq!(path.etcd_key(), "dynamo://test/_component_/discovery/_path_/v1/config"); + + // Test path storage operations + if let Ok(storage) = path.storage() { + let test_data = b"path data".to_vec(); + + storage.put(test_data.clone(), None).await.unwrap(); + let values = storage.get().await.unwrap(); + assert_eq!(values[0].value(), &test_data); + + // Cleanup + storage.delete(None).await.unwrap(); + } + } +} diff --git a/lib/runtime/src/entity/client.rs b/lib/runtime/src/entity/client.rs new file mode 100644 index 0000000000..0ddabcd826 --- /dev/null +++ b/lib/runtime/src/entity/client.rs @@ -0,0 +1,521 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use crate::pipeline::{ + AddressedPushRouter, AddressedRequest, AsyncEngine, Data, ManyOut, PushRouter, RouterMode, + SingleIn, +}; +use rand::Rng; +use std::collections::HashMap; +use std::sync::{ + atomic::{AtomicU64, Ordering}, + Arc, +}; +use tokio::{net::unix::pipe::Receiver, sync::Mutex}; + +use crate::{ + pipeline::async_trait, + transports::etcd::{Client as EtcdClient, WatchEvent}, +}; + +use super::*; + +/// Each state will be have a nonce associated with it +/// The state will be emitted in a watch channel, so we can observe the +/// critical state transitions. +enum MapState { + /// The map is empty; value = nonce + Empty(u64), + + /// The map is not-empty; values are (nonce, count) + NonEmpty(u64, u64), + + /// The watcher has finished, no more events will be emitted + Finished, +} + +enum EndpointEvent { + Put(String, i64), + Delete(String), +} + +#[derive(Clone, Debug)] +pub struct Client { + // This is me + pub endpoint: Endpoint, + // These are the remotes I know about + pub instance_source: Arc, +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +#[serde(rename_all = "snake_case")] +pub enum TransportType { + NatsTcp(String), +} + +#[derive(Debug, Clone, Serialize, Deserialize, Eq, PartialEq)] +pub struct StoredValue { + pub key: Instance, + pub value: T, +} + +pub type StoredTransport = StoredValue; + +#[derive(Clone, Debug)] +pub enum InstanceSource { + Static, + Dynamic(tokio::sync::watch::Receiver>), +} + +impl Client { + + pub(crate) async fn new(endpoint: Endpoint) -> Result { + if endpoint.to_descriptor().is_static() { + Ok(Client { + endpoint, + instance_source: Arc::new(InstanceSource::Static), + }) + } else { + let instance_source = + Self::get_or_create_dynamic_instance_source(&endpoint).await?; + Ok(Client { + endpoint, + instance_source, + }) + } + + } + + pub fn instances(&self) -> Vec { + match self.instance_source.as_ref() { + InstanceSource::Static => vec![], + InstanceSource::Dynamic(watch_rx) => watch_rx.borrow().clone(), + } + } + + pub fn instance_ids(&self) -> Vec { + self.instances().into_iter().filter_map(|ep| ep.key.instance_id()).collect() + } + + /// Wait for at least one Instance to be available for this Endpoint + pub async fn wait_for_instances(&self) -> Result> { + let mut instances: Vec = vec![]; + if let InstanceSource::Dynamic(mut rx) = self.instance_source.as_ref().clone() { + // wait for there to be 1 or more endpoints + loop { + let stored_transports = rx.borrow_and_update().to_vec(); + instances = stored_transports.into_iter().map(|st| st.key).collect(); + if instances.is_empty() { + rx.changed().await?; + } else { + break; + } + } + } + Ok(instances) + } + + /// Is this component know at startup and not discovered via etcd? + pub fn is_static(&self) -> bool { + self.endpoint.to_descriptor().is_static() + } + + async fn get_or_create_dynamic_instance_source( + endpoint: &Endpoint, + ) -> Result> { + // Try to get from cache first + if let Some(cached) = Self::try_get_cached_instance_source(endpoint).await? { + return Ok(cached); + } + + // Set up new watcher + let prefix_watcher = endpoint.discovery_storage()?.watch_prefix().await?; + let (prefix, _watcher, kv_event_rx) = prefix_watcher.dissolve(); + + let (watch_tx, watch_rx) = tokio::sync::watch::channel(Vec::::new()); + + // Spawn background watcher task + let drt = endpoint.drt(); + Self::spawn_instance_watcher(drt, prefix, watch_tx, kv_event_rx); + + // Create and cache the new instance source + let instance_source = Arc::new(InstanceSource::Dynamic(watch_rx)); + Self::cache_instance_source(endpoint, &instance_source).await?; + + Ok(instance_source) + } + + async fn try_get_cached_instance_source(endpoint: &Endpoint) -> Result>> { + let drt = endpoint.drt(); + let instance_sources_guard = drt.instance_sources(); + let mut instance_sources = instance_sources_guard.lock().await; + + if let Some(instance_source) = instance_sources.get(&endpoint.to_descriptor().identifier()) + .and_then(|weak| weak.upgrade()) + { + return Ok(Some(instance_source)); + } + + // Clean up stale entry if it exists but couldn't upgrade + instance_sources.remove(&endpoint.to_descriptor().identifier()); + Ok(None) + } + + async fn cache_instance_source( + endpoint: &Endpoint, + instance_source: &Arc + ) -> Result<()> { + let drt = endpoint.drt(); + let instance_sources_guard = drt.instance_sources(); + let mut instance_sources = instance_sources_guard.lock().await; + instance_sources.insert(endpoint.to_descriptor().identifier(), Arc::downgrade(instance_source)); + Ok(()) + } + + fn spawn_instance_watcher( + drt: &DistributedRuntime, + prefix: String, + watch_tx: tokio::sync::watch::Sender>, + mut kv_event_rx: tokio::sync::mpsc::Receiver, + ) { + let secondary = drt.runtime.secondary().clone(); + + secondary.spawn(async move { + tracing::debug!(prefix = %prefix, "Starting endpoint watcher"); + let mut instance_map: HashMap = HashMap::new(); + + loop { + let kv_event = tokio::select! { + _ = watch_tx.closed() => { + tracing::debug!(prefix = %prefix, "All watchers closed; shutting down endpoint watcher"); + break; + } + kv_event = kv_event_rx.recv() => { + match kv_event { + Some(event) => event, + None => { + tracing::debug!(prefix = %prefix, "Watch stream closed; shutting down endpoint watcher"); + break; + } + } + } + }; + + let should_continue = match kv_event { + WatchEvent::Put(kv) => Self::handle_put_event(kv, &mut instance_map), + WatchEvent::Delete(kv) => Self::handle_delete_event(kv, &mut instance_map, &prefix), + }; + + if !should_continue { + break; + } + + let instances: Vec = instance_map.values().cloned().collect(); + if watch_tx.send(instances).is_err() { + tracing::debug!(prefix = %prefix, "Unable to send watch updates; shutting down endpoint watcher"); + break; + } + } + + tracing::debug!(prefix = %prefix, "Completed endpoint watcher"); + let _ = watch_tx.send(vec![]); + }); + } + + fn handle_put_event( + kv: etcd_client::KeyValue, + instance_map: &mut HashMap + ) -> bool { + let Ok(key) = kv.key_str() else { + tracing::error!("Unable to parse PUT event key as UTF-8"); + return false; + }; + + let Ok(instance) = Instance::parse(key) else { + tracing::error!(key = key, "Failed to parse instance from key"); + return true; // Continue processing other events + }; + + let Ok(transport) = serde_json::from_slice::(kv.value()) else { + tracing::error!(key = key, "Failed to deserialize transport type"); + return true; // Continue processing other events + }; + + instance_map.insert( + instance.clone(), + StoredValue { key: instance, value: transport } + ); + true + } + + fn handle_delete_event( + kv: etcd_client::KeyValue, + instance_map: &mut HashMap, + prefix: &str + ) -> bool { + let Ok(key) = kv.key_str() else { + tracing::error!( + prefix = %prefix, + "Unable to parse DELETE event key as UTF-8; shutting down endpoint watcher" + ); + return false; + }; + + let Ok(instance) = Instance::parse(key) else { + tracing::error!(key = key, "Failed to parse instance from delete key"); + return true; // Continue processing other events + }; + + instance_map.remove(&instance); + true + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::{Runtime, distributed::DistributedConfig}; + use crate::descriptor::{Identifier, Instance}; + use crate::entity::Endpoint; + + async fn create_test_runtime() -> DistributedRuntime { + let runtime = Runtime::from_current().unwrap(); + DistributedRuntime::from_settings_without_discovery(runtime).await.unwrap() + } + + async fn create_test_runtime_with_etcd() -> Result { + let runtime = Runtime::from_current()?; + let mut config = DistributedConfig::from_settings(false); + config.etcd_config.etcd_url = vec!["http://localhost:2379".to_string()]; + DistributedRuntime::new(runtime, config).await + } + + #[tokio::test] + async fn test_static_client_creation() { + let drt = create_test_runtime().await; + + // Create a static endpoint + let id = Identifier::new_endpoint("test", "service", "api").unwrap(); + let static_instance = Instance::new_static(id).unwrap(); + let endpoint = Endpoint::from_instance(static_instance, drt).unwrap(); + + // Create client + let client = Client::new(endpoint.clone()).await.unwrap(); + + // Verify it's static + assert!(client.is_static()); + assert!(matches!(client.instance_source.as_ref(), InstanceSource::Static)); + + // Static clients should return empty instances + assert_eq!(client.instances().len(), 0); + assert_eq!(client.instance_ids().len(), 0); + } + + #[tokio::test] + async fn test_instance_filtering() { + let drt = create_test_runtime().await; + + // Create test data + let instances = vec![ + StoredTransport { + key: Instance::new( + Identifier::new_endpoint("test", "svc", "api").unwrap(), + 123 + ).unwrap(), + value: TransportType::NatsTcp("nats://localhost:4222".to_string()), + }, + StoredTransport { + key: Instance::new_static( + Identifier::new_endpoint("test", "svc", "api").unwrap() + ).unwrap(), + value: TransportType::NatsTcp("nats://localhost:4223".to_string()), + }, + ]; + + // Create a dynamic client with test data + let (_tx, rx) = tokio::sync::watch::channel(instances.clone()); + let instance_source = Arc::new(InstanceSource::Dynamic(rx)); + + let endpoint = Endpoint::new("test", "svc", "api", drt).unwrap(); + let client = Client { + endpoint, + instance_source, + }; + + // Test instances() method + let retrieved_instances = client.instances(); + assert_eq!(retrieved_instances.len(), 2); + + // Test instance_ids() - should filter out static instances + let ids = client.instance_ids(); + assert_eq!(ids.len(), 1); + assert_eq!(ids[0], 123); + } + + #[tokio::test] + async fn test_wait_for_instances() { + let drt = create_test_runtime().await; + + // Create a dynamic endpoint + let (tx, rx) = tokio::sync::watch::channel(vec![]); + let instance_source = Arc::new(InstanceSource::Dynamic(rx)); + + let endpoint = Endpoint::new("test", "svc", "api", drt).unwrap(); + let client = Client { + endpoint, + instance_source, + }; + + // Spawn a task to send instances after a delay + let tx_clone = tx.clone(); + tokio::spawn(async move { + tokio::time::sleep(tokio::time::Duration::from_millis(100)).await; + let instances = vec![ + StoredTransport { + key: Instance::new( + Identifier::new_endpoint("test", "svc", "api").unwrap(), + 456 + ).unwrap(), + value: TransportType::NatsTcp("nats://localhost:4224".to_string()), + }, + ]; + let _ = tx_clone.send(instances); + }); + + // Wait for instances + let instances = client.wait_for_instances().await.unwrap(); + assert_eq!(instances.len(), 1); + assert_eq!(instances[0].instance_id(), Some(456)); + } + + #[tokio::test] + async fn test_dynamic_client_with_etcd() { + // Check if etcd is available + if etcd_client::Client::connect(["localhost:2379"], None).await.is_err() { + eprintln!("Skipping test: ETCD not available"); + return; + } + + let drt = match create_test_runtime_with_etcd().await { + Ok(drt) => drt, + Err(_) => { + eprintln!("Skipping test: Could not create runtime with etcd"); + return; + } + }; + + // Create a dynamic endpoint + let endpoint = Endpoint::new("test", "client", "dynamic", drt.clone()).unwrap(); + + // Create client + let client = Client::new(endpoint.clone()).await.unwrap(); + + // Verify it's dynamic + assert!(!client.is_static()); + assert!(matches!(client.instance_source.as_ref(), InstanceSource::Dynamic(_))); + + // Initially should have no instances + assert_eq!(client.instances().len(), 0); + + // Register an instance using another client + let test_instance = Instance::new( + Identifier::new_endpoint("test", "client", "dynamic").unwrap(), + 789 + ).unwrap(); + let test_endpoint = Endpoint::from_instance(test_instance.clone(), drt.clone()).unwrap(); + + // Store transport info + let transport = TransportType::NatsTcp("nats://localhost:4225".to_string()); + let storage = test_endpoint.storage().unwrap(); + storage.put(serde_json::to_vec(&transport).unwrap(), None).await.unwrap(); + + // Wait a bit for the watcher to pick it up + tokio::time::sleep(tokio::time::Duration::from_millis(200)).await; + + // Check that the client sees the instance + let instances = client.instances(); + assert_eq!(instances.len(), 1); + assert_eq!(instances[0].key.instance_id(), Some(789)); + + // Add a second instance + let test_instance2 = Instance::new( + Identifier::new_endpoint("test", "client", "dynamic").unwrap(), + 790 + ).unwrap(); + let test_endpoint2 = Endpoint::from_instance(test_instance2.clone(), drt.clone()).unwrap(); + + // Store transport info for second instance + let transport2 = TransportType::NatsTcp("nats://localhost:4226".to_string()); + let storage2 = test_endpoint2.storage().unwrap(); + storage2.put(serde_json::to_vec(&transport2).unwrap(), None).await.unwrap(); + + // Wait for the watcher to pick up the second instance + tokio::time::sleep(tokio::time::Duration::from_millis(200)).await; + + // Check that the client now sees both instances + let instances = client.instances(); + assert_eq!(instances.len(), 2); + + // Verify both instance IDs are present + let instance_ids: Vec = instances.iter() + .filter_map(|st| st.key.instance_id()) + .collect(); + assert!(instance_ids.contains(&789)); + assert!(instance_ids.contains(&790)); + + // Check instance IDs method + let ids = client.instance_ids(); + assert_eq!(ids.len(), 2); + assert!(ids.contains(&789)); + assert!(ids.contains(&790)); + + // Clean up both instances + storage.delete(None).await.unwrap(); + storage2.delete(None).await.unwrap(); + + // Wait for deletions to propagate + tokio::time::sleep(tokio::time::Duration::from_millis(200)).await; + + // Verify both instances were removed + assert_eq!(client.instances().len(), 0); + } + + #[tokio::test] + async fn test_instance_source_caching() { + // Check if etcd is available + if etcd_client::Client::connect(["localhost:2379"], None).await.is_err() { + eprintln!("Skipping test: ETCD not available"); + return; + } + + let drt = match create_test_runtime_with_etcd().await { + Ok(drt) => drt, + Err(_) => { + eprintln!("Skipping test: Could not create runtime with etcd"); + return; + } + }; + + let endpoint1 = Endpoint::new("test", "cache", "endpoint1", drt.clone()).unwrap(); + let endpoint2 = Endpoint::new("test", "cache", "endpoint1", drt.clone()).unwrap(); // Same endpoint + + // Create two clients for the same endpoint + let client1 = Client::new(endpoint1).await.unwrap(); + let client2 = Client::new(endpoint2).await.unwrap(); + + // They should share the same instance source (via Arc) + assert!(Arc::ptr_eq(&client1.instance_source, &client2.instance_source)); + } +} diff --git a/lib/runtime/src/component/component.rs b/lib/runtime/src/entity/component.rs similarity index 98% rename from lib/runtime/src/component/component.rs rename to lib/runtime/src/entity/component.rs index 8bbca61fd0..9554358475 100644 --- a/lib/runtime/src/component/component.rs +++ b/lib/runtime/src/entity/component.rs @@ -21,11 +21,12 @@ use futures::{Stream, TryStreamExt}; use super::*; use crate::traits::events::{EventPublisher, EventSubscriber}; +use crate::slug::Slug; #[async_trait] impl EventPublisher for Component { fn subject(&self) -> String { - format!("namespace.{}.component.{}", self.namespace.name, self.name) + self.to_descriptor().slug().to_string() } async fn publish( diff --git a/lib/runtime/src/entity/endpoint.rs b/lib/runtime/src/entity/endpoint.rs new file mode 100644 index 0000000000..aa46135d47 --- /dev/null +++ b/lib/runtime/src/entity/endpoint.rs @@ -0,0 +1,279 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use derive_getters::Dissolve; + +use crate::entity::client::TransportType; + +use super::*; + +pub use async_nats::service::endpoint::Stats as EndpointStats; +use crate::entity::client::StoredTransport; + +#[derive(Educe, Builder, Dissolve)] +#[educe(Debug)] +#[builder(pattern = "owned", build_fn(private, name = "build_internal"))] +pub struct EndpointConfig { + #[builder(private)] + endpoint: Endpoint, + + /// Endpoint handler + #[educe(Debug(ignore))] + handler: Arc, + + /// Stats handler + #[educe(Debug(ignore))] + #[builder(default, private)] + _stats_handler: Option, +} + +impl EndpointConfigBuilder { + pub(crate) fn from_endpoint(endpoint: Endpoint) -> Self { + Self::default().endpoint(endpoint) + } + + pub fn stats_handler(self, handler: F) -> Self + where + F: FnMut(EndpointStats) -> serde_json::Value + Send + Sync + 'static, + { + self._stats_handler(Some(Box::new(handler))) + } + + pub async fn start(self) -> Result<()> { + let (endpoint, handler, stats_handler) = self.build_internal()?.dissolve(); + + // acquire the registry lock + let registry = endpoint.drt().component_registry.inner.lock().await; + + let identifier = &endpoint.to_descriptor().identifier().to_component().unwrap(); + + // get the group + let group = registry + .services + .get(identifier) + .map(|service| service.group(endpoint.to_descriptor().slug())) + .ok_or(error!("Service not found"))?; + + // get the stats handler map + let handler_map = registry + .stats_handlers + .get(identifier) + .cloned() + .expect("no stats handler registry; this is unexpected"); + + drop(registry); + + // insert the stats handler + if let Some(stats_handler) = stats_handler { + handler_map + .lock() + .unwrap() + .insert(endpoint.to_descriptor().slug(), stats_handler); + } + + // creates an endpoint for the service + let service_endpoint = group + .endpoint(&endpoint.to_descriptor().slug()) + .await + .map_err(|e| anyhow::anyhow!("Failed to start endpoint: {e}"))?; + + let storage = endpoint.storage()?; + let cancel_token = storage.primary_lease().child_token(); + + let push_endpoint = PushEndpoint::builder() + .service_handler(handler) + .cancellation_token(cancel_token.clone()) + .build() + .map_err(|e| anyhow::anyhow!("Failed to build push endpoint: {e}"))?; + + let info = StoredTransport{ + key: endpoint.to_descriptor(), + value: TransportType::NatsTcp(endpoint.to_descriptor().slug().to_string()) + }; + + // Start the service in background + let task = tokio::spawn(push_endpoint.start(service_endpoint)); + + // Register in storage after service is starting + let info = serde_json::to_vec_pretty(&info)?; + if let Err(e) = storage.create(info.clone(), None).await { + tracing::error!("Failed to register discoverable service: {:?}", e); + cancel_token.cancel(); + return Err(error!("Failed to register discoverable service")); + } + + // Monitor the task and handle result + let task_result = match task.await { + Ok(Ok(())) => { + tracing::debug!("Endpoint service completed successfully"); + Ok(()) + } + Ok(Err(service_error)) => { + tracing::error!( + error = %service_error, + endpoint = %endpoint, + "Service failed" + ); + cancel_token.cancel(); + Err(service_error) + } + Err(join_error) => { + tracing::error!( + error = %join_error, + endpoint = %endpoint, + "Task join failed" + ); + cancel_token.cancel(); + Err(error!("Task failed to complete")) + } + }; + + // Always cleanup: remove from storage regardless of success/failure + if let Err(cleanup_error) = storage.delete(None).await { + tracing::warn!( + error = %cleanup_error, + endpoint = %endpoint, + action = "cleanup_service_registration", + "Failed to cleanup service registration" + ); + } + + task_result + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::entity::{Component, Endpoint}; + use crate::{Runtime, DistributedRuntime}; + use crate::pipeline::network::PushWorkHandler; + use crate::pipeline::error::PipelineError; + use std::sync::Arc; + use async_trait::async_trait; + use bytes::Bytes; + + // Mock handler for testing + #[derive(Clone)] + struct MockHandler; + + #[async_trait] + impl PushWorkHandler for MockHandler { + async fn handle_payload(&self, _payload: Bytes) -> Result<(), PipelineError> { + Ok(()) + } + } + + async fn create_test_runtime() -> DistributedRuntime { + let runtime = Runtime::from_current().unwrap(); + DistributedRuntime::from_settings_without_discovery(runtime).await.unwrap() + } + + async fn create_test_runtime_with_etcd() -> Result { + let runtime = Runtime::from_current()?; + let mut config = crate::distributed::DistributedConfig::from_settings(false); + config.etcd_config.etcd_url = vec!["http://localhost:2379".to_string()]; + DistributedRuntime::new(runtime, config).await + } + + async fn check_nats_available() -> bool { + // Try to connect to NATS to see if it's available + match async_nats::connect("nats://localhost:4222").await { + Ok(_) => true, + Err(_) => false, + } + } + + async fn check_etcd_available() -> bool { + // Try to connect to etcd to see if it's available + etcd_client::Client::connect(["localhost:2379"], None).await.is_ok() + } + + #[tokio::test] + async fn test_service_and_endpoint_integration() { + // Check if NATS is available + if !check_nats_available().await { + eprintln!("Skipping test: NATS not available"); + return; + } + + // Check if etcd is available + if !check_etcd_available().await { + eprintln!("Skipping test: ETCD not available"); + return; + } + + let drt = match create_test_runtime_with_etcd().await { + Ok(drt) => drt, + Err(_) => { + eprintln!("Skipping test: Could not create runtime with etcd"); + return; + } + }; + + // Step 1: Create a service + let component = Component::new("test", "integration", drt.clone()).unwrap(); + let service_builder = crate::entity::service::ServiceConfigBuilder::from_component(component.clone()); + let _created_component = service_builder.create().await.expect("Service creation should succeed"); + + // Step 2: Create an endpoint for the service + let endpoint = Endpoint::new("test", "integration", "testapi", drt.clone()).unwrap(); + let handler: Arc = Arc::new(MockHandler); + + // Get the storage and its cancellation token before moving endpoint + let storage = endpoint.storage().unwrap(); + let cancel_token = storage.primary_lease().primary_token(); + + let endpoint_builder = EndpointConfigBuilder::from_endpoint(endpoint.clone()) + .handler(handler); + + // Start the endpoint in a background task + let endpoint_task = tokio::spawn(async move { + endpoint_builder.start().await + }); + + // Give it a moment to register + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + + // Step 3: Verify the endpoint is registered in storage + let kvs = storage.get().await.expect("Should get storage data"); + assert!(!kvs.is_empty(), "Endpoint should be registered in storage"); + + // Parse and verify the stored transport + if let Some(kv) = kvs.first() { + let stored: StoredTransport = serde_json::from_slice(kv.value()).expect("Should deserialize stored transport"); + assert_eq!(stored.key, endpoint.to_descriptor()); + assert!(matches!(stored.value, TransportType::NatsTcp(_))); + } + + // Trigger graceful shutdown via cancellation token + cancel_token.cancel(); + + // Wait for the endpoint task to complete + match tokio::time::timeout(tokio::time::Duration::from_secs(5), endpoint_task).await { + Ok(Ok(Ok(()))) => println!("Endpoint shut down successfully"), + Ok(Ok(Err(e))) => println!("Endpoint shut down with error: {:?}", e), + Ok(Err(e)) => println!("Endpoint task panicked: {:?}", e), + Err(_) => println!("Endpoint shutdown timed out"), + } + + // Give cleanup a moment to complete + tokio::time::sleep(tokio::time::Duration::from_millis(500)).await; + + // Verify cleanup happened + let kvs_after = storage.get().await.expect("Should get storage data"); + assert!(kvs_after.is_empty(), "Endpoint should be removed from storage after shutdown"); + } +} diff --git a/lib/runtime/src/component/namespace.rs b/lib/runtime/src/entity/namespace.rs similarity index 98% rename from lib/runtime/src/component/namespace.rs rename to lib/runtime/src/entity/namespace.rs index 66b9744cc4..9a9ec53994 100644 --- a/lib/runtime/src/component/namespace.rs +++ b/lib/runtime/src/entity/namespace.rs @@ -25,7 +25,7 @@ use crate::traits::events::{EventPublisher, EventSubscriber}; #[async_trait] impl EventPublisher for Namespace { fn subject(&self) -> String { - format!("namespace.{}", self.name) + self.to_descriptor().slug().to_string() } async fn publish( diff --git a/lib/runtime/src/component/registry.rs b/lib/runtime/src/entity/registry.rs similarity index 100% rename from lib/runtime/src/component/registry.rs rename to lib/runtime/src/entity/registry.rs diff --git a/lib/runtime/src/entity/service.rs b/lib/runtime/src/entity/service.rs new file mode 100644 index 0000000000..94d41cddb3 --- /dev/null +++ b/lib/runtime/src/entity/service.rs @@ -0,0 +1,171 @@ +// SPDX-FileCopyrightText: Copyright (c) 2024-2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +// SPDX-License-Identifier: Apache-2.0 +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +use derive_getters::Dissolve; +use std::collections::HashMap; +use std::sync::Mutex; + +use super::*; +use crate::transports::nats::Slug; + +pub use super::endpoint::EndpointStats; +pub type StatsHandler = + Box serde_json::Value + Send + Sync + 'static>; +pub type EndpointStatsHandler = + Box serde_json::Value + Send + Sync + 'static>; + +pub const PROJECT_NAME: &str = "Dynamo"; + +#[derive(Educe, Builder, Dissolve)] +#[educe(Debug)] +#[builder(pattern = "owned", build_fn(private, name = "build_internal"))] +pub struct ServiceConfig { + #[builder(private)] + component: Component, + + /// Description + #[builder(default)] + description: Option, +} + +impl ServiceConfigBuilder { + /// Create the [`Component`]'s service and store it in the registry. + pub async fn create(self) -> Result { + let (component, description) = self.build_internal()?.dissolve(); + + let version = "0.0.1".to_string(); + + // let service_name = component.service_name(); + // log::debug!("component: {component}; creating, service_name: {service_name}"); + + let description = description.unwrap_or(format!( + "{PROJECT_NAME} {component}")); + + let stats_handler_registry: Arc>> = + Arc::new(Mutex::new(HashMap::new())); + + let stats_handler_registry_clone = stats_handler_registry.clone(); + + let mut guard = component.drt().component_registry.inner.lock().await; + + if guard.services.contains_key(&component.to_descriptor()) { + return Err(anyhow::anyhow!("Service already exists")); + } + + // create service on the secondary runtime + let builder = component.drt().nats_client.client().service_builder(); + + tracing::debug!("Starting service: {}", component.to_descriptor().slug()); + let service_builder = builder + .description(description) + .stats_handler(move |name: String, stats| { + log::trace!("stats_handler: {name}, {stats:?}"); + let mut guard = stats_handler_registry.lock().unwrap(); + match guard.get_mut(&Slug::slugify(&name)) { + Some(handler) => handler(stats), + None => serde_json::Value::Null, + } + }); + tracing::debug!("Got builder"); + let service = service_builder + .start(component.to_descriptor().slug().to_string(), version) + .await + .map_err(|e| anyhow::anyhow!("Failed to start service: {e}"))?; + + // insert the service into the registry + guard.services.insert(component.to_descriptor(), service); + + // insert the stats handler into the registry + guard + .stats_handlers + .insert(component.to_descriptor(), stats_handler_registry_clone); + + // drop the guard to unlock the mutex + drop(guard); + + Ok(component) + } +} + +impl ServiceConfigBuilder { + pub(crate) fn from_component(component: Component) -> Self { + Self::default().component(component) + } +} + +#[cfg(test)] +mod tests { + use super::*; + use crate::entity::Component; + use crate::Runtime; + + async fn create_test_runtime() -> DistributedRuntime { + let runtime = Runtime::from_current().unwrap(); + DistributedRuntime::from_settings_without_discovery(runtime).await.unwrap() + } + + async fn check_nats_available() -> bool { + // Try to connect to NATS to see if it's available + match async_nats::connect("nats://localhost:4222").await { + Ok(_) => true, + Err(_) => false, + } + } + + #[tokio::test] + async fn test_service_creation_registers_service() { + // Check if NATS is available + if !check_nats_available().await { + eprintln!("Skipping test: NATS not available"); + return; + } + + let drt = create_test_runtime().await; + let component = Component::new("test", "svc", drt).unwrap(); + let builder = ServiceConfigBuilder::from_component(component.clone()); + let created_component = builder.create().await.expect("Service should be created"); + + // Check that the service is in the registry + let registry = created_component.drt().component_registry.inner.lock().await; + assert!(registry.services.contains_key(&created_component.to_descriptor())); + + // Also check that stats handler registry was created + assert!(registry.stats_handlers.contains_key(&created_component.to_descriptor())); + } + + #[tokio::test] + async fn test_duplicate_service_creation_fails() { + // Check if NATS is available + if !check_nats_available().await { + eprintln!("Skipping test: NATS not available"); + return; + } + + let drt = create_test_runtime().await; + let component = Component::new("test", "svc", drt).unwrap(); + let builder = ServiceConfigBuilder::from_component(component.clone()); + builder.create().await.expect("First creation should succeed"); + + // Second creation should fail + let builder2 = ServiceConfigBuilder::from_component(component.clone()); + let result = builder2.create().await; + assert!(result.is_err(), "Duplicate service creation should fail"); + + // Verify the error message contains expected text + if let Err(e) = result { + assert!(e.to_string().contains("Service already exists")); + } + } +} diff --git a/lib/runtime/src/lib.rs b/lib/runtime/src/lib.rs index d292bf242d..4dfa57bc49 100644 --- a/lib/runtime/src/lib.rs +++ b/lib/runtime/src/lib.rs @@ -34,8 +34,10 @@ mod config; pub use config::RuntimeConfig; pub mod component; +pub mod descriptor; pub mod discovery; pub mod engine; +pub mod entity; pub mod logging; pub mod pipeline; pub mod prelude; @@ -55,7 +57,8 @@ pub use futures::stream; pub use tokio_util::sync::CancellationToken; pub use worker::Worker; -use component::{Endpoint, InstanceSource}; +use entity::{Endpoint, InstanceSource, Registry}; +use descriptor::{Instance, Identifier}; /// Types of Tokio runtimes that can be used to construct a Dynamo [Runtime]. #[derive(Clone)] @@ -90,11 +93,11 @@ pub struct DistributedRuntime { // take for example two instances of a client to the same remote component. The registry allows us to use // a single endpoint watcher for both clients, this keeps the number background tasking watching specific // paths in etcd to a minimum. - component_registry: component::Registry, + component_registry: Registry, // Will only have static components that are not discoverable via etcd, they must be know at // startup. Will not start etcd. is_static: bool, - instance_sources: Arc>>>, + instance_sources: Arc>>>, } diff --git a/lib/runtime/src/pipeline/network/egress/push_router.rs b/lib/runtime/src/pipeline/network/egress/push_router.rs index 228a7d6ef6..e0ab2a880b 100644 --- a/lib/runtime/src/pipeline/network/egress/push_router.rs +++ b/lib/runtime/src/pipeline/network/egress/push_router.rs @@ -25,7 +25,7 @@ use std::{ }; use crate::{ - component::{Client, Endpoint, InstanceSource}, + entity::{Client, Endpoint, InstanceSource}, engine::{AsyncEngine, Data}, pipeline::{AddressedPushRouter, AddressedRequest, Error, ManyOut, SingleIn}, traits::DistributedRuntimeProvider, @@ -111,15 +111,16 @@ where if count == 0 { return Err(anyhow::anyhow!( "no instances found for endpoint {:?}", - self.client.endpoint.etcd_root() + self.client.endpoint )); } let offset = counter % count as u64; - instances[offset as usize].id() + instances[offset as usize].key.instance_id().unwrap() + }; tracing::trace!("round robin router selected {instance_id}"); - let subject = self.client.endpoint.subject_to(instance_id); + let subject = self.client.endpoint.to_string(); let request = request.map(|req| AddressedRequest::new(req, subject)); self.addressed.generate(request).await @@ -133,16 +134,16 @@ where if count == 0 { return Err(anyhow::anyhow!( "no instances found for endpoint {:?}", - self.client.endpoint.etcd_root() + self.client.endpoint )); } let counter = rand::rng().random::(); let offset = counter % count as u64; - instances[offset as usize].id() + instances[offset as usize].key.instance_id().unwrap() }; tracing::trace!("random router selected {instance_id}"); - let subject = self.client.endpoint.subject_to(instance_id); + let subject = self.client.endpoint.to_string(); let request = request.map(|req| AddressedRequest::new(req, subject)); self.addressed.generate(request).await @@ -156,24 +157,24 @@ where ) -> anyhow::Result> { let found = { let instances = self.client.instances(); - instances.iter().any(|ep| ep.id() == instance_id) + instances.iter().any(|ep| ep.key.instance_id() == Some(instance_id)) }; if !found { return Err(anyhow::anyhow!( "instance_id={instance_id} not found for endpoint {:?}", - self.client.endpoint.etcd_root() + self.client.endpoint )); } - let subject = self.client.endpoint.subject_to(instance_id); + let subject = self.client.endpoint.to_string(); let request = request.map(|req| AddressedRequest::new(req, subject)); self.addressed.generate(request).await } pub async fn r#static(&self, request: SingleIn) -> anyhow::Result> { - let subject = self.client.endpoint.subject(); + let subject = self.client.endpoint.to_string(); tracing::debug!("static got subject: {subject}"); let request = request.map(|req| AddressedRequest::new(req, subject)); tracing::debug!("router generate"); diff --git a/lib/runtime/src/slug.rs b/lib/runtime/src/slug.rs index 571ac5eeb1..ad706252ac 100644 --- a/lib/runtime/src/slug.rs +++ b/lib/runtime/src/slug.rs @@ -21,7 +21,7 @@ const REPLACEMENT_CHAR: char = '_'; /// URL and NATS friendly string. /// Only a-z, 0-9, - and _. -#[derive(Serialize, Clone, Debug, Eq, PartialEq)] +#[derive(Serialize, Clone, Debug, Eq, PartialEq, Hash)] pub struct Slug(String); impl Slug { diff --git a/lib/runtime/src/transports/etcd.rs b/lib/runtime/src/transports/etcd.rs index 21cf1fbe86..5e20689f8c 100644 --- a/lib/runtime/src/transports/etcd.rs +++ b/lib/runtime/src/transports/etcd.rs @@ -179,6 +179,8 @@ impl Client { let id = lease_id.unwrap_or(self.lease_id()); let put_options = PutOptions::new().with_lease(id); + tracing::debug!("kv_create attempting to create key: {}, lease_id: {}", key, id); + // Build the transaction let txn = Txn::new() .when(vec![Compare::version(key.as_str(), CompareOp::Equal, 0)]) // Ensure the lock does not exist @@ -190,12 +192,14 @@ impl Client { let result = self.client.kv_client().txn(txn).await?; if result.succeeded() { + tracing::debug!("kv_create succeeded for key: {}", key); Ok(()) } else { + tracing::warn!("kv_create failed for key: {}", key); for resp in result.op_responses() { tracing::warn!("kv_create etcd op response: {resp:?}"); } - Err(error!("failed to create key")) + Err(error!("failed to create key: {}", key)) } } @@ -300,10 +304,12 @@ impl Client { } pub async fn kv_get_prefix(&self, prefix: impl AsRef) -> Result> { + // add trailing slash to only match string, not substring + // foo/bar/ -> matches foo/bar/baz but not foo/barbaz let mut get_response = self .client .kv_client() - .get(prefix.as_ref(), Some(GetOptions::new().with_prefix())) + .get(format!("{}/", prefix.as_ref()), Some(GetOptions::new().with_prefix())) .await?; Ok(get_response.take_kvs())