From 30107322c97e842cb1a51b05e9162f9ef8118f96 Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy <90605504+nikolay-komarevskiy@users.noreply.github.com> Date: Wed, 31 Jul 2024 19:02:29 +0200 Subject: [PATCH 1/8] feat(BOUN-1168): add dynamic route provider (Discovery Library) (#568) --- CHANGELOG.md | 2 + Cargo.lock | 12 +- Cargo.toml | 8 +- ic-agent/Cargo.toml | 8 +- ic-agent/src/agent/builder.rs | 32 + .../dynamic_routing/dynamic_route_provider.rs | 704 ++++++++++++++++++ .../dynamic_routing/health_check.rs | 302 ++++++++ .../dynamic_routing/messages.rs | 16 + .../http_transport/dynamic_routing/mod.rs | 16 + .../http_transport/dynamic_routing/node.rs | 60 ++ .../dynamic_routing/nodes_fetch.rs | 178 +++++ .../snapshot/latency_based_routing.rs | 362 +++++++++ .../dynamic_routing/snapshot/mod.rs | 6 + .../snapshot/round_robin_routing.rs | 220 ++++++ .../snapshot/routing_snapshot.rs | 15 + .../dynamic_routing/test_utils.rs | 125 ++++ .../dynamic_routing/type_aliases.rs | 18 + ic-agent/src/agent/http_transport/mod.rs | 2 + 18 files changed, 2075 insertions(+), 11 deletions(-) create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/messages.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/mod.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/node.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs create mode 100644 ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs diff --git a/CHANGELOG.md b/CHANGELOG.md index f95901ec..09e8a970 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +## [0.37.1] - 2024-07-25 + * Bug fix: Add `api/v2` prefix to read_state requests for hyper transport ## [0.37.0] - 2024-07-23 diff --git a/Cargo.lock b/Cargo.lock index 7a31d329..c2e4fb46 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -1083,7 +1083,7 @@ dependencies = [ [[package]] name = "ic-agent" -version = "0.37.0" +version = "0.37.1" dependencies = [ "async-lock", "backoff", @@ -1148,7 +1148,7 @@ dependencies = [ [[package]] name = "ic-identity-hsm" -version = "0.37.0" +version = "0.37.1" dependencies = [ "hex", "ic-agent", @@ -1160,7 +1160,7 @@ dependencies = [ [[package]] name = "ic-transport-types" -version = "0.37.0" +version = "0.37.1" dependencies = [ "candid", "hex", @@ -1176,7 +1176,7 @@ dependencies = [ [[package]] name = "ic-utils" -version = "0.37.0" +version = "0.37.1" dependencies = [ "async-trait", "candid", @@ -1239,7 +1239,7 @@ dependencies = [ [[package]] name = "icx" -version = "0.37.0" +version = "0.37.1" dependencies = [ "anyhow", "candid", @@ -1257,7 +1257,7 @@ dependencies = [ [[package]] name = "icx-cert" -version = "0.37.0" +version = "0.37.1" dependencies = [ "anyhow", "base64", diff --git a/Cargo.toml b/Cargo.toml index a435ed5e..535c20fa 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -11,7 +11,7 @@ members = [ ] [workspace.package] -version = "0.37.0" +version = "0.37.1" authors = ["DFINITY Stiftung "] edition = "2021" repository = "https://github.com/dfinity/agent-rs" @@ -22,9 +22,9 @@ rust-version = "1.75.0" license = "Apache-2.0" [workspace.dependencies] -ic-agent = { path = "ic-agent", version = "0.37.0", default-features = false } -ic-utils = { path = "ic-utils", version = "0.37.0" } -ic-transport-types = { path = "ic-transport-types", version = "0.37.0" } +ic-agent = { path = "ic-agent", version = "0.37.1", default-features = false } +ic-utils = { path = "ic-utils", version = "0.37.1" } +ic-transport-types = { path = "ic-transport-types", version = "0.37.1" } ic-certification = "2.2" candid = "0.10.1" diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 18923d8e..6be9470f 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -74,8 +74,14 @@ hyper-rustls = { version = "0.27", default-features = false, features = [ "http1", "http2", ], optional = true } -tokio = { version = "1.24.2", features = ["time"] } +tokio = { version = "1.24.2", features = ["macros", "time"] } tower = { version = "0.4.13", optional = true } +async-trait = "^0.1.0" +tracing = "^0.1.0" +arc-swap = "^1.0.0" +simple_moving_average = "^1.0.0" +tracing-subscriber = "^0.2.0" +tokio-util = { version = "^0.7.0", features = ["rt"] } rustls-webpki = "0.102" [target.'cfg(target_family = "wasm")'.dependencies] diff --git a/ic-agent/src/agent/builder.rs b/ic-agent/src/agent/builder.rs index 77c998d4..02a5d037 100644 --- a/ic-agent/src/agent/builder.rs +++ b/ic-agent/src/agent/builder.rs @@ -16,6 +16,38 @@ impl AgentBuilder { Agent::new(self.config) } + #[cfg(all(feature = "reqwest", not(target_family = "wasm")))] + /// Set the dynamic transport layer for the [`Agent`], performing continuos discovery of the API boundary nodes and routing traffic via them based on the latencies. + pub async fn with_discovery_transport(self, client: reqwest::Client) -> Self { + use crate::agent::http_transport::{ + dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + node::Node, + snapshot::latency_based_routing::LatencyRoutingSnapshot, + }, + route_provider::RouteProvider, + ReqwestTransport, + }; + + // TODO: This is a temporary solution to get the seed node. + let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); + + let route_provider = DynamicRouteProviderBuilder::new( + LatencyRoutingSnapshot::new(), + vec![seed], + client.clone(), + ) + .build() + .await; + + let route_provider = Arc::new(route_provider) as Arc; + + let transport = ReqwestTransport::create_with_client_route(route_provider, client) + .expect("failed to create transport"); + + self.with_transport(transport) + } + /// Set the URL of the [Agent]. #[cfg(feature = "reqwest")] pub fn with_url>(self, url: S) -> Self { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs new file mode 100644 index 00000000..cb657ae2 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -0,0 +1,704 @@ +//! An implementation of the [`RouteProvider`](crate::agent::http_transport::route_provider::RouteProvider) for dynamic generation of routing urls. + +use std::{ + sync::Arc, + time::{Duration, Instant}, +}; + +use arc_swap::ArcSwap; +use candid::Principal; +use reqwest::Client; +use thiserror::Error; +use tokio::{ + runtime::Handle, + sync::{mpsc, watch}, + time::timeout, +}; +use tokio_util::{sync::CancellationToken, task::TaskTracker}; +use tracing::{error, info, warn}; +use url::Url; + +use crate::{ + agent::http_transport::{ + dynamic_routing::{ + health_check::{HealthCheck, HealthChecker, HealthManagerActor}, + messages::FetchedNodes, + node::Node, + nodes_fetch::{Fetch, NodesFetchActor, NodesFetcher}, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::AtomicSwap, + }, + route_provider::RouteProvider, + }, + AgentError, +}; + +/// +pub const IC0_SEED_DOMAIN: &str = "ic0.app"; + +const MAINNET_ROOT_SUBNET_ID: &str = + "tdb26-jop6k-aogll-7ltgs-eruif-6kk7m-qpktf-gdiqx-mxtrf-vb5e6-eqe"; + +const FETCH_PERIOD: Duration = Duration::from_secs(5); +const FETCH_RETRY_INTERVAL: Duration = Duration::from_millis(250); +const TIMEOUT_AWAIT_HEALTHY_SEED: Duration = Duration::from_millis(1000); +const HEALTH_CHECK_TIMEOUT: Duration = Duration::from_secs(2); +const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1); + +const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider"; + +/// A dynamic route provider. +/// It spawns the discovery service (`NodesFetchActor`) for fetching the latest nodes topology. +/// It also spawns the `HealthManagerActor`, which orchestrates the health check tasks for each node and updates routing snapshot. +#[derive(Debug)] +pub struct DynamicRouteProvider { + /// Fetcher for fetching the latest nodes topology. + fetcher: Arc, + /// Periodicity of fetching the latest nodes topology. + fetch_period: Duration, + /// Interval for retrying fetching the nodes in case of error. + fetch_retry_interval: Duration, + /// Health checker for checking the health of the nodes. + checker: Arc, + /// Periodicity of checking the health of the nodes. + check_period: Duration, + /// Snapshot of the routing nodes. + routing_snapshot: AtomicSwap, + /// Task tracker for managing the spawned tasks. + tracker: TaskTracker, + /// Initial seed nodes, which are used for the initial fetching of the nodes. + seeds: Vec, + /// Cancellation token for stopping the spawned tasks. + token: CancellationToken, +} + +/// An error that occurred when the DynamicRouteProvider service was running. +#[derive(Error, Debug)] +pub enum DynamicRouteProviderError { + /// An error when fetching topology of the API nodes. + #[error("An error when fetching API nodes: {0}")] + NodesFetchError(String), + /// An error when checking API node's health. + #[error("An error when checking API node's health: {0}")] + HealthCheckError(String), + /// An invalid domain name provided. + #[error("Provided domain name is invalid: {0}")] + InvalidDomainName(String), +} + +/// A builder for the `DynamicRouteProvider`. +pub struct DynamicRouteProviderBuilder { + fetcher: Arc, + fetch_period: Duration, + fetch_retry_interval: Duration, + checker: Arc, + check_period: Duration, + routing_snapshot: AtomicSwap, + seeds: Vec, +} + +impl DynamicRouteProviderBuilder { + /// Creates a new instance of the builder. + pub fn new(snapshot: S, seeds: Vec, http_client: Client) -> Self { + let fetcher = Arc::new(NodesFetcher::new( + http_client.clone(), + Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(), + None, + )); + let checker = Arc::new(HealthChecker::new(http_client, HEALTH_CHECK_TIMEOUT)); + Self { + fetcher, + fetch_period: FETCH_PERIOD, + fetch_retry_interval: FETCH_RETRY_INTERVAL, + checker, + check_period: HEALTH_CHECK_PERIOD, + seeds, + routing_snapshot: Arc::new(ArcSwap::from_pointee(snapshot)), + } + } + + /// Sets the fetcher of the nodes in the topology. + pub fn with_fetcher(mut self, fetcher: Arc) -> Self { + self.fetcher = fetcher; + self + } + + /// Sets the fetching periodicity. + pub fn with_fetch_period(mut self, period: Duration) -> Self { + self.fetch_period = period; + self + } + + /// Sets the node health checker. + pub fn with_checker(mut self, checker: Arc) -> Self { + self.checker = checker; + self + } + + /// Sets the periodicity of node health checking. + pub fn with_check_period(mut self, period: Duration) -> Self { + self.check_period = period; + self + } + + /// Builds an instance of the `DynamicRouteProvider`. + pub async fn build(self) -> DynamicRouteProvider + where + S: RoutingSnapshot + 'static, + { + let route_provider = DynamicRouteProvider { + fetcher: self.fetcher, + fetch_period: self.fetch_period, + fetch_retry_interval: self.fetch_retry_interval, + checker: self.checker, + check_period: self.check_period, + routing_snapshot: self.routing_snapshot, + tracker: TaskTracker::new(), + seeds: self.seeds, + token: CancellationToken::new(), + }; + + route_provider.run().await; + + route_provider + } +} + +impl RouteProvider for DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ + fn route(&self) -> Result { + let snapshot = self.routing_snapshot.load(); + let node = snapshot.next().ok_or_else(|| { + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + })?; + Ok(node.to_routing_url()) + } +} + +impl DynamicRouteProvider +where + S: RoutingSnapshot + 'static, +{ + /// Starts two background tasks: + /// - Task1: NodesFetchActor + /// - Periodically fetches existing API nodes (gets latest nodes topology) and sends discovered nodes to HealthManagerActor. + /// - Task2: HealthManagerActor: + /// - Listens to the fetched nodes messages from the NodesFetchActor. + /// - Starts/stops health check tasks (HealthCheckActors) based on the newly added/removed nodes. + /// - These spawned health check tasks periodically update the snapshot with the latest node health info. + pub async fn run(&self) { + info!("{DYNAMIC_ROUTE_PROVIDER}: started ..."); + // Communication channel between NodesFetchActor and HealthManagerActor. + let (fetch_sender, fetch_receiver) = watch::channel(None); + + // Communication channel with HealthManagerActor to receive info about healthy seed nodes (used only once). + let (init_sender, mut init_receiver) = mpsc::channel(1); + + // Start the receiving part first. + let health_manager_actor = HealthManagerActor::new( + Arc::clone(&self.checker), + self.check_period, + Arc::clone(&self.routing_snapshot), + fetch_receiver, + init_sender, + self.token.clone(), + ); + self.tracker + .spawn(async move { health_manager_actor.run().await }); + + // Dispatch all seed nodes for initial health checks + if let Err(err) = fetch_sender.send(Some(FetchedNodes { + nodes: self.seeds.clone(), + })) { + error!("{DYNAMIC_ROUTE_PROVIDER}: failed to send results to HealthManager: {err:?}"); + } + + // Try await for healthy seeds. + let start = Instant::now(); + match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { + Ok(_) => info!( + "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", + start.elapsed() + ), + Err(_) => warn!( + "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", + start.elapsed() + ), + }; + // We can close the channel now. + init_receiver.close(); + + let fetch_actor = NodesFetchActor::new( + Arc::clone(&self.fetcher), + self.fetch_period, + self.fetch_retry_interval, + fetch_sender, + Arc::clone(&self.routing_snapshot), + self.token.clone(), + ); + self.tracker.spawn(async move { fetch_actor.run().await }); + info!( + "{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully" + ); + } +} + +// Gracefully stop the inner spawned tasks running in the background. +impl Drop for DynamicRouteProvider { + fn drop(&mut self) { + self.token.cancel(); + self.tracker.close(); + let tracker = self.tracker.clone(); + // If no runtime is available do nothing. + if let Ok(handle) = Handle::try_current() { + handle.spawn(async move { + tracker.wait().await; + warn!("{DYNAMIC_ROUTE_PROVIDER}: stopped gracefully"); + }); + } else { + error!("{DYNAMIC_ROUTE_PROVIDER}: no runtime available, cannot stop the spawned tasks"); + } + } +} + +#[cfg(test)] +mod tests { + use candid::Principal; + use reqwest::Client; + use std::{ + sync::{Arc, Once}, + time::{Duration, Instant}, + }; + use tracing::Level; + use tracing_subscriber::FmtSubscriber; + + use crate::{ + agent::http_transport::{ + dynamic_routing::{ + dynamic_route_provider::{ + DynamicRouteProviderBuilder, IC0_SEED_DOMAIN, MAINNET_ROOT_SUBNET_ID, + }, + node::Node, + snapshot::{ + latency_based_routing::LatencyRoutingSnapshot, + round_robin_routing::RoundRobinRoutingSnapshot, + }, + test_utils::{ + assert_routed_domains, route_n_times, NodeHealthCheckerMock, NodesFetcherMock, + }, + }, + route_provider::RouteProvider, + ReqwestTransport, + }, + Agent, AgentError, + }; + + static TRACING_INIT: Once = Once::new(); + + pub fn setup_tracing() { + TRACING_INIT.call_once(|| { + FmtSubscriber::builder().with_max_level(Level::TRACE).init(); + }); + } + + async fn assert_no_routing_via_domains( + route_provider: Arc, + excluded_domains: Vec<&str>, + timeout: Duration, + route_call_interval: Duration, + ) { + if excluded_domains.is_empty() { + panic!("List of excluded domains can't be empty"); + } + + let route_calls = 30; + let start = Instant::now(); + + while start.elapsed() < timeout { + let routed_domains = (0..route_calls) + .map(|_| { + route_provider.route().map(|url| { + let domain = url.domain().expect("no domain name in url"); + domain.to_string() + }) + }) + .collect::, _>>() + .unwrap_or_default(); + + // Exit when excluded domains are not used for routing any more. + if !routed_domains.is_empty() + && !routed_domains + .iter() + .any(|d| excluded_domains.contains(&d.as_str())) + { + return; + } + + tokio::time::sleep(route_call_interval).await; + } + panic!("Expected excluded domains {excluded_domains:?} are still observed in routing over the last {route_calls} calls"); + } + + #[tokio::test] + async fn test_mainnet() { + // Setup. + setup_tracing(); + let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); + let client = Client::builder().build().unwrap(); + let route_provider = DynamicRouteProviderBuilder::new( + LatencyRoutingSnapshot::new(), + vec![seed], + client.clone(), + ) + .build() + .await; + let route_provider = Arc::new(route_provider) as Arc; + let transport = + ReqwestTransport::create_with_client_route(Arc::clone(&route_provider), client) + .expect("failed to create transport"); + let agent = Agent::builder() + .with_transport(transport) + .build() + .expect("failed to create an agent"); + let subnet_id = Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(); + // Assert that seed (ic0.app) is not used for routing. Henceforth, only discovered API nodes are used. + assert_no_routing_via_domains( + Arc::clone(&route_provider), + vec![IC0_SEED_DOMAIN], + Duration::from_secs(40), + Duration::from_secs(2), + ) + .await; + // Act: perform /read_state call via dynamically discovered API BNs. + let api_bns = agent + .fetch_api_boundary_nodes_by_subnet_id(subnet_id) + .await + .expect("failed to fetch api boundary nodes"); + assert!(!api_bns.is_empty()); + } + + #[tokio::test] + async fn test_routing_with_topology_and_node_health_updates() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + fetcher.overwrite_nodes(vec![node_1.clone()]); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // A single healthy node exists in the topology. This node happens to be the seed node. + fetcher.overwrite_nodes(vec![node_1.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); + + // This time span is required for the snapshot to be fully updated with the new nodes and their health info. + let snapshot_update_duration = fetch_interval + 2 * check_interval; + + // Test 1: multiple route() calls return a single domain=ic0.app. + // Only a single node exists, which is initially healthy. + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain()], 6); + + // Test 2: multiple route() calls return 3 different domains with equal fairness (repetition). + // Two healthy nodes are added to the topology. + let node_2 = Node::new("api1.com").unwrap(); + let node_3 = Node::new("api2.com").unwrap(); + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.domain(), node_2.domain(), node_3.domain()], + 2, + ); + + // Test 3: multiple route() calls return 2 different domains with equal fairness (repetition). + // One node is set to unhealthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_3.domain()], 3); + + // Test 4: multiple route() calls return 3 different domains with equal fairness (repetition). + // Unhealthy node is set back to healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.domain(), node_2.domain(), node_3.domain()], + 2, + ); + + // Test 5: multiple route() calls return 3 different domains with equal fairness (repetition). + // One healthy node is added, but another one goes unhealthy. + let node_4 = Node::new("api3.com").unwrap(); + checker.overwrite_healthy_nodes(vec![node_2.clone(), node_3.clone(), node_4.clone()]); + fetcher.overwrite_nodes(vec![ + node_1.clone(), + node_2.clone(), + node_3.clone(), + node_4.clone(), + ]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_2.domain(), node_3.domain(), node_4.domain()], + 2, + ); + + // Test 6: multiple route() calls return a single domain=api1.com. + // One node is set to unhealthy and one is removed from the topology. + checker.overwrite_healthy_nodes(vec![node_2.clone(), node_3.clone()]); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_4.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_2.domain()], 3); + } + + #[tokio::test] + async fn test_route_with_initially_unhealthy_seeds_becoming_healthy() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + let node_2 = Node::new("api1.com").unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // Two nodes exist, which are initially unhealthy. + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone()]); + checker.overwrite_healthy_nodes(vec![]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = DynamicRouteProviderBuilder::new( + snapshot, + vec![node_1.clone(), node_2.clone()], + client, + ) + .with_fetcher(fetcher) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); + + // Test 1: calls to route() return an error, as no healthy seeds exist. + for _ in 0..4 { + tokio::time::sleep(check_interval).await; + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + + // Test 2: calls to route() return both seeds, as they become healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); + tokio::time::sleep(3 * check_interval).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); + } + + #[tokio::test] + async fn test_routing_with_no_healthy_nodes_returns_an_error() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // A single seed node which is initially healthy. + fetcher.overwrite_nodes(vec![node_1.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); + + // Test 1: multiple route() calls return a single domain=ic0.app, as the seed is healthy. + tokio::time::sleep(2 * check_interval).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); + + // Test 2: calls to route() return an error, as no healthy nodes exist. + checker.overwrite_healthy_nodes(vec![]); + tokio::time::sleep(2 * check_interval).await; + for _ in 0..4 { + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + } + + #[tokio::test] + async fn test_route_with_no_healthy_seeds_errors() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // No healthy seed nodes present, this should lead to errors. + fetcher.overwrite_nodes(vec![]); + checker.overwrite_healthy_nodes(vec![]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher) + .with_checker(checker) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + + // Test: calls to route() return an error, as no healthy seeds exist. + for _ in 0..4 { + tokio::time::sleep(check_interval).await; + let result = route_provider.route(); + assert_eq!( + result.unwrap_err(), + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + ); + } + } + + #[tokio::test] + async fn test_route_with_one_healthy_and_one_unhealthy_seed() { + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + let node_2 = Node::new("api1.com").unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // One healthy seed is present, it should be discovered during the initialization time. + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = DynamicRouteProviderBuilder::new( + snapshot, + vec![node_1.clone(), node_2.clone()], + client, + ) + .with_fetcher(fetcher) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); + + // Test 1: calls to route() return only a healthy seed ic0.app. + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); + + // Test 2: calls to route() return two healthy seeds, as the unhealthy seed becomes healthy. + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone()]); + tokio::time::sleep(2 * check_interval).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain(), node_2.domain()], 3); + } + + #[tokio::test] + async fn test_routing_with_an_empty_fetched_list_of_api_nodes() { + // Check resiliency to an empty list of fetched API nodes (this should never happen in normal IC operation). + // Setup. + setup_tracing(); + let node_1 = Node::new(IC0_SEED_DOMAIN).unwrap(); + // Set nodes fetching params: topology, fetching periodicity. + let fetcher = Arc::new(NodesFetcherMock::new()); + let fetch_interval = Duration::from_secs(2); + // Set health checking params: healthy nodes, checking periodicity. + let checker = Arc::new(NodeHealthCheckerMock::new()); + let check_interval = Duration::from_secs(1); + // One healthy seed is initially present, but the topology has no node. + fetcher.overwrite_nodes(vec![]); + checker.overwrite_healthy_nodes(vec![node_1.clone()]); + // Configure RouteProvider + let snapshot = RoundRobinRoutingSnapshot::new(); + let client = Client::builder().build().unwrap(); + let route_provider = + DynamicRouteProviderBuilder::new(snapshot, vec![node_1.clone()], client) + .with_fetcher(fetcher.clone()) + .with_checker(checker.clone()) + .with_fetch_period(fetch_interval) + .with_check_period(check_interval) + .build() + .await; + let route_provider = Arc::new(route_provider); + + // This time span is required for the snapshot to be fully updated with the new nodes topology and health info. + let snapshot_update_duration = fetch_interval + 2 * check_interval; + + // Test 1: multiple route() calls return a single domain=ic0.app. + // HealthManagerActor shouldn't update the snapshot, if the list of fetched nodes is empty, thus we observe the healthy seed. + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(3, Arc::clone(&route_provider)); + assert_routed_domains(routed_domains, vec![node_1.domain()], 3); + + // Test 2: multiple route() calls should now return 3 different domains with equal fairness (repetition). + // Three nodes are added to the topology, i.e. now the fetched nodes list is non-empty. + let node_2 = Node::new("api1.com").unwrap(); + let node_3 = Node::new("api2.com").unwrap(); + fetcher.overwrite_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + checker.overwrite_healthy_nodes(vec![node_1.clone(), node_2.clone(), node_3.clone()]); + tokio::time::sleep(snapshot_update_duration).await; + let routed_domains = route_n_times(6, Arc::clone(&route_provider)); + assert_routed_domains( + routed_domains, + vec![node_1.domain(), node_2.domain(), node_3.domain()], + 2, + ); + } +} + +// - none of the seeds [] are healthy +// - none of the API node [] is healthy +// - return a vector of errors: HealthCheckErrors, FetchErrors, etc. diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs new file mode 100644 index 00000000..491f010b --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs @@ -0,0 +1,302 @@ +use async_trait::async_trait; +use http::{Method, StatusCode}; +use reqwest::{Client, Request}; +use std::{ + fmt::Debug, + sync::Arc, + time::{Duration, Instant}, +}; +use tokio::{sync::mpsc, time}; +use tokio_util::{sync::CancellationToken, task::TaskTracker}; +use tracing::{debug, error, info, warn}; +use url::Url; + +use crate::agent::http_transport::dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, + messages::{FetchedNodes, NodeHealthState}, + node::Node, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::{AtomicSwap, ReceiverMpsc, ReceiverWatch, SenderMpsc}, +}; + +const CHANNEL_BUFFER: usize = 128; + +/// A trait representing a health check of the node. +#[async_trait] +pub trait HealthCheck: Send + Sync + Debug { + /// Checks the health of the node. + async fn check(&self, node: &Node) -> Result; +} + +/// A struct representing the health check status of the node. +#[derive(Clone, PartialEq, Debug, Default)] +pub struct HealthCheckStatus { + latency: Option, +} + +impl HealthCheckStatus { + /// Creates a new `HealthCheckStatus` instance. + pub fn new(latency: Option) -> Self { + Self { latency } + } + + /// Checks if the node is healthy. + pub fn is_healthy(&self) -> bool { + self.latency.is_some() + } + + /// Get the latency of the health check. + pub fn latency(&self) -> Option { + self.latency + } +} + +/// A struct implementing the `HealthCheck` for the nodes. +#[derive(Debug)] +pub struct HealthChecker { + http_client: Client, + timeout: Duration, +} + +impl HealthChecker { + /// Creates a new `HealthChecker` instance. + pub fn new(http_client: Client, timeout: Duration) -> Self { + Self { + http_client, + timeout, + } + } +} + +const HEALTH_CHECKER: &str = "HealthChecker"; + +#[async_trait] +impl HealthCheck for HealthChecker { + async fn check(&self, node: &Node) -> Result { + // API boundary node exposes /health endpoint and should respond with 204 (No Content) if it's healthy. + let url = Url::parse(&format!("https://{}/health", node.domain())).unwrap(); + + let mut request = Request::new(Method::GET, url.clone()); + *request.timeout_mut() = Some(self.timeout); + + let start = Instant::now(); + let response = self.http_client.execute(request).await.map_err(|err| { + DynamicRouteProviderError::HealthCheckError(format!( + "Failed to execute GET request to {url}: {err}" + )) + })?; + let latency = start.elapsed(); + + if response.status() != StatusCode::NO_CONTENT { + let err_msg = format!( + "{HEALTH_CHECKER}: Unexpected http status code {} for url={url} received", + response.status() + ); + error!(err_msg); + return Err(DynamicRouteProviderError::HealthCheckError(err_msg)); + } + + Ok(HealthCheckStatus::new(Some(latency))) + } +} + +const HEALTH_CHECK_ACTOR: &str = "HealthCheckActor"; + +/// A struct performing the health check of the node and sending the health status to the listener. +struct HealthCheckActor { + /// The health checker. + checker: Arc, + /// The period of the health check. + period: Duration, + /// The node to check. + node: Node, + /// The sender channel (listener) to send the health status. + sender_channel: SenderMpsc, + /// The cancellation token of the actor. + token: CancellationToken, +} + +impl HealthCheckActor { + fn new( + checker: Arc, + period: Duration, + node: Node, + sender_channel: SenderMpsc, + token: CancellationToken, + ) -> Self { + Self { + checker, + period, + node, + sender_channel, + token, + } + } + + /// Runs the actor. + async fn run(self) { + let mut interval = time::interval(self.period); + loop { + tokio::select! { + _ = interval.tick() => { + let health = self.checker.check(&self.node).await.unwrap_or_default(); + let message = NodeHealthState { + node: self.node.clone(), + health, + }; + // Inform the listener about node's health. It can only fail if the listener was closed/dropped. + self.sender_channel.send(message).await.expect("Failed to send node's health state"); + } + _ = self.token.cancelled() => { + info!("{HEALTH_CHECK_ACTOR}: was gracefully cancelled for node {:?}", self.node); + break; + } + } + } + } +} + +/// The name of the health manager actor. +pub(super) const HEALTH_MANAGER_ACTOR: &str = "HealthManagerActor"; + +/// A struct managing the health checks of the nodes. +/// It receives the fetched nodes from the `NodesFetchActor` and starts the health checks for them. +/// It also receives the health status of the nodes from the `HealthCheckActor/s` and updates the routing snapshot. +pub(super) struct HealthManagerActor { + /// The health checker. + checker: Arc, + /// The period of the health check. + period: Duration, + /// The routing snapshot, storing the nodes. + routing_snapshot: AtomicSwap, + /// The receiver channel to listen to the fetched nodes messages. + fetch_receiver: ReceiverWatch, + /// The sender channel to send the health status of the nodes back to HealthManagerActor. + check_sender: SenderMpsc, + /// The receiver channel to receive the health status of the nodes from the `HealthCheckActor/s`. + check_receiver: ReceiverMpsc, + /// The sender channel to send the initialization status to DynamicRouteProvider (used only once in the init phase). + init_sender: SenderMpsc, + /// The cancellation token of the actor. + token: CancellationToken, + /// The cancellation token for all the health checks. + nodes_token: CancellationToken, + /// The task tracker of the health checks, waiting for the tasks to exit (graceful termination). + nodes_tracker: TaskTracker, + /// The flag indicating if this actor is initialized with healthy nodes. + is_initialized: bool, +} + +impl HealthManagerActor +where + S: RoutingSnapshot, +{ + /// Creates a new `HealthManagerActor` instance. + pub fn new( + checker: Arc, + period: Duration, + routing_snapshot: AtomicSwap, + fetch_receiver: ReceiverWatch, + init_sender: SenderMpsc, + token: CancellationToken, + ) -> Self { + let (check_sender, check_receiver) = mpsc::channel(CHANNEL_BUFFER); + + Self { + checker, + period, + routing_snapshot, + fetch_receiver, + check_sender, + check_receiver, + init_sender, + token, + nodes_token: CancellationToken::new(), + nodes_tracker: TaskTracker::new(), + is_initialized: false, + } + } + + /// Runs the actor. + pub async fn run(mut self) { + loop { + tokio::select! { + // Process a new array of fetched nodes from NodesFetchActor, if it appeared in the channel. + result = self.fetch_receiver.changed() => { + if let Err(err) = result { + error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}"); + self.token.cancel(); + continue; + } + // Get the latest value from the channel and mark it as seen. + let Some(FetchedNodes { nodes }) = self.fetch_receiver.borrow_and_update().clone() else { continue }; + self.handle_fetch_update(nodes).await; + } + // Receive health check messages from all running HealthCheckActor/s. + Some(msg) = self.check_receiver.recv() => { + self.handle_health_update(msg).await; + } + _ = self.token.cancelled() => { + self.stop_all_checks().await; + self.check_receiver.close(); + warn!("{HEALTH_MANAGER_ACTOR}: was gracefully cancelled, all nodes health checks stopped"); + break; + } + } + } + } + + async fn handle_health_update(&mut self, msg: NodeHealthState) { + let current_snapshot = self.routing_snapshot.load_full(); + let mut new_snapshot = (*current_snapshot).clone(); + new_snapshot.update_node(&msg.node, msg.health.clone()); + self.routing_snapshot.store(Arc::new(new_snapshot)); + if !self.is_initialized && msg.health.is_healthy() { + self.is_initialized = true; + // If TIMEOUT_AWAIT_HEALTHY_SEED has been exceeded, the receiver was dropped and send would thus fail. We ignore the failure. + let _ = self.init_sender.send(true).await; + } + } + + async fn handle_fetch_update(&mut self, nodes: Vec) { + if nodes.is_empty() { + // This is a bug in the IC registry. There should be at least one API Boundary Node in the registry. + // Updating nodes snapshot with an empty array, would lead to an irrecoverable error, as new nodes couldn't be fetched. + // We avoid such updates and just wait for a non-empty list. + error!("{HEALTH_MANAGER_ACTOR}: list of fetched nodes is empty"); + return; + } + debug!("{HEALTH_MANAGER_ACTOR}: fetched nodes received {:?}", nodes); + let current_snapshot = self.routing_snapshot.load_full(); + let mut new_snapshot = (*current_snapshot).clone(); + // If the snapshot has changed, store it and restart all node's health checks. + if new_snapshot.sync_nodes(&nodes) { + self.routing_snapshot.store(Arc::new(new_snapshot)); + self.stop_all_checks().await; + self.start_checks(nodes.to_vec()); + } + } + + fn start_checks(&mut self, nodes: Vec) { + // Create a single cancellation token for all started health checks. + self.nodes_token = CancellationToken::new(); + for node in nodes { + debug!("{HEALTH_MANAGER_ACTOR}: starting health check for node {node:?}"); + let actor = HealthCheckActor::new( + Arc::clone(&self.checker), + self.period, + node, + self.check_sender.clone(), + self.nodes_token.clone(), + ); + self.nodes_tracker.spawn(async move { actor.run().await }); + } + } + + async fn stop_all_checks(&self) { + warn!("{HEALTH_MANAGER_ACTOR}: stopping all running health checks"); + self.nodes_token.cancel(); + self.nodes_tracker.close(); + self.nodes_tracker.wait().await; + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs new file mode 100644 index 00000000..5feeae25 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs @@ -0,0 +1,16 @@ +use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; + +/// Represents a message with fetched nodes. +#[derive(Debug, Clone)] +pub struct FetchedNodes { + /// The fetched nodes. + pub nodes: Vec, +} + +/// Represents a message with the health state of a node. +pub struct NodeHealthState { + /// The node. + pub node: Node, + /// The health state of the node. + pub health: HealthCheckStatus, +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs new file mode 100644 index 00000000..07570f0f --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs @@ -0,0 +1,16 @@ +//! Dynamic routing implementation. +pub mod dynamic_route_provider; +/// Health check implementation. +pub mod health_check; +/// Messages used in dynamic routing. +pub(super) mod messages; +/// Node implementation. +pub mod node; +/// Nodes fetch implementation. +pub mod nodes_fetch; +/// Routing snapshot implementation. +pub mod snapshot; +#[cfg(test)] +pub(super) mod test_utils; +/// Type aliases used in dynamic routing. +pub(super) mod type_aliases; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs new file mode 100644 index 00000000..37716da3 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/node.rs @@ -0,0 +1,60 @@ +use url::Url; + +use crate::agent::{ + http_transport::dynamic_routing::dynamic_route_provider::DynamicRouteProviderError, + ApiBoundaryNode, +}; + +/// Represents a node in the dynamic routing. +#[derive(Debug, Clone, PartialEq, Eq, Hash)] +pub struct Node { + domain: String, +} + +impl Node { + /// Creates a new `Node` instance from the domain name. + pub fn new(domain: &str) -> Result { + if !is_valid_domain(domain) { + return Err(DynamicRouteProviderError::InvalidDomainName( + domain.to_string(), + )); + } + Ok(Self { + domain: domain.to_string(), + }) + } + + /// Returns the domain name of the node. + pub fn domain(&self) -> String { + self.domain.clone() + } +} + +impl Node { + /// Converts the node to a routing URL. + pub fn to_routing_url(&self) -> Url { + Url::parse(&format!("https://{}", self.domain)).expect("failed to parse URL") + } +} + +impl From<&Node> for Url { + fn from(node: &Node) -> Self { + // Parsing can't fail, as the domain was checked at node instantiation. + Url::parse(&format!("https://{}", node.domain)).expect("failed to parse URL") + } +} + +impl TryFrom<&ApiBoundaryNode> for Node { + type Error = DynamicRouteProviderError; + + fn try_from(value: &ApiBoundaryNode) -> Result { + Node::new(&value.domain) + } +} + +/// Checks if the given domain is a valid URL. +fn is_valid_domain>(domain: S) -> bool { + // Prepend scheme to make it a valid URL + let url_string = format!("http://{}", domain.as_ref()); + Url::parse(&url_string).is_ok() +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs new file mode 100644 index 00000000..7e01d145 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -0,0 +1,178 @@ +use async_trait::async_trait; +use candid::Principal; +use reqwest::Client; +use std::{fmt::Debug, sync::Arc, time::Duration}; +use tokio::time::{self, sleep}; +use tokio_util::sync::CancellationToken; +use tracing::{error, warn}; +use url::Url; + +use crate::agent::{ + http_transport::{ + dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, + health_check::HEALTH_MANAGER_ACTOR, + messages::FetchedNodes, + node::Node, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::{AtomicSwap, SenderWatch}, + }, + reqwest_transport::ReqwestTransport, + }, + Agent, +}; + +const NODES_FETCH_ACTOR: &str = "NodesFetchActor"; + +/// Fetcher of nodes in the topology. +#[async_trait] +pub trait Fetch: Sync + Send + Debug { + /// Fetches the nodes from the topology. + async fn fetch(&self, url: Url) -> Result, DynamicRouteProviderError>; +} + +/// A struct representing the fetcher of the nodes from the topology. +#[derive(Debug)] +pub struct NodesFetcher { + http_client: Client, + subnet_id: Principal, + // By default, the nodes fetcher is configured to talk to the mainnet of Internet Computer, and verifies responses using a hard-coded public key. + // However, for testnets one can set up a custom public key. + root_key: Option>, +} + +impl NodesFetcher { + /// Creates a new `NodesFetcher` instance. + pub fn new(http_client: Client, subnet_id: Principal, root_key: Option>) -> Self { + Self { + http_client, + subnet_id, + root_key, + } + } +} + +#[async_trait] +impl Fetch for NodesFetcher { + async fn fetch(&self, url: Url) -> Result, DynamicRouteProviderError> { + let transport = ReqwestTransport::create_with_client(url, self.http_client.clone()) + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to build transport: {err}" + )) + })?; + let agent = Agent::builder() + .with_transport(transport) + .build() + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to build the agent: {err}" + )) + })?; + if let Some(key) = self.root_key.clone() { + agent.set_root_key(key); + } + let api_bns = agent + .fetch_api_boundary_nodes_by_subnet_id(self.subnet_id) + .await + .map_err(|err| { + DynamicRouteProviderError::NodesFetchError(format!( + "Failed to fetch API nodes: {err}" + )) + })?; + // If some API BNs have invalid domain names, they are discarded. + let nodes = api_bns + .iter() + .filter_map(|api_node| api_node.try_into().ok()) + .collect(); + return Ok(nodes); + } +} + +/// A struct representing the actor responsible for fetching existing nodes and communicating it with the listener. +pub(super) struct NodesFetchActor { + /// The fetcher object responsible for fetching the nodes. + fetcher: Arc, + /// Time period between fetches. + period: Duration, + /// The interval to wait before retrying to fetch the nodes in case of failures. + fetch_retry_interval: Duration, + /// Communication channel with the listener. + fetch_sender: SenderWatch, + /// The snapshot of the routing table. + routing_snapshot: AtomicSwap, + /// The token to cancel/stop the actor. + token: CancellationToken, +} + +impl NodesFetchActor +where + S: RoutingSnapshot, +{ + /// Creates a new `NodesFetchActor` instance. + pub fn new( + fetcher: Arc, + period: Duration, + retry_interval: Duration, + fetch_sender: SenderWatch, + snapshot: AtomicSwap, + token: CancellationToken, + ) -> Self { + Self { + fetcher, + period, + fetch_retry_interval: retry_interval, + fetch_sender, + routing_snapshot: snapshot, + token, + } + } + + /// Runs the actor. + pub async fn run(self) { + let mut interval = time::interval(self.period); + loop { + tokio::select! { + _ = interval.tick() => { + // Retry until success: + // - try to get a healthy node from the routing snapshot + // - if snapshot is empty, break the cycle and wait for the next fetch cycle + // - using the healthy node, try to fetch nodes from topology + // - if failure, sleep and retry + // - try send fetched nodes to the listener + // - failure should never happen, but we trace it if it does + loop { + let snapshot = self.routing_snapshot.load(); + if let Some(node) = snapshot.next() { + match self.fetcher.fetch((&node).into()).await { + Ok(nodes) => { + let msg = Some( + FetchedNodes {nodes}); + match self.fetch_sender.send(msg) { + Ok(()) => break, // message sent successfully, exist the loop + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to send results to {HEALTH_MANAGER_ACTOR}: {err:?}"); + } + } + }, + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to fetch nodes: {err:?}"); + } + }; + } else { + // No healthy nodes in the snapshot, break the cycle and wait for the next fetch cycle + error!("{NODES_FETCH_ACTOR}: no nodes in the snapshot"); + break; + }; + warn!("Retrying to fetch the nodes in {:?}", self.fetch_retry_interval); + sleep(self.fetch_retry_interval).await; + } + } + _ = self.token.cancelled() => { + warn!("{NODES_FETCH_ACTOR}: was gracefully cancelled"); + break; + } + } + } + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs new file mode 100644 index 00000000..1ae10136 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -0,0 +1,362 @@ +use std::{collections::HashSet, time::Duration}; + +use rand::Rng; +use simple_moving_average::{SumTreeSMA, SMA}; + +use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, +}; + +// Some big value implying that node is unhealthy, should be much bigger than node's latency. +const MAX_LATENCY: Duration = Duration::from_secs(500); + +const WINDOW_SIZE: usize = 15; + +// Algorithmic complexity: add sample - O(log(N)), get average - O(1). +// Space complexity: O(N) +type LatencyMovAvg = SumTreeSMA; + +/// A node, which stores health check latencies in the form of moving average. +#[derive(Clone, Debug)] +struct WeightedNode { + node: Node, + /// Moving mean of latencies measurements. + latency_mov_avg: LatencyMovAvg, + /// Weight of the node (invers of the average latency), used for stochastic weighted random sampling. + weight: f64, +} + +/// Routing snapshot for latency-based routing. +/// In this routing strategy, nodes are randomly selected based on their averaged latency of the last WINDOW_SIZE health checks. +/// Nodes with smaller average latencies are preferred for routing. +#[derive(Default, Debug, Clone)] +pub struct LatencyRoutingSnapshot { + weighted_nodes: Vec, + existing_nodes: HashSet, +} + +/// Implementation of the LatencyRoutingSnapshot. +impl LatencyRoutingSnapshot { + /// Creates a new LatencyRoutingSnapshot. + pub fn new() -> Self { + Self { + weighted_nodes: vec![], + existing_nodes: HashSet::new(), + } + } +} + +/// Helper function to sample nodes based on their weights. +/// Here weight index is selected based on the input number in range [0, 1] +#[inline(always)] +fn weighted_sample(weights: &[f64], number: f64) -> Option { + if !(0.0..=1.0).contains(&number) { + return None; + } + let sum: f64 = weights.iter().sum(); + let mut weighted_number = number * sum; + for (idx, weight) in weights.iter().enumerate() { + weighted_number -= weight; + if weighted_number <= 0.0 { + return Some(idx); + } + } + None +} + +impl RoutingSnapshot for LatencyRoutingSnapshot { + fn has_nodes(&self) -> bool { + !self.weighted_nodes.is_empty() + } + + fn next(&self) -> Option { + // We select a node based on it's weight, using a stochastic weighted random sampling approach. + let weights = self + .weighted_nodes + .iter() + .map(|n| n.weight) + .collect::>(); + // Generate a random float in the range [0, 1) + let mut rng = rand::thread_rng(); + let rand_num = rng.gen::(); + // Using this random float and an array of weights we get an index of the node. + let idx = weighted_sample(weights.as_slice(), rand_num); + idx.map(|idx| self.weighted_nodes[idx].node.clone()) + } + + fn sync_nodes(&mut self, nodes: &[Node]) -> bool { + let new_nodes = HashSet::from_iter(nodes.iter().cloned()); + // Find nodes removed from topology. + let nodes_removed: Vec<_> = self + .existing_nodes + .difference(&new_nodes) + .cloned() + .collect(); + let has_removed_nodes = !nodes_removed.is_empty(); + // Find nodes added to topology. + let nodes_added: Vec<_> = new_nodes + .difference(&self.existing_nodes) + .cloned() + .collect(); + let has_added_nodes = !nodes_added.is_empty(); + self.existing_nodes.extend(nodes_added); + // NOTE: newly added nodes will appear in the weighted_nodes later. + // This happens after the first node health check round and a consequent update_node() invocation. + for node in nodes_removed.into_iter() { + self.existing_nodes.remove(&node); + let idx = self.weighted_nodes.iter().position(|x| x.node == node); + idx.map(|idx| self.weighted_nodes.swap_remove(idx)); + } + + has_added_nodes || has_removed_nodes + } + + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool { + if !self.existing_nodes.contains(node) { + return false; + } + + // If latency is None (meaning Node is unhealthy), we assign some big value + let latency = health.latency().unwrap_or(MAX_LATENCY); + + if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) { + // Node is already in the array (it is not the first update_node() call). + self.weighted_nodes[idx].latency_mov_avg.add_sample(latency); + let latency_avg = self.weighted_nodes[idx].latency_mov_avg.get_average(); + // As nodes with smaller average latencies are preferred for routing, we use inverted values for weights. + self.weighted_nodes[idx].weight = 1.0 / latency_avg.as_secs_f64(); + } else { + // Node is not yet in array (first update_node() call). + let mut latency_mov_avg = LatencyMovAvg::from_zero(Duration::ZERO); + latency_mov_avg.add_sample(latency); + let weight = 1.0 / latency_mov_avg.get_average().as_secs_f64(); + self.weighted_nodes.push(WeightedNode { + latency_mov_avg, + node: node.clone(), + weight, + }) + } + + true + } +} + +#[cfg(test)] +mod tests { + use std::{collections::HashSet, time::Duration}; + + use simple_moving_average::SMA; + + use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, + node::Node, + snapshot::{ + latency_based_routing::{ + weighted_sample, LatencyMovAvg, LatencyRoutingSnapshot, WeightedNode, MAX_LATENCY, + }, + routing_snapshot::RoutingSnapshot, + }, + }; + + #[test] + fn test_snapshot_init() { + // Arrange + let snapshot = LatencyRoutingSnapshot::new(); + // Assert + assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.existing_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_for_non_existing_node_fails() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); + // Act + let is_updated = snapshot.update_node(&node, health); + // Assert + assert!(!is_updated); + assert!(snapshot.weighted_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_for_existing_node_succeeds() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); + snapshot.existing_nodes.insert(node.clone()); + // Check first update + let is_updated = snapshot.update_node(&node, health); + assert!(is_updated); + assert!(snapshot.has_nodes()); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_secs(1) + ); + assert_eq!(weighted_node.weight, 1.0); + assert_eq!(snapshot.next().unwrap(), node); + // Check second update + let health = HealthCheckStatus::new(Some(Duration::from_secs(2))); + let is_updated = snapshot.update_node(&node, health); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_millis(1500) + ); + assert_eq!(weighted_node.weight, 1.0 / 1.5); + // Check third update + let health = HealthCheckStatus::new(Some(Duration::from_secs(3))); + let is_updated = snapshot.update_node(&node, health); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + assert_eq!( + weighted_node.latency_mov_avg.get_average(), + Duration::from_millis(2000) + ); + assert_eq!(weighted_node.weight, 0.5); + // Check forth update with none + let health = HealthCheckStatus::new(None); + let is_updated = snapshot.update_node(&node, health); + assert!(is_updated); + let weighted_node = snapshot.weighted_nodes.first().unwrap(); + let avg_latency = Duration::from_secs_f64((MAX_LATENCY.as_secs() as f64 + 6.0) / 4.0); + assert_eq!(weighted_node.latency_mov_avg.get_average(), avg_latency); + assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64()); + assert_eq!(snapshot.weighted_nodes.len(), 1); + assert_eq!(snapshot.existing_nodes.len(), 1); + assert_eq!(snapshot.next().unwrap(), node); + } + + #[test] + fn test_sync_node_scenarios() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + // Sync with node_1 + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); + assert!(nodes_changed); + assert!(snapshot.weighted_nodes.is_empty()); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + // Add node_1 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_1.clone(), + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), + weight: 0.0, + }); + // Sync with node_1 again + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); + assert!(!nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + assert_eq!(snapshot.weighted_nodes[0].node, node_1); + // Sync with node_2 + let node_2 = Node::new("api2.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_2.clone()]) + ); + // Make sure node_1 was removed from weighted_nodes too + assert!(snapshot.weighted_nodes.is_empty()); + // Add node_2 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_2.clone(), + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), + weight: 0.0, + }); + // Sync with [node_2, node_3] + let node_3 = Node::new("api3.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_3.clone(), node_2.clone()]); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) + ); + assert_eq!(snapshot.weighted_nodes[0].node, node_2); + // Add node_3 to weighted_nodes manually + snapshot.weighted_nodes.push(WeightedNode { + node: node_3, + latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), + weight: 0.0, + }); + // Sync with [] + let nodes_changed = snapshot.sync_nodes(&[]); + assert!(nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + // Make sure all nodes were removed from the healthy_nodes + assert!(snapshot.weighted_nodes.is_empty()); + // Sync with [] again + let nodes_changed = snapshot.sync_nodes(&[]); + assert!(!nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + } + + #[test] + fn test_weighted_sample() { + // Case 1: empty array + let arr: &[f64] = &[]; + let idx = weighted_sample(arr, 0.5); + assert_eq!(idx, None); + // Case 2: single element in array + let arr: &[f64] = &[1.0]; + let idx = weighted_sample(arr, 0.0); + assert_eq!(idx, Some(0)); + let idx = weighted_sample(arr, 1.0); + assert_eq!(idx, Some(0)); + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + // Case 3: two elements in array (second element has twice the weight of the first) + let arr: &[f64] = &[1.0, 2.0]; // prefixed_sum = [1.0, 3.0] + let idx = weighted_sample(arr, 0.0); // 0.0 * 3.0 < 1.0 + assert_eq!(idx, Some(0)); + let idx = weighted_sample(arr, 0.33); // 0.33 * 3.0 < 1.0 + assert_eq!(idx, Some(0)); // selection probability ~0.33 + let idx = weighted_sample(arr, 0.35); // 0.35 * 3.0 > 1.0 + assert_eq!(idx, Some(1)); // selection probability ~0.66 + let idx = weighted_sample(arr, 1.0); // 1.0 * 3.0 > 1.0 + assert_eq!(idx, Some(1)); + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + // Case 4: four elements in array + let arr: &[f64] = &[1.0, 2.0, 1.5, 2.5]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0] + let idx = weighted_sample(arr, 0.14); // 0.14 * 7 < 1.0 + assert_eq!(idx, Some(0)); // probability ~0.14 + let idx = weighted_sample(arr, 0.15); // 0.15 * 7 > 1.0 + assert_eq!(idx, Some(1)); + let idx = weighted_sample(arr, 0.42); // 0.42 * 7 < 3.0 + assert_eq!(idx, Some(1)); // probability ~0.28 + let idx = weighted_sample(arr, 0.43); // 0.43 * 7 > 3.0 + assert_eq!(idx, Some(2)); + let idx = weighted_sample(arr, 0.64); // 0.64 * 7 < 4.5 + assert_eq!(idx, Some(2)); // probability ~0.22 + let idx = weighted_sample(arr, 0.65); // 0.65 * 7 > 4.5 + assert_eq!(idx, Some(3)); + let idx = weighted_sample(arr, 0.99); + assert_eq!(idx, Some(3)); // probability ~0.35 + // check bounds + let idx = weighted_sample(arr, -1.0); + assert_eq!(idx, None); + let idx = weighted_sample(arr, 1.1); + assert_eq!(idx, None); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs new file mode 100644 index 00000000..73df1537 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs @@ -0,0 +1,6 @@ +/// Snapshot of the routing table. +pub mod latency_based_routing; +/// Node implementation. +pub mod round_robin_routing; +/// Routing snapshot implementation. +pub mod routing_snapshot; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs new file mode 100644 index 00000000..149e49d2 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -0,0 +1,220 @@ +use std::{ + collections::HashSet, + sync::{ + atomic::{AtomicUsize, Ordering}, + Arc, + }, +}; + +use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, +}; + +/// Routing snapshot, which samples nodes in a round-robin fashion. +#[derive(Default, Debug, Clone)] +pub struct RoundRobinRoutingSnapshot { + current_idx: Arc, + healthy_nodes: HashSet, + existing_nodes: HashSet, +} + +impl RoundRobinRoutingSnapshot { + /// Creates a new instance of `RoundRobinRoutingSnapshot`. + pub fn new() -> Self { + Self { + current_idx: Arc::new(AtomicUsize::new(0)), + healthy_nodes: HashSet::new(), + existing_nodes: HashSet::new(), + } + } +} + +impl RoutingSnapshot for RoundRobinRoutingSnapshot { + fn has_nodes(&self) -> bool { + !self.healthy_nodes.is_empty() + } + + fn next(&self) -> Option { + if self.healthy_nodes.is_empty() { + return None; + } + let prev_idx = self.current_idx.fetch_add(1, Ordering::Relaxed); + self.healthy_nodes + .iter() + .nth(prev_idx % self.healthy_nodes.len()) + .cloned() + } + + fn sync_nodes(&mut self, nodes: &[Node]) -> bool { + let new_nodes = HashSet::from_iter(nodes.iter().cloned()); + // Find nodes removed from topology. + let nodes_removed: Vec<_> = self + .existing_nodes + .difference(&new_nodes) + .cloned() + .collect(); + let has_removed_nodes = !nodes_removed.is_empty(); + // Find nodes added to topology. + let nodes_added: Vec<_> = new_nodes + .difference(&self.existing_nodes) + .cloned() + .collect(); + let has_added_nodes = !nodes_added.is_empty(); + // NOTE: newly added nodes will appear in the healthy_nodes later. + // This happens after the first node health check round and a consequent update_node() invocation. + self.existing_nodes.extend(nodes_added); + nodes_removed.iter().for_each(|node| { + self.existing_nodes.remove(node); + self.healthy_nodes.remove(node); + }); + + has_added_nodes || has_removed_nodes + } + + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool { + if !self.existing_nodes.contains(node) { + return false; + } + if health.is_healthy() { + self.healthy_nodes.insert(node.clone()) + } else { + self.healthy_nodes.remove(node) + } + } +} + +#[cfg(test)] +mod tests { + use std::time::Duration; + use std::{collections::HashSet, sync::atomic::Ordering}; + + use crate::agent::http_transport::dynamic_routing::{ + health_check::HealthCheckStatus, + node::Node, + snapshot::{ + round_robin_routing::RoundRobinRoutingSnapshot, routing_snapshot::RoutingSnapshot, + }, + }; + + #[test] + fn test_snapshot_init() { + // Arrange + let snapshot = RoundRobinRoutingSnapshot::new(); + // Assert + assert!(snapshot.healthy_nodes.is_empty()); + assert!(snapshot.existing_nodes.is_empty()); + assert!(!snapshot.has_nodes()); + assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 0); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_of_non_existing_node_always_returns_false() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + // This node is not present in existing_nodes + let node = Node::new("api1.com").unwrap(); + let healthy = HealthCheckStatus::new(Some(Duration::from_secs(1))); + let unhealthy = HealthCheckStatus::new(None); + // Act 1 + let is_updated = snapshot.update_node(&node, healthy); + // Assert + assert!(!is_updated); + assert!(snapshot.existing_nodes.is_empty()); + assert!(snapshot.next().is_none()); + // Act 2 + let is_updated = snapshot.update_node(&node, unhealthy); + // Assert + assert!(!is_updated); + assert!(snapshot.existing_nodes.is_empty()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_update_of_existing_unhealthy_node_with_healthy_node_returns_true() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + // node is present in existing_nodes, but not in healthy_nodes + snapshot.existing_nodes.insert(node.clone()); + let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); + // Act + let is_updated = snapshot.update_node(&node, health); + assert!(is_updated); + assert!(snapshot.has_nodes()); + assert_eq!(snapshot.next().unwrap(), node); + assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 1); + } + + #[test] + fn test_update_of_existing_healthy_node_with_unhealthy_node_returns_true() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node = Node::new("api1.com").unwrap(); + snapshot.existing_nodes.insert(node.clone()); + snapshot.healthy_nodes.insert(node.clone()); + let unhealthy = HealthCheckStatus::new(None); + // Act + let is_updated = snapshot.update_node(&node, unhealthy); + assert!(is_updated); + assert!(!snapshot.has_nodes()); + assert!(snapshot.next().is_none()); + } + + #[test] + fn test_sync_node_scenarios() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + // Sync with node_1 + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); + assert!(nodes_changed); + assert!(snapshot.healthy_nodes.is_empty()); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + // Add node_1 to healthy_nodes manually + snapshot.healthy_nodes.insert(node_1.clone()); + // Sync with node_1 again + let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); + assert!(!nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_1.clone()]) + ); + assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_1])); + // Sync with node_2 + let node_2 = Node::new("api2.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_2.clone()]) + ); + // Make sure node_1 was removed from healthy nodes + assert!(snapshot.healthy_nodes.is_empty()); + // Add node_2 to healthy_nodes manually + snapshot.healthy_nodes.insert(node_2.clone()); + // Sync with [node_2, node_3] + let node_3 = Node::new("api3.com").unwrap(); + let nodes_changed = snapshot.sync_nodes(&[node_3.clone(), node_2.clone()]); + assert!(nodes_changed); + assert_eq!( + snapshot.existing_nodes, + HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) + ); + assert_eq!(snapshot.healthy_nodes, HashSet::from_iter(vec![node_2])); + snapshot.healthy_nodes.insert(node_3); + // Sync with [] + let nodes_changed = snapshot.sync_nodes(&[]); + assert!(nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + // Make sure all nodes were removed from the healthy_nodes + assert!(snapshot.healthy_nodes.is_empty()); + // Sync with [] again + let nodes_changed = snapshot.sync_nodes(&[]); + assert!(!nodes_changed); + assert!(snapshot.existing_nodes.is_empty()); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs new file mode 100644 index 00000000..155b8eac --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -0,0 +1,15 @@ +use std::fmt::Debug; + +use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; + +/// A trait for interacting with the snapshot of nodes (routing table). +pub trait RoutingSnapshot: Send + Sync + Clone + Debug { + /// Returns `true` if the snapshot has nodes. + fn has_nodes(&self) -> bool; + /// Get the next node in the snapshot. + fn next(&self) -> Option; + /// Syncs the nodes in the snapshot with the provided list of nodes, returning `true` if the snapshot was updated. + fn sync_nodes(&mut self, nodes: &[Node]) -> bool; + /// Updates the health status of a specific node, returning `true` if the node was found and updated. + fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool; +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs new file mode 100644 index 00000000..60004d75 --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs @@ -0,0 +1,125 @@ +use std::collections::{HashMap, HashSet}; +use std::time::Duration; +use std::{fmt::Debug, hash::Hash, sync::Arc}; + +use arc_swap::ArcSwap; +use async_trait::async_trait; +use url::Url; + +use crate::agent::http_transport::{ + dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, + health_check::{HealthCheck, HealthCheckStatus}, + node::Node, + nodes_fetch::Fetch, + type_aliases::AtomicSwap, + }, + route_provider::RouteProvider, +}; + +pub(super) fn route_n_times(n: usize, f: Arc) -> Vec { + (0..n) + .map(|_| f.route().unwrap().domain().unwrap().to_string()) + .collect() +} + +pub(super) fn assert_routed_domains( + actual: Vec, + expected: Vec, + expected_repetitions: usize, +) where + T: AsRef + Eq + Hash + Debug + Ord, +{ + fn build_count_map(items: &[T]) -> HashMap<&T, usize> + where + T: Eq + Hash, + { + items.iter().fold(HashMap::new(), |mut map, item| { + *map.entry(item).or_insert(0) += 1; + map + }) + } + let count_actual = build_count_map(&actual); + let count_expected = build_count_map(&expected); + + let mut keys_actual = count_actual.keys().collect::>(); + keys_actual.sort(); + let mut keys_expected = count_expected.keys().collect::>(); + keys_expected.sort(); + // Assert all routed domains are present. + assert_eq!(keys_actual, keys_expected); + + // Assert the expected repetition count of each routed domain. + let actual_repetitions = count_actual.values().collect::>(); + assert!(actual_repetitions + .iter() + .all(|&x| x == &expected_repetitions)); +} + +#[derive(Debug)] +pub(super) struct NodesFetcherMock { + // A set of nodes, existing in the topology. + pub nodes: AtomicSwap>, +} + +#[async_trait] +impl Fetch for NodesFetcherMock { + async fn fetch(&self, _url: Url) -> Result, DynamicRouteProviderError> { + let nodes = (*self.nodes.load_full()).clone(); + Ok(nodes) + } +} + +impl Default for NodesFetcherMock { + fn default() -> Self { + Self::new() + } +} + +impl NodesFetcherMock { + pub fn new() -> Self { + Self { + nodes: Arc::new(ArcSwap::from_pointee(vec![])), + } + } + + pub fn overwrite_nodes(&self, nodes: Vec) { + self.nodes.store(Arc::new(nodes)); + } +} + +#[derive(Debug)] +pub(super) struct NodeHealthCheckerMock { + healthy_nodes: Arc>>, +} + +impl Default for NodeHealthCheckerMock { + fn default() -> Self { + Self::new() + } +} + +#[async_trait] +impl HealthCheck for NodeHealthCheckerMock { + async fn check(&self, node: &Node) -> Result { + let nodes = self.healthy_nodes.load_full(); + let latency = match nodes.contains(node) { + true => Some(Duration::from_secs(1)), + false => None, + }; + Ok(HealthCheckStatus::new(latency)) + } +} + +impl NodeHealthCheckerMock { + pub fn new() -> Self { + Self { + healthy_nodes: Arc::new(ArcSwap::from_pointee(HashSet::new())), + } + } + + pub fn overwrite_healthy_nodes(&self, healthy_nodes: Vec) { + self.healthy_nodes + .store(Arc::new(HashSet::from_iter(healthy_nodes))); + } +} diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs new file mode 100644 index 00000000..6be931fb --- /dev/null +++ b/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs @@ -0,0 +1,18 @@ +use arc_swap::ArcSwap; +use std::sync::Arc; +use tokio::sync::{mpsc, watch}; + +/// A type alias for the sender end of a watch channel. +pub(super) type SenderWatch = watch::Sender>; + +/// A type alias for the receiver end of a watch channel. +pub(super) type ReceiverWatch = watch::Receiver>; + +/// A type alias for the sender end of a multi-producer, single-consumer channel. +pub(super) type SenderMpsc = mpsc::Sender; + +/// A type alias for the receiver end of a multi-producer, single-consumer channel. +pub(super) type ReceiverMpsc = mpsc::Receiver; + +/// A type alias for an atomic swap operation on a shared value. +pub(super) type AtomicSwap = Arc>; diff --git a/ic-agent/src/agent/http_transport/mod.rs b/ic-agent/src/agent/http_transport/mod.rs index 8f2220d6..7ffdd622 100644 --- a/ic-agent/src/agent/http_transport/mod.rs +++ b/ic-agent/src/agent/http_transport/mod.rs @@ -30,4 +30,6 @@ const ICP0_SUB_DOMAIN: &str = ".icp0.io"; const ICP_API_SUB_DOMAIN: &str = ".icp-api.io"; #[allow(dead_code)] const LOCALHOST_SUB_DOMAIN: &str = ".localhost"; +#[cfg(all(feature = "reqwest", not(target_family = "wasm")))] +pub mod dynamic_routing; pub mod route_provider; From 609934841f1a57d328222d829ccdc2699074492f Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy <90605504+nikolay-komarevskiy@users.noreply.github.com> Date: Mon, 19 Aug 2024 19:08:02 +0300 Subject: [PATCH 2/8] feat: add `fn n_routes()` to `RouteProvider` trait (#584) --- .../dynamic_routing/dynamic_route_provider.rs | 11 +- .../dynamic_routing/nodes_fetch.rs | 2 +- .../snapshot/latency_based_routing.rs | 134 +++++++++++++++--- .../snapshot/round_robin_routing.rs | 118 ++++++++++++++- .../snapshot/routing_snapshot.rs | 6 +- .../agent/http_transport/route_provider.rs | 87 +++++++++++- 6 files changed, 325 insertions(+), 33 deletions(-) diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index cb657ae2..855fe566 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -170,11 +170,20 @@ where { fn route(&self) -> Result { let snapshot = self.routing_snapshot.load(); - let node = snapshot.next().ok_or_else(|| { + let node = snapshot.next_node().ok_or_else(|| { AgentError::RouteProviderError("No healthy API nodes found.".to_string()) })?; Ok(node.to_routing_url()) } + + fn n_ordered_routes(&self, n: usize) -> Result, AgentError> { + let snapshot = self.routing_snapshot.load(); + let nodes = snapshot.next_n_nodes(n).ok_or_else(|| { + AgentError::RouteProviderError("No healthy API nodes found.".to_string()) + })?; + let urls = nodes.iter().map(|n| n.to_routing_url()).collect(); + Ok(urls) + } } impl DynamicRouteProvider diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs index 7e01d145..e887e668 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs @@ -143,7 +143,7 @@ where // - failure should never happen, but we trace it if it does loop { let snapshot = self.routing_snapshot.load(); - if let Some(node) = snapshot.next() { + if let Some(node) = snapshot.next_node() { match self.fetcher.fetch((&node).into()).await { Ok(nodes) => { let msg = Some( diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index 1ae10136..7de1bbc6 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -49,13 +49,13 @@ impl LatencyRoutingSnapshot { /// Helper function to sample nodes based on their weights. /// Here weight index is selected based on the input number in range [0, 1] #[inline(always)] -fn weighted_sample(weights: &[f64], number: f64) -> Option { +fn weighted_sample(weighted_nodes: &[(f64, &Node)], number: f64) -> Option { if !(0.0..=1.0).contains(&number) { return None; } - let sum: f64 = weights.iter().sum(); + let sum: f64 = weighted_nodes.iter().map(|n| n.0).sum(); let mut weighted_number = number * sum; - for (idx, weight) in weights.iter().enumerate() { + for (idx, &(weight, _)) in weighted_nodes.iter().enumerate() { weighted_number -= weight; if weighted_number <= 0.0 { return Some(idx); @@ -69,19 +69,40 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { !self.weighted_nodes.is_empty() } - fn next(&self) -> Option { - // We select a node based on it's weight, using a stochastic weighted random sampling approach. - let weights = self + fn next_node(&self) -> Option { + self.next_n_nodes(1).unwrap_or_default().into_iter().next() + } + + // Uses weighted random sampling algorithm without item replacement n times. + fn next_n_nodes(&self, n: usize) -> Option> { + if n == 0 { + return Some(Vec::new()); + } + + let n = std::cmp::min(n, self.weighted_nodes.len()); + + let mut nodes = Vec::with_capacity(n); + + let mut weighted_nodes: Vec<_> = self .weighted_nodes .iter() - .map(|n| n.weight) - .collect::>(); - // Generate a random float in the range [0, 1) + .map(|n| (n.weight, &n.node)) + .collect(); + let mut rng = rand::thread_rng(); - let rand_num = rng.gen::(); - // Using this random float and an array of weights we get an index of the node. - let idx = weighted_sample(weights.as_slice(), rand_num); - idx.map(|idx| self.weighted_nodes[idx].node.clone()) + + for _ in 0..n { + // Generate a random float in the range [0, 1) + let rand_num = rng.gen::(); + if let Some(idx) = weighted_sample(weighted_nodes.as_slice(), rand_num) { + let node = weighted_nodes[idx].1; + nodes.push(node.clone()); + // Remove the item, so that it can't be selected anymore. + weighted_nodes.swap_remove(idx); + } + } + + Some(nodes) } fn sync_nodes(&mut self, nodes: &[Node]) -> bool { @@ -143,7 +164,10 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { #[cfg(test)] mod tests { - use std::{collections::HashSet, time::Duration}; + use std::{ + collections::{HashMap, HashSet}, + time::Duration, + }; use simple_moving_average::SMA; @@ -166,7 +190,7 @@ mod tests { assert!(snapshot.weighted_nodes.is_empty()); assert!(snapshot.existing_nodes.is_empty()); assert!(!snapshot.has_nodes()); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); } #[test] @@ -181,7 +205,7 @@ mod tests { assert!(!is_updated); assert!(snapshot.weighted_nodes.is_empty()); assert!(!snapshot.has_nodes()); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); } #[test] @@ -201,7 +225,7 @@ mod tests { Duration::from_secs(1) ); assert_eq!(weighted_node.weight, 1.0); - assert_eq!(snapshot.next().unwrap(), node); + assert_eq!(snapshot.next_node().unwrap(), node); // Check second update let health = HealthCheckStatus::new(Some(Duration::from_secs(2))); let is_updated = snapshot.update_node(&node, health); @@ -232,7 +256,7 @@ mod tests { assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64()); assert_eq!(snapshot.weighted_nodes.len(), 1); assert_eq!(snapshot.existing_nodes.len(), 1); - assert_eq!(snapshot.next().unwrap(), node); + assert_eq!(snapshot.next_node().unwrap(), node); } #[test] @@ -307,12 +331,13 @@ mod tests { #[test] fn test_weighted_sample() { + let node = &Node::new("api1.com").unwrap(); // Case 1: empty array - let arr: &[f64] = &[]; + let arr = &[]; let idx = weighted_sample(arr, 0.5); assert_eq!(idx, None); // Case 2: single element in array - let arr: &[f64] = &[1.0]; + let arr = &[(1.0, node)]; let idx = weighted_sample(arr, 0.0); assert_eq!(idx, Some(0)); let idx = weighted_sample(arr, 1.0); @@ -323,7 +348,7 @@ mod tests { let idx = weighted_sample(arr, 1.1); assert_eq!(idx, None); // Case 3: two elements in array (second element has twice the weight of the first) - let arr: &[f64] = &[1.0, 2.0]; // prefixed_sum = [1.0, 3.0] + let arr = &[(1.0, node), (2.0, node)]; // // prefixed_sum = [1.0, 3.0] let idx = weighted_sample(arr, 0.0); // 0.0 * 3.0 < 1.0 assert_eq!(idx, Some(0)); let idx = weighted_sample(arr, 0.33); // 0.33 * 3.0 < 1.0 @@ -338,7 +363,7 @@ mod tests { let idx = weighted_sample(arr, 1.1); assert_eq!(idx, None); // Case 4: four elements in array - let arr: &[f64] = &[1.0, 2.0, 1.5, 2.5]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0] + let arr = &[(1.0, node), (2.0, node), (1.5, node), (2.5, node)]; // prefixed_sum = [1.0, 3.0, 4.5, 7.0] let idx = weighted_sample(arr, 0.14); // 0.14 * 7 < 1.0 assert_eq!(idx, Some(0)); // probability ~0.14 let idx = weighted_sample(arr, 0.15); // 0.15 * 7 > 1.0 @@ -359,4 +384,69 @@ mod tests { let idx = weighted_sample(arr, 1.1); assert_eq!(idx, None); } + + #[test] + // #[ignore] + // This test is for manual runs to see the statistics for nodes selection probability. + fn test_stats_for_next_n_nodes() { + // Arrange + let mut snapshot = LatencyRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + let node_2 = Node::new("api2.com").unwrap(); + let node_3 = Node::new("api3.com").unwrap(); + let node_4 = Node::new("api4.com").unwrap(); + let node_5 = Node::new("api5.com").unwrap(); + let node_6 = Node::new("api6.com").unwrap(); + let latency_mov_avg = LatencyMovAvg::from_zero(Duration::ZERO); + snapshot.weighted_nodes = vec![ + WeightedNode { + node: node_2.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 8.0, + }, + WeightedNode { + node: node_3.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 4.0, + }, + WeightedNode { + node: node_1.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 16.0, + }, + WeightedNode { + node: node_6.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 2.0, + }, + WeightedNode { + node: node_5.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 1.0, + }, + WeightedNode { + node: node_4.clone(), + latency_mov_avg: latency_mov_avg.clone(), + weight: 4.1, + }, + ]; + + let mut stats = HashMap::new(); + let experiments = 30; + let select_nodes_count = 10; + for i in 0..experiments { + let nodes = snapshot.next_n_nodes(select_nodes_count).unwrap(); + println!("Experiment {i}: selected nodes {nodes:?}"); + for item in nodes.into_iter() { + *stats.entry(item).or_insert(1) += 1; + } + } + for (node, count) in stats { + println!( + "Node {:?} is selected with probability {}", + node.domain(), + count as f64 / experiments as f64 + ); + } + } } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs index 149e49d2..318eb379 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -34,7 +34,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { !self.healthy_nodes.is_empty() } - fn next(&self) -> Option { + fn next_node(&self) -> Option { if self.healthy_nodes.is_empty() { return None; } @@ -45,6 +45,31 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { .cloned() } + fn next_n_nodes(&self, n: usize) -> Option> { + if n == 0 { + return Some(Vec::new()); + } + + let healthy_nodes = Vec::from_iter(self.healthy_nodes.clone()); + let healthy_count = healthy_nodes.len(); + + if n >= healthy_count { + return Some(healthy_nodes.clone()); + } + + let idx = self.current_idx.fetch_add(n, Ordering::Relaxed) % healthy_count; + let mut nodes = Vec::with_capacity(n); + + if healthy_count - idx >= n { + nodes.extend_from_slice(&healthy_nodes[idx..idx + n]); + } else { + nodes.extend_from_slice(&healthy_nodes[idx..]); + nodes.extend_from_slice(&healthy_nodes[..n - nodes.len()]); + } + + Some(nodes) + } + fn sync_nodes(&mut self, nodes: &[Node]) -> bool { let new_nodes = HashSet::from_iter(nodes.iter().cloned()); // Find nodes removed from topology. @@ -85,6 +110,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { #[cfg(test)] mod tests { + use std::collections::HashMap; use std::time::Duration; use std::{collections::HashSet, sync::atomic::Ordering}; @@ -105,7 +131,7 @@ mod tests { assert!(snapshot.existing_nodes.is_empty()); assert!(!snapshot.has_nodes()); assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 0); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); } #[test] @@ -121,13 +147,13 @@ mod tests { // Assert assert!(!is_updated); assert!(snapshot.existing_nodes.is_empty()); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); // Act 2 let is_updated = snapshot.update_node(&node, unhealthy); // Assert assert!(!is_updated); assert!(snapshot.existing_nodes.is_empty()); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); } #[test] @@ -142,7 +168,7 @@ mod tests { let is_updated = snapshot.update_node(&node, health); assert!(is_updated); assert!(snapshot.has_nodes()); - assert_eq!(snapshot.next().unwrap(), node); + assert_eq!(snapshot.next_node().unwrap(), node); assert_eq!(snapshot.current_idx.load(Ordering::SeqCst), 1); } @@ -158,7 +184,7 @@ mod tests { let is_updated = snapshot.update_node(&node, unhealthy); assert!(is_updated); assert!(!snapshot.has_nodes()); - assert!(snapshot.next().is_none()); + assert!(snapshot.next_node().is_none()); } #[test] @@ -217,4 +243,84 @@ mod tests { assert!(!nodes_changed); assert!(snapshot.existing_nodes.is_empty()); } + + #[test] + fn test_next_node() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + let node_2 = Node::new("api2.com").unwrap(); + let node_3 = Node::new("api3.com").unwrap(); + let nodes = vec![node_1, node_2, node_3]; + snapshot.existing_nodes.extend(nodes.clone()); + snapshot.healthy_nodes.extend(nodes.clone()); + // Act + let n = 6; + let mut count_map = HashMap::new(); + for _ in 0..n { + let node = snapshot.next_node().unwrap(); + count_map.entry(node).and_modify(|v| *v += 1).or_insert(1); + } + // Assert each node was returned 2 times + let k = 2; + assert_eq!( + count_map.len(), + nodes.len(), + "The number of unique elements is not {}", + nodes.len() + ); + for (item, &count) in &count_map { + assert_eq!( + count, k, + "Element {:?} does not appear exactly {} times", + item, k + ); + } + } + + #[test] + fn test_n_nodes() { + // Arrange + let mut snapshot = RoundRobinRoutingSnapshot::new(); + let node_1 = Node::new("api1.com").unwrap(); + let node_2 = Node::new("api2.com").unwrap(); + let node_3 = Node::new("api3.com").unwrap(); + let node_4 = Node::new("api4.com").unwrap(); + let node_5 = Node::new("api5.com").unwrap(); + let nodes = vec![ + node_1.clone(), + node_2.clone(), + node_3.clone(), + node_4.clone(), + node_5.clone(), + ]; + snapshot.healthy_nodes.extend(nodes.clone()); + // First call + let mut n_nodes: Vec<_> = snapshot.next_n_nodes(3).expect("failed to get nodes"); + // Second call + n_nodes.extend(snapshot.next_n_nodes(3).expect("failed to get nodes")); + // Third call + n_nodes.extend(snapshot.next_n_nodes(4).expect("failed to get nodes")); + // Fourth call + n_nodes.extend(snapshot.next_n_nodes(5).expect("failed to get nodes")); + // Assert each node was returned 3 times + let k = 3; + let mut count_map = HashMap::new(); + for item in n_nodes.iter() { + count_map.entry(item).and_modify(|v| *v += 1).or_insert(1); + } + assert_eq!( + count_map.len(), + nodes.len(), + "The number of unique elements is not {}", + nodes.len() + ); + for (item, &count) in &count_map { + assert_eq!( + count, k, + "Element {:?} does not appear exactly {} times", + item, k + ); + } + } } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs index 155b8eac..242abdfe 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -6,8 +6,10 @@ use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckSta pub trait RoutingSnapshot: Send + Sync + Clone + Debug { /// Returns `true` if the snapshot has nodes. fn has_nodes(&self) -> bool; - /// Get the next node in the snapshot. - fn next(&self) -> Option; + /// Get next node from the snapshot. + fn next_node(&self) -> Option; + /// Get up to n different nodes from the snapshot. + fn next_n_nodes(&self, n: usize) -> Option>; /// Syncs the nodes in the snapshot with the provided list of nodes, returning `true` if the snapshot was updated. fn sync_nodes(&mut self, nodes: &[Node]) -> bool; /// Updates the health status of a specific node, returning `true` if the node was found and updated. diff --git a/ic-agent/src/agent/http_transport/route_provider.rs b/ic-agent/src/agent/http_transport/route_provider.rs index 608b2888..d8e7a8e3 100644 --- a/ic-agent/src/agent/http_transport/route_provider.rs +++ b/ic-agent/src/agent/http_transport/route_provider.rs @@ -15,8 +15,19 @@ use crate::agent::{ /// A [`RouteProvider`] for dynamic generation of routing urls. pub trait RouteProvider: std::fmt::Debug + Send + Sync { - /// Generate next routing url + /// Generates the next routing URL based on the internal routing logic. + /// + /// This method returns a single `Url` that can be used for routing. + /// The logic behind determining the next URL can vary depending on the implementation fn route(&self) -> Result; + + /// Generates up to `n` different routing URLs in order of priority. + /// + /// This method returns a vector of `Url` instances, each representing a routing + /// endpoint. The URLs are ordered by priority, with the most preferred route + /// appearing first. The returned vector can contain fewer than `n` URLs if + /// fewer are available. + fn n_ordered_routes(&self, n: usize) -> Result, AgentError>; } /// A simple implementation of the [`RouteProvider`] which produces an even distribution of the urls from the input ones. @@ -38,6 +49,28 @@ impl RouteProvider for RoundRobinRouteProvider { let prev_idx = self.current_idx.fetch_add(1, Ordering::Relaxed); Ok(self.routes[prev_idx % self.routes.len()].clone()) } + + fn n_ordered_routes(&self, n: usize) -> Result, AgentError> { + if n == 0 { + return Ok(Vec::new()); + } + + if n >= self.routes.len() { + return Ok(self.routes.clone()); + } + + let idx = self.current_idx.fetch_add(n, Ordering::Relaxed) % self.routes.len(); + let mut urls = Vec::with_capacity(n); + + if self.routes.len() - idx >= n { + urls.extend_from_slice(&self.routes[idx..idx + n]); + } else { + urls.extend_from_slice(&self.routes[idx..]); + urls.extend_from_slice(&self.routes[..n - urls.len()]); + } + + Ok(urls) + } } impl RoundRobinRouteProvider { @@ -99,4 +132,56 @@ mod tests { .collect(); assert_eq!(expected_urls, urls); } + + #[test] + fn test_n_routes() { + // Test with an empty list of urls + let provider = RoundRobinRouteProvider::new(Vec::<&str>::new()) + .expect("failed to create a route provider"); + let urls_iter = provider.n_ordered_routes(1).expect("failed to get urls"); + assert!(urls_iter.is_empty()); + // Test with non-empty list of urls + let provider = RoundRobinRouteProvider::new(vec![ + "https://url1.com", + "https://url2.com", + "https://url3.com", + "https://url4.com", + "https://url5.com", + ]) + .expect("failed to create a route provider"); + // First call + let urls: Vec<_> = provider.n_ordered_routes(3).expect("failed to get urls"); + let expected_urls: Vec = ["https://url1.com", "https://url2.com", "https://url3.com"] + .iter() + .map(|url_str| Url::parse(url_str).expect("invalid URL")) + .collect(); + assert_eq!(urls, expected_urls); + // Second call + let urls: Vec<_> = provider.n_ordered_routes(3).expect("failed to get urls"); + let expected_urls: Vec = ["https://url4.com", "https://url5.com", "https://url1.com"] + .iter() + .map(|url_str| Url::parse(url_str).expect("invalid URL")) + .collect(); + assert_eq!(urls, expected_urls); + // Third call + let urls: Vec<_> = provider.n_ordered_routes(2).expect("failed to get urls"); + let expected_urls: Vec = ["https://url2.com", "https://url3.com"] + .iter() + .map(|url_str| Url::parse(url_str).expect("invalid URL")) + .collect(); + assert_eq!(urls, expected_urls); + // Fourth call + let urls: Vec<_> = provider.n_ordered_routes(5).expect("failed to get urls"); + let expected_urls: Vec = [ + "https://url1.com", + "https://url2.com", + "https://url3.com", + "https://url4.com", + "https://url5.com", + ] + .iter() + .map(|url_str| Url::parse(url_str).expect("invalid URL")) + .collect(); + assert_eq!(urls, expected_urls); + } } From 572e555da0006d911e3b5f96cdf00992eb23dd0d Mon Sep 17 00:00:00 2001 From: Nikolay Komarevskiy <90605504+nikolay-komarevskiy@users.noreply.github.com> Date: Tue, 27 Aug 2024 19:33:26 +0300 Subject: [PATCH 3/8] feat: add penalty for nodes unavailability in latency-based routing (#587) --- Cargo.lock | 702 ++++++++++++------ ic-agent/Cargo.toml | 1 - .../dynamic_routing/dynamic_route_provider.rs | 11 +- .../snapshot/latency_based_routing.rs | 478 ++++++++---- .../snapshot/round_robin_routing.rs | 16 +- .../snapshot/routing_snapshot.rs | 2 +- 6 files changed, 805 insertions(+), 405 deletions(-) diff --git a/Cargo.lock b/Cargo.lock index c2e4fb46..f49df689 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -44,11 +44,35 @@ version = "0.2.18" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5c6cb57a04249c6480766f7f7cef5467412af1490f8d1e243141daddada3264f" +[[package]] +name = "android-tzdata" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e999941b234f3131b00bc13c22d06e8c5ff726d1b6318ac7eb276997bbb4fef0" + +[[package]] +name = "android_system_properties" +version = "0.1.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "819e7219dbd41043ac279b19830f2efc897156490d7fd6ea916720117ee66311" +dependencies = [ + "libc", +] + +[[package]] +name = "ansi_term" +version = "0.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d52a9bb7ec0cf484c551830a7ce27bd20d67eac647e1befb56b0be4ee39a55d2" +dependencies = [ + "winapi", +] + [[package]] name = "anstream" -version = "0.6.14" +version = "0.6.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "418c75fa768af9c03be99d17643f93f79bbba589895012a80e3452a19ddda15b" +checksum = "64e15c1ab1f89faffbf04a634d5e1962e9074f2741eef6d97f3c4e322426d526" dependencies = [ "anstyle", "anstyle-parse", @@ -61,33 +85,33 @@ dependencies = [ [[package]] name = "anstyle" -version = "1.0.7" +version = "1.0.8" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "038dfcf04a5feb68e9c60b21c9625a54c2c0616e79b72b0fd87075a056ae1d1b" +checksum = "1bec1de6f59aedf83baf9ff929c98f2ad654b97c9510f4e70cf6f661d49fd5b1" [[package]] name = "anstyle-parse" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c03a11a9034d92058ceb6ee011ce58af4a9bf61491aa7e1e59ecd24bd40d22d4" +checksum = "eb47de1e80c2b463c735db5b217a0ddc39d612e7ac9e2e96a5aed1f57616c1cb" dependencies = [ "utf8parse", ] [[package]] name = "anstyle-query" -version = "1.1.0" +version = "1.1.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ad186efb764318d35165f1758e7dcef3b10628e26d41a44bc5550652e6804391" +checksum = "6d36fc52c7f6c869915e99412912f22093507da8d9e942ceaf66fe4b7c14422a" dependencies = [ "windows-sys 0.52.0", ] [[package]] name = "anstyle-wincon" -version = "3.0.3" +version = "3.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "61a38449feb7068f52bb06c12759005cf459ee52bb4adc1d5a7c4322d716fb19" +checksum = "5bf74e1b6e971609db8ca7a9ce79fd5768ab6ae46441c572e46cf596f59e57f8" dependencies = [ "anstyle", "windows-sys 0.52.0", @@ -108,6 +132,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "arrayvec" version = "0.5.2" @@ -152,7 +182,7 @@ checksum = "6e0c28dcc82d7c8ead5cb13beb15405b57b8546e93215673ff8ca0349a028107" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -293,9 +323,9 @@ checksum = "1fd0f2584146f6f2ef48085050886acf353beff7305ebd1ae69500e27c67f64b" [[package]] name = "bytes" -version = "1.6.1" +version = "1.7.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a12916984aab3fa6e39d655a33e09c0071eb36d6ab3aea5c2d78551f1df6d952" +checksum = "8318a53db07bb3f8dca91a600466bdb3f2eaadeedfdbcf02e1accbad9271ba50" [[package]] name = "cached" @@ -312,9 +342,9 @@ dependencies = [ [[package]] name = "candid" -version = "0.10.9" +version = "0.10.10" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7df77a80c72fcd356cf37ff59c812f37ff06dc9a81232b3aff0a308cb5996904" +checksum = "6c30ee7f886f296b6422c0ff017e89dd4f831521dfdcc76f3f71aae1ce817222" dependencies = [ "anyhow", "binread", @@ -342,7 +372,7 @@ dependencies = [ "lazy_static", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -366,9 +396,12 @@ dependencies = [ [[package]] name = "cc" -version = "1.1.5" +version = "1.1.14" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "324c74f2155653c90b04f25b2a47a8a631360cb908f92a772695f430c7e31052" +checksum = "50d2eb3cd3d1bf4529e31c215ee6f93ec5a3d536d9f578f93d9d33ee19562932" +dependencies = [ + "shlex", +] [[package]] name = "cfg-if" @@ -376,11 +409,23 @@ version = "1.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "baf1de4339761588bc0619e3cbc0120ee582ebb74b53b4efbf79117bd2da40fd" +[[package]] +name = "chrono" +version = "0.4.38" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a21f936df1771bf62b77f047b726c4625ff2e8aa607c01ec06e5a05bd8463401" +dependencies = [ + "android-tzdata", + "iana-time-zone", + "num-traits", + "windows-targets 0.52.6", +] + [[package]] name = "clap" -version = "4.5.9" +version = "4.5.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "64acc1846d54c1fe936a78dc189c34e28d3f5afc348403f28ecf53660b9b8462" +checksum = "ed6719fffa43d0d87e5fd8caeab59be1554fb028cd30edc88fc4369b17971019" dependencies = [ "clap_builder", "clap_derive", @@ -388,9 +433,9 @@ dependencies = [ [[package]] name = "clap_builder" -version = "4.5.9" +version = "4.5.15" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "6fb8393d67ba2e7bfaf28a23458e4e2b543cc73a99595511eb207fdb8aede942" +checksum = "216aec2b177652e3846684cbfe25c9964d18ec45234f0f5da5157b207ed1aab6" dependencies = [ "anstream", "anstyle", @@ -400,21 +445,21 @@ dependencies = [ [[package]] name = "clap_derive" -version = "4.5.8" +version = "4.5.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "2bac35c6dafb060fd4d275d9a4ffae97917c13a6327903a8be2153cd964f7085" +checksum = "501d359d5f3dcaf6ecdeee48833ae73ec6e42723a1e52419c79abf9507eec0a0" dependencies = [ "heck", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] name = "clap_lex" -version = "0.7.1" +version = "0.7.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4b82cf0babdbd58558212896d1a4272303a57bdb245c2bf1147185fb45640e70" +checksum = "1462739cb27611015575c0c11df5df7601141071f07518d56fcc1be504cbec97" [[package]] name = "codespan-reporting" @@ -428,9 +473,9 @@ dependencies = [ [[package]] name = "colorchoice" -version = "1.0.1" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0b6a852b24ab71dffc585bcb46eaf7959d175cb865a7152e35b348d1b2960422" +checksum = "d3fd119d74b830634cea2a0f58bbd0d54540518a14397557951e79340abc28c0" [[package]] name = "colored" @@ -476,11 +521,17 @@ dependencies = [ "unicode-segmentation", ] +[[package]] +name = "core-foundation-sys" +version = "0.8.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "773648b94d0e5d620f64f280777445740e61fe701025087ec8b57f45c791888b" + [[package]] name = "cpufeatures" -version = "0.2.12" +version = "0.2.13" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "53fe5e26ff1b7aef8bca9c6080520cfb8d9333c7568e1829cef191a9723e5504" +checksum = "51e852e6dc9a5bed1fae92dd2375037bf2b768725bf3be87811edee3249d09ad" dependencies = [ "libc", ] @@ -761,7 +812,7 @@ checksum = "87750cf4b7a4c0625b1529e4c543c2182106e4dedc60a2a6455e00d212c489ac" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -836,35 +887,16 @@ dependencies = [ [[package]] name = "h2" -version = "0.3.26" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "81fe527a889e1532da5c525686d96d4c2e74cdd345badf8dfef9f6b39dd5f5e8" -dependencies = [ - "bytes", - "fnv", - "futures-core", - "futures-sink", - "futures-util", - "http 0.2.12", - "indexmap", - "slab", - "tokio", - "tokio-util", - "tracing", -] - -[[package]] -name = "h2" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa82e28a107a8cc405f0839610bdc9b15f1e25ec7d696aa5cf173edbcb1486ab" +checksum = "524e8ac6999421f49a846c2d4411f337e53497d8ec55d67753beffa43c5d9205" dependencies = [ "atomic-waker", "bytes", "fnv", "futures-core", "futures-sink", - "http 1.1.0", + "http", "indexmap", "slab", "tokio", @@ -915,17 +947,6 @@ dependencies = [ "digest 0.10.7", ] -[[package]] -name = "http" -version = "0.2.12" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "601cbb57e577e2f5ef5be8e7b83f0f63994f25aa94d673e54a92d5c516d101f1" -dependencies = [ - "bytes", - "fnv", - "itoa", -] - [[package]] name = "http" version = "1.1.0" @@ -937,17 +958,6 @@ dependencies = [ "itoa", ] -[[package]] -name = "http-body" -version = "0.4.6" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "7ceab25649e9960c0311ea418d17bee82c0dcec1bd053b5f9a66e265a693bed2" -dependencies = [ - "bytes", - "http 0.2.12", - "pin-project-lite", -] - [[package]] name = "http-body" version = "1.0.1" @@ -955,7 +965,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1efedce1fb8e6913f23e0c92de8e62cd5b772a67e7b3946df930a62566c93184" dependencies = [ "bytes", - "http 1.1.0", + "http", ] [[package]] @@ -965,7 +975,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "17a08236c6f51c2ee95d840f45acf8fa9e339390e00b4ef640857b2f2a534d70" dependencies = [ "bytes", - "http-body 1.0.1", + "http-body", "http-body-util", ] @@ -977,8 +987,8 @@ checksum = "793429d76616a256bcb62c2a2ec2bed781c8307e797e2598c50010f2bee2544f" dependencies = [ "bytes", "futures-util", - "http 1.1.0", - "http-body 1.0.1", + "http", + "http-body", "pin-project-lite", ] @@ -1000,29 +1010,6 @@ version = "2.1.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "9a3a5bfb195931eeb336b2a7b4d761daec841b97f947d34394601737a7bba5e4" -[[package]] -name = "hyper" -version = "0.14.30" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a152ddd61dfaec7273fe8419ab357f33aee0d914c5f4efbf0d96fa749eea5ec9" -dependencies = [ - "bytes", - "futures-channel", - "futures-core", - "futures-util", - "h2 0.3.26", - "http 0.2.12", - "http-body 0.4.6", - "httparse", - "httpdate", - "itoa", - "pin-project-lite", - "tokio", - "tower-service", - "tracing", - "want", -] - [[package]] name = "hyper" version = "1.4.1" @@ -1032,10 +1019,11 @@ dependencies = [ "bytes", "futures-channel", "futures-util", - "h2 0.4.5", - "http 1.1.0", - "http-body 1.0.1", + "h2", + "http", + "http-body", "httparse", + "httpdate", "itoa", "pin-project-lite", "smallvec", @@ -1050,8 +1038,8 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5ee4be2c948921a1a5320b629c4193916ed787a7f7f293fd3f7f5a6c9de74155" dependencies = [ "futures-util", - "http 1.1.0", - "hyper 1.4.1", + "http", + "hyper", "hyper-util", "rustls", "rustls-pki-types", @@ -1063,16 +1051,16 @@ dependencies = [ [[package]] name = "hyper-util" -version = "0.1.6" +version = "0.1.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "3ab92f4f49ee4fb4f997c784b7a2e0fa70050211e0b6a287f898c3c9785ca956" +checksum = "cde7055719c54e36e95e8719f95883f22072a48ede39db7fc17a4e1d5281e9b9" dependencies = [ "bytes", "futures-channel", "futures-util", - "http 1.1.0", - "http-body 1.0.1", - "hyper 1.4.1", + "http", + "http-body", + "hyper", "pin-project-lite", "socket2", "tokio", @@ -1081,11 +1069,36 @@ dependencies = [ "tracing", ] +[[package]] +name = "iana-time-zone" +version = "0.1.60" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e7ffbb5a1b541ea2561f8c41c087286cc091e21e556a4f09a8f6cbf17b69b141" +dependencies = [ + "android_system_properties", + "core-foundation-sys", + "iana-time-zone-haiku", + "js-sys", + "wasm-bindgen", + "windows-core", +] + +[[package]] +name = "iana-time-zone-haiku" +version = "0.1.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f31827a206f56af32e590ba56d5d2d085f558508192593743f16b2306495269f" +dependencies = [ + "cc", +] + [[package]] name = "ic-agent" version = "0.37.1" dependencies = [ + "arc-swap", "async-lock", + "async-trait", "backoff", "cached", "candid", @@ -1093,11 +1106,11 @@ dependencies = [ "futures-util", "getrandom", "hex", - "http 1.1.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-to-bytes", "http-body-util", - "hyper 1.4.1", + "hyper", "hyper-rustls", "hyper-util", "ic-certification", @@ -1126,7 +1139,10 @@ dependencies = [ "thiserror", "time", "tokio", + "tokio-util", "tower", + "tracing", + "tracing-subscriber", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -1136,9 +1152,9 @@ dependencies = [ [[package]] name = "ic-certification" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "20052ce9255fbe2de7041a4f6996fddd095ba1f31ae83b6c0ccdee5be6e7bbcf" +checksum = "e64ee3d8b6e81b51f245716d3e0badb63c283c00f3c9fb5d5219afc30b5bf821" dependencies = [ "hex", "serde", @@ -1285,9 +1301,9 @@ dependencies = [ [[package]] name = "indexmap" -version = "2.2.6" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "168fb715dda47215e360912c096649d23d58bf392ac62f73919e831745e40f26" +checksum = "93ead53efc7ea8ed3cfb0c79fc8023fbb782a5432b52830b6518941cebe6505c" dependencies = [ "equivalent", "hashbrown", @@ -1313,9 +1329,9 @@ checksum = "8f518f335dce6725a761382244631d86cf0ccb2863413590b31338feb467f9c3" [[package]] name = "is_terminal_polyfill" -version = "1.70.0" +version = "1.70.1" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f8478577c03552c21db0e2724ffb8986a5ce7af88107e6be5d2ee6e158c12800" +checksum = "7943c866cc5cd64cbc25b2e01621d07fa8eb2a1a23160ee81ce38704e97b8ecf" [[package]] name = "itertools" @@ -1334,9 +1350,9 @@ checksum = "49f1f14873335454500d59611f1cf4a4b0f786f9ac11f4312a78e4cf2566695b" [[package]] name = "js-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29c15563dc2726973df627357ce0c9ddddbea194836909d655df6a75d2cf296d" +checksum = "1868808506b929d7b0cfa8f75951347aa71bb21144b7791bae35d9bccfcfe37a" dependencies = [ "wasm-bindgen", ] @@ -1383,7 +1399,7 @@ version = "0.20.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "507460a910eb7b32ee961886ff48539633b788a36b65692b95f225b844c82553" dependencies = [ - "regex-automata", + "regex-automata 0.4.7", ] [[package]] @@ -1400,9 +1416,9 @@ checksum = "884e2677b40cc8c339eaefcb701c32ef1fd2493d71118dc0ca4b6a736c93bd67" [[package]] name = "libc" -version = "0.2.155" +version = "0.2.158" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "97b3888a4aecf77e811145cadf6eef5901f4782c53886191b2f693f24761847c" +checksum = "d8adc4bb1803a324070e64a98ae98f38934d91957a99cfb3a43dcbc01bc56439" [[package]] name = "libloading" @@ -1460,7 +1476,7 @@ dependencies = [ "proc-macro2", "quote", "regex-syntax 0.6.29", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -1472,6 +1488,15 @@ dependencies = [ "logos-codegen", ] +[[package]] +name = "matchers" +version = "0.0.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f099785f7595cc4b4553a174ce30dd7589ef93391ff414dbb67f62392b9e0ce1" +dependencies = [ + "regex-automata 0.1.10", +] + [[package]] name = "memchr" version = "2.7.4" @@ -1484,6 +1509,16 @@ version = "0.3.17" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "6877bb514081ee2a7ff5ef9de3281f14a4dd4bceac4c09388074a6b5df8a139a" +[[package]] +name = "minicov" +version = "0.3.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c71e683cd655513b99affab7d317deb690528255a0d5f717f1024093c12b169" +dependencies = [ + "cc", + "walkdir", +] + [[package]] name = "miniz_oxide" version = "0.7.4" @@ -1495,25 +1530,31 @@ dependencies = [ [[package]] name = "mio" -version = "0.8.11" +version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a4a650543ca06a924e8b371db273b2756685faae30f8487da1b56505a8f78b0c" +checksum = "80e04d1dcff3aae0704555fe5fee3bcfaf3d1fdf8a7e521d5b9d2b42acb52cec" dependencies = [ + "hermit-abi", "libc", "wasi", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "mockito" -version = "1.4.0" +version = "1.5.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d2f6e023aa5bdf392aa06c78e4a4e6d498baab5138d0c993503350ebbc37bf1e" +checksum = "09b34bd91b9e5c5b06338d392463e1318d683cf82ec3d3af4014609be6e2108d" dependencies = [ "assert-json-diff", + "bytes", "colored", - "futures-core", - "hyper 0.14.30", + "futures-util", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-util", "log", "rand", "regex", @@ -1575,21 +1616,11 @@ dependencies = [ "autocfg", ] -[[package]] -name = "num_cpus" -version = "1.16.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4161fcb6d602d4d2081af7c3a45852d875a03dd337a6bfdd6e06407b61342a43" -dependencies = [ - "hermit-abi", - "libc", -] - [[package]] name = "object" -version = "0.36.1" +version = "0.36.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "081b846d1d56ddfc18fdf1a922e4f6e07a11768ea1b92dec44e42b72712ccfce" +checksum = "27b64972346851a39438c60b341ebc01bba47464ae329e55cf343eb93964efd9" dependencies = [ "memchr", ] @@ -1729,7 +1760,7 @@ checksum = "2f38a4412a78282e09a2cf38d195ea5420d15ba0602cb375210efbc877243965" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -1772,9 +1803,12 @@ checksum = "439ee305def115ba05938db6eb1644ff94165c5ab5e9420d1c1bcedbba909391" [[package]] name = "ppv-lite86" -version = "0.2.17" +version = "0.2.20" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5b40af805b3121feab8a3c29f04d8ad262fa8e0561883e7653e024ae4479e6de" +checksum = "77957b295656769bb8ad2b6a6b09d897d94f05c41b069aede1fcdaa675eaea04" +dependencies = [ + "zerocopy", +] [[package]] name = "precomputed-hash" @@ -1822,9 +1856,9 @@ dependencies = [ [[package]] name = "quinn" -version = "0.11.2" +version = "0.11.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e4ceeeeabace7857413798eb1ffa1e9c905a9946a57d81fb69b4b71c4d8eb3ad" +checksum = "b22d8e7369034b9a7132bc2008cac12f2013c8132b45e0554e6e20e2617f2156" dependencies = [ "bytes", "pin-project-lite", @@ -1832,6 +1866,7 @@ dependencies = [ "quinn-udp", "rustc-hash", "rustls", + "socket2", "thiserror", "tokio", "tracing", @@ -1839,9 +1874,9 @@ dependencies = [ [[package]] name = "quinn-proto" -version = "0.11.3" +version = "0.11.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "ddf517c03a109db8100448a4be38d498df8a210a99fe0e1b9eaf39e78c640efe" +checksum = "ba92fb39ec7ad06ca2582c0ca834dfeadcaf06ddfc8e635c80aa7e1c05315fdd" dependencies = [ "bytes", "rand", @@ -1856,9 +1891,9 @@ dependencies = [ [[package]] name = "quinn-udp" -version = "0.5.2" +version = "0.5.4" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "9096629c45860fc7fb143e125eb826b5e721e10be3263160c7d60ca832cf8c46" +checksum = "8bffec3605b73c6f1754535084a85229fa8a30f86014e6c81aeec4abb68b0285" dependencies = [ "libc", "once_cell", @@ -1869,9 +1904,9 @@ dependencies = [ [[package]] name = "quote" -version = "1.0.36" +version = "1.0.37" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "0fa76aaf39101c457836aec0ce2316dbdc3ab723cdda1c6bd4e6ad4208acaca7" +checksum = "b5b9d34b8991d19d98081b46eacdd8eb58c6f2b201139f7c5f643cc155a633af" dependencies = [ "proc-macro2", ] @@ -1923,9 +1958,9 @@ dependencies = [ [[package]] name = "redox_users" -version = "0.4.5" +version = "0.4.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bd283d9651eeda4b2a83a43c1c91b266c40fd76ecd39a50a8c630ae69dc72891" +checksum = "ba009ff324d1fc1b900bd1fdb31564febe58a8ccc8a6fdbb93b543d33b13ca43" dependencies = [ "getrandom", "libredox", @@ -1950,16 +1985,25 @@ dependencies = [ [[package]] name = "regex" -version = "1.10.5" +version = "1.10.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b91213439dad192326a0d7c6ee3955910425f441d7038e0d6933b0aec5c4517f" +checksum = "4219d74c6b67a3654a9fbebc4b419e22126d13d2f3c4a07ee0cb61ff79a79619" dependencies = [ "aho-corasick", "memchr", - "regex-automata", + "regex-automata 0.4.7", "regex-syntax 0.8.4", ] +[[package]] +name = "regex-automata" +version = "0.1.10" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "6c230d73fb8d8c1b9c0b3135c5142a8acee3a0558fb8db5cf1cb65f8d7862132" +dependencies = [ + "regex-syntax 0.6.29", +] + [[package]] name = "regex-automata" version = "0.4.7" @@ -1985,19 +2029,19 @@ checksum = "7a66a03ae7c801facd77a29370b4faec201768915ac14a721ba36f20bc9c209b" [[package]] name = "reqwest" -version = "0.12.5" +version = "0.12.7" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c7d6d2a27d57148378eb5e111173f4276ad26340ecc5c49a4a2152167a2d6a37" +checksum = "f8f4955649ef5c38cc7f9e8aa41761d48fb9677197daea9984dc54f56aad5e63" dependencies = [ "base64", "bytes", "futures-channel", "futures-core", "futures-util", - "http 1.1.0", - "http-body 1.0.1", + "http", + "http-body", "http-body-util", - "hyper 1.4.1", + "hyper", "hyper-rustls", "hyper-util", "ipnet", @@ -2025,7 +2069,7 @@ dependencies = [ "wasm-streams", "web-sys", "webpki-roots", - "winreg", + "windows-registry", ] [[package]] @@ -2061,15 +2105,15 @@ checksum = "719b953e2095829ee67db738b3bfa9fa368c94900df327b3f07fe6e794d2fe1f" [[package]] name = "rustc-hash" -version = "1.1.0" +version = "2.0.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "08d43f7aa6b08d49f382cde6a7982047c3426db949b1424bc4b7ec9ae12c6ce2" +checksum = "583034fd73374156e66797ed8e5b0d5690409c9226b22d87cb7f19821c05d152" [[package]] name = "rustls" -version = "0.23.11" +version = "0.23.12" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4828ea528154ae444e5a642dbb7d5623354030dc9822b83fd9bb79683c7399d0" +checksum = "c58f8c84392efc0a126acce10fa59ff7b3d2ac06ab451a33f2741989b806b044" dependencies = [ "once_cell", "ring", @@ -2081,9 +2125,9 @@ dependencies = [ [[package]] name = "rustls-pemfile" -version = "2.1.2" +version = "2.1.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "29993a25686778eb88d4189742cd713c9bce943bc54251a33509dc63cbacf73d" +checksum = "196fe16b00e106300d3e45ecfcb764fa292a535d7326a29a5875c579c7417425" dependencies = [ "base64", "rustls-pki-types", @@ -2091,15 +2135,15 @@ dependencies = [ [[package]] name = "rustls-pki-types" -version = "1.7.0" +version = "1.8.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "976295e77ce332211c0d24d92c0e83e50f5c5f046d11082cea19f3df13a3562d" +checksum = "fc0a2ce646f8655401bb81e7927b812614bd5d91dbc968696be50603510fcaf0" [[package]] name = "rustls-webpki" -version = "0.102.5" +version = "0.102.6" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f9a6fccd794a42c2c105b513a2f62bc3fd8f3ba57a4593677ceb0bd035164d78" +checksum = "8e6b52d4fda176fd835fdc55a835d4a89b8499cad995885a21149d5ad62f852e" dependencies = [ "ring", "rustls-pki-types", @@ -2161,9 +2205,9 @@ checksum = "61697e0a1c7e512e84a621326239844a24d8207b4669b41bc18b32ea5cbf988b" [[package]] name = "serde" -version = "1.0.204" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "bc76f558e0cbb2a839d37354c575f1dc3fdc6546b5be373ba43d95f231bf7c12" +checksum = "99fce0ffe7310761ca6bf9faf5115afbc19688edd00171d81b1bb1b116c63e09" dependencies = [ "serde_derive", ] @@ -2189,22 +2233,23 @@ dependencies = [ [[package]] name = "serde_derive" -version = "1.0.204" +version = "1.0.209" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e0cd7e117be63d3c3678776753929474f3b04a43a080c744d6b0ae2a8c28e222" +checksum = "a5831b979fd7b5439637af1752d535ff49f4860c0f341d1baeb6faf0f4242170" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] name = "serde_json" -version = "1.0.120" +version = "1.0.127" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4e0d21c9a8cae1235ad58a00c11cb40d4b1e5c784f1ef2c537876ed6ffd8b7c5" +checksum = "8043c06d9f82bd7271361ed64f415fe5e12a77fdb52e573e7f06a516dea329ad" dependencies = [ "itoa", + "memchr", "ryu", "serde", ] @@ -2217,7 +2262,7 @@ checksum = "6c64451ba24fc7a6a2d60fc75dd9c83c90903b19028d4eff35e88fc1e86564e9" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -2256,6 +2301,21 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + +[[package]] +name = "shlex" +version = "1.3.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0fda2ff0d084019ba4d7c6f371c95d8fd75ce3524c3cb8fb653a3023f6323e64" + [[package]] name = "signal-hook-registry" version = "1.4.2" @@ -2277,9 +2337,9 @@ dependencies = [ [[package]] name = "similar" -version = "2.5.0" +version = "2.6.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "fa42c91313f1d05da9b26f267f931cf178d4aba455b4c4622dd7355eb80c6640" +checksum = "1de1d4f81173b03af4c0cbed3c898f6bff5b870e4a7f5d6f4057d62a7a4b686e" [[package]] name = "simple_asn1" @@ -2342,15 +2402,15 @@ dependencies = [ [[package]] name = "stacker" -version = "0.1.15" +version = "0.1.16" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c886bd4480155fd3ef527d45e9ac8dd7118a898a46530b7b94c3e21866259fce" +checksum = "95a5daa25ea337c85ed954c0496e3bdd2c7308cc3b24cf7b50d04876654c579f" dependencies = [ "cc", "cfg-if", "libc", "psm", - "winapi", + "windows-sys 0.36.1", ] [[package]] @@ -2388,7 +2448,7 @@ dependencies = [ "proc-macro2", "quote", "rustversion", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -2416,9 +2476,9 @@ dependencies = [ [[package]] name = "syn" -version = "2.0.71" +version = "2.0.76" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b146dcf730474b4bcd16c311627b31ede9ab149045db4d6088b3becaea046462" +checksum = "578e081a14e0cefc3279b0472138c513f37b41a08d5a3cca9b6e4e8ceb6cd525" dependencies = [ "proc-macro2", "quote", @@ -2430,6 +2490,9 @@ name = "sync_wrapper" version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "a7065abeca94b6a8a577f9bd45aa0867a2238b74e8eb67cf10d492bc39351394" +dependencies = [ + "futures-core", +] [[package]] name = "term" @@ -2468,7 +2531,17 @@ checksum = "a4558b58466b9ad7ca0f102865eccc95938dca1a74a856f2b57b6629050da261" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", +] + +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", ] [[package]] @@ -2529,32 +2602,31 @@ checksum = "1f3ccbac311fea05f86f61904b462b55fb3df8837a366dfc601a0161d0532f20" [[package]] name = "tokio" -version = "1.38.1" +version = "1.39.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "eb2caba9f80616f438e09748d5acda951967e1ea58508ef53d9c6402485a46df" +checksum = "9babc99b9923bfa4804bd74722ff02c0381021eafa4db9949217e3be8e84fff5" dependencies = [ "backtrace", "bytes", "libc", "mio", - "num_cpus", "parking_lot", "pin-project-lite", "signal-hook-registry", "socket2", "tokio-macros", - "windows-sys 0.48.0", + "windows-sys 0.52.0", ] [[package]] name = "tokio-macros" -version = "2.3.0" +version = "2.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "5f5ae998a069d4b5aba8ee9dad856af7d520c3699e6159b185c2acd48155d39a" +checksum = "693d596312e88961bc67d7f1f97af8a70227d9f90c31bba5806eec004978d752" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -2577,6 +2649,8 @@ dependencies = [ "bytes", "futures-core", "futures-sink", + "futures-util", + "hashbrown", "pin-project-lite", "tokio", ] @@ -2599,15 +2673,15 @@ dependencies = [ [[package]] name = "tower-layer" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "c20c8dbed6283a09604c3e69b4b7eeb54e298b8a600d4d5ecb5ad39de609f1d0" +checksum = "121c2a6cda46980bb0fcd1647ffaf6cd3fc79a013de288782836f6df9c48780e" [[package]] name = "tower-service" -version = "0.3.2" +version = "0.3.3" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b6bc1c9ce2b5135ac7f93c72918fc37feb872bdc6a5533a8b85eb4b86bfdae52" +checksum = "8df9b6e13f2d32c91b9bd719c00d1958837bc7dec474d94952798cc8e69eeec3" [[package]] name = "tracing" @@ -2629,7 +2703,7 @@ checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -2639,6 +2713,50 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.1.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f751112709b4e791d8ce53e32c4ed2d353565a795ce84da2285393f41557bdf2" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-serde" +version = "0.1.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bc6b213177105856957181934e4920de57730fc69bf42c37ee5bb664d406d9e1" +dependencies = [ + "serde", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.2.25" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0e0d2eaa99c3c2e41547cfa109e910a68ea03823cccad4a0525dcbc9b01e8c71" +dependencies = [ + "ansi_term", + "chrono", + "lazy_static", + "matchers", + "regex", + "serde", + "serde_json", + "sharded-slab", + "smallvec", + "thread_local", + "tracing", + "tracing-core", + "tracing-log", + "tracing-serde", ] [[package]] @@ -2694,9 +2812,9 @@ checksum = "0336d538f7abc86d282a4189614dfaa90810dfc2c6f6427eaf88e16311dd225d" [[package]] name = "unicode-xid" -version = "0.2.4" +version = "0.2.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "f962df74c8c05a667b5ee8bcf162993134c104e96440b663c8daa176dc772d8c" +checksum = "229730647fbc343e3a80e463c1db7f78f3855d3f3739bee0dda773c9a037c90a" [[package]] name = "untrusted" @@ -2721,11 +2839,17 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "version_check" -version = "0.9.4" +version = "0.9.5" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "49874b5167b65d7193b8aba1567f5c7d93d001cafc34600cee003eda787e483f" +checksum = "0b928f33d975fc6ad9f86c8f283853ad26bdd5b10b7f1542aa2fa15e2289105a" [[package]] name = "walkdir" @@ -2754,34 +2878,35 @@ checksum = "9c8d87e72b64a3b4db28d11ce29237c246188f4f51057d65a7eab63b7987e423" [[package]] name = "wasm-bindgen" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4be2531df63900aeb2bca0daaaddec08491ee64ceecbee5076636a3b026795a8" +checksum = "a82edfc16a6c469f5f44dc7b571814045d60404b55a0ee849f9bcfa2e63dd9b5" dependencies = [ "cfg-if", + "once_cell", "wasm-bindgen-macro", ] [[package]] name = "wasm-bindgen-backend" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "614d787b966d3989fa7bb98a654e369c762374fd3213d212cfc0251257e747da" +checksum = "9de396da306523044d3302746f1208fa71d7532227f15e347e2d93e4145dd77b" dependencies = [ "bumpalo", "log", "once_cell", "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-futures" -version = "0.4.42" +version = "0.4.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "76bc14366121efc8dbb487ab05bcc9d346b3b5ec0eaa76e46594cabbe51762c0" +checksum = "61e9300f63a621e96ed275155c108eb6f843b6a26d053f122ab69724559dc8ed" dependencies = [ "cfg-if", "js-sys", @@ -2791,9 +2916,9 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a1f8823de937b71b9460c0c34e25f3da88250760bec0ebac694b49997550d726" +checksum = "585c4c91a46b072c92e908d99cb1dcdf95c5218eeb6f3bf1efa991ee7a68cccf" dependencies = [ "quote", "wasm-bindgen-macro-support", @@ -2801,31 +2926,32 @@ dependencies = [ [[package]] name = "wasm-bindgen-macro-support" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "e94f17b526d0a461a191c78ea52bbce64071ed5c04c9ffe424dcb38f74171bb7" +checksum = "afc340c74d9005395cf9dd098506f7f44e38f2b4a21c6aaacf9a105ea5e1e836" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", "wasm-bindgen-backend", "wasm-bindgen-shared", ] [[package]] name = "wasm-bindgen-shared" -version = "0.2.92" +version = "0.2.93" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "af190c94f2773fdb3729c55b007a722abb5384da03bc0986df4c289bf5567e96" +checksum = "c62a0a307cb4a311d3a07867860911ca130c3494e8c2719593806c08bc5d0484" [[package]] name = "wasm-bindgen-test" -version = "0.3.42" +version = "0.3.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "d9bf62a58e0780af3e852044583deee40983e5886da43a271dd772379987667b" +checksum = "68497a05fb21143a08a7d24fc81763384a3072ee43c44e86aad1744d6adef9d9" dependencies = [ "console_error_panic_hook", "js-sys", + "minicov", "scoped-tls", "wasm-bindgen", "wasm-bindgen-futures", @@ -2834,13 +2960,13 @@ dependencies = [ [[package]] name = "wasm-bindgen-test-macro" -version = "0.3.42" +version = "0.3.43" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "b7f89739351a2e03cb94beb799d47fb2cac01759b40ec441f7de39b00cbf7ef0" +checksum = "4b8220be1fa9e4c889b30fd207d4906657e7e90b12e0e6b0c8b8d8709f5de021" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] @@ -2858,9 +2984,9 @@ dependencies = [ [[package]] name = "web-sys" -version = "0.3.69" +version = "0.3.70" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "77afa9a11836342370f4817622a2f0f418b134426d91a82dfb48f532d2ec13ef" +checksum = "26fdeaafd9bd129f65e7c031593c24d62186301e0c72c8978fa1678be7d532c0" dependencies = [ "js-sys", "wasm-bindgen", @@ -2893,11 +3019,11 @@ checksum = "ac3b87c63620426dd9b991e5ce0329eff545bccbbb34f3be09ff6fb6ab51b7b6" [[package]] name = "winapi-util" -version = "0.1.8" +version = "0.1.9" source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "4d4cc384e1e73b93bafa6fb4f1df8c41695c8a91cf9c4c64358067d15a7b6c6b" +checksum = "cf221c93e13a30d793f7645a0e7762c55d169dbb0a49671918a2319d289b10bb" dependencies = [ - "windows-sys 0.52.0", + "windows-sys 0.59.0", ] [[package]] @@ -2906,6 +3032,58 @@ version = "0.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "712e227841d057c1ee1cd2fb22fa7e5a5461ae8e48fa2ca79ec42cfc1931183f" +[[package]] +name = "windows-core" +version = "0.52.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ab640c8d7e35bf8ba19b884ba838ceb4fba93a4e8c65a9059d08afcfc683d9" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-registry" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e400001bb720a623c1c69032f8e3e4cf09984deec740f007dd2b03ec864804b0" +dependencies = [ + "windows-result", + "windows-strings", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-result" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1d1043d8214f791817bab27572aaa8af63732e11bf84aa21a45a78d6c317ae0e" +dependencies = [ + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-strings" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4cd9b125c486025df0eabcb585e62173c6c9eddcec5d117d3b6e8c30e2ee4d10" +dependencies = [ + "windows-result", + "windows-targets 0.52.6", +] + +[[package]] +name = "windows-sys" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ea04155a16a59f9eab786fe12a4a450e75cdb175f9e0d80da1e17db09f55b8d2" +dependencies = [ + "windows_aarch64_msvc 0.36.1", + "windows_i686_gnu 0.36.1", + "windows_i686_msvc 0.36.1", + "windows_x86_64_gnu 0.36.1", + "windows_x86_64_msvc 0.36.1", +] + [[package]] name = "windows-sys" version = "0.48.0" @@ -2924,6 +3102,15 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-sys" +version = "0.59.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1e38bc4d79ed67fd075bcc251a1c39b32a1776bbe92e5bef1f0bf1f8c531853b" +dependencies = [ + "windows-targets 0.52.6", +] + [[package]] name = "windows-targets" version = "0.48.5" @@ -2967,6 +3154,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "32a4622180e7a0ec044bb555404c800bc9fd9ec262ec147edd5989ccd0c02cd3" +[[package]] +name = "windows_aarch64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "9bb8c3fd39ade2d67e9874ac4f3db21f0d710bee00fe7cab16949ec184eeaa47" + [[package]] name = "windows_aarch64_msvc" version = "0.48.5" @@ -2979,6 +3172,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "09ec2a7bb152e2252b53fa7803150007879548bc709c039df7627cabbd05d469" +[[package]] +name = "windows_i686_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "180e6ccf01daf4c426b846dfc66db1fc518f074baa793aa7d9b9aaeffad6a3b6" + [[package]] name = "windows_i686_gnu" version = "0.48.5" @@ -2997,6 +3196,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0eee52d38c090b3caa76c563b86c3a4bd71ef1a819287c19d586d7334ae8ed66" +[[package]] +name = "windows_i686_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e2e7917148b2812d1eeafaeb22a97e4813dfa60a3f8f78ebe204bcc88f12f024" + [[package]] name = "windows_i686_msvc" version = "0.48.5" @@ -3009,6 +3214,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "240948bc05c5e7c6dabba28bf89d89ffce3e303022809e73deaefe4f6ec56c66" +[[package]] +name = "windows_x86_64_gnu" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "4dcd171b8776c41b97521e5da127a2d86ad280114807d0b2ab1e462bc764d9e1" + [[package]] name = "windows_x86_64_gnu" version = "0.48.5" @@ -3033,6 +3244,12 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "24d5b23dc417412679681396f2b49f3de8c1473deb516bd34410872eff51ed0d" +[[package]] +name = "windows_x86_64_msvc" +version = "0.36.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c811ca4a8c853ef420abd8592ba53ddbbac90410fab6903b3e79972a631f7680" + [[package]] name = "windows_x86_64_msvc" version = "0.48.5" @@ -3045,22 +3262,13 @@ version = "0.52.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "589f6da84c646204747d1270a2a5661ea66ed1cced2631d546fdfb155959f9ec" -[[package]] -name = "winreg" -version = "0.52.0" -source = "registry+https://github.com/rust-lang/crates.io-index" -checksum = "a277a57398d4bfa075df44f501a17cfdf8542d224f0d36095a2adc7aee4ef0a5" -dependencies = [ - "cfg-if", - "windows-sys 0.48.0", -] - [[package]] name = "zerocopy" version = "0.7.35" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1b9b4fd18abc82b8136838da5d50bae7bdea537c574d8dc1a34ed098d6c166f0" dependencies = [ + "byteorder", "zerocopy-derive", ] @@ -3072,7 +3280,7 @@ checksum = "fa4f8080344d4671fb4e831a13ad1e68092748387dfc4f55e356242fae12ce3e" dependencies = [ "proc-macro2", "quote", - "syn 2.0.71", + "syn 2.0.76", ] [[package]] diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 6be9470f..f455a94d 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -79,7 +79,6 @@ tower = { version = "0.4.13", optional = true } async-trait = "^0.1.0" tracing = "^0.1.0" arc-swap = "^1.0.0" -simple_moving_average = "^1.0.0" tracing-subscriber = "^0.2.0" tokio-util = { version = "^0.7.0", features = ["rt"] } rustls-webpki = "0.102" diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs index 855fe566..e05f6f4d 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs @@ -42,7 +42,7 @@ const MAINNET_ROOT_SUBNET_ID: &str = const FETCH_PERIOD: Duration = Duration::from_secs(5); const FETCH_RETRY_INTERVAL: Duration = Duration::from_millis(250); const TIMEOUT_AWAIT_HEALTHY_SEED: Duration = Duration::from_millis(1000); -const HEALTH_CHECK_TIMEOUT: Duration = Duration::from_secs(2); +const HEALTH_CHECK_TIMEOUT: Duration = Duration::from_secs(1); const HEALTH_CHECK_PERIOD: Duration = Duration::from_secs(1); const DYNAMIC_ROUTE_PROVIDER: &str = "DynamicRouteProvider"; @@ -178,9 +178,12 @@ where fn n_ordered_routes(&self, n: usize) -> Result, AgentError> { let snapshot = self.routing_snapshot.load(); - let nodes = snapshot.next_n_nodes(n).ok_or_else(|| { - AgentError::RouteProviderError("No healthy API nodes found.".to_string()) - })?; + let nodes = snapshot.next_n_nodes(n); + if nodes.is_empty() { + return Err(AgentError::RouteProviderError( + "No healthy API nodes found.".to_string(), + )); + }; let urls = nodes.iter().map(|n| n.to_routing_url()).collect(); Ok(urls) } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs index 7de1bbc6..6b1ee0b0 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs @@ -1,29 +1,153 @@ -use std::{collections::HashSet, time::Duration}; +use std::{ + collections::{HashSet, VecDeque}, + time::Duration, +}; use rand::Rng; -use simple_moving_average::{SumTreeSMA, SMA}; use crate::agent::http_transport::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, }; -// Some big value implying that node is unhealthy, should be much bigger than node's latency. -const MAX_LATENCY: Duration = Duration::from_secs(500); - +// Determines the size of the sliding window used for storing latencies and availabilities of nodes. const WINDOW_SIZE: usize = 15; +// Determines the decay rate of the exponential decay function, which is used for generating weights over the sliding window. +const LAMBDA_DECAY: f64 = 0.3; + +/// Generates exponentially decaying weights for the sliding window. +fn generate_exp_decaying_weights(n: usize, lambda: f64) -> Vec { + let mut weights: Vec = Vec::with_capacity(n); + for i in 0..n { + let weight = (-lambda * i as f64).exp(); + weights.push(weight); + } + weights +} -// Algorithmic complexity: add sample - O(log(N)), get average - O(1). -// Space complexity: O(N) -type LatencyMovAvg = SumTreeSMA; - -/// A node, which stores health check latencies in the form of moving average. +// Node with meta information and metrics (latencies, availabilities). +// Routing URLs a generated based on the score field. #[derive(Clone, Debug)] -struct WeightedNode { +struct NodeWithMetrics { + // Node information. node: Node, - /// Moving mean of latencies measurements. - latency_mov_avg: LatencyMovAvg, - /// Weight of the node (invers of the average latency), used for stochastic weighted random sampling. - weight: f64, + // Size of the sliding window used for store latencies and availabilities of the node. + window_size: usize, + /// Reflects the status of the most recent health check. It should be the same as the last element in `availabilities`. + is_healthy: bool, + /// Sliding window with latency measurements. + latencies: VecDeque, + /// Sliding window with availability measurements. + availabilities: VecDeque, + /// Overall score of the node. Calculated based on latencies and availabilities arrays. This score is used in `next_n_nodes()` and `next_node()` methods. + score: f64, +} + +impl NodeWithMetrics { + pub fn new(node: Node, window_size: usize) -> Self { + Self { + node, + window_size, + is_healthy: false, + latencies: VecDeque::with_capacity(window_size + 1), + availabilities: VecDeque::with_capacity(window_size + 1), + score: 0.0, + } + } + + pub fn add_latency_measurement(&mut self, latency: Option) { + self.is_healthy = latency.is_some(); + if let Some(duration) = latency { + self.latencies.push_back(duration.as_secs_f64()); + while self.latencies.len() > self.window_size { + self.latencies.pop_front(); + } + self.availabilities.push_back(true); + } else { + self.availabilities.push_back(false); + } + while self.availabilities.len() > self.window_size { + self.availabilities.pop_front(); + } + } +} + +/// Computes the score of the node based on the latencies, availabilities and window weights. +/// `window_weights_sum`` is passed for efficiency reasons, as it is pre-calculated. +fn compute_score( + window_weights: &[f64], + window_weights_sum: f64, + availabilities: &VecDeque, + latencies: &VecDeque, + use_availability_penalty: bool, +) -> f64 { + let weights_size = window_weights.len(); + let availabilities_size = availabilities.len(); + let latencies_size = latencies.len(); + + if weights_size < availabilities_size { + panic!( + "Weights array of size {weights_size} is smaller than array of availabilities of size {availabilities_size}", + ); + } else if weights_size < latencies_size { + panic!( + "Weights array of size {weights_size} is smaller than array of latencies of size {latencies_size}", + ); + } + + // Compute normalized availability score [0.0, 1.0]. + let score_a = if !use_availability_penalty { + 1.0 + } else if availabilities.is_empty() { + 0.0 + } else { + let mut score = 0.0; + + // Compute weighted score. Weights are applied in reverse order. + for (idx, availability) in availabilities.iter().rev().enumerate() { + score += window_weights[idx] * (*availability as u8 as f64); + } + + // Normalize the score. + let weights_sum = if availabilities_size < weights_size { + // Use partial sum of weights, if the window is not full. + let partial_weights_sum: f64 = window_weights.iter().take(availabilities_size).sum(); + partial_weights_sum + } else { + // Use pre-calculated sum, if the window is full. + window_weights_sum + }; + + score /= weights_sum; + + score + }; + + // Compute latency score (not normalized). + let score_l = if latencies.is_empty() { + 0.0 + } else { + let mut score = 0.0; + + // Compute weighted score. Weights are applied in reverse order. Latency is inverted, so that smaller latencies have higher score. + for (idx, latency) in latencies.iter().rev().enumerate() { + score += window_weights[idx] / latency; + } + + let weights_sum = if latencies_size < weights_size { + let partial_weights_sum: f64 = window_weights.iter().take(latencies.len()).sum(); + partial_weights_sum + } else { + // Use pre-calculated sum. + window_weights_sum + }; + + score /= weights_sum; + + score + }; + + // Combine availability and latency scores via product to emphasize the importance of both metrics. + score_l * score_a } /// Routing snapshot for latency-based routing. @@ -31,19 +155,44 @@ struct WeightedNode { /// Nodes with smaller average latencies are preferred for routing. #[derive(Default, Debug, Clone)] pub struct LatencyRoutingSnapshot { - weighted_nodes: Vec, + nodes_with_metrics: Vec, existing_nodes: HashSet, + window_weights: Vec, + window_weights_sum: f64, + use_availability_penalty: bool, } /// Implementation of the LatencyRoutingSnapshot. impl LatencyRoutingSnapshot { /// Creates a new LatencyRoutingSnapshot. pub fn new() -> Self { + // Weights are ordered from left to right, where the leftmost weight is for the most recent health check. + let window_weights = generate_exp_decaying_weights(WINDOW_SIZE, LAMBDA_DECAY); + // Pre-calculate the sum of weights for efficiency reasons. + let window_weights_sum: f64 = window_weights.iter().sum(); + Self { - weighted_nodes: vec![], + nodes_with_metrics: vec![], existing_nodes: HashSet::new(), + use_availability_penalty: true, + window_weights, + window_weights_sum, } } + + /// Sets whether to use availability penalty in the score computation. + pub fn set_availability_penalty(mut self, use_penalty: bool) -> Self { + self.use_availability_penalty = use_penalty; + self + } + + /// Sets the weights for the sliding window. + /// The weights are ordered from left to right, where the leftmost weight is for the most recent health check. + pub fn set_window_weights(mut self, weights: &[f64]) -> Self { + self.window_weights_sum = weights.iter().sum(); + self.window_weights = weights.to_vec(); + self + } } /// Helper function to sample nodes based on their weights. @@ -66,43 +215,46 @@ fn weighted_sample(weighted_nodes: &[(f64, &Node)], number: f64) -> Option bool { - !self.weighted_nodes.is_empty() + self.nodes_with_metrics.iter().any(|n| n.is_healthy) } fn next_node(&self) -> Option { - self.next_n_nodes(1).unwrap_or_default().into_iter().next() + self.next_n_nodes(1).into_iter().next() } - // Uses weighted random sampling algorithm without item replacement n times. - fn next_n_nodes(&self, n: usize) -> Option> { + // Uses weighted random sampling algorithm n times. Node can be selected at most once (sampling without replacement). + fn next_n_nodes(&self, n: usize) -> Vec { if n == 0 { - return Some(Vec::new()); + return Vec::new(); } - let n = std::cmp::min(n, self.weighted_nodes.len()); + // Preallocate array for a better efficiency. + let mut healthy_nodes = Vec::with_capacity(self.nodes_with_metrics.len()); + for n in &self.nodes_with_metrics { + if n.is_healthy { + healthy_nodes.push((n.score, &n.node)); + } + } - let mut nodes = Vec::with_capacity(n); + // Limit the number of returned nodes to the number of healthy nodes. + let n = std::cmp::min(n, healthy_nodes.len()); - let mut weighted_nodes: Vec<_> = self - .weighted_nodes - .iter() - .map(|n| (n.weight, &n.node)) - .collect(); + let mut nodes = Vec::with_capacity(n); let mut rng = rand::thread_rng(); for _ in 0..n { // Generate a random float in the range [0, 1) let rand_num = rng.gen::(); - if let Some(idx) = weighted_sample(weighted_nodes.as_slice(), rand_num) { - let node = weighted_nodes[idx].1; + if let Some(idx) = weighted_sample(healthy_nodes.as_slice(), rand_num) { + let node = healthy_nodes[idx].1; nodes.push(node.clone()); // Remove the item, so that it can't be selected anymore. - weighted_nodes.swap_remove(idx); + healthy_nodes.swap_remove(idx); } } - Some(nodes) + nodes } fn sync_nodes(&mut self, nodes: &[Node]) -> bool { @@ -125,38 +277,38 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { // This happens after the first node health check round and a consequent update_node() invocation. for node in nodes_removed.into_iter() { self.existing_nodes.remove(&node); - let idx = self.weighted_nodes.iter().position(|x| x.node == node); - idx.map(|idx| self.weighted_nodes.swap_remove(idx)); + let idx = self.nodes_with_metrics.iter().position(|x| x.node == node); + idx.map(|idx| self.nodes_with_metrics.swap_remove(idx)); } has_added_nodes || has_removed_nodes } fn update_node(&mut self, node: &Node, health: HealthCheckStatus) -> bool { + // Skip the update if the node is not in the existing nodes. if !self.existing_nodes.contains(node) { return false; } - // If latency is None (meaning Node is unhealthy), we assign some big value - let latency = health.latency().unwrap_or(MAX_LATENCY); - - if let Some(idx) = self.weighted_nodes.iter().position(|x| &x.node == node) { - // Node is already in the array (it is not the first update_node() call). - self.weighted_nodes[idx].latency_mov_avg.add_sample(latency); - let latency_avg = self.weighted_nodes[idx].latency_mov_avg.get_average(); - // As nodes with smaller average latencies are preferred for routing, we use inverted values for weights. - self.weighted_nodes[idx].weight = 1.0 / latency_avg.as_secs_f64(); - } else { - // Node is not yet in array (first update_node() call). - let mut latency_mov_avg = LatencyMovAvg::from_zero(Duration::ZERO); - latency_mov_avg.add_sample(latency); - let weight = 1.0 / latency_mov_avg.get_average().as_secs_f64(); - self.weighted_nodes.push(WeightedNode { - latency_mov_avg, - node: node.clone(), - weight, - }) - } + let idx = self + .nodes_with_metrics + .iter() + .position(|x| &x.node == node) + .unwrap_or_else(|| { + let node = NodeWithMetrics::new(node.clone(), self.window_weights.len()); + self.nodes_with_metrics.push(node); + self.nodes_with_metrics.len() - 1 + }); + + self.nodes_with_metrics[idx].add_latency_measurement(health.latency()); + + self.nodes_with_metrics[idx].score = compute_score( + self.window_weights.as_slice(), + self.window_weights_sum, + &self.nodes_with_metrics[idx].availabilities, + &self.nodes_with_metrics[idx].latencies, + self.use_availability_penalty, + ); true } @@ -165,18 +317,16 @@ impl RoutingSnapshot for LatencyRoutingSnapshot { #[cfg(test)] mod tests { use std::{ - collections::{HashMap, HashSet}, + collections::{HashMap, HashSet, VecDeque}, time::Duration, }; - use simple_moving_average::SMA; - use crate::agent::http_transport::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::{ latency_based_routing::{ - weighted_sample, LatencyMovAvg, LatencyRoutingSnapshot, WeightedNode, MAX_LATENCY, + compute_score, weighted_sample, LatencyRoutingSnapshot, NodeWithMetrics, }, routing_snapshot::RoutingSnapshot, }, @@ -187,10 +337,11 @@ mod tests { // Arrange let snapshot = LatencyRoutingSnapshot::new(); // Assert - assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.nodes_with_metrics.is_empty()); assert!(snapshot.existing_nodes.is_empty()); assert!(!snapshot.has_nodes()); assert!(snapshot.next_node().is_none()); + assert!(snapshot.next_n_nodes(1).is_empty()); } #[test] @@ -203,7 +354,7 @@ mod tests { let is_updated = snapshot.update_node(&node, health); // Assert assert!(!is_updated); - assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.nodes_with_metrics.is_empty()); assert!(!snapshot.has_nodes()); assert!(snapshot.next_node().is_none()); } @@ -211,7 +362,9 @@ mod tests { #[test] fn test_update_for_existing_node_succeeds() { // Arrange - let mut snapshot = LatencyRoutingSnapshot::new(); + let mut snapshot = LatencyRoutingSnapshot::new() + .set_window_weights(&[2.0, 1.0]) + .set_availability_penalty(false); let node = Node::new("api1.com").unwrap(); let health = HealthCheckStatus::new(Some(Duration::from_secs(1))); snapshot.existing_nodes.insert(node.clone()); @@ -219,65 +372,51 @@ mod tests { let is_updated = snapshot.update_node(&node, health); assert!(is_updated); assert!(snapshot.has_nodes()); - let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!( - weighted_node.latency_mov_avg.get_average(), - Duration::from_secs(1) - ); - assert_eq!(weighted_node.weight, 1.0); + let node_with_metrics = snapshot.nodes_with_metrics.first().unwrap(); + assert_eq!(node_with_metrics.score, (2.0 / 1.0) / 2.0); assert_eq!(snapshot.next_node().unwrap(), node); // Check second update let health = HealthCheckStatus::new(Some(Duration::from_secs(2))); let is_updated = snapshot.update_node(&node, health); assert!(is_updated); - let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!( - weighted_node.latency_mov_avg.get_average(), - Duration::from_millis(1500) - ); - assert_eq!(weighted_node.weight, 1.0 / 1.5); + let node_with_metrics = snapshot.nodes_with_metrics.first().unwrap(); + assert_eq!(node_with_metrics.score, (2.0 / 2.0 + 1.0 / 1.0) / 3.0); // Check third update let health = HealthCheckStatus::new(Some(Duration::from_secs(3))); let is_updated = snapshot.update_node(&node, health); assert!(is_updated); - let weighted_node = snapshot.weighted_nodes.first().unwrap(); - assert_eq!( - weighted_node.latency_mov_avg.get_average(), - Duration::from_millis(2000) - ); - assert_eq!(weighted_node.weight, 0.5); + let node_with_metrics = snapshot.nodes_with_metrics.first().unwrap(); + assert_eq!(node_with_metrics.score, (2.0 / 3.0 + 1.0 / 2.0) / 3.0); // Check forth update with none let health = HealthCheckStatus::new(None); let is_updated = snapshot.update_node(&node, health); assert!(is_updated); - let weighted_node = snapshot.weighted_nodes.first().unwrap(); - let avg_latency = Duration::from_secs_f64((MAX_LATENCY.as_secs() as f64 + 6.0) / 4.0); - assert_eq!(weighted_node.latency_mov_avg.get_average(), avg_latency); - assert_eq!(weighted_node.weight, 1.0 / avg_latency.as_secs_f64()); - assert_eq!(snapshot.weighted_nodes.len(), 1); + let node_with_metrics = snapshot.nodes_with_metrics.first().unwrap(); + assert_eq!(node_with_metrics.score, (2.0 / 3.0 + 1.0 / 2.0) / 3.0); + assert!(!snapshot.has_nodes()); + assert_eq!(snapshot.nodes_with_metrics.len(), 1); assert_eq!(snapshot.existing_nodes.len(), 1); - assert_eq!(snapshot.next_node().unwrap(), node); + assert!(snapshot.next_node().is_none()); } #[test] fn test_sync_node_scenarios() { // Arrange + let window_size = 1; let mut snapshot = LatencyRoutingSnapshot::new(); let node_1 = Node::new("api1.com").unwrap(); // Sync with node_1 let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(nodes_changed); - assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.nodes_with_metrics.is_empty()); assert_eq!( snapshot.existing_nodes, HashSet::from_iter(vec![node_1.clone()]) ); // Add node_1 to weighted_nodes manually - snapshot.weighted_nodes.push(WeightedNode { - node: node_1.clone(), - latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), - weight: 0.0, - }); + snapshot + .nodes_with_metrics + .push(NodeWithMetrics::new(node_1.clone(), window_size)); // Sync with node_1 again let nodes_changed = snapshot.sync_nodes(&[node_1.clone()]); assert!(!nodes_changed); @@ -285,7 +424,7 @@ mod tests { snapshot.existing_nodes, HashSet::from_iter(vec![node_1.clone()]) ); - assert_eq!(snapshot.weighted_nodes[0].node, node_1); + assert_eq!(snapshot.nodes_with_metrics[0].node, node_1); // Sync with node_2 let node_2 = Node::new("api2.com").unwrap(); let nodes_changed = snapshot.sync_nodes(&[node_2.clone()]); @@ -295,13 +434,11 @@ mod tests { HashSet::from_iter(vec![node_2.clone()]) ); // Make sure node_1 was removed from weighted_nodes too - assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.nodes_with_metrics.is_empty()); // Add node_2 to weighted_nodes manually - snapshot.weighted_nodes.push(WeightedNode { - node: node_2.clone(), - latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), - weight: 0.0, - }); + snapshot + .nodes_with_metrics + .push(NodeWithMetrics::new(node_2.clone(), window_size)); // Sync with [node_2, node_3] let node_3 = Node::new("api3.com").unwrap(); let nodes_changed = snapshot.sync_nodes(&[node_3.clone(), node_2.clone()]); @@ -310,23 +447,22 @@ mod tests { snapshot.existing_nodes, HashSet::from_iter(vec![node_3.clone(), node_2.clone()]) ); - assert_eq!(snapshot.weighted_nodes[0].node, node_2); + assert_eq!(snapshot.nodes_with_metrics[0].node, node_2); // Add node_3 to weighted_nodes manually - snapshot.weighted_nodes.push(WeightedNode { - node: node_3, - latency_mov_avg: LatencyMovAvg::from_zero(Duration::ZERO), - weight: 0.0, - }); + snapshot + .nodes_with_metrics + .push(NodeWithMetrics::new(node_3, window_size)); // Sync with [] let nodes_changed = snapshot.sync_nodes(&[]); assert!(nodes_changed); assert!(snapshot.existing_nodes.is_empty()); // Make sure all nodes were removed from the healthy_nodes - assert!(snapshot.weighted_nodes.is_empty()); + assert!(snapshot.nodes_with_metrics.is_empty()); // Sync with [] again let nodes_changed = snapshot.sync_nodes(&[]); assert!(!nodes_changed); assert!(snapshot.existing_nodes.is_empty()); + assert!(!snapshot.has_nodes()); } #[test] @@ -386,56 +522,110 @@ mod tests { } #[test] - // #[ignore] + fn test_compute_score_with_penalty() { + let use_penalty = true; + + // Test empty arrays + let weights: &[f64] = &[]; + let weights_sum: f64 = weights.iter().sum(); + let availabilities = VecDeque::new(); + let latencies = VecDeque::new(); + + let score = compute_score( + weights, + weights_sum, + &availabilities, + &latencies, + use_penalty, + ); + assert_eq!(score, 0.0); + + // Test arrays with one element. + let weights: &[f64] = &[2.0, 1.0]; + let weights_sum: f64 = weights.iter().sum(); + let availabilities = vec![true].into(); + let latencies = vec![2.0].into(); + let score = compute_score( + weights, + weights_sum, + &availabilities, + &latencies, + use_penalty, + ); + let score_l = (2.0 / 2.0) / 2.0; + let score_a = 1.0; + assert_eq!(score, score_l * score_a); + + // Test arrays with two element. + let weights: &[f64] = &[2.0, 1.0]; + let weights_sum: f64 = weights.iter().sum(); + let availabilities = vec![true, false].into(); + let latencies = vec![1.0, 2.0].into(); + let score = compute_score( + weights, + weights_sum, + &availabilities, + &latencies, + use_penalty, + ); + let score_l = (2.0 / 2.0 + 1.0 / 1.0) / weights_sum; + let score_a = (2.0 * 0.0 + 1.0 * 1.0) / weights_sum; + assert_eq!(score, score_l * score_a); + + // Test arrays of different sizes. + let weights: &[f64] = &[3.0, 2.0, 1.0]; + let weights_sum: f64 = weights.iter().sum(); + let availabilities = vec![true, false, true].into(); + let latencies = vec![1.0, 2.0].into(); + let score = compute_score( + weights, + weights_sum, + &availabilities, + &latencies, + use_penalty, + ); + let score_l = (3.0 / 2.0 + 2.0 / 1.0) / 5.0; + let score_a = (3.0 * 1.0 + 2.0 * 0.0 + 1.0 * 1.0) / weights_sum; + assert_eq!(score, score_l * score_a); + } + + #[test] + #[ignore] // This test is for manual runs to see the statistics for nodes selection probability. fn test_stats_for_next_n_nodes() { // Arrange let mut snapshot = LatencyRoutingSnapshot::new(); + + let window_size = 1; + let node_1 = Node::new("api1.com").unwrap(); let node_2 = Node::new("api2.com").unwrap(); let node_3 = Node::new("api3.com").unwrap(); let node_4 = Node::new("api4.com").unwrap(); - let node_5 = Node::new("api5.com").unwrap(); - let node_6 = Node::new("api6.com").unwrap(); - let latency_mov_avg = LatencyMovAvg::from_zero(Duration::ZERO); - snapshot.weighted_nodes = vec![ - WeightedNode { - node: node_2.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 8.0, - }, - WeightedNode { - node: node_3.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 4.0, - }, - WeightedNode { - node: node_1.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 16.0, - }, - WeightedNode { - node: node_6.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 2.0, - }, - WeightedNode { - node: node_5.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 1.0, - }, - WeightedNode { - node: node_4.clone(), - latency_mov_avg: latency_mov_avg.clone(), - weight: 4.1, - }, - ]; + + let mut node_1 = NodeWithMetrics::new(node_1, window_size); + let mut node_2 = NodeWithMetrics::new(node_2, window_size); + let mut node_3 = NodeWithMetrics::new(node_3, window_size); + let mut node_4 = NodeWithMetrics::new(node_4, window_size); + + node_1.is_healthy = true; + node_2.is_healthy = true; + node_3.is_healthy = true; + node_4.is_healthy = false; + + node_1.score = 16.0; + node_2.score = 8.0; + node_3.score = 4.0; + // even though the score is high, this node should never be selected as it is unhealthy + node_4.score = 30.0; + + snapshot.nodes_with_metrics = vec![node_1, node_2, node_3, node_4]; let mut stats = HashMap::new(); let experiments = 30; - let select_nodes_count = 10; + let select_nodes_count = 1; for i in 0..experiments { - let nodes = snapshot.next_n_nodes(select_nodes_count).unwrap(); + let nodes = snapshot.next_n_nodes(select_nodes_count); println!("Experiment {i}: selected nodes {nodes:?}"); for item in nodes.into_iter() { *stats.entry(item).or_insert(1) += 1; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs index 318eb379..2f3fd421 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs @@ -45,16 +45,16 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { .cloned() } - fn next_n_nodes(&self, n: usize) -> Option> { + fn next_n_nodes(&self, n: usize) -> Vec { if n == 0 { - return Some(Vec::new()); + return Vec::new(); } let healthy_nodes = Vec::from_iter(self.healthy_nodes.clone()); let healthy_count = healthy_nodes.len(); if n >= healthy_count { - return Some(healthy_nodes.clone()); + return healthy_nodes.clone(); } let idx = self.current_idx.fetch_add(n, Ordering::Relaxed) % healthy_count; @@ -67,7 +67,7 @@ impl RoutingSnapshot for RoundRobinRoutingSnapshot { nodes.extend_from_slice(&healthy_nodes[..n - nodes.len()]); } - Some(nodes) + nodes } fn sync_nodes(&mut self, nodes: &[Node]) -> bool { @@ -296,13 +296,13 @@ mod tests { ]; snapshot.healthy_nodes.extend(nodes.clone()); // First call - let mut n_nodes: Vec<_> = snapshot.next_n_nodes(3).expect("failed to get nodes"); + let mut n_nodes: Vec<_> = snapshot.next_n_nodes(3); // Second call - n_nodes.extend(snapshot.next_n_nodes(3).expect("failed to get nodes")); + n_nodes.extend(snapshot.next_n_nodes(3)); // Third call - n_nodes.extend(snapshot.next_n_nodes(4).expect("failed to get nodes")); + n_nodes.extend(snapshot.next_n_nodes(4)); // Fourth call - n_nodes.extend(snapshot.next_n_nodes(5).expect("failed to get nodes")); + n_nodes.extend(snapshot.next_n_nodes(5)); // Assert each node was returned 3 times let k = 3; let mut count_map = HashMap::new(); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs index 242abdfe..5357b271 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs +++ b/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs @@ -9,7 +9,7 @@ pub trait RoutingSnapshot: Send + Sync + Clone + Debug { /// Get next node from the snapshot. fn next_node(&self) -> Option; /// Get up to n different nodes from the snapshot. - fn next_n_nodes(&self, n: usize) -> Option>; + fn next_n_nodes(&self, n: usize) -> Vec; /// Syncs the nodes in the snapshot with the provided list of nodes, returning `true` if the snapshot was updated. fn sync_nodes(&mut self, nodes: &[Node]) -> bool; /// Updates the health status of a specific node, returning `true` if the node was found and updated. From aeba12e4c479552296349556e11516fdcb520692 Mon Sep 17 00:00:00 2001 From: Adam Spofford Date: Mon, 23 Sep 2024 12:57:41 -0700 Subject: [PATCH 4/8] Merge dynamic-routing as-is under a feature flag --- Cargo.lock | 134 +++++++++++++++++- ic-agent/Cargo.toml | 18 ++- ic-agent/src/agent/route_provider.rs | 3 + .../dynamic_routing/dynamic_route_provider.rs | 85 ++++------- .../dynamic_routing/health_check.rs | 86 +++++------ .../dynamic_routing/messages.rs | 2 +- .../dynamic_routing/mod.rs | 3 + .../dynamic_routing/node.rs | 2 +- .../dynamic_routing/nodes_fetch.rs | 108 +++++++------- .../snapshot/latency_based_routing.rs | 4 +- .../dynamic_routing/snapshot/mod.rs | 0 .../snapshot/round_robin_routing.rs | 4 +- .../snapshot/routing_snapshot.rs | 2 +- .../dynamic_routing/test_utils.rs | 4 +- .../dynamic_routing/type_aliases.rs | 9 +- ic-agent/src/util.rs | 11 ++ 16 files changed, 304 insertions(+), 171 deletions(-) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/dynamic_route_provider.rs (92%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/health_check.rs (80%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/messages.rs (85%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/mod.rs (74%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/node.rs (96%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/nodes_fetch.rs (53%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/snapshot/latency_based_routing.rs (99%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/snapshot/mod.rs (100%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/snapshot/round_robin_routing.rs (99%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/snapshot/routing_snapshot.rs (92%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/test_utils.rs (97%) rename ic-agent/src/agent/{http_transport => route_provider}/dynamic_routing/type_aliases.rs (63%) diff --git a/Cargo.lock b/Cargo.lock index 9d453a98..341c9e56 100644 --- a/Cargo.lock +++ b/Cargo.lock @@ -108,6 +108,12 @@ version = "1.3.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "7d5a26814d8dcb93b0e5a0ff3c6d80a8843bafb21b39e8e18a6f05471870e110" +[[package]] +name = "arc-swap" +version = "1.7.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "69f7f8c3906b62b754cd5326047894316021dcfe5a194c8ea52bdd94934a3457" + [[package]] name = "arrayvec" version = "0.5.2" @@ -133,13 +139,24 @@ dependencies = [ "serde_json", ] +[[package]] +name = "async-channel" +version = "1.9.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "81953c529336010edd6d8e358f886d9581267795c61b19475b71314bffa46d35" +dependencies = [ + "concurrent-queue", + "event-listener 2.5.3", + "futures-core", +] + [[package]] name = "async-lock" version = "3.4.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "ff6e472cdea888a4bd64f342f09b3f50e1886d32afe8df3d663c01140b811b18" dependencies = [ - "event-listener", + "event-listener 5.3.1", "event-listener-strategy", "pin-project-lite", ] @@ -155,6 +172,15 @@ dependencies = [ "syn 2.0.76", ] +[[package]] +name = "async-watch" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a078faf4e27c0c6cc0efb20e5da59dcccc04968ebf2801d8e0b2195124cdcdb2" +dependencies = [ + "event-listener 2.5.3", +] + [[package]] name = "atomic-waker" version = "1.1.2" @@ -682,6 +708,12 @@ version = "1.0.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "5443807d6dff69373d433ab9ef5378ad8df50ca6298caf15de6e52e24aaf54d5" +[[package]] +name = "event-listener" +version = "2.5.3" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "0206175f82b8d6bf6652ff7d71a1e27fd2e4efde587fd368662814d6ec1d9ce0" + [[package]] name = "event-listener" version = "5.3.1" @@ -699,7 +731,7 @@ version = "0.5.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0f214dc438f977e6d4e3500aaa277f5ad94ca83fbbd9b1a15713ce2344ccc5a1" dependencies = [ - "event-listener", + "event-listener 5.3.1", "pin-project-lite", ] @@ -1014,7 +1046,11 @@ dependencies = [ name = "ic-agent" version = "0.37.1" dependencies = [ + "arc-swap", + "async-channel", "async-lock", + "async-trait", + "async-watch", "backoff", "cached", "candid", @@ -1047,9 +1083,12 @@ dependencies = [ "serde_repr", "sha2 0.10.8", "simple_asn1", + "stop-token", "thiserror", "time", "tokio", + "tracing", + "tracing-subscriber", "url", "wasm-bindgen", "wasm-bindgen-futures", @@ -1468,6 +1507,16 @@ version = "1.0.6" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "650eef8c711430f1a879fdd01d4745a7deea475becfb90269c06775983bbf086" +[[package]] +name = "nu-ansi-term" +version = "0.46.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "77a8165726e8236064dbb45459242600304b42a5ea24ee2948e18e023bf7ba84" +dependencies = [ + "overload", + "winapi", +] + [[package]] name = "num-bigint" version = "0.2.6" @@ -1535,6 +1584,12 @@ version = "0.3.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c08d65885ee38876c4f86fa503fb49d7b507c2b62552df7c70b2fce627e06381" +[[package]] +name = "overload" +version = "0.1.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b15813163c1d831bf4a13c3610c05c0d03b39feb07f7e09fa234dac9b15aaf39" + [[package]] name = "p256" version = "0.13.2" @@ -2190,6 +2245,15 @@ dependencies = [ "digest 0.10.7", ] +[[package]] +name = "sharded-slab" +version = "0.1.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f40ca3c46823713e0d4209592e8d6e826aa57e928f09752619fc696c499637f6" +dependencies = [ + "lazy_static", +] + [[package]] name = "shlex" version = "1.3.0" @@ -2293,6 +2357,18 @@ dependencies = [ "windows-sys 0.36.1", ] +[[package]] +name = "stop-token" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "af91f480ee899ab2d9f8435bfdfc14d08a5754bd9d3fef1f1a1c23336aad6c8b" +dependencies = [ + "async-channel", + "cfg-if", + "futures-core", + "pin-project-lite", +] + [[package]] name = "string_cache" version = "0.8.7" @@ -2414,6 +2490,16 @@ dependencies = [ "syn 2.0.76", ] +[[package]] +name = "thread_local" +version = "1.1.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8b9ef9bad013ada3808854ceac7b46812a6465ba368859a37e2100283d2d719c" +dependencies = [ + "cfg-if", + "once_cell", +] + [[package]] name = "time" version = "0.3.36" @@ -2557,9 +2643,21 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c3523ab5a71916ccf420eebdf5521fcef02141234bbc0b8a49f2fdc4544364ef" dependencies = [ "pin-project-lite", + "tracing-attributes", "tracing-core", ] +[[package]] +name = "tracing-attributes" +version = "0.1.27" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "34704c8d6ebcbc939824180af020566b01a7c01f80641264eba0999f6c2b6be7" +dependencies = [ + "proc-macro2", + "quote", + "syn 2.0.76", +] + [[package]] name = "tracing-core" version = "0.1.32" @@ -2567,6 +2665,32 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "c06d3da6113f116aaee68e4d601191614c9053067f9ab7f6edbcb161237daa54" dependencies = [ "once_cell", + "valuable", +] + +[[package]] +name = "tracing-log" +version = "0.2.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ee855f1f400bd0e5c02d150ae5de3840039a3f54b025156404e34c23c03f47c3" +dependencies = [ + "log", + "once_cell", + "tracing-core", +] + +[[package]] +name = "tracing-subscriber" +version = "0.3.18" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "ad0f048c97dbd9faa9b7df56362b8ebcaa52adb06b498c050d2f4e32f90a7a8b" +dependencies = [ + "nu-ansi-term", + "sharded-slab", + "smallvec", + "thread_local", + "tracing-core", + "tracing-log", ] [[package]] @@ -2649,6 +2773,12 @@ version = "0.2.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "06abde3611657adf66d383f00b093d7faecc7fa57071cce2578660c9f1010821" +[[package]] +name = "valuable" +version = "0.1.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "830b7e5d4d90034032940e4ace0d9a9a057e7a45cd94e6c007832e39edb82f6d" + [[package]] name = "version_check" version = "0.9.5" diff --git a/ic-agent/Cargo.toml b/ic-agent/Cargo.toml index 9c5462b7..886c54e1 100644 --- a/ic-agent/Cargo.toml +++ b/ic-agent/Cargo.toml @@ -15,7 +15,11 @@ keywords = ["internet-computer", "agent", "icp", "dfinity"] include = ["src", "Cargo.toml", "../LICENSE", "README.md"] [dependencies] +arc-swap = { version = "1.7", optional = true } +async-channel = { version = "1.9", optional = true } async-lock = "3.3" +async-trait = { version = "0.1", optional = true } +async-watch = { version = "0.3", optional = true } backoff = "0.4.0" cached = { version = "0.52", features = ["ahash"], default-features = false } candid = { workspace = true } @@ -41,8 +45,10 @@ serde_cbor = { workspace = true } serde_repr = { workspace = true } sha2 = { workspace = true } simple_asn1 = "0.6.1" +stop-token = { version = "0.7", optional = true } thiserror = { workspace = true } time = { workspace = true } +tracing = { version = "0.1", optional = true } url = "2.1.0" [dependencies.reqwest] @@ -67,6 +73,7 @@ web-sys = { version = "0.3", features = ["Window"], optional = true } [dev-dependencies] serde_json.workspace = true +tracing-subscriber = "0.3" [target.'cfg(not(target_family = "wasm"))'.dev-dependencies] tokio = { workspace = true, features = ["full"] } @@ -99,8 +106,17 @@ wasm-bindgen = [ "backoff/wasm-bindgen", "cached/wasm", ] +_internal_dynamic-routing = [ + "dep:arc-swap", + "dep:async-channel", + "dep:async-trait", + "dep:async-watch", + "dep:stop-token", + "tracing", +] +tracing = ["dep:tracing"] # Does very little right now. [package.metadata.docs.rs] targets = ["x86_64-unknown-linux-gnu", "wasm32-unknown-unknown"] rustdoc-args = ["--cfg=docsrs"] -features = ["hyper"] +features = ["_internal_dynamic-routing"] diff --git a/ic-agent/src/agent/route_provider.rs b/ic-agent/src/agent/route_provider.rs index 0db3d2a0..029508e9 100644 --- a/ic-agent/src/agent/route_provider.rs +++ b/ic-agent/src/agent/route_provider.rs @@ -7,6 +7,9 @@ use url::Url; use crate::agent::AgentError; +#[cfg(feature = "_internal_dynamic-routing")] +pub mod dynamic_routing; + const IC0_DOMAIN: &str = "ic0.app"; const ICP0_DOMAIN: &str = "icp0.io"; const ICP_API_DOMAIN: &str = "icp-api.io"; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs similarity index 92% rename from ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs index e05f6f4d..3611fbca 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs @@ -7,19 +7,15 @@ use std::{ use arc_swap::ArcSwap; use candid::Principal; +use futures_util::FutureExt; use reqwest::Client; +use stop_token::StopSource; use thiserror::Error; -use tokio::{ - runtime::Handle, - sync::{mpsc, watch}, - time::timeout, -}; -use tokio_util::{sync::CancellationToken, task::TaskTracker}; use tracing::{error, info, warn}; use url::Url; use crate::{ - agent::http_transport::{ + agent::route_provider::{ dynamic_routing::{ health_check::{HealthCheck, HealthChecker, HealthManagerActor}, messages::FetchedNodes, @@ -28,7 +24,7 @@ use crate::{ snapshot::routing_snapshot::RoutingSnapshot, type_aliases::AtomicSwap, }, - route_provider::RouteProvider, + RouteProvider, }, AgentError, }; @@ -64,12 +60,10 @@ pub struct DynamicRouteProvider { check_period: Duration, /// Snapshot of the routing nodes. routing_snapshot: AtomicSwap, - /// Task tracker for managing the spawned tasks. - tracker: TaskTracker, /// Initial seed nodes, which are used for the initial fetching of the nodes. seeds: Vec, /// Cancellation token for stopping the spawned tasks. - token: CancellationToken, + token: StopSource, } /// An error that occurred when the DynamicRouteProvider service was running. @@ -153,9 +147,8 @@ impl DynamicRouteProviderBuilder { checker: self.checker, check_period: self.check_period, routing_snapshot: self.routing_snapshot, - tracker: TaskTracker::new(), seeds: self.seeds, - token: CancellationToken::new(), + token: StopSource::new(), }; route_provider.run().await; @@ -203,10 +196,10 @@ where pub async fn run(&self) { info!("{DYNAMIC_ROUTE_PROVIDER}: started ..."); // Communication channel between NodesFetchActor and HealthManagerActor. - let (fetch_sender, fetch_receiver) = watch::channel(None); + let (fetch_sender, fetch_receiver) = async_watch::channel(None); // Communication channel with HealthManagerActor to receive info about healthy seed nodes (used only once). - let (init_sender, mut init_receiver) = mpsc::channel(1); + let (init_sender, init_receiver) = async_channel::bounded(1); // Start the receiving part first. let health_manager_actor = HealthManagerActor::new( @@ -215,10 +208,9 @@ where Arc::clone(&self.routing_snapshot), fetch_receiver, init_sender, - self.token.clone(), + self.token.token(), ); - self.tracker - .spawn(async move { health_manager_actor.run().await }); + crate::util::spawn(async move { health_manager_actor.run().await }); // Dispatch all seed nodes for initial health checks if let Err(err) = fetch_sender.send(Some(FetchedNodes { @@ -229,16 +221,20 @@ where // Try await for healthy seeds. let start = Instant::now(); - match timeout(TIMEOUT_AWAIT_HEALTHY_SEED, init_receiver.recv()).await { - Ok(_) => info!( - "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", - start.elapsed() - ), - Err(_) => warn!( - "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", - start.elapsed() - ), - }; + futures_util::select! { + _ = crate::util::sleep(TIMEOUT_AWAIT_HEALTHY_SEED).fuse() => { + warn!( + "{DYNAMIC_ROUTE_PROVIDER}: no healthy seeds found within {:?}", + start.elapsed() + ); + } + _ = init_receiver.recv().fuse() => { + info!( + "{DYNAMIC_ROUTE_PROVIDER}: found healthy seeds within {:?}", + start.elapsed() + ); + } + } // We can close the channel now. init_receiver.close(); @@ -248,33 +244,15 @@ where self.fetch_retry_interval, fetch_sender, Arc::clone(&self.routing_snapshot), - self.token.clone(), + self.token.token(), ); - self.tracker.spawn(async move { fetch_actor.run().await }); + crate::util::spawn(async move { fetch_actor.run().await }); info!( "{DYNAMIC_ROUTE_PROVIDER}: NodesFetchActor and HealthManagerActor started successfully" ); } } -// Gracefully stop the inner spawned tasks running in the background. -impl Drop for DynamicRouteProvider { - fn drop(&mut self) { - self.token.cancel(); - self.tracker.close(); - let tracker = self.tracker.clone(); - // If no runtime is available do nothing. - if let Ok(handle) = Handle::try_current() { - handle.spawn(async move { - tracker.wait().await; - warn!("{DYNAMIC_ROUTE_PROVIDER}: stopped gracefully"); - }); - } else { - error!("{DYNAMIC_ROUTE_PROVIDER}: no runtime available, cannot stop the spawned tasks"); - } - } -} - #[cfg(test)] mod tests { use candid::Principal; @@ -287,7 +265,7 @@ mod tests { use tracing_subscriber::FmtSubscriber; use crate::{ - agent::http_transport::{ + agent::route_provider::{ dynamic_routing::{ dynamic_route_provider::{ DynamicRouteProviderBuilder, IC0_SEED_DOMAIN, MAINNET_ROOT_SUBNET_ID, @@ -301,8 +279,7 @@ mod tests { assert_routed_domains, route_n_times, NodeHealthCheckerMock, NodesFetcherMock, }, }, - route_provider::RouteProvider, - ReqwestTransport, + RouteProvider, }, Agent, AgentError, }; @@ -367,11 +344,9 @@ mod tests { .build() .await; let route_provider = Arc::new(route_provider) as Arc; - let transport = - ReqwestTransport::create_with_client_route(Arc::clone(&route_provider), client) - .expect("failed to create transport"); let agent = Agent::builder() - .with_transport(transport) + .with_arc_route_provider(Arc::clone(&route_provider)) + .with_http_client(client) .build() .expect("failed to create an agent"); let subnet_id = Principal::from_text(MAINNET_ROOT_SUBNET_ID).unwrap(); diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs b/ic-agent/src/agent/route_provider/dynamic_routing/health_check.rs similarity index 80% rename from ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/health_check.rs index 491f010b..daa00e1a 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/health_check.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/health_check.rs @@ -1,4 +1,5 @@ use async_trait::async_trait; +use futures_util::FutureExt; use http::{Method, StatusCode}; use reqwest::{Client, Request}; use std::{ @@ -6,12 +7,11 @@ use std::{ sync::Arc, time::{Duration, Instant}, }; -use tokio::{sync::mpsc, time}; -use tokio_util::{sync::CancellationToken, task::TaskTracker}; +use stop_token::{StopSource, StopToken}; use tracing::{debug, error, info, warn}; use url::Url; -use crate::agent::http_transport::dynamic_routing::{ +use crate::agent::route_provider::dynamic_routing::{ dynamic_route_provider::DynamicRouteProviderError, messages::{FetchedNodes, NodeHealthState}, node::Node, @@ -113,7 +113,7 @@ struct HealthCheckActor { /// The sender channel (listener) to send the health status. sender_channel: SenderMpsc, /// The cancellation token of the actor. - token: CancellationToken, + token: StopToken, } impl HealthCheckActor { @@ -122,7 +122,7 @@ impl HealthCheckActor { period: Duration, node: Node, sender_channel: SenderMpsc, - token: CancellationToken, + token: StopToken, ) -> Self { Self { checker, @@ -135,19 +135,22 @@ impl HealthCheckActor { /// Runs the actor. async fn run(self) { - let mut interval = time::interval(self.period); loop { - tokio::select! { - _ = interval.tick() => { - let health = self.checker.check(&self.node).await.unwrap_or_default(); - let message = NodeHealthState { - node: self.node.clone(), - health, - }; - // Inform the listener about node's health. It can only fail if the listener was closed/dropped. - self.sender_channel.send(message).await.expect("Failed to send node's health state"); + let health = self.checker.check(&self.node).await.unwrap_or_default(); + let message = NodeHealthState { + node: self.node.clone(), + health, + }; + // Inform the listener about node's health. It can only fail if the listener was closed/dropped. + self.sender_channel + .send(message) + .await + .expect("Failed to send node's health state"); + futures_util::select! { + _ = crate::util::sleep(self.period).fuse() => { + continue; } - _ = self.token.cancelled() => { + _ = self.token.clone().fuse() => { info!("{HEALTH_CHECK_ACTOR}: was gracefully cancelled for node {:?}", self.node); break; } @@ -178,11 +181,9 @@ pub(super) struct HealthManagerActor { /// The sender channel to send the initialization status to DynamicRouteProvider (used only once in the init phase). init_sender: SenderMpsc, /// The cancellation token of the actor. - token: CancellationToken, + token: StopToken, /// The cancellation token for all the health checks. - nodes_token: CancellationToken, - /// The task tracker of the health checks, waiting for the tasks to exit (graceful termination). - nodes_tracker: TaskTracker, + nodes_token: StopSource, /// The flag indicating if this actor is initialized with healthy nodes. is_initialized: bool, } @@ -198,9 +199,9 @@ where routing_snapshot: AtomicSwap, fetch_receiver: ReceiverWatch, init_sender: SenderMpsc, - token: CancellationToken, + token: StopToken, ) -> Self { - let (check_sender, check_receiver) = mpsc::channel(CHANNEL_BUFFER); + let (check_sender, check_receiver) = async_channel::bounded(CHANNEL_BUFFER); Self { checker, @@ -211,8 +212,7 @@ where check_receiver, init_sender, token, - nodes_token: CancellationToken::new(), - nodes_tracker: TaskTracker::new(), + nodes_token: StopSource::new(), is_initialized: false, } } @@ -220,23 +220,27 @@ where /// Runs the actor. pub async fn run(mut self) { loop { - tokio::select! { + futures_util::select! { // Process a new array of fetched nodes from NodesFetchActor, if it appeared in the channel. - result = self.fetch_receiver.changed() => { - if let Err(err) = result { - error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}"); - self.token.cancel(); - continue; - } + result = self.fetch_receiver.recv().fuse() => { + let value = match result { + Ok(value) => value, + Err(err) => { + error!("{HEALTH_MANAGER_ACTOR}: nodes fetch sender has been dropped: {err:?}"); + continue; + } + }; // Get the latest value from the channel and mark it as seen. - let Some(FetchedNodes { nodes }) = self.fetch_receiver.borrow_and_update().clone() else { continue }; + let Some(FetchedNodes { nodes }) = value else { continue }; self.handle_fetch_update(nodes).await; } // Receive health check messages from all running HealthCheckActor/s. - Some(msg) = self.check_receiver.recv() => { - self.handle_health_update(msg).await; + msg_opt = self.check_receiver.recv().fuse() => { + if let Ok(msg) = msg_opt { + self.handle_health_update(msg).await; + } } - _ = self.token.cancelled() => { + _ = self.token.clone().fuse() => { self.stop_all_checks().await; self.check_receiver.close(); warn!("{HEALTH_MANAGER_ACTOR}: was gracefully cancelled, all nodes health checks stopped"); @@ -279,7 +283,7 @@ where fn start_checks(&mut self, nodes: Vec) { // Create a single cancellation token for all started health checks. - self.nodes_token = CancellationToken::new(); + self.nodes_token = StopSource::new(); for node in nodes { debug!("{HEALTH_MANAGER_ACTOR}: starting health check for node {node:?}"); let actor = HealthCheckActor::new( @@ -287,16 +291,14 @@ where self.period, node, self.check_sender.clone(), - self.nodes_token.clone(), + self.nodes_token.token(), ); - self.nodes_tracker.spawn(async move { actor.run().await }); + crate::util::spawn(async move { actor.run().await }); } } - async fn stop_all_checks(&self) { + async fn stop_all_checks(&mut self) { warn!("{HEALTH_MANAGER_ACTOR}: stopping all running health checks"); - self.nodes_token.cancel(); - self.nodes_tracker.close(); - self.nodes_tracker.wait().await; + self.nodes_token = StopSource::new(); } } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs b/ic-agent/src/agent/route_provider/dynamic_routing/messages.rs similarity index 85% rename from ic-agent/src/agent/http_transport/dynamic_routing/messages.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/messages.rs index 5feeae25..61876389 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/messages.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/messages.rs @@ -1,4 +1,4 @@ -use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; +use crate::agent::route_provider::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; /// Represents a message with fetched nodes. #[derive(Debug, Clone)] diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs b/ic-agent/src/agent/route_provider/dynamic_routing/mod.rs similarity index 74% rename from ic-agent/src/agent/http_transport/dynamic_routing/mod.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/mod.rs index 07570f0f..ccb29064 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/mod.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/mod.rs @@ -1,4 +1,7 @@ //! Dynamic routing implementation. +//! +//! This is an internal unstable feature. It works, but it's still in the oven; its design will go through drastic changes before it is released. + pub mod dynamic_route_provider; /// Health check implementation. pub mod health_check; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs b/ic-agent/src/agent/route_provider/dynamic_routing/node.rs similarity index 96% rename from ic-agent/src/agent/http_transport/dynamic_routing/node.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/node.rs index 37716da3..3aa7ca7f 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/node.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/node.rs @@ -1,7 +1,7 @@ use url::Url; use crate::agent::{ - http_transport::dynamic_routing::dynamic_route_provider::DynamicRouteProviderError, + route_provider::dynamic_routing::dynamic_route_provider::DynamicRouteProviderError, ApiBoundaryNode, }; diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs b/ic-agent/src/agent/route_provider/dynamic_routing/nodes_fetch.rs similarity index 53% rename from ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/nodes_fetch.rs index e887e668..9a47cab3 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/nodes_fetch.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/nodes_fetch.rs @@ -1,23 +1,20 @@ use async_trait::async_trait; use candid::Principal; +use futures_util::FutureExt; use reqwest::Client; use std::{fmt::Debug, sync::Arc, time::Duration}; -use tokio::time::{self, sleep}; -use tokio_util::sync::CancellationToken; +use stop_token::StopToken; use tracing::{error, warn}; use url::Url; use crate::agent::{ - http_transport::{ - dynamic_routing::{ - dynamic_route_provider::DynamicRouteProviderError, - health_check::HEALTH_MANAGER_ACTOR, - messages::FetchedNodes, - node::Node, - snapshot::routing_snapshot::RoutingSnapshot, - type_aliases::{AtomicSwap, SenderWatch}, - }, - reqwest_transport::ReqwestTransport, + route_provider::dynamic_routing::{ + dynamic_route_provider::DynamicRouteProviderError, + health_check::HEALTH_MANAGER_ACTOR, + messages::FetchedNodes, + node::Node, + snapshot::routing_snapshot::RoutingSnapshot, + type_aliases::{AtomicSwap, SenderWatch}, }, Agent, }; @@ -55,14 +52,9 @@ impl NodesFetcher { #[async_trait] impl Fetch for NodesFetcher { async fn fetch(&self, url: Url) -> Result, DynamicRouteProviderError> { - let transport = ReqwestTransport::create_with_client(url, self.http_client.clone()) - .map_err(|err| { - DynamicRouteProviderError::NodesFetchError(format!( - "Failed to build transport: {err}" - )) - })?; let agent = Agent::builder() - .with_transport(transport) + .with_url(url) + .with_http_client(self.http_client.clone()) .build() .map_err(|err| { DynamicRouteProviderError::NodesFetchError(format!( @@ -102,7 +94,7 @@ pub(super) struct NodesFetchActor { /// The snapshot of the routing table. routing_snapshot: AtomicSwap, /// The token to cancel/stop the actor. - token: CancellationToken, + token: StopToken, } impl NodesFetchActor @@ -116,7 +108,7 @@ where retry_interval: Duration, fetch_sender: SenderWatch, snapshot: AtomicSwap, - token: CancellationToken, + token: StopToken, ) -> Self { Self { fetcher, @@ -130,45 +122,47 @@ where /// Runs the actor. pub async fn run(self) { - let mut interval = time::interval(self.period); loop { - tokio::select! { - _ = interval.tick() => { - // Retry until success: - // - try to get a healthy node from the routing snapshot - // - if snapshot is empty, break the cycle and wait for the next fetch cycle - // - using the healthy node, try to fetch nodes from topology - // - if failure, sleep and retry - // - try send fetched nodes to the listener - // - failure should never happen, but we trace it if it does - loop { - let snapshot = self.routing_snapshot.load(); - if let Some(node) = snapshot.next_node() { - match self.fetcher.fetch((&node).into()).await { - Ok(nodes) => { - let msg = Some( - FetchedNodes {nodes}); - match self.fetch_sender.send(msg) { - Ok(()) => break, // message sent successfully, exist the loop - Err(err) => { - error!("{NODES_FETCH_ACTOR}: failed to send results to {HEALTH_MANAGER_ACTOR}: {err:?}"); - } - } - }, - Err(err) => { - error!("{NODES_FETCH_ACTOR}: failed to fetch nodes: {err:?}"); - } - }; - } else { - // No healthy nodes in the snapshot, break the cycle and wait for the next fetch cycle - error!("{NODES_FETCH_ACTOR}: no nodes in the snapshot"); - break; - }; - warn!("Retrying to fetch the nodes in {:?}", self.fetch_retry_interval); - sleep(self.fetch_retry_interval).await; + // Retry until success: + // - try to get a healthy node from the routing snapshot + // - if snapshot is empty, break the cycle and wait for the next fetch cycle + // - using the healthy node, try to fetch nodes from topology + // - if failure, sleep and retry + // - try send fetched nodes to the listener + // - failure should never happen, but we trace it if it does + loop { + let snapshot = self.routing_snapshot.load(); + if let Some(node) = snapshot.next_node() { + match self.fetcher.fetch((&node).into()).await { + Ok(nodes) => { + let msg = Some(FetchedNodes { nodes }); + match self.fetch_sender.send(msg) { + Ok(()) => break, // message sent successfully, exist the loop + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to send results to {HEALTH_MANAGER_ACTOR}: {err:?}"); + } + } } + Err(err) => { + error!("{NODES_FETCH_ACTOR}: failed to fetch nodes: {err:?}"); + } + }; + } else { + // No healthy nodes in the snapshot, break the cycle and wait for the next fetch cycle + error!("{NODES_FETCH_ACTOR}: no nodes in the snapshot"); + break; + }; + warn!( + "Retrying to fetch the nodes in {:?}", + self.fetch_retry_interval + ); + crate::util::sleep(self.fetch_retry_interval).await; + } + futures_util::select! { + _ = crate::util::sleep(self.period).fuse() => { + continue; } - _ = self.token.cancelled() => { + _ = self.token.clone().fuse() => { warn!("{NODES_FETCH_ACTOR}: was gracefully cancelled"); break; } diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/latency_based_routing.rs similarity index 99% rename from ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/snapshot/latency_based_routing.rs index 6b1ee0b0..2aa3fd90 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/latency_based_routing.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/latency_based_routing.rs @@ -5,7 +5,7 @@ use std::{ use rand::Rng; -use crate::agent::http_transport::dynamic_routing::{ +use crate::agent::route_provider::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, }; @@ -321,7 +321,7 @@ mod tests { time::Duration, }; - use crate::agent::http_transport::dynamic_routing::{ + use crate::agent::route_provider::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::{ diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/mod.rs similarity index 100% rename from ic-agent/src/agent/http_transport/dynamic_routing/snapshot/mod.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/snapshot/mod.rs diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/round_robin_routing.rs similarity index 99% rename from ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/snapshot/round_robin_routing.rs index 2f3fd421..67ce51e0 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/round_robin_routing.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/round_robin_routing.rs @@ -6,7 +6,7 @@ use std::{ }, }; -use crate::agent::http_transport::dynamic_routing::{ +use crate::agent::route_provider::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::routing_snapshot::RoutingSnapshot, }; @@ -114,7 +114,7 @@ mod tests { use std::time::Duration; use std::{collections::HashSet, sync::atomic::Ordering}; - use crate::agent::http_transport::dynamic_routing::{ + use crate::agent::route_provider::dynamic_routing::{ health_check::HealthCheckStatus, node::Node, snapshot::{ diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/routing_snapshot.rs similarity index 92% rename from ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/snapshot/routing_snapshot.rs index 5357b271..9f9331a2 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/snapshot/routing_snapshot.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/snapshot/routing_snapshot.rs @@ -1,6 +1,6 @@ use std::fmt::Debug; -use crate::agent::http_transport::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; +use crate::agent::route_provider::dynamic_routing::{health_check::HealthCheckStatus, node::Node}; /// A trait for interacting with the snapshot of nodes (routing table). pub trait RoutingSnapshot: Send + Sync + Clone + Debug { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs b/ic-agent/src/agent/route_provider/dynamic_routing/test_utils.rs similarity index 97% rename from ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/test_utils.rs index 60004d75..fea43caf 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/test_utils.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/test_utils.rs @@ -6,7 +6,7 @@ use arc_swap::ArcSwap; use async_trait::async_trait; use url::Url; -use crate::agent::http_transport::{ +use crate::agent::route_provider::{ dynamic_routing::{ dynamic_route_provider::DynamicRouteProviderError, health_check::{HealthCheck, HealthCheckStatus}, @@ -14,7 +14,7 @@ use crate::agent::http_transport::{ nodes_fetch::Fetch, type_aliases::AtomicSwap, }, - route_provider::RouteProvider, + RouteProvider, }; pub(super) fn route_n_times(n: usize, f: Arc) -> Vec { diff --git a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs b/ic-agent/src/agent/route_provider/dynamic_routing/type_aliases.rs similarity index 63% rename from ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs rename to ic-agent/src/agent/route_provider/dynamic_routing/type_aliases.rs index 6be931fb..f4d76fa3 100644 --- a/ic-agent/src/agent/http_transport/dynamic_routing/type_aliases.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/type_aliases.rs @@ -1,18 +1,17 @@ use arc_swap::ArcSwap; use std::sync::Arc; -use tokio::sync::{mpsc, watch}; /// A type alias for the sender end of a watch channel. -pub(super) type SenderWatch = watch::Sender>; +pub(super) type SenderWatch = async_watch::Sender>; /// A type alias for the receiver end of a watch channel. -pub(super) type ReceiverWatch = watch::Receiver>; +pub(super) type ReceiverWatch = async_watch::Receiver>; /// A type alias for the sender end of a multi-producer, single-consumer channel. -pub(super) type SenderMpsc = mpsc::Sender; +pub(super) type SenderMpsc = async_channel::Sender; /// A type alias for the receiver end of a multi-producer, single-consumer channel. -pub(super) type ReceiverMpsc = mpsc::Receiver; +pub(super) type ReceiverMpsc = async_channel::Receiver; /// A type alias for an atomic swap operation on a shared value. pub(super) type AtomicSwap = Arc>; diff --git a/ic-agent/src/util.rs b/ic-agent/src/util.rs index c33a3485..353d20f0 100644 --- a/ic-agent/src/util.rs +++ b/ic-agent/src/util.rs @@ -1,3 +1,4 @@ +use std::future::Future; use std::time::Duration; pub async fn sleep(d: Duration) { @@ -19,3 +20,13 @@ pub async fn sleep(d: Duration) { const _: () = { panic!("Using ic-agent from WASM requires enabling the `wasm-bindgen` feature") }; } + +#[cfg(all(target_family = "wasm", feature = "wasm-bindgen"))] +pub fn spawn(f: impl Future + 'static) { + wasm_bindgen_futures::spawn_local(f); +} + +#[cfg(not(all(target_family = "wasm", feature = "wasm-bindgen")))] +pub fn spawn(f: impl Future + Send + 'static) { + tokio::spawn(f); +} From 2a34dbcbcd78926996cf39fc34ffae81647830b5 Mon Sep 17 00:00:00 2001 From: Adam Spofford Date: Mon, 23 Sep 2024 13:24:39 -0700 Subject: [PATCH 5/8] clippy --- ic-agent/src/util.rs | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ic-agent/src/util.rs b/ic-agent/src/util.rs index 353d20f0..a61dfda0 100644 --- a/ic-agent/src/util.rs +++ b/ic-agent/src/util.rs @@ -1,3 +1,5 @@ +#![allow(dead_code)] + use std::future::Future; use std::time::Duration; From b8352388dcb749560ac3bac120187017842693ff Mon Sep 17 00:00:00 2001 From: Adam Spofford Date: Mon, 23 Sep 2024 13:33:31 -0700 Subject: [PATCH 6/8] lingering stuff --- ic-agent/src/agent/builder.rs | 30 ++++++++----------- .../dynamic_routing/dynamic_route_provider.rs | 2 +- 2 files changed, 14 insertions(+), 18 deletions(-) diff --git a/ic-agent/src/agent/builder.rs b/ic-agent/src/agent/builder.rs index c29045a5..1be14e02 100644 --- a/ic-agent/src/agent/builder.rs +++ b/ic-agent/src/agent/builder.rs @@ -1,9 +1,18 @@ use url::Url; use crate::{ - agent::{agent_config::AgentConfig, Agent}, + agent::{ + agent_config::AgentConfig, + route_provider::dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + node::Node, + snapshot::latency_based_routing::LatencyRoutingSnapshot, + }, + Agent, + }, AgentError, Identity, NonceFactory, NonceGenerator, }; +use reqwest::Client; use std::sync::Arc; use super::route_provider::RouteProvider; @@ -20,19 +29,8 @@ impl AgentBuilder { Agent::new(self.config) } - #[cfg(all(feature = "reqwest", not(target_family = "wasm")))] /// Set the dynamic transport layer for the [`Agent`], performing continuos discovery of the API boundary nodes and routing traffic via them based on the latencies. - pub async fn with_discovery_transport(self, client: reqwest::Client) -> Self { - use crate::agent::http_transport::{ - dynamic_routing::{ - dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, - node::Node, - snapshot::latency_based_routing::LatencyRoutingSnapshot, - }, - route_provider::RouteProvider, - ReqwestTransport, - }; - + pub async fn with_discovery_transport(self, client: Client) -> Self { // TODO: This is a temporary solution to get the seed node. let seed = Node::new(IC0_SEED_DOMAIN).unwrap(); @@ -46,10 +44,8 @@ impl AgentBuilder { let route_provider = Arc::new(route_provider) as Arc; - let transport = ReqwestTransport::create_with_client_route(route_provider, client) - .expect("failed to create transport"); - - self.with_transport(transport) + self.with_arc_route_provider(route_provider) + .with_http_client(client) } /// Set the URL of the [Agent]. diff --git a/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs b/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs index 3611fbca..8c245380 100644 --- a/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs +++ b/ic-agent/src/agent/route_provider/dynamic_routing/dynamic_route_provider.rs @@ -1,4 +1,4 @@ -//! An implementation of the [`RouteProvider`](crate::agent::http_transport::route_provider::RouteProvider) for dynamic generation of routing urls. +//! An implementation of [`RouteProvider`] for dynamic generation of routing urls. use std::{ sync::Arc, From d8454d24cf0582c2c6f2bf5daac9d0caf2ca8eef Mon Sep 17 00:00:00 2001 From: Adam Spofford Date: Mon, 23 Sep 2024 13:39:08 -0700 Subject: [PATCH 7/8] Update changelog --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3e0af72f..0016f689 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,8 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## Unreleased +* Add support for dynamic routing based on boundary node discovery. This is an internal feature for now, with a feature flag `_internal_dynamic-routing`. + ## [0.38.1] - 2024-09-23 * Fix `ic-agent` manifest so that documentation can be built for docs.rs. From adc41dbc59070bb188f708594be2202bb9af1377 Mon Sep 17 00:00:00 2001 From: Adam Spofford Date: Mon, 23 Sep 2024 14:38:18 -0700 Subject: [PATCH 8/8] respect feature flag in `builder.rs` --- ic-agent/src/agent/builder.rs | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/ic-agent/src/agent/builder.rs b/ic-agent/src/agent/builder.rs index 1be14e02..90331145 100644 --- a/ic-agent/src/agent/builder.rs +++ b/ic-agent/src/agent/builder.rs @@ -1,18 +1,9 @@ use url::Url; use crate::{ - agent::{ - agent_config::AgentConfig, - route_provider::dynamic_routing::{ - dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, - node::Node, - snapshot::latency_based_routing::LatencyRoutingSnapshot, - }, - Agent, - }, + agent::{agent_config::AgentConfig, Agent}, AgentError, Identity, NonceFactory, NonceGenerator, }; -use reqwest::Client; use std::sync::Arc; use super::route_provider::RouteProvider; @@ -29,8 +20,14 @@ impl AgentBuilder { Agent::new(self.config) } - /// Set the dynamic transport layer for the [`Agent`], performing continuos discovery of the API boundary nodes and routing traffic via them based on the latencies. - pub async fn with_discovery_transport(self, client: Client) -> Self { + /// Set the dynamic transport layer for the [`Agent`], performing continuous discovery of the API boundary nodes and routing traffic via them based on latency. + #[cfg(feature = "_internal_dynamic-routing")] + pub async fn with_discovery_transport(self, client: reqwest::Client) -> Self { + use crate::agent::route_provider::dynamic_routing::{ + dynamic_route_provider::{DynamicRouteProviderBuilder, IC0_SEED_DOMAIN}, + node::Node, + snapshot::latency_based_routing::LatencyRoutingSnapshot, + }; // TODO: This is a temporary solution to get the seed node. let seed = Node::new(IC0_SEED_DOMAIN).unwrap();