Skip to content

Commit d1d5446

Browse files
committed
perf(runtime-service): use short retry when no peers available
The runtime service tries to download the finalized block runtime immediately at startup, before peer connections are established. This always fails with StorageQueryError { errors: [] } (no peers to query). Previously, this triggered the full 4s retry_after_failed cooldown, making warm start consistently ~5-7s. Now, "no peers" errors use a 200ms retry instead of 4s. Peers typically connect within a few hundred milliseconds, so the retry succeeds quickly. Other errors (peer misbehavior, decode failures) still use the full 4s cooldown. Benchmark on Polkadot: warm start drops from ~5.5s to ~600ms.
1 parent 2cf734a commit d1d5446

3 files changed

Lines changed: 39 additions & 8 deletions

File tree

lib/src/chain/async_tree.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -505,8 +505,13 @@ where
505505
/// Panics if the [`AsyncOpId`] is invalid.
506506
///
507507
pub fn async_op_failure(&mut self, async_op_id: AsyncOpId, now: &TNow) {
508-
let new_timeout = now.clone() + self.retry_after_failed;
508+
let retry_after = now.clone() + self.retry_after_failed;
509+
self.async_op_failure_retry_at(async_op_id, &retry_after);
510+
}
509511

512+
/// Similar to [`AsyncTree::async_op_failure`], but retries at the given time
513+
/// instead of `now + retry_after_failed`.
514+
pub fn async_op_failure_retry_at(&mut self, async_op_id: AsyncOpId, retry_after: &TNow) {
510515
// Update the blocks that were performing this operation.
511516
// The blocks are iterated from child to parent, so that we can check, for each node,
512517
// whether its parent has the same asynchronous operation id.
@@ -523,11 +528,11 @@ where
523528
AsyncOpState::InProgress {
524529
async_op_id: id,
525530
timeout: Some(ref timeout),
526-
} if id == async_op_id => Some(cmp::min(timeout.clone(), new_timeout.clone())),
531+
} if id == async_op_id => Some(cmp::min(timeout.clone(), retry_after.clone())),
527532
AsyncOpState::InProgress {
528533
async_op_id: id,
529534
timeout: None,
530-
} if id == async_op_id => Some(new_timeout.clone()),
535+
} if id == async_op_id => Some(retry_after.clone()),
531536
_ => continue,
532537
};
533538

light-base/src/runtime_service.rs

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -2810,12 +2810,26 @@ async fn run_background<TPlat: PlatformRef>(
28102810
);
28112811
}
28122812

2813-
match &mut background.tree {
2814-
Tree::FinalizedBlockRuntimeKnown { tree, .. } => {
2815-
tree.async_op_failure(async_op_id, &background.platform.now());
2813+
if error.is_no_peers() {
2814+
// No peers available yet — use a short retry (200ms) instead of
2815+
// the full 4s cooldown. Peers typically connect within milliseconds.
2816+
let short_retry = background.platform.now() + Duration::from_millis(200);
2817+
match &mut background.tree {
2818+
Tree::FinalizedBlockRuntimeKnown { tree, .. } => {
2819+
tree.async_op_failure_retry_at(async_op_id, &short_retry);
2820+
}
2821+
Tree::FinalizedBlockRuntimeUnknown { tree, .. } => {
2822+
tree.async_op_failure_retry_at(async_op_id, &short_retry);
2823+
}
28162824
}
2817-
Tree::FinalizedBlockRuntimeUnknown { tree, .. } => {
2818-
tree.async_op_failure(async_op_id, &background.platform.now());
2825+
} else {
2826+
match &mut background.tree {
2827+
Tree::FinalizedBlockRuntimeKnown { tree, .. } => {
2828+
tree.async_op_failure(async_op_id, &background.platform.now());
2829+
}
2830+
Tree::FinalizedBlockRuntimeUnknown { tree, .. } => {
2831+
tree.async_op_failure(async_op_id, &background.platform.now());
2832+
}
28192833
}
28202834
}
28212835
}
@@ -2832,6 +2846,13 @@ enum RuntimeDownloadError {
28322846
}
28332847

28342848
impl RuntimeDownloadError {
2849+
fn is_no_peers(&self) -> bool {
2850+
match self {
2851+
RuntimeDownloadError::StorageQuery(err) => err.is_no_peers(),
2852+
RuntimeDownloadError::InvalidHeader(_) => false,
2853+
}
2854+
}
2855+
28352856
/// Returns `true` if this is caused by networking issues, as opposed to a consensus-related
28362857
/// issue.
28372858
fn is_network_problem(&self) -> bool {

light-base/src/sync_service.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1040,6 +1040,11 @@ pub struct StorageQueryError {
10401040
}
10411041

10421042
impl StorageQueryError {
1043+
/// Returns `true` if no peers were available to query.
1044+
pub fn is_no_peers(&self) -> bool {
1045+
self.errors.is_empty()
1046+
}
1047+
10431048
/// Returns `true` if this is caused by networking issues, as opposed to a consensus-related
10441049
/// issue.
10451050
pub fn is_network_problem(&self) -> bool {

0 commit comments

Comments
 (0)