Fix echo bounce and initial sync deadlock in peer protocol
Add mpsc channel between live receive and push loops so hashes received from a peer aren't pushed right back (echo prevention). Change initial reconciliation to use tokio::join! for concurrent send/receive, avoiding QUIC flow-control deadlock when both sides have large transfers. Update known_hashes to union-insert so peer-received hashes persist across poll cycles. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
parent
06c01baf44
commit
ccce1474d4
@ -3,8 +3,12 @@
|
|||||||
//! When two sync agents connect, they:
|
//! When two sync agents connect, they:
|
||||||
//! 1. Exchange hash lists (from their local CAN services)
|
//! 1. Exchange hash lists (from their local CAN services)
|
||||||
//! 2. Compute the diff (what each side is missing)
|
//! 2. Compute the diff (what each side is missing)
|
||||||
//! 3. Send missing assets to each other
|
//! 3. Send/receive missing assets concurrently (avoids deadlock)
|
||||||
//! 4. Continue polling for new assets and pushing them
|
//! 4. Continue polling for new assets and pushing them
|
||||||
|
//!
|
||||||
|
//! The live sync uses an unbounded channel to share received hashes
|
||||||
|
//! from the receive loop to the push loop, preventing "echo" where
|
||||||
|
//! an asset received from a peer gets pushed right back to them.
|
||||||
|
|
||||||
use std::collections::{HashMap, HashSet};
|
use std::collections::{HashMap, HashSet};
|
||||||
|
|
||||||
@ -12,6 +16,7 @@ use anyhow::{Context, Result};
|
|||||||
use iroh::endpoint::Connection;
|
use iroh::endpoint::Connection;
|
||||||
use prost::Message;
|
use prost::Message;
|
||||||
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
use tokio::io::{AsyncReadExt, AsyncWriteExt};
|
||||||
|
use tokio::sync::mpsc;
|
||||||
use tracing::{debug, error, info, warn};
|
use tracing::{debug, error, info, warn};
|
||||||
|
|
||||||
use crate::can_client::CanSyncClient;
|
use crate::can_client::CanSyncClient;
|
||||||
@ -52,8 +57,9 @@ async fn read_frame(recv: &mut iroh::endpoint::RecvStream) -> Result<(u8, Vec<u8
|
|||||||
|
|
||||||
/// Run a full sync session with a connected peer.
|
/// Run a full sync session with a connected peer.
|
||||||
///
|
///
|
||||||
/// This handles both initial reconciliation and ongoing transfer.
|
/// This handles initial reconciliation: exchange hash lists, compute diffs,
|
||||||
/// Called for both outgoing connections (we initiated) and incoming connections.
|
/// then send/receive missing assets **concurrently** to avoid deadlock when
|
||||||
|
/// both sides have large amounts of data to transfer.
|
||||||
pub async fn run_sync_session(
|
pub async fn run_sync_session(
|
||||||
conn: Connection,
|
conn: Connection,
|
||||||
can: CanSyncClient,
|
can: CanSyncClient,
|
||||||
@ -133,18 +139,26 @@ pub async fn run_sync_session(
|
|||||||
they_need.len()
|
they_need.len()
|
||||||
);
|
);
|
||||||
|
|
||||||
// Step 5: Send assets the peer is missing
|
// Step 5+6: Send and receive assets CONCURRENTLY to avoid deadlock.
|
||||||
if !they_need.is_empty() {
|
//
|
||||||
send_assets(&can, &mut send, &they_need, &short_id).await?;
|
// If both sides have many large assets, doing send-then-receive sequentially
|
||||||
}
|
// can deadlock: QUIC flow control stalls the writer because the reader hasn't
|
||||||
|
// drained its buffer (it's busy writing). Running both in parallel avoids this.
|
||||||
|
let send_fut = async {
|
||||||
|
if !they_need.is_empty() {
|
||||||
|
send_assets(&can, &mut send, &they_need, &short_id).await?;
|
||||||
|
}
|
||||||
|
let done_frame = encode_frame(MSG_DONE, &[]);
|
||||||
|
send.write_all(&done_frame).await.context("sending DONE")?;
|
||||||
|
send.flush().await.context("flushing after DONE")?;
|
||||||
|
Ok::<_, anyhow::Error>(())
|
||||||
|
};
|
||||||
|
|
||||||
// Send DONE marker
|
let recv_fut = receive_assets(&can, &mut recv, &short_id);
|
||||||
let done_frame = encode_frame(MSG_DONE, &[]);
|
|
||||||
send.write_all(&done_frame).await?;
|
|
||||||
send.flush().await?;
|
|
||||||
|
|
||||||
// Step 6: Receive assets we're missing
|
let (send_result, recv_result) = tokio::join!(send_fut, recv_fut);
|
||||||
receive_assets(&can, &mut recv, &short_id).await?;
|
send_result.context("sending assets to peer")?;
|
||||||
|
recv_result.context("receiving assets from peer")?;
|
||||||
|
|
||||||
info!("Sync session with {} complete", short_id);
|
info!("Sync session with {} complete", short_id);
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -179,11 +193,14 @@ async fn send_assets(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Receive assets from peer and push them to local CAN service.
|
/// Receive assets from peer and push them to local CAN service.
|
||||||
|
/// Returns the list of hashes that were successfully ingested.
|
||||||
async fn receive_assets(
|
async fn receive_assets(
|
||||||
can: &CanSyncClient,
|
can: &CanSyncClient,
|
||||||
recv: &mut iroh::endpoint::RecvStream,
|
recv: &mut iroh::endpoint::RecvStream,
|
||||||
peer_short: &str,
|
peer_short: &str,
|
||||||
) -> Result<()> {
|
) -> Result<Vec<String>> {
|
||||||
|
let mut received = Vec::new();
|
||||||
|
|
||||||
loop {
|
loop {
|
||||||
let (msg_type, payload) = read_frame(recv).await.context("reading asset from peer")?;
|
let (msg_type, payload) = read_frame(recv).await.context("reading asset from peer")?;
|
||||||
|
|
||||||
@ -195,7 +212,8 @@ async fn receive_assets(
|
|||||||
MSG_ASSET_BUNDLE => {
|
MSG_ASSET_BUNDLE => {
|
||||||
let bundle =
|
let bundle =
|
||||||
AssetBundle::decode(payload.as_slice()).context("decoding asset bundle")?;
|
AssetBundle::decode(payload.as_slice()).context("decoding asset bundle")?;
|
||||||
let hash_short = bundle.hash[..bundle.hash.len().min(12)].to_string();
|
let hash = bundle.hash.clone();
|
||||||
|
let hash_short = hash[..hash.len().min(12)].to_string();
|
||||||
info!("Received asset {} from peer {}", hash_short, peer_short);
|
info!("Received asset {} from peer {}", hash_short, peer_short);
|
||||||
|
|
||||||
match can.push(bundle).await {
|
match can.push(bundle).await {
|
||||||
@ -205,6 +223,7 @@ async fn receive_assets(
|
|||||||
} else {
|
} else {
|
||||||
info!("Ingested asset {} from peer {}", resp.hash, peer_short);
|
info!("Ingested asset {} from peer {}", resp.hash, peer_short);
|
||||||
}
|
}
|
||||||
|
received.push(hash);
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
error!("Failed to push asset {} to CAN: {:#}", hash_short, e);
|
error!("Failed to push asset {} to CAN: {:#}", hash_short, e);
|
||||||
@ -237,7 +256,7 @@ async fn receive_assets(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Ok(())
|
Ok(received)
|
||||||
}
|
}
|
||||||
|
|
||||||
/// Handle an incoming connection from a peer who connected to us.
|
/// Handle an incoming connection from a peer who connected to us.
|
||||||
@ -263,7 +282,9 @@ pub async fn handle_incoming(
|
|||||||
|
|
||||||
/// Run both live sync loops (push + receive) concurrently.
|
/// Run both live sync loops (push + receive) concurrently.
|
||||||
///
|
///
|
||||||
/// This should be called after initial reconciliation is complete.
|
/// Uses an unbounded channel to prevent the "echo" problem: when we receive
|
||||||
|
/// an asset from the peer, we tell the push loop about it so it doesn't
|
||||||
|
/// push the same asset right back.
|
||||||
pub async fn run_live_sync(
|
pub async fn run_live_sync(
|
||||||
conn: Connection,
|
conn: Connection,
|
||||||
can: CanSyncClient,
|
can: CanSyncClient,
|
||||||
@ -271,14 +292,17 @@ pub async fn run_live_sync(
|
|||||||
) {
|
) {
|
||||||
let short_id = conn.remote_id().fmt_short().to_string();
|
let short_id = conn.remote_id().fmt_short().to_string();
|
||||||
|
|
||||||
|
// Channel for receive loop to notify push loop about received hashes
|
||||||
|
let (received_tx, received_rx) = mpsc::unbounded_channel::<String>();
|
||||||
|
|
||||||
// Run push loop and receive loop concurrently — when either ends, we're done
|
// Run push loop and receive loop concurrently — when either ends, we're done
|
||||||
tokio::select! {
|
tokio::select! {
|
||||||
result = live_push_loop(conn.clone(), can.clone(), poll_interval) => {
|
result = live_push_loop(conn.clone(), can.clone(), poll_interval, received_rx) => {
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
warn!("Live push loop with {} ended: {:#}", short_id, e);
|
warn!("Live push loop with {} ended: {:#}", short_id, e);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
result = live_receive_loop(conn, can) => {
|
result = live_receive_loop(conn, can, received_tx) => {
|
||||||
if let Err(e) = result {
|
if let Err(e) = result {
|
||||||
warn!("Live receive loop with {} ended: {:#}", short_id, e);
|
warn!("Live receive loop with {} ended: {:#}", short_id, e);
|
||||||
}
|
}
|
||||||
@ -287,16 +311,20 @@ pub async fn run_live_sync(
|
|||||||
}
|
}
|
||||||
|
|
||||||
/// Poll for new local assets and push them to the peer via new QUIC streams.
|
/// Poll for new local assets and push them to the peer via new QUIC streams.
|
||||||
|
///
|
||||||
|
/// Drains the `received_rx` channel each tick to learn about hashes that arrived
|
||||||
|
/// from the peer, so we don't echo them back.
|
||||||
async fn live_push_loop(
|
async fn live_push_loop(
|
||||||
conn: Connection,
|
conn: Connection,
|
||||||
can: CanSyncClient,
|
can: CanSyncClient,
|
||||||
poll_interval: std::time::Duration,
|
poll_interval: std::time::Duration,
|
||||||
|
mut received_rx: mpsc::UnboundedReceiver<String>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let peer_id = conn.remote_id();
|
let peer_id = conn.remote_id();
|
||||||
let short_id = peer_id.fmt_short().to_string();
|
let short_id = peer_id.fmt_short().to_string();
|
||||||
info!("Starting live push loop with {}", short_id);
|
info!("Starting live push loop with {}", short_id);
|
||||||
|
|
||||||
// Track what we've already synced
|
// Track what we've already synced (local + received from peer)
|
||||||
let mut known_hashes: HashSet<String> = {
|
let mut known_hashes: HashSet<String> = {
|
||||||
let resp = can.get_hashes().await?;
|
let resp = can.get_hashes().await?;
|
||||||
resp.assets.into_iter().map(|a| a.hash).collect()
|
resp.assets.into_iter().map(|a| a.hash).collect()
|
||||||
@ -307,6 +335,12 @@ async fn live_push_loop(
|
|||||||
loop {
|
loop {
|
||||||
interval.tick().await;
|
interval.tick().await;
|
||||||
|
|
||||||
|
// Drain any hashes that the receive loop got from the peer.
|
||||||
|
// This prevents us from pushing them right back (echo prevention).
|
||||||
|
while let Ok(hash) = received_rx.try_recv() {
|
||||||
|
known_hashes.insert(hash);
|
||||||
|
}
|
||||||
|
|
||||||
// Poll for new assets
|
// Poll for new assets
|
||||||
let resp = match can.get_hashes().await {
|
let resp = match can.get_hashes().await {
|
||||||
Ok(r) => r,
|
Ok(r) => r,
|
||||||
@ -349,8 +383,10 @@ async fn live_push_loop(
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// Update our known set
|
// Update our known set (union with current so we keep peer-received hashes too)
|
||||||
known_hashes = current_hashes;
|
for h in current_hashes {
|
||||||
|
known_hashes.insert(h);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
Ok(())
|
Ok(())
|
||||||
@ -359,10 +395,12 @@ async fn live_push_loop(
|
|||||||
/// Accept incoming QUIC bi-streams from the peer and receive assets.
|
/// Accept incoming QUIC bi-streams from the peer and receive assets.
|
||||||
///
|
///
|
||||||
/// The peer's live_push_loop opens new bi-streams for each batch of assets.
|
/// The peer's live_push_loop opens new bi-streams for each batch of assets.
|
||||||
/// This loop accepts those streams and ingests the assets into the local CAN service.
|
/// This loop accepts those streams, ingests assets into the local CAN service,
|
||||||
|
/// and notifies the push loop via channel to prevent echo.
|
||||||
async fn live_receive_loop(
|
async fn live_receive_loop(
|
||||||
conn: Connection,
|
conn: Connection,
|
||||||
can: CanSyncClient,
|
can: CanSyncClient,
|
||||||
|
received_tx: mpsc::UnboundedSender<String>,
|
||||||
) -> Result<()> {
|
) -> Result<()> {
|
||||||
let peer_id = conn.remote_id();
|
let peer_id = conn.remote_id();
|
||||||
let short_id = peer_id.fmt_short().to_string();
|
let short_id = peer_id.fmt_short().to_string();
|
||||||
@ -372,8 +410,16 @@ async fn live_receive_loop(
|
|||||||
match conn.accept_bi().await {
|
match conn.accept_bi().await {
|
||||||
Ok((_send, mut recv)) => {
|
Ok((_send, mut recv)) => {
|
||||||
info!("Accepted live sync stream from peer {}", short_id);
|
info!("Accepted live sync stream from peer {}", short_id);
|
||||||
if let Err(e) = receive_assets(&can, &mut recv, &short_id).await {
|
match receive_assets(&can, &mut recv, &short_id).await {
|
||||||
warn!("Error receiving live assets from {}: {:#}", short_id, e);
|
Ok(received_hashes) => {
|
||||||
|
// Notify push loop about received hashes to prevent echo
|
||||||
|
for hash in received_hashes {
|
||||||
|
let _ = received_tx.send(hash);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
Err(e) => {
|
||||||
|
warn!("Error receiving live assets from {}: {:#}", short_id, e);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
Err(e) => {
|
Err(e) => {
|
||||||
|
|||||||
Loading…
x
Reference in New Issue
Block a user