From a32609433c40ab1307a28b4ba34d8be2b641cdd3 Mon Sep 17 00:00:00 2001 From: D4ryl00 Date: Tue, 2 Jun 2026 15:29:30 +0200 Subject: [PATCH 1/4] bitswap/network/bsnet: don't mark peer unresponsive on a single send failure streamMessageSender.send() marked a peer unresponsive on every failed attempt to open a stream or send a message. send() is called from multiAttempt(), which already resets the stream, retries, and marks the peer unresponsive only after all retries are exhausted, so the per-attempt marking is both redundant and premature. It is harmful after a connection bounce (a disconnect quickly followed by a reconnect): the inbound disconnect notification is suppressed because the peer is already connected again, so the ConnectEventManager stays in the responsive state and the reconnect Connected event is a no-op. A single send failure on the stale stream then transitions the peer to unresponsive and emits PeerDisconnected, sidelining a peer whose connection is in fact live. There is no recovery path: returning to responsive requires an incoming message (OnMessage), which never arrives because we have stopped sending our wantlist. Fetches from that peer hang until it fully disconnects. Rely on multiAttempt() to mark the peer unresponsive after retries fail. Signed-off-by: D4ryl00 --- bitswap/network/bsnet/ipfs_impl.go | 2 -- 1 file changed, 2 deletions(-) diff --git a/bitswap/network/bsnet/ipfs_impl.go b/bitswap/network/bsnet/ipfs_impl.go index 709838df6..d09d26e16 100644 --- a/bitswap/network/bsnet/ipfs_impl.go +++ b/bitswap/network/bsnet/ipfs_impl.go @@ -244,7 +244,6 @@ func (s *streamMessageSender) send(ctx context.Context, msg bsmsg.BitSwapMessage stream, err := s.Connect(ctx) if err != nil { log.Infof("failed to open stream to %s: %s", s.to, err) - s.bsnet.connectEvtMgr.MarkUnresponsive(s.to) return err } @@ -254,7 +253,6 @@ func (s *streamMessageSender) send(ctx context.Context, msg bsmsg.BitSwapMessage timeout := s.opts.SendTimeout - time.Since(start) if err = s.bsnet.msgToStream(ctx, stream, msg, timeout); err != nil { log.Infof("failed to send message to %s: %s", s.to, err) - s.bsnet.connectEvtMgr.MarkUnresponsive(s.to) return err } From c0d42e7c61b844f5b9e9ea73a4ba1a020387dce5 Mon Sep 17 00:00:00 2001 From: D4ryl00 Date: Tue, 2 Jun 2026 15:42:47 +0200 Subject: [PATCH 2/4] docs: changelog for bitswap unresponsive-on-reconnect fix Signed-off-by: D4ryl00 --- CHANGELOG.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/CHANGELOG.md b/CHANGELOG.md index b2a31c0da..d644e1bfd 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -22,6 +22,8 @@ The following emojis are used to highlight certain changes: ### Fixed +- `bitswap/network/bsnet`: stop marking a peer unresponsive on a single failed send attempt. `send()` is retried by `multiAttempt()`, which already marks the peer once all retries are exhausted; marking on the first failure could permanently sideline a peer that had just reconnected (the disconnect notification being suppressed), hanging fetches from it until it fully disconnected. [#1164](https://github.com/ipfs/boxo/pull/1164) + ### Security - `tracing`: bumped OpenTelemetry OTLP exporters to [v1.43.0](https://github.com/open-telemetry/opentelemetry-go/releases/tag/v1.43.0), which caps the HTTP exporter's response body at 4 MiB. A hostile or man-in-the-middle collector could otherwise exhaust its memory ([CVE-2026-39882](https://github.com/open-telemetry/opentelemetry-go/security/advisories/GHSA-w8rr-5gcm-pp58)). The gRPC exporter is unaffected. From 472492c476ac544bb25564f383042eac5cdf5c12 Mon Sep 17 00:00:00 2001 From: D4ryl00 Date: Tue, 2 Jun 2026 15:55:44 +0200 Subject: [PATCH 3/4] bitswap/network/bsnet: test that a single send failure doesn't mark a peer unresponsive Regression test for the reconnect hang: a recoverable send error (failure on the first attempt, success on retry) must not emit a PeerDisconnected, i.e. must not mark the still-connected peer unresponsive. Signed-off-by: D4ryl00 --- bitswap/network/bsnet/ipfs_impl_test.go | 69 +++++++++++++++++++++++++ 1 file changed, 69 insertions(+) diff --git a/bitswap/network/bsnet/ipfs_impl_test.go b/bitswap/network/bsnet/ipfs_impl_test.go index a714c1a31..39f340f60 100644 --- a/bitswap/network/bsnet/ipfs_impl_test.go +++ b/bitswap/network/bsnet/ipfs_impl_test.go @@ -449,6 +449,75 @@ func TestMessageResendAfterError(t *testing.T) { } } +// TestMessageSendErrorDoesNotMarkUnresponsive checks that a single recoverable +// send error does not mark the peer unresponsive. send() is retried by +// multiAttempt(), which only marks the peer once all retries are exhausted; +// marking on the first failure could permanently sideline a peer whose +// connection is in fact still live (e.g. right after a reconnect, where the +// disconnect notification is suppressed), with no recovery path. +// Regression test for https://github.com/ipfs/boxo/issues/1163. +func TestMessageSendErrorDoesNotMarkUnresponsive(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + p1 := tnet.RandIdentityOrFatal(t) + r1 := newReceiver() + p2 := tnet.RandIdentityOrFatal(t) + r2 := newReceiver() + + eh, bsnet1, _, _, msg := prepareNetwork(t, ctx, p1, r1, p2, r2) + + testSendErrorBackoff := 100 * time.Millisecond + ms, err := bsnet1.NewMessageSender(ctx, p2.ID(), &network.MessageSenderOpts{ + MaxRetries: 3, + SendTimeout: 100 * time.Millisecond, + SendErrorBackoff: testSendErrorBackoff, + }) + if err != nil { + t.Fatal(err) + } + defer ms.Reset() + + // Fail the next send, then stop failing so the retry succeeds. + eh.setError(errMockNetErr) + go func() { + time.Sleep(testSendErrorBackoff / 2) + eh.setError(nil) + }() + + if err = ms.SendMsg(ctx, msg); err != nil { + t.Fatal(err) + } + + select { + case <-ctx.Done(): + t.Fatal("did not receive message sent") + case <-r2.messageReceived: + } + + // The single, recovered send error must not have marked the peer + // unresponsive. Marking it would emit a PeerDisconnected with no + // recovery (the peer stays connected, so no further event arrives), + // leaving the peer dropped from the receiver's set. Wait for the + // connection state to settle, then assert the peer is still connected. + require.Eventually(t, func() bool { + r1.mu.Lock() + defer r1.mu.Unlock() + _, ok := r1.peers[p2.ID()] + return ok + }, time.Second, 20*time.Millisecond, "peer was marked unresponsive after a single recoverable send error") + + // Give a late erroneous PeerDisconnected a chance to land, and confirm + // the peer remains connected. + time.Sleep(300 * time.Millisecond) + r1.mu.Lock() + _, stillConnected := r1.peers[p2.ID()] + r1.mu.Unlock() + if !stillConnected { + t.Fatal("peer was marked unresponsive after a single recoverable send error") + } +} + func TestMessageSendTimeout(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel() From 9447cbd3b414c3b7d7128276f2a1f8c359dd2fbb Mon Sep 17 00:00:00 2001 From: D4ryl00 Date: Tue, 2 Jun 2026 18:30:39 +0200 Subject: [PATCH 4/4] bitswap/network/bsnet: test that a persistently failing peer is marked unresponsive Signed-off-by: D4ryl00 --- bitswap/network/bsnet/ipfs_impl_test.go | 44 +++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/bitswap/network/bsnet/ipfs_impl_test.go b/bitswap/network/bsnet/ipfs_impl_test.go index 39f340f60..495d266b9 100644 --- a/bitswap/network/bsnet/ipfs_impl_test.go +++ b/bitswap/network/bsnet/ipfs_impl_test.go @@ -518,6 +518,50 @@ func TestMessageSendErrorDoesNotMarkUnresponsive(t *testing.T) { } } +// TestMessageSendErrorMarksUnresponsive checks that a peer whose sends keep +// failing is eventually marked unresponsive once multiAttempt() exhausts all +// retries, emitting a PeerDisconnected. This is the complement of +// TestMessageSendErrorDoesNotMarkUnresponsive: a single recoverable error must +// not mark the peer, but a persistently failing one must. +func TestMessageSendErrorMarksUnresponsive(t *testing.T) { + ctx, cancel := context.WithTimeout(context.Background(), 2*time.Second) + defer cancel() + + p1 := tnet.RandIdentityOrFatal(t) + r1 := newReceiver() + p2 := tnet.RandIdentityOrFatal(t) + r2 := newReceiver() + + eh, bsnet1, _, _, msg := prepareNetwork(t, ctx, p1, r1, p2, r2) + + ms, err := bsnet1.NewMessageSender(ctx, p2.ID(), &network.MessageSenderOpts{ + MaxRetries: 3, + SendTimeout: 100 * time.Millisecond, + SendErrorBackoff: 10 * time.Millisecond, + }) + if err != nil { + t.Fatal(err) + } + defer ms.Reset() + + // Keep failing every send so all retries are exhausted. + eh.setError(errMockNetErr) + + // SendMsg must fail once retries are exhausted. + if err = ms.SendMsg(ctx, msg); err == nil { + t.Fatal("expected SendMsg to fail after exhausting retries") + } + + // The exhausted retries must have marked the peer unresponsive, which + // emits a PeerDisconnected, dropping it from the receiver's set. + require.Eventually(t, func() bool { + r1.mu.Lock() + defer r1.mu.Unlock() + _, ok := r1.peers[p2.ID()] + return !ok + }, time.Second, 20*time.Millisecond, "unresponsive peer was not disconnected after exhausting retries") +} + func TestMessageSendTimeout(t *testing.T) { ctx, cancel := context.WithTimeout(context.Background(), time.Second) defer cancel()