summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJakub Kicinski <kuba@kernel.org>2025-10-29 17:44:30 -0700
committerJakub Kicinski <kuba@kernel.org>2025-10-29 17:44:30 -0700
commitac345c5fff55262a51dad43bf8724c3d5d1fb006 (patch)
tree7a28015ad1fcba5a9441885e332a75235a817a61 /net
parent00764aa5c9bbb2044eb04d6d78584a436666b231 (diff)
parentfe11dfa10919ce594682c76f5f648a0840d80a2b (diff)
Merge branch 'mptcp-various-rare-sending-issues'
Matthieu Baerts says: ==================== mptcp: various rare sending issues Here are various fixes from Paolo, addressing very occasional issues on the sending side: - Patch 1: drop an optimisation that could lead to timeout in case of race conditions. A fix for up to v5.11. - Patch 2: fix stream corruption under very specific conditions. A fix for up to v5.13. - Patch 3: restore MPTCP-level zero window probe after a recent fix. A fix for up to v5.16. - Patch 4: new MIB counter to track MPTCP-level zero windows probe to help catching issues similar to the one fixed by the previous patch. ==================== Link: https://patch.msgid.link/20251028-net-mptcp-send-timeout-v1-0-38ffff5a9ec8@kernel.org Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Diffstat (limited to 'net')
-rw-r--r--net/mptcp/mib.c1
-rw-r--r--net/mptcp/mib.h1
-rw-r--r--net/mptcp/protocol.c57
-rw-r--r--net/mptcp/protocol.h2
4 files changed, 39 insertions, 22 deletions
diff --git a/net/mptcp/mib.c b/net/mptcp/mib.c
index 6003e47c770a..171643815076 100644
--- a/net/mptcp/mib.c
+++ b/net/mptcp/mib.c
@@ -85,6 +85,7 @@ static const struct snmp_mib mptcp_snmp_list[] = {
SNMP_MIB_ITEM("DssFallback", MPTCP_MIB_DSSFALLBACK),
SNMP_MIB_ITEM("SimultConnectFallback", MPTCP_MIB_SIMULTCONNFALLBACK),
SNMP_MIB_ITEM("FallbackFailed", MPTCP_MIB_FALLBACKFAILED),
+ SNMP_MIB_ITEM("WinProbe", MPTCP_MIB_WINPROBE),
};
/* mptcp_mib_alloc - allocate percpu mib counters
diff --git a/net/mptcp/mib.h b/net/mptcp/mib.h
index 309bac6fea32..a1d3e9369fbb 100644
--- a/net/mptcp/mib.h
+++ b/net/mptcp/mib.h
@@ -88,6 +88,7 @@ enum linux_mptcp_mib_field {
MPTCP_MIB_DSSFALLBACK, /* Bad or missing DSS */
MPTCP_MIB_SIMULTCONNFALLBACK, /* Simultaneous connect */
MPTCP_MIB_FALLBACKFAILED, /* Can't fallback due to msk status */
+ MPTCP_MIB_WINPROBE, /* MPTCP-level zero window probe */
__MPTCP_MIB_MAX
};
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 875027b9319c..2d6b8de35c44 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -1007,7 +1007,7 @@ static void __mptcp_clean_una(struct sock *sk)
if (WARN_ON_ONCE(!msk->recovery))
break;
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
}
dfrag_clear(sk, dfrag);
@@ -1299,7 +1299,12 @@ alloc_skb:
if (copy == 0) {
u64 snd_una = READ_ONCE(msk->snd_una);
- if (snd_una != msk->snd_nxt || tcp_write_queue_tail(ssk)) {
+ /* No need for zero probe if there are any data pending
+ * either at the msk or ssk level; skb is the current write
+ * queue tail and can be empty at this point.
+ */
+ if (snd_una != msk->snd_nxt || skb->len ||
+ skb != tcp_send_head(ssk)) {
tcp_remove_empty_skb(ssk);
return 0;
}
@@ -1350,6 +1355,7 @@ alloc_skb:
mpext->dsn64);
if (zero_window_probe) {
+ MPTCP_INC_STATS(sock_net(ssk), MPTCP_MIB_WINPROBE);
mptcp_subflow_ctx(ssk)->rel_write_seq += copy;
mpext->frozen = 1;
if (READ_ONCE(msk->csum_enabled))
@@ -1552,7 +1558,7 @@ static int __subflow_push_pending(struct sock *sk, struct sock *ssk,
mptcp_update_post_push(msk, dfrag, ret);
}
- WRITE_ONCE(msk->first_pending, mptcp_send_next(sk));
+ msk->first_pending = mptcp_send_next(sk);
if (msk->snd_burst <= 0 ||
!sk_stream_memory_free(ssk) ||
@@ -1912,7 +1918,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
get_page(dfrag->page);
list_add_tail(&dfrag->list, &msk->rtx_queue);
if (!msk->first_pending)
- WRITE_ONCE(msk->first_pending, dfrag);
+ msk->first_pending = dfrag;
}
pr_debug("msk=%p dfrag at seq=%llu len=%u sent=%u new=%d\n", msk,
dfrag->data_seq, dfrag->data_len, dfrag->already_sent,
@@ -1945,22 +1951,36 @@ do_error:
static void mptcp_rcv_space_adjust(struct mptcp_sock *msk, int copied);
-static int __mptcp_recvmsg_mskq(struct sock *sk,
- struct msghdr *msg,
- size_t len, int flags,
+static int __mptcp_recvmsg_mskq(struct sock *sk, struct msghdr *msg,
+ size_t len, int flags, int copied_total,
struct scm_timestamping_internal *tss,
int *cmsg_flags)
{
struct mptcp_sock *msk = mptcp_sk(sk);
struct sk_buff *skb, *tmp;
+ int total_data_len = 0;
int copied = 0;
skb_queue_walk_safe(&sk->sk_receive_queue, skb, tmp) {
- u32 offset = MPTCP_SKB_CB(skb)->offset;
+ u32 delta, offset = MPTCP_SKB_CB(skb)->offset;
u32 data_len = skb->len - offset;
- u32 count = min_t(size_t, len - copied, data_len);
+ u32 count;
int err;
+ if (flags & MSG_PEEK) {
+ /* skip already peeked skbs */
+ if (total_data_len + data_len <= copied_total) {
+ total_data_len += data_len;
+ continue;
+ }
+
+ /* skip the already peeked data in the current skb */
+ delta = copied_total - total_data_len;
+ offset += delta;
+ data_len -= delta;
+ }
+
+ count = min_t(size_t, len - copied, data_len);
if (!(flags & MSG_TRUNC)) {
err = skb_copy_datagram_msg(skb, offset, msg, count);
if (unlikely(err < 0)) {
@@ -1977,16 +1997,14 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
copied += count;
- if (count < data_len) {
- if (!(flags & MSG_PEEK)) {
+ if (!(flags & MSG_PEEK)) {
+ msk->bytes_consumed += count;
+ if (count < data_len) {
MPTCP_SKB_CB(skb)->offset += count;
MPTCP_SKB_CB(skb)->map_seq += count;
- msk->bytes_consumed += count;
+ break;
}
- break;
- }
- if (!(flags & MSG_PEEK)) {
/* avoid the indirect call, we know the destructor is sock_rfree */
skb->destructor = NULL;
skb->sk = NULL;
@@ -1994,7 +2012,6 @@ static int __mptcp_recvmsg_mskq(struct sock *sk,
sk_mem_uncharge(sk, skb->truesize);
__skb_unlink(skb, &sk->sk_receive_queue);
skb_attempt_defer_free(skb);
- msk->bytes_consumed += count;
}
if (copied >= len)
@@ -2191,7 +2208,8 @@ static int mptcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
while (copied < len) {
int err, bytes_read;
- bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags, &tss, &cmsg_flags);
+ bytes_read = __mptcp_recvmsg_mskq(sk, msg, len - copied, flags,
+ copied, &tss, &cmsg_flags);
if (unlikely(bytes_read < 0)) {
if (!copied)
copied = bytes_read;
@@ -2882,7 +2900,7 @@ static void __mptcp_clear_xmit(struct sock *sk)
struct mptcp_sock *msk = mptcp_sk(sk);
struct mptcp_data_frag *dtmp, *dfrag;
- WRITE_ONCE(msk->first_pending, NULL);
+ msk->first_pending = NULL;
list_for_each_entry_safe(dfrag, dtmp, &msk->rtx_queue, list)
dfrag_clear(sk, dfrag);
}
@@ -3422,9 +3440,6 @@ void __mptcp_data_acked(struct sock *sk)
void __mptcp_check_push(struct sock *sk, struct sock *ssk)
{
- if (!mptcp_send_head(sk))
- return;
-
if (!sock_owned_by_user(sk))
__mptcp_subflow_push_pending(sk, ssk, false);
else
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index 52f9cfa4ce95..379a88e14e8d 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -414,7 +414,7 @@ static inline struct mptcp_data_frag *mptcp_send_head(const struct sock *sk)
{
const struct mptcp_sock *msk = mptcp_sk(sk);
- return READ_ONCE(msk->first_pending);
+ return msk->first_pending;
}
static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk)