diff options
Diffstat (limited to 'net/mptcp')
-rw-r--r-- | net/mptcp/fastopen.c | 11 | ||||
-rw-r--r-- | net/mptcp/options.c | 14 | ||||
-rw-r--r-- | net/mptcp/pm.c | 4 | ||||
-rw-r--r-- | net/mptcp/pm_netlink.c | 6 | ||||
-rw-r--r-- | net/mptcp/pm_userspace.c | 4 | ||||
-rw-r--r-- | net/mptcp/protocol.c | 183 | ||||
-rw-r--r-- | net/mptcp/protocol.h | 17 | ||||
-rw-r--r-- | net/mptcp/sockopt.c | 46 | ||||
-rw-r--r-- | net/mptcp/subflow.c | 147 |
9 files changed, 275 insertions, 157 deletions
diff --git a/net/mptcp/fastopen.c b/net/mptcp/fastopen.c index d237d142171c..bceaab8dd8e4 100644 --- a/net/mptcp/fastopen.c +++ b/net/mptcp/fastopen.c @@ -9,11 +9,18 @@ void mptcp_fastopen_subflow_synack_set_params(struct mptcp_subflow_context *subflow, struct request_sock *req) { - struct sock *ssk = subflow->tcp_sock; - struct sock *sk = subflow->conn; + struct sock *sk, *ssk; struct sk_buff *skb; struct tcp_sock *tp; + /* on early fallback the subflow context is deleted by + * subflow_syn_recv_sock() + */ + if (!subflow) + return; + + ssk = subflow->tcp_sock; + sk = subflow->conn; tp = tcp_sk(ssk); subflow->is_mptfo = 1; diff --git a/net/mptcp/options.c b/net/mptcp/options.c index b30cea2fbf3f..19a01b6566f1 100644 --- a/net/mptcp/options.c +++ b/net/mptcp/options.c @@ -442,7 +442,6 @@ static void clear_3rdack_retransmission(struct sock *sk) static bool mptcp_established_options_mp(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, - unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -556,7 +555,6 @@ static void mptcp_write_data_fin(struct mptcp_subflow_context *subflow, static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, bool snd_data_fin_enable, unsigned int *size, - unsigned int remaining, struct mptcp_out_options *opts) { struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk); @@ -580,7 +578,6 @@ static bool mptcp_established_options_dss(struct sock *sk, struct sk_buff *skb, opts->ext_copy = *mpext; } - remaining -= map_size; dss_size = map_size; if (skb && snd_data_fin_enable) mptcp_write_data_fin(subflow, skb, &opts->ext_copy); @@ -851,9 +848,9 @@ bool mptcp_established_options(struct sock *sk, struct sk_buff *skb, } snd_data_fin = mptcp_data_fin_enabled(msk); - if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, remaining, opts)) + if (mptcp_established_options_mp(sk, skb, snd_data_fin, &opt_size, opts)) ret = true; - else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, remaining, opts)) { + else if (mptcp_established_options_dss(sk, skb, snd_data_fin, &opt_size, opts)) { unsigned int mp_fail_size; ret = true; @@ -1001,7 +998,7 @@ check_notify: clear_3rdack_retransmission(ssk); mptcp_pm_subflow_established(msk); } else { - mptcp_pm_fully_established(msk, ssk, GFP_ATOMIC); + mptcp_pm_fully_established(msk, ssk); } return true; @@ -1192,9 +1189,8 @@ bool mptcp_incoming_options(struct sock *sk, struct sk_buff *skb) */ if (TCP_SKB_CB(skb)->seq == TCP_SKB_CB(skb)->end_seq) { if (mp_opt.data_fin && mp_opt.data_len == 1 && - mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64) && - schedule_work(&msk->work)) - sock_hold(subflow->conn); + mptcp_update_rcv_data_fin(msk, mp_opt.data_seq, mp_opt.dsn64)) + mptcp_schedule_work((struct sock *)msk); return true; } diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c index 70f0ced3ca86..78c924506e83 100644 --- a/net/mptcp/pm.c +++ b/net/mptcp/pm.c @@ -126,7 +126,7 @@ static bool mptcp_pm_schedule_work(struct mptcp_sock *msk, return true; } -void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp) +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk) { struct mptcp_pm_data *pm = &msk->pm; bool announce = false; @@ -150,7 +150,7 @@ void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, spin_unlock_bh(&pm->lock); if (announce) - mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, gfp); + mptcp_event(MPTCP_EVENT_ESTABLISHED, msk, ssk, GFP_ATOMIC); } void mptcp_pm_connection_closed(struct mptcp_sock *msk) diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index 5c8dea49626c..bc343dab5e3f 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -1035,8 +1035,8 @@ static int mptcp_pm_nl_create_listen_socket(struct sock *sk, lock_sock(newsk); ssock = __mptcp_nmpc_socket(mptcp_sk(newsk)); release_sock(newsk); - if (!ssock) - return -EINVAL; + if (IS_ERR(ssock)) + return PTR_ERR(ssock); mptcp_info2sockaddr(&entry->addr, &addr, entry->addr.family); #if IS_ENABLED(CONFIG_MPTCP_IPV6) @@ -2035,7 +2035,7 @@ static int mptcp_event_put_token_and_ssk(struct sk_buff *skb, nla_put_s32(skb, MPTCP_ATTR_IF_IDX, ssk->sk_bound_dev_if)) return -EMSGSIZE; - sk_err = ssk->sk_err; + sk_err = READ_ONCE(ssk->sk_err); if (sk_err && sk->sk_state == TCP_ESTABLISHED && nla_put_u8(skb, MPTCP_ATTR_ERROR, sk_err)) return -EMSGSIZE; diff --git a/net/mptcp/pm_userspace.c b/net/mptcp/pm_userspace.c index a02d3cbf2a1b..27a275805c06 100644 --- a/net/mptcp/pm_userspace.c +++ b/net/mptcp/pm_userspace.c @@ -25,8 +25,8 @@ void mptcp_free_local_addr_list(struct mptcp_sock *msk) } } -int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry) +static int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, + struct mptcp_pm_addr_entry *entry) { DECLARE_BITMAP(id_bitmap, MPTCP_PM_MAX_ADDR_ID + 1); struct mptcp_pm_addr_entry *match = NULL; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index 60b23b2716c4..08dc53f56bc2 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -49,18 +49,6 @@ static void __mptcp_check_send_data_fin(struct sock *sk); DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions); static struct net_device mptcp_napi_dev; -/* If msk has an initial subflow socket, and the MP_CAPABLE handshake has not - * completed yet or has failed, return the subflow socket. - * Otherwise return NULL. - */ -struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk) -{ - if (!msk->subflow || READ_ONCE(msk->can_ack)) - return NULL; - - return msk->subflow; -} - /* Returns end sequence number of the receiver's advertised window */ static u64 mptcp_wnd_end(const struct mptcp_sock *msk) { @@ -116,6 +104,31 @@ static int __mptcp_socket_create(struct mptcp_sock *msk) return 0; } +/* If the MPC handshake is not started, returns the first subflow, + * eventually allocating it. + */ +struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk) +{ + struct sock *sk = (struct sock *)msk; + int ret; + + if (!((1 << sk->sk_state) & (TCPF_CLOSE | TCPF_LISTEN))) + return ERR_PTR(-EINVAL); + + if (!msk->subflow) { + if (msk->first) + return ERR_PTR(-EINVAL); + + ret = __mptcp_socket_create(msk); + if (ret) + return ERR_PTR(ret); + + mptcp_sockopt_sync(msk, msk->first); + } + + return msk->subflow; +} + static void mptcp_drop(struct sock *sk, struct sk_buff *skb) { sk_drops_add(sk, skb); @@ -459,7 +472,7 @@ static bool mptcp_pending_data_fin(struct sock *sk, u64 *seq) return false; } -static void mptcp_set_datafin_timeout(const struct sock *sk) +static void mptcp_set_datafin_timeout(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); u32 retransmits; @@ -1662,13 +1675,31 @@ static void mptcp_set_nospace(struct sock *sk) static int mptcp_disconnect(struct sock *sk, int flags); -static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, +static int mptcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, size_t len, int *copied_syn) { unsigned int saved_flags = msg->msg_flags; struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + struct sock *ssk; int ret; + /* on flags based fastopen the mptcp is supposed to create the + * first subflow right now. Otherwise we are in the defer_connect + * path, and the first subflow must be already present. + * Since the defer_connect flag is cleared after the first succsful + * fastopen attempt, no need to check for additional subflow status. + */ + if (msg->msg_flags & MSG_FASTOPEN) { + ssock = __mptcp_nmpc_socket(msk); + if (IS_ERR(ssock)) + return PTR_ERR(ssock); + } + if (!msk->first) + return -EINVAL; + + ssk = msk->first; + lock_sock(ssk); msg->msg_flags |= MSG_DONTWAIT; msk->connect_flags = O_NONBLOCK; @@ -1691,6 +1722,7 @@ static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msgh } else if (ret && ret != -EINPROGRESS) { mptcp_disconnect(sk, 0); } + inet_sk(sk)->defer_connect = 0; return ret; } @@ -1699,7 +1731,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; - struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1709,12 +1740,10 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); - ssock = __mptcp_nmpc_socket(msk); - if (unlikely(ssock && (inet_sk(ssock->sk)->defer_connect || - msg->msg_flags & MSG_FASTOPEN))) { + if (unlikely(inet_sk(sk)->defer_connect || msg->msg_flags & MSG_FASTOPEN)) { int copied_syn = 0; - ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + ret = mptcp_sendmsg_fastopen(sk, msg, len, &copied_syn); copied += copied_syn; if (ret == -EINPROGRESS && copied_syn > 0) goto out; @@ -2315,7 +2344,26 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, unsigned int flags) { struct mptcp_sock *msk = mptcp_sk(sk); - bool need_push, dispose_it; + bool dispose_it, need_push = false; + + /* If the first subflow moved to a close state before accept, e.g. due + * to an incoming reset, mptcp either: + * - if either the subflow or the msk are dead, destroy the context + * (the subflow socket is deleted by inet_child_forget) and the msk + * - otherwise do nothing at the moment and take action at accept and/or + * listener shutdown - user-space must be able to accept() the closed + * socket. + */ + if (msk->in_accept_queue && msk->first == ssk) { + if (!sock_flag(sk, SOCK_DEAD) && !sock_flag(ssk, SOCK_DEAD)) + return; + + /* ensure later check in mptcp_worker() will dispose the msk */ + sock_set_flag(sk, SOCK_DEAD); + lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); + mptcp_subflow_drop_ctx(ssk); + goto out_release; + } dispose_it = !msk->subflow || ssk != msk->subflow->sk; if (dispose_it) @@ -2351,28 +2399,22 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, if (!inet_csk(ssk)->icsk_ulp_ops) { WARN_ON_ONCE(!sock_flag(ssk, SOCK_DEAD)); kfree_rcu(subflow, rcu); - } else if (msk->in_accept_queue && msk->first == ssk) { - /* if the first subflow moved to a close state, e.g. due to - * incoming reset and we reach here before inet_child_forget() - * the TCP stack could later try to close it via - * inet_csk_listen_stop(), or deliver it to the user space via - * accept(). - * We can't delete the subflow - or risk a double free - nor let - * the msk survive - or will be leaked in the non accept scenario: - * fallback and let TCP cope with the subflow cleanup. - */ - WARN_ON_ONCE(sock_flag(ssk, SOCK_DEAD)); - mptcp_subflow_drop_ctx(ssk); } else { /* otherwise tcp will dispose of the ssk and subflow ctx */ - if (ssk->sk_state == TCP_LISTEN) + if (ssk->sk_state == TCP_LISTEN) { + tcp_set_state(ssk, TCP_CLOSE); + mptcp_subflow_queue_clean(sk, ssk); + inet_csk_listen_stop(ssk); mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED); + } __tcp_close(ssk, 0); /* close acquired an extra ref */ __sock_put(ssk); } + +out_release: release_sock(ssk); sock_put(ssk); @@ -2427,21 +2469,14 @@ static void __mptcp_close_subflow(struct sock *sk) mptcp_close_ssk(sk, ssk, subflow); } - /* if the MPC subflow has been closed before the msk is accepted, - * msk will never be accept-ed, close it now - */ - if (!msk->first && msk->in_accept_queue) { - sock_set_flag(sk, SOCK_DEAD); - inet_sk_state_store(sk, TCP_CLOSE); - } } -static bool mptcp_check_close_timeout(const struct sock *sk) +static bool mptcp_should_close(const struct sock *sk) { s32 delta = tcp_jiffies32 - inet_csk(sk)->icsk_mtup.probe_timestamp; struct mptcp_subflow_context *subflow; - if (delta >= TCP_TIMEWAIT_LEN) + if (delta >= TCP_TIMEWAIT_LEN || mptcp_sk(sk)->in_accept_queue) return true; /* if all subflows are in closed status don't bother with additional @@ -2480,15 +2515,15 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) /* Mirror the tcp_reset() error propagation */ switch (sk->sk_state) { case TCP_SYN_SENT: - sk->sk_err = ECONNREFUSED; + WRITE_ONCE(sk->sk_err, ECONNREFUSED); break; case TCP_CLOSE_WAIT: - sk->sk_err = EPIPE; + WRITE_ONCE(sk->sk_err, EPIPE); break; case TCP_CLOSE: return; default: - sk->sk_err = ECONNRESET; + WRITE_ONCE(sk->sk_err, ECONNRESET); } inet_sk_state_store(sk, TCP_CLOSE); @@ -2626,7 +2661,7 @@ static void mptcp_worker(struct work_struct *work) lock_sock(sk); state = sk->sk_state; - if (unlikely(state == TCP_CLOSE)) + if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN))) goto unlock; mptcp_check_data_fin_ack(sk); @@ -2649,7 +2684,7 @@ static void mptcp_worker(struct work_struct *work) * even if it is orphaned and in FIN_WAIT2 state */ if (sock_flag(sk, SOCK_DEAD)) { - if (mptcp_check_close_timeout(sk)) { + if (mptcp_should_close(sk)) { inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); } @@ -2728,10 +2763,6 @@ static int mptcp_init_sock(struct sock *sk) if (unlikely(!net->mib.mptcp_statistics) && !mptcp_mib_alloc(net)) return -ENOMEM; - ret = __mptcp_socket_create(mptcp_sk(sk)); - if (ret) - return ret; - set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will @@ -2895,6 +2926,14 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } +void __mptcp_unaccepted_force_close(struct sock *sk) +{ + sock_set_flag(sk, SOCK_DEAD); + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + __mptcp_destroy_sock(sk); +} + static __poll_t mptcp_check_readable(struct mptcp_sock *msk) { /* Concurrent splices from sk_receive_queue into receive_queue will @@ -2928,10 +2967,13 @@ bool __mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_check_readable(msk)) { - /* the msk has read data, do the MPTCP equivalent of TCP reset */ + if (mptcp_check_readable(msk) || timeout < 0) { + /* If the msk has read data, or the caller explicitly ask it, + * do the MPTCP equivalent of TCP reset, aka MPTCP fastclose + */ inet_sk_state_store(sk, TCP_CLOSE); mptcp_do_fastclose(sk); + timeout = 0; } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); } @@ -3143,7 +3185,7 @@ static struct sock *mptcp_accept(struct sock *sk, int flags, int *err, struct socket *listener; struct sock *newsk; - listener = __mptcp_nmpc_socket(msk); + listener = msk->subflow; if (WARN_ON_ONCE(!listener)) { *err = -EINVAL; return NULL; @@ -3363,7 +3405,7 @@ static int mptcp_get_port(struct sock *sk, unsigned short snum) struct mptcp_sock *msk = mptcp_sk(sk); struct socket *ssock; - ssock = __mptcp_nmpc_socket(msk); + ssock = msk->subflow; pr_debug("msk=%p, subflow=%p", msk, ssock); if (WARN_ON_ONCE(!ssock)) return -EINVAL; @@ -3551,8 +3593,8 @@ static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) int err = -EINVAL; ssock = __mptcp_nmpc_socket(msk); - if (!ssock) - return -EINVAL; + if (IS_ERR(ssock)) + return PTR_ERR(ssock); mptcp_token_destroy(msk); inet_sk_state_store(sk, TCP_SYN_SENT); @@ -3640,8 +3682,8 @@ static int mptcp_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) lock_sock(sock->sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); goto unlock; } @@ -3677,8 +3719,8 @@ static int mptcp_listen(struct socket *sock, int backlog) lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { - err = -EINVAL; + if (IS_ERR(ssock)) { + err = PTR_ERR(ssock); goto unlock; } @@ -3709,7 +3751,10 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, pr_debug("msk=%p", msk); - ssock = __mptcp_nmpc_socket(msk); + /* buggy applications can call accept on socket states other then LISTEN + * but no need to allocate the first subflow just to error out. + */ + ssock = msk->subflow; if (!ssock) return -EINVAL; @@ -3733,6 +3778,18 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (!ssk->sk_socket) mptcp_sock_graft(ssk, newsock); } + + /* Do late cleanup for the first subflow as necessary. Also + * deal with bad peers not doing a complete shutdown. + */ + if (msk->first && + unlikely(inet_sk_state_load(msk->first) == TCP_CLOSE)) { + __mptcp_close_ssk(newsk, msk->first, + mptcp_subflow_ctx(msk->first), 0); + if (unlikely(list_empty(&msk->conn_list))) + inet_sk_state_store(newsk, TCP_CLOSE); + } + release_sock(newsk); } @@ -3791,7 +3848,7 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); - if (sk->sk_err) + if (READ_ONCE(sk->sk_err)) mask |= EPOLLERR; return mask; diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 339a6f072989..2d7b2c80a164 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -334,10 +334,7 @@ static inline void msk_owned_by_me(const struct mptcp_sock *msk) sock_owned_by_me((const struct sock *)msk); } -static inline struct mptcp_sock *mptcp_sk(const struct sock *sk) -{ - return (struct mptcp_sock *)sk; -} +#define mptcp_sk(ptr) container_of_const(ptr, struct mptcp_sock, sk.icsk_inet.sk) /* the msk socket don't use the backlog, also account for the bulk * free memory @@ -371,7 +368,7 @@ static inline struct mptcp_data_frag *mptcp_send_next(struct sock *sk) static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) { - struct mptcp_sock *msk = mptcp_sk(sk); + const struct mptcp_sock *msk = mptcp_sk(sk); if (!msk->first_pending) return NULL; @@ -382,7 +379,7 @@ static inline struct mptcp_data_frag *mptcp_pending_tail(const struct sock *sk) return list_last_entry(&msk->rtx_queue, struct mptcp_data_frag, list); } -static inline struct mptcp_data_frag *mptcp_rtx_head(const struct sock *sk) +static inline struct mptcp_data_frag *mptcp_rtx_head(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -629,10 +626,12 @@ void mptcp_close_ssk(struct sock *sk, struct sock *ssk, struct mptcp_subflow_context *subflow); void __mptcp_subflow_send_ack(struct sock *ssk); void mptcp_subflow_reset(struct sock *ssk); +void mptcp_subflow_queue_clean(struct sock *sk, struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); -struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +struct socket *__mptcp_nmpc_socket(struct mptcp_sock *msk); bool __mptcp_close(struct sock *sk, long timeout); void mptcp_cancel_work(struct sock *sk); +void __mptcp_unaccepted_force_close(struct sock *sk); void mptcp_set_owner_r(struct sk_buff *skb, struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, @@ -785,7 +784,7 @@ bool mptcp_pm_addr_families_match(const struct sock *sk, void mptcp_pm_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_nl_subflow_chk_stale(const struct mptcp_sock *msk, struct sock *ssk); void mptcp_pm_new_connection(struct mptcp_sock *msk, const struct sock *ssk, int server_side); -void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk, gfp_t gfp); +void mptcp_pm_fully_established(struct mptcp_sock *msk, const struct sock *ssk); bool mptcp_pm_allow_new_subflow(struct mptcp_sock *msk); void mptcp_pm_connection_closed(struct mptcp_sock *msk); void mptcp_pm_subflow_established(struct mptcp_sock *msk); @@ -833,8 +832,6 @@ int mptcp_pm_remove_subflow(struct mptcp_sock *msk, const struct mptcp_rm_list * void mptcp_pm_remove_addrs_and_subflows(struct mptcp_sock *msk, struct list_head *rm_list); -int mptcp_userspace_pm_append_new_local_addr(struct mptcp_sock *msk, - struct mptcp_pm_addr_entry *entry); void mptcp_free_local_addr_list(struct mptcp_sock *msk); int mptcp_nl_cmd_announce(struct sk_buff *skb, struct genl_info *info); int mptcp_nl_cmd_remove(struct sk_buff *skb, struct genl_info *info); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 8a9656248b0f..d4258869ac48 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -301,9 +301,9 @@ static int mptcp_setsockopt_sol_socket(struct mptcp_sock *msk, int optname, case SO_BINDTOIFINDEX: lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } ret = sock_setsockopt(ssock, SOL_SOCKET, optname, optval, optlen); @@ -396,9 +396,9 @@ static int mptcp_setsockopt_v6(struct mptcp_sock *msk, int optname, case IPV6_FREEBIND: lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } ret = tcp_setsockopt(ssock->sk, SOL_IPV6, optname, optval, optlen); @@ -693,9 +693,9 @@ static int mptcp_setsockopt_sol_ip_set_transparent(struct mptcp_sock *msk, int o lock_sock(sk); ssock = __mptcp_nmpc_socket(msk); - if (!ssock) { + if (IS_ERR(ssock)) { release_sock(sk); - return -EINVAL; + return PTR_ERR(ssock); } issk = inet_sk(ssock->sk); @@ -762,13 +762,15 @@ static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int { struct sock *sk = (struct sock *)msk; struct socket *sock; - int ret = -EINVAL; + int ret; /* Limit to first subflow, before the connection establishment */ lock_sock(sk); sock = __mptcp_nmpc_socket(msk); - if (!sock) + if (IS_ERR(sock)) { + ret = PTR_ERR(sock); goto unlock; + } ret = tcp_setsockopt(sock->sk, level, optname, optval, optlen); @@ -861,7 +863,7 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int { struct sock *sk = (struct sock *)msk; struct socket *ssock; - int ret = -EINVAL; + int ret; struct sock *ssk; lock_sock(sk); @@ -872,8 +874,10 @@ static int mptcp_getsockopt_first_sf_only(struct mptcp_sock *msk, int level, int } ssock = __mptcp_nmpc_socket(msk); - if (!ssock) + if (IS_ERR(ssock)) { + ret = PTR_ERR(ssock); goto out; + } ret = tcp_getsockopt(ssock->sk, level, optname, optval, optlen); @@ -885,7 +889,6 @@ out: void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) { u32 flags = 0; - u8 val; memset(info, 0, sizeof(*info)); @@ -893,12 +896,19 @@ void mptcp_diag_fill_info(struct mptcp_sock *msk, struct mptcp_info *info) info->mptcpi_add_addr_signal = READ_ONCE(msk->pm.add_addr_signaled); info->mptcpi_add_addr_accepted = READ_ONCE(msk->pm.add_addr_accepted); info->mptcpi_local_addr_used = READ_ONCE(msk->pm.local_addr_used); - info->mptcpi_subflows_max = mptcp_pm_get_subflows_max(msk); - val = mptcp_pm_get_add_addr_signal_max(msk); - info->mptcpi_add_addr_signal_max = val; - val = mptcp_pm_get_add_addr_accept_max(msk); - info->mptcpi_add_addr_accepted_max = val; - info->mptcpi_local_addr_max = mptcp_pm_get_local_addr_max(msk); + + /* The following limits only make sense for the in-kernel PM */ + if (mptcp_pm_is_kernel(msk)) { + info->mptcpi_subflows_max = + mptcp_pm_get_subflows_max(msk); + info->mptcpi_add_addr_signal_max = + mptcp_pm_get_add_addr_signal_max(msk); + info->mptcpi_add_addr_accepted_max = + mptcp_pm_get_add_addr_accept_max(msk); + info->mptcpi_local_addr_max = + mptcp_pm_get_local_addr_max(msk); + } + if (test_bit(MPTCP_FALLBACK_DONE, &msk->flags)) flags |= MPTCP_INFO_FLAG_FALLBACK; if (READ_ONCE(msk->can_ack)) @@ -1046,7 +1056,7 @@ static int mptcp_getsockopt_tcpinfo(struct mptcp_sock *msk, char __user *optval, static void mptcp_get_sub_addrs(const struct sock *sk, struct mptcp_subflow_addrs *a) { - struct inet_sock *inet = inet_sk(sk); + const struct inet_sock *inet = inet_sk(sk); memset(a, 0, sizeof(*a)); diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index a0041360ee9d..ba065b66551a 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -408,9 +408,8 @@ void mptcp_subflow_reset(struct sock *ssk) tcp_send_active_reset(ssk, GFP_ATOMIC); tcp_done(ssk); - if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags) && - schedule_work(&mptcp_sk(sk)->work)) - return; /* worker will put sk for us */ + if (!test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &mptcp_sk(sk)->flags)) + mptcp_schedule_work(sk); sock_put(sk); } @@ -696,14 +695,6 @@ static bool subflow_hmac_valid(const struct request_sock *req, return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } -static void mptcp_force_close(struct sock *sk) -{ - /* the msk is not yet exposed to user-space, and refcount is 2 */ - inet_sk_state_store(sk, TCP_CLOSE); - sk_common_release(sk); - sock_put(sk); -} - static void subflow_ulp_fallback(struct sock *sk, struct mptcp_subflow_context *old_ctx) { @@ -724,9 +715,12 @@ void mptcp_subflow_drop_ctx(struct sock *ssk) if (!ctx) return; - subflow_ulp_fallback(ssk, ctx); - if (ctx->conn) - sock_put(ctx->conn); + list_del(&mptcp_subflow_ctx(ssk)->node); + if (inet_csk(ssk)->icsk_ulp_ops) { + subflow_ulp_fallback(ssk, ctx); + if (ctx->conn) + sock_put(ctx->conn); + } kfree_rcu(ctx, rcu); } @@ -755,7 +749,6 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, struct mptcp_subflow_request_sock *subflow_req; struct mptcp_options_received mp_opt; bool fallback, fallback_is_fatal; - struct sock *new_msk = NULL; struct mptcp_sock *owner; struct sock *child; @@ -784,14 +777,9 @@ static struct sock *subflow_syn_recv_sock(const struct sock *sk, * options. */ mptcp_get_options(skb, &mp_opt); - if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) { + if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPC)) fallback = true; - goto create_child; - } - new_msk = mptcp_sk_clone(listener->conn, &mp_opt, req); - if (!new_msk) - fallback = true; } else if (subflow_req->mp_join) { mptcp_get_options(skb, &mp_opt); if (!(mp_opt.suboptions & OPTIONS_MPTCP_MPJ) || @@ -820,23 +808,23 @@ create_child: subflow_add_reset_reason(skb, MPTCP_RST_EMPTCP); goto dispose_child; } - - if (new_msk) - mptcp_copy_inaddrs(new_msk, child); - mptcp_subflow_drop_ctx(child); - goto out; + goto fallback; } /* ssk inherits options of listener sk */ ctx->setsockopt_seq = listener->setsockopt_seq; if (ctx->mp_capable) { - owner = mptcp_sk(new_msk); + ctx->conn = mptcp_sk_clone(listener->conn, &mp_opt, req); + if (!ctx->conn) + goto fallback; + + owner = mptcp_sk(ctx->conn); /* this can't race with mptcp_close(), as the msk is * not yet exposted to user-space */ - inet_sk_state_store((void *)new_msk, TCP_ESTABLISHED); + inet_sk_state_store(ctx->conn, TCP_ESTABLISHED); /* record the newly created socket as the first msk * subflow, but don't link it yet into conn_list @@ -846,11 +834,9 @@ create_child: /* new mpc subflow takes ownership of the newly * created mptcp socket */ - mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; + owner->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(owner, child, 1); mptcp_token_accept(subflow_req, owner); - ctx->conn = new_msk; - new_msk = NULL; /* set msk addresses early to ensure mptcp_pm_get_local_id() * uses the correct data @@ -867,7 +853,7 @@ create_child: */ if (mp_opt.suboptions & OPTION_MPTCP_MPC_ACK) { mptcp_subflow_fully_established(ctx, &mp_opt); - mptcp_pm_fully_established(owner, child, GFP_ATOMIC); + mptcp_pm_fully_established(owner, child); ctx->pm_notified = 1; } } else if (ctx->mp_join) { @@ -900,11 +886,6 @@ create_child: } } -out: - /* dispose of the left over mptcp master, if any */ - if (unlikely(new_msk)) - mptcp_force_close(new_msk); - /* check for expected invariant - should never trigger, just help * catching eariler subtle bugs */ @@ -922,6 +903,10 @@ dispose_child: /* The last child reference will be released by the caller */ return child; + +fallback: + mptcp_subflow_drop_ctx(child); + return child; } static struct inet_connection_sock_af_ops subflow_specific __ro_after_init; @@ -1118,8 +1103,8 @@ static enum mapping_status get_mapping_status(struct sock *ssk, skb_ext_del(skb, SKB_EXT_MPTCP); return MAPPING_OK; } else { - if (updated && schedule_work(&msk->work)) - sock_hold((struct sock *)msk); + if (updated) + mptcp_schedule_work((struct sock *)msk); return MAPPING_DATA_FIN; } @@ -1222,17 +1207,12 @@ static void mptcp_subflow_discard_data(struct sock *ssk, struct sk_buff *skb, /* sched mptcp worker to remove the subflow if no more data is pending */ static void subflow_sched_work_if_closed(struct mptcp_sock *msk, struct sock *ssk) { - struct sock *sk = (struct sock *)msk; - if (likely(ssk->sk_state != TCP_CLOSE)) return; if (skb_queue_empty(&ssk->sk_receive_queue) && - !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) { - sock_hold(sk); - if (!schedule_work(&msk->work)) - sock_put(sk); - } + !test_and_set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) + mptcp_schedule_work((struct sock *)msk); } static bool subflow_can_fallback(struct mptcp_subflow_context *subflow) @@ -1350,7 +1330,7 @@ fallback: subflow->reset_reason = MPTCP_RST_EMPTCP; reset: - ssk->sk_err = EBADMSG; + WRITE_ONCE(ssk->sk_err, EBADMSG); tcp_set_state(ssk, TCP_CLOSE); while ((skb = skb_peek(&ssk->sk_receive_queue))) sk_eat_skb(ssk, skb); @@ -1434,7 +1414,7 @@ void __mptcp_error_report(struct sock *sk) ssk_state = inet_sk_state_load(ssk); if (ssk_state == TCP_CLOSE && !sock_flag(sk, SOCK_DEAD)) inet_sk_state_store(sk, ssk_state); - sk->sk_err = -err; + WRITE_ONCE(sk->sk_err, -err); /* This barrier is coupled with smp_rmb() in mptcp_poll() */ smp_wmb(); @@ -1825,6 +1805,77 @@ static void subflow_state_change(struct sock *sk) } } +void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk) +{ + struct request_sock_queue *queue = &inet_csk(listener_ssk)->icsk_accept_queue; + struct mptcp_sock *msk, *next, *head = NULL; + struct request_sock *req; + struct sock *sk; + + /* build a list of all unaccepted mptcp sockets */ + spin_lock_bh(&queue->rskq_lock); + for (req = queue->rskq_accept_head; req; req = req->dl_next) { + struct mptcp_subflow_context *subflow; + struct sock *ssk = req->sk; + + if (!sk_is_mptcp(ssk)) + continue; + + subflow = mptcp_subflow_ctx(ssk); + if (!subflow || !subflow->conn) + continue; + + /* skip if already in list */ + sk = subflow->conn; + msk = mptcp_sk(sk); + if (msk->dl_next || msk == head) + continue; + + sock_hold(sk); + msk->dl_next = head; + head = msk; + } + spin_unlock_bh(&queue->rskq_lock); + if (!head) + return; + + /* can't acquire the msk socket lock under the subflow one, + * or will cause ABBA deadlock + */ + release_sock(listener_ssk); + + for (msk = head; msk; msk = next) { + sk = (struct sock *)msk; + + lock_sock_nested(sk, SINGLE_DEPTH_NESTING); + next = msk->dl_next; + msk->dl_next = NULL; + + __mptcp_unaccepted_force_close(sk); + release_sock(sk); + + /* lockdep will report a false positive ABBA deadlock + * between cancel_work_sync and the listener socket. + * The involved locks belong to different sockets WRT + * the existing AB chain. + * Using a per socket key is problematic as key + * deregistration requires process context and must be + * performed at socket disposal time, in atomic + * context. + * Just tell lockdep to consider the listener socket + * released here. + */ + mutex_release(&listener_sk->sk_lock.dep_map, _RET_IP_); + mptcp_cancel_work(sk); + mutex_acquire(&listener_sk->sk_lock.dep_map, 0, 0, _RET_IP_); + + sock_put(sk); + } + + /* we are still under the listener msk socket lock */ + lock_sock_nested(listener_ssk, SINGLE_DEPTH_NESTING); +} + static int subflow_ulp_init(struct sock *sk) { struct inet_connection_sock *icsk = inet_csk(sk); |