diff options
Diffstat (limited to 'net')
37 files changed, 1372 insertions, 558 deletions
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index 83a6aacfab31..62edbf1b114e 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -508,12 +508,6 @@ static void le_setup(struct hci_request *req) /* Read LE Supported States */ hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL); - /* Read LE White List Size */ - hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL); - - /* Clear LE White List */ - hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); - /* LE-only controllers have LE implicitly enabled */ if (!lmp_bredr_capable(hdev)) hci_dev_set_flag(hdev, HCI_LE_ENABLED); @@ -832,6 +826,17 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt) hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL); } + if (hdev->commands[26] & 0x40) { + /* Read LE White List Size */ + hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, + 0, NULL); + } + + if (hdev->commands[26] & 0x80) { + /* Clear LE White List */ + hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL); + } + if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) { /* Read LE Maximum Data Length */ hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL); diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 7c65ee200c29..66e8b6ee19a5 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -239,7 +239,7 @@ static u16 l2cap_alloc_cid(struct l2cap_conn *conn) else dyn_end = L2CAP_CID_DYN_END; - for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) { + for (cid = L2CAP_CID_DYN_START; cid <= dyn_end; cid++) { if (!__l2cap_get_chan_by_scid(conn, cid)) return cid; } @@ -5250,7 +5250,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, credits = __le16_to_cpu(rsp->credits); result = __le16_to_cpu(rsp->result); - if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23)) + if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 || + dcid < L2CAP_CID_DYN_START || + dcid > L2CAP_CID_LE_DYN_END)) return -EPROTO; BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x", @@ -5270,6 +5272,11 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn, switch (result) { case L2CAP_CR_SUCCESS: + if (__l2cap_get_chan_by_dcid(conn, dcid)) { + err = -EBADSLT; + break; + } + chan->ident = 0; chan->dcid = dcid; chan->omtu = mtu; @@ -5437,9 +5444,16 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn, goto response_unlock; } + /* Check for valid dynamic CID range */ + if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) { + result = L2CAP_CR_INVALID_SCID; + chan = NULL; + goto response_unlock; + } + /* Check if we already have channel with that dcid */ if (__l2cap_get_chan_by_dcid(conn, scid)) { - result = L2CAP_CR_NO_MEM; + result = L2CAP_CR_SCID_IN_USE; chan = NULL; goto response_unlock; } diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c index 80c34d70218c..f7e8dee64fc8 100644 --- a/net/bridge/br_stp.c +++ b/net/bridge/br_stp.c @@ -600,12 +600,17 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t) int br_set_forward_delay(struct net_bridge *br, unsigned long val) { unsigned long t = clock_t_to_jiffies(val); - - if (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY) - return -ERANGE; + int err = -ERANGE; spin_lock_bh(&br->lock); + if (br->stp_enabled != BR_NO_STP && + (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)) + goto unlock; + __br_set_forward_delay(br, t); + err = 0; + +unlock: spin_unlock_bh(&br->lock); - return 0; + return err; } diff --git a/net/core/dev.c b/net/core/dev.c index 8ce3f74cd6b9..ab9b8d0d115e 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -6402,7 +6402,7 @@ int __netdev_update_features(struct net_device *dev) struct net_device *upper, *lower; netdev_features_t features; struct list_head *iter; - int err = 0; + int err = -1; ASSERT_RTNL(); @@ -6419,7 +6419,7 @@ int __netdev_update_features(struct net_device *dev) features = netdev_sync_upper_features(dev, upper, features); if (dev->features == features) - return 0; + goto sync_lower; netdev_dbg(dev, "Features changed: %pNF -> %pNF\n", &dev->features, &features); @@ -6434,6 +6434,7 @@ int __netdev_update_features(struct net_device *dev) return -1; } +sync_lower: /* some features must be disabled on lower devices when disabled * on an upper device (think: bonding master or bridge) */ @@ -6443,7 +6444,7 @@ int __netdev_update_features(struct net_device *dev) if (!err) dev->features = features; - return 1; + return err < 0 ? 0 : 1; } /** diff --git a/net/core/dst.c b/net/core/dst.c index 2a1818065e12..e6dc77252fe9 100644 --- a/net/core/dst.c +++ b/net/core/dst.c @@ -306,7 +306,7 @@ void dst_release(struct dst_entry *dst) if (unlikely(newrefcnt < 0)) net_warn_ratelimited("%s: dst:%p refcnt:%d\n", __func__, dst, newrefcnt); - if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt) + if (!newrefcnt && unlikely(dst->flags & DST_NOCACHE)) call_rcu(&dst->rcu_head, dst_destroy_rcu); } } diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 3e87447e65c7..d97268e8ff10 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -923,14 +923,21 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst || fib_prefsrc != cfg->fc_dst) { u32 tb_id = cfg->fc_table; + int rc; if (tb_id == RT_TABLE_MAIN) tb_id = RT_TABLE_LOCAL; - if (inet_addr_type_table(cfg->fc_nlinfo.nl_net, - fib_prefsrc, tb_id) != RTN_LOCAL) { - return false; + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, tb_id); + + if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) { + rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net, + fib_prefsrc, RT_TABLE_LOCAL); } + + if (rc != RTN_LOCAL) + return false; } return true; } diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index 64aaf3522a59..6baf36e11808 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -2392,11 +2392,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, struct ip_sf_socklist *psl; struct net *net = sock_net(sk); + ASSERT_RTNL(); + if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; @@ -2417,7 +2417,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, goto done; msf->imsf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); if (!psl) { len = 0; count = 0; @@ -2436,7 +2435,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, return -EFAULT; return 0; done: - rtnl_unlock(); return err; } @@ -2450,6 +2448,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, struct inet_sock *inet = inet_sk(sk); struct ip_sf_socklist *psl; + ASSERT_RTNL(); + psin = (struct sockaddr_in *)&gsf->gf_group; if (psin->sin_family != AF_INET) return -EINVAL; @@ -2457,8 +2457,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, if (!ipv4_is_multicast(addr)) return -EINVAL; - rtnl_lock(); - err = -EADDRNOTAVAIL; for_each_pmc_rtnl(inet, pmc) { @@ -2470,7 +2468,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, goto done; gsf->gf_fmode = pmc->sfmode; psl = rtnl_dereference(pmc->sflist); - rtnl_unlock(); count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; @@ -2490,7 +2487,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, } return 0; done: - rtnl_unlock(); return err; } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index c3c359ad66e3..5f73a7c03e27 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -1251,11 +1251,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt); * the _received_ ones. The set sets the _sent_ ones. */ +static bool getsockopt_needs_rtnl(int optname) +{ + switch (optname) { + case IP_MSFILTER: + case MCAST_MSFILTER: + return true; + } + return false; +} + static int do_ip_getsockopt(struct sock *sk, int level, int optname, char __user *optval, int __user *optlen, unsigned int flags) { struct inet_sock *inet = inet_sk(sk); - int val; + bool needs_rtnl = getsockopt_needs_rtnl(optname); + int val, err = 0; int len; if (level != SOL_IP) @@ -1269,6 +1280,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (len < 0) return -EINVAL; + if (needs_rtnl) + rtnl_lock(); lock_sock(sk); switch (optname) { @@ -1386,39 +1399,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, case IP_MSFILTER: { struct ip_msfilter msf; - int err; if (len < IP_MSFILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_msfget(sk, &msf, (struct ip_msfilter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case MCAST_MSFILTER: { struct group_filter gsf; - int err; if (len < GROUP_FILTER_SIZE(0)) { - release_sock(sk); - return -EINVAL; + err = -EINVAL; + goto out; } if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) { - release_sock(sk); - return -EFAULT; + err = -EFAULT; + goto out; } err = ip_mc_gsfget(sk, &gsf, (struct group_filter __user *)optval, optlen); - release_sock(sk); - return err; + goto out; } case IP_MULTICAST_ALL: val = inet->mc_all; @@ -1485,6 +1494,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, return -EFAULT; } return 0; + +out: + release_sock(sk); + if (needs_rtnl) + rtnl_unlock(); + return err; } int ip_getsockopt(struct sock *sk, int level, diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c index 0e5591c2ee9f..6fb869f646bf 100644 --- a/net/ipv4/netfilter/nf_defrag_ipv4.c +++ b/net/ipv4/netfilter/nf_defrag_ipv4.c @@ -67,10 +67,9 @@ static unsigned int ipv4_conntrack_defrag(void *priv, const struct nf_hook_state *state) { struct sock *sk = skb->sk; - struct inet_sock *inet = inet_sk(skb->sk); - if (sk && (sk->sk_family == PF_INET) && - inet->nodefrag) + if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) && + inet_sk(sk)->nodefrag) return NF_ACCEPT; #if IS_ENABLED(CONFIG_NF_CONNTRACK) diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 25300c5e283b..a0bd7a55193e 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -48,14 +48,14 @@ static void set_local_port_range(struct net *net, int range[2]) { bool same_parity = !((range[0] ^ range[1]) & 1); - write_seqlock(&net->ipv4.ip_local_ports.lock); + write_seqlock_bh(&net->ipv4.ip_local_ports.lock); if (same_parity && !net->ipv4.ip_local_ports.warned) { net->ipv4.ip_local_ports.warned = true; pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n"); } net->ipv4.ip_local_ports.range[0] = range[0]; net->ipv4.ip_local_ports.range[1] = range[1]; - write_sequnlock(&net->ipv4.ip_local_ports.lock); + write_sequnlock_bh(&net->ipv4.ip_local_ports.lock); } /* Validate changes from /proc interface. */ diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 1c2648bbac4b..950e28c0cdf2 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -1326,6 +1326,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, if (__inet_inherit_port(sk, newsk) < 0) goto put_and_exit; *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); + if (*own_req) + tcp_move_syn(newtp, req); return newsk; diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index 3575dd1e5b67..ac6b1961ffeb 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -551,9 +551,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->rack.mstamp.v64 = 0; newtp->rack.advanced = 0; - newtp->saved_syn = req->saved_syn; - req->saved_syn = NULL; - TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS); } return newsk; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index d72fa90d6feb..d84742f003a9 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -418,6 +418,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev) if (err) { ipv6_mc_destroy_dev(ndev); del_timer(&ndev->regen_timer); + snmp6_unregister_dev(ndev); goto err_release; } /* protected by rtnl_lock */ diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index ea2f4d5440b5..5baa8e754e41 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -1140,14 +1140,18 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff * goto out; } *own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash)); - /* Clone pktoptions received with SYN, if we own the req */ - if (*own_req && ireq->pktopts) { - newnp->pktoptions = skb_clone(ireq->pktopts, - sk_gfp_atomic(sk, GFP_ATOMIC)); - consume_skb(ireq->pktopts); - ireq->pktopts = NULL; - if (newnp->pktoptions) - skb_set_owner_r(newnp->pktoptions, newsk); + if (*own_req) { + tcp_move_syn(newtp, req); + + /* Clone pktoptions received with SYN, if we own the req */ + if (ireq->pktopts) { + newnp->pktoptions = skb_clone(ireq->pktopts, + sk_gfp_atomic(sk, GFP_ATOMIC)); + consume_skb(ireq->pktopts); + ireq->pktopts = NULL; + if (newnp->pktoptions) + skb_set_owner_r(newnp->pktoptions, newsk); + } } return newsk; diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c index 97b75f9bfbcd..d43869879fcf 100644 --- a/net/netfilter/nf_nat_redirect.c +++ b/net/netfilter/nf_nat_redirect.c @@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb, rcu_read_lock(); indev = __in_dev_get_rcu(skb->dev); - if (indev != NULL) { + if (indev && indev->ifa_list) { ifa = indev->ifa_list; newdst = ifa->ifa_local; } diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c index f1d9e887f5b1..46453ab318db 100644 --- a/net/netfilter/nfnetlink.c +++ b/net/netfilter/nfnetlink.c @@ -492,7 +492,7 @@ static int nfnetlink_bind(struct net *net, int group) type = nfnl_group2type[group]; rcu_read_lock(); - ss = nfnetlink_get_subsys(type); + ss = nfnetlink_get_subsys(type << 8); rcu_read_unlock(); if (!ss) request_module("nfnetlink-subsys-%d", type); diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index e4ad2c24bc41..9dfaf4d55ee0 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -31,6 +31,7 @@ void nft_meta_get_eval(const struct nft_expr *expr, const struct nft_meta *priv = nft_expr_priv(expr); const struct sk_buff *skb = pkt->skb; const struct net_device *in = pkt->in, *out = pkt->out; + struct sock *sk; u32 *dest = ®s->data[priv->dreg]; switch (priv->key) { @@ -86,33 +87,35 @@ void nft_meta_get_eval(const struct nft_expr *expr, *(u16 *)dest = out->type; break; case NFT_META_SKUID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kuid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsuid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsuid); + read_unlock_bh(&sk->sk_callback_lock); break; case NFT_META_SKGID: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - read_lock_bh(&skb->sk->sk_callback_lock); - if (skb->sk->sk_socket == NULL || - skb->sk->sk_socket->file == NULL) { - read_unlock_bh(&skb->sk->sk_callback_lock); + read_lock_bh(&sk->sk_callback_lock); + if (sk->sk_socket == NULL || + sk->sk_socket->file == NULL) { + read_unlock_bh(&sk->sk_callback_lock); goto err; } *dest = from_kgid_munged(&init_user_ns, - skb->sk->sk_socket->file->f_cred->fsgid); - read_unlock_bh(&skb->sk->sk_callback_lock); + sk->sk_socket->file->f_cred->fsgid); + read_unlock_bh(&sk->sk_callback_lock); break; #ifdef CONFIG_IP_ROUTE_CLASSID case NFT_META_RTCLASSID: { @@ -168,9 +171,10 @@ void nft_meta_get_eval(const struct nft_expr *expr, break; #ifdef CONFIG_CGROUP_NET_CLASSID case NFT_META_CGROUP: - if (skb->sk == NULL || !sk_fullsock(skb->sk)) + sk = skb_to_full_sk(skb); + if (!sk || !sk_fullsock(sk)) goto err; - *dest = skb->sk->sk_classid; + *dest = sk->sk_classid; break; #endif default: diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c index 899b06115fc5..3eff7b67cdf2 100644 --- a/net/netfilter/xt_TEE.c +++ b/net/netfilter/xt_TEE.c @@ -31,8 +31,9 @@ static unsigned int tee_tg4(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; + int oif = info->priv ? info->priv->oif : 0; - nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, info->priv->oif); + nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif); return XT_CONTINUE; } @@ -42,8 +43,9 @@ static unsigned int tee_tg6(struct sk_buff *skb, const struct xt_action_param *par) { const struct xt_tee_tginfo *info = par->targinfo; + int oif = info->priv ? info->priv->oif : 0; - nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, info->priv->oif); + nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif); return XT_CONTINUE; } diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index ca2e577ed8ac..1302b475abcb 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -14,6 +14,7 @@ #include <linux/skbuff.h> #include <linux/file.h> #include <net/sock.h> +#include <net/inet_sock.h> #include <linux/netfilter/x_tables.h> #include <linux/netfilter/xt_owner.h> @@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; + struct sock *sk = skb_to_full_sk(skb); - if (skb->sk == NULL || skb->sk->sk_socket == NULL) + if (sk == NULL || sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; else if (info->match & info->invert & XT_OWNER_SOCKET) /* @@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) */ return false; - filp = skb->sk->sk_socket->file; + filp = sk->sk_socket->file; if (filp == NULL) return ((info->match ^ info->invert) & (XT_OWNER_UID | XT_OWNER_GID)) == 0; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 691660b9b7ef..af399cac5205 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2911,22 +2911,40 @@ static int packet_release(struct socket *sock) * Attach a packet hook. */ -static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) +static int packet_do_bind(struct sock *sk, const char *name, int ifindex, + __be16 proto) { struct packet_sock *po = pkt_sk(sk); struct net_device *dev_curr; __be16 proto_curr; bool need_rehook; + struct net_device *dev = NULL; + int ret = 0; + bool unlisted = false; - if (po->fanout) { - if (dev) - dev_put(dev); - + if (po->fanout) return -EINVAL; - } lock_sock(sk); spin_lock(&po->bind_lock); + rcu_read_lock(); + + if (name) { + dev = dev_get_by_name_rcu(sock_net(sk), name); + if (!dev) { + ret = -ENODEV; + goto out_unlock; + } + } else if (ifindex) { + dev = dev_get_by_index_rcu(sock_net(sk), ifindex); + if (!dev) { + ret = -ENODEV; + goto out_unlock; + } + } + + if (dev) + dev_hold(dev); proto_curr = po->prot_hook.type; dev_curr = po->prot_hook.dev; @@ -2934,14 +2952,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) need_rehook = proto_curr != proto || dev_curr != dev; if (need_rehook) { - unregister_prot_hook(sk, true); + if (po->running) { + rcu_read_unlock(); + __unregister_prot_hook(sk, true); + rcu_read_lock(); + dev_curr = po->prot_hook.dev; + if (dev) + unlisted = !dev_get_by_index_rcu(sock_net(sk), + dev->ifindex); + } po->num = proto; po->prot_hook.type = proto; - po->prot_hook.dev = dev; - po->ifindex = dev ? dev->ifindex : 0; - packet_cached_dev_assign(po, dev); + if (unlikely(unlisted)) { + dev_put(dev); + po->prot_hook.dev = NULL; + po->ifindex = -1; + packet_cached_dev_reset(po); + } else { + po->prot_hook.dev = dev; + po->ifindex = dev ? dev->ifindex : 0; + packet_cached_dev_assign(po, dev); + } } if (dev_curr) dev_put(dev_curr); @@ -2949,7 +2982,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) if (proto == 0 || !need_rehook) goto out_unlock; - if (!dev || (dev->flags & IFF_UP)) { + if (!unlisted && (!dev || (dev->flags & IFF_UP))) { register_prot_hook(sk); } else { sk->sk_err = ENETDOWN; @@ -2958,9 +2991,10 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) } out_unlock: + rcu_read_unlock(); spin_unlock(&po->bind_lock); release_sock(sk); - return 0; + return ret; } /* @@ -2972,8 +3006,6 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, { struct sock *sk = sock->sk; char name[15]; - struct net_device *dev; - int err = -ENODEV; /* * Check legality @@ -2983,19 +3015,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, return -EINVAL; strlcpy(name, uaddr->sa_data, sizeof(name)); - dev = dev_get_by_name(sock_net(sk), name); - if (dev) - err = packet_do_bind(sk, dev, pkt_sk(sk)->num); - return err; + return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr; struct sock *sk = sock->sk; - struct net_device *dev = NULL; - int err; - /* * Check legality @@ -3006,16 +3032,8 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len if (sll->sll_family != AF_PACKET) return -EINVAL; - if (sll->sll_ifindex) { - err = -ENODEV; - dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex); - if (dev == NULL) - goto out; - } - err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num); - -out: - return err; + return packet_do_bind(sk, NULL, sll->sll_ifindex, + sll->sll_protocol ? : pkt_sk(sk)->num); } static struct proto packet_proto = { diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 536838b657bf..fbfec6a18839 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -22,6 +22,7 @@ #include <linux/if_vlan.h> #include <linux/slab.h> #include <linux/module.h> +#include <net/inet_sock.h> #include <net/pkt_cls.h> #include <net/ip.h> @@ -197,8 +198,11 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb) static u32 flow_get_skuid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kuid_t skuid = sk->sk_socket->file->f_cred->fsuid; + return from_kuid(&init_user_ns, skuid); } return 0; @@ -206,8 +210,11 @@ static u32 flow_get_skuid(const struct sk_buff *skb) static u32 flow_get_skgid(const struct sk_buff *skb) { - if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) { - kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid; + struct sock *sk = skb_to_full_sk(skb); + + if (sk && sk->sk_socket && sk->sk_socket->file) { + kgid_t skgid = sk->sk_socket->file->f_cred->fsgid; + return from_kgid(&init_user_ns, skgid); } return 0; diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c index b5294ce20cd4..f2aabc0089da 100644 --- a/net/sched/em_meta.c +++ b/net/sched/em_meta.c @@ -343,119 +343,145 @@ META_COLLECTOR(int_sk_refcnt) META_COLLECTOR(int_sk_rcvbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvbuf; + dst->value = sk->sk_rcvbuf; } META_COLLECTOR(int_sk_shutdown) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_shutdown; + dst->value = sk->sk_shutdown; } META_COLLECTOR(int_sk_proto) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_protocol; + dst->value = sk->sk_protocol; } META_COLLECTOR(int_sk_type) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_type; + dst->value = sk->sk_type; } META_COLLECTOR(int_sk_rmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_rmem_alloc_get(skb->sk); + dst->value = sk_rmem_alloc_get(sk); } META_COLLECTOR(int_sk_wmem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = sk_wmem_alloc_get(skb->sk); + dst->value = sk_wmem_alloc_get(sk); } META_COLLECTOR(int_sk_omem_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = atomic_read(&skb->sk->sk_omem_alloc); + dst->value = atomic_read(&sk->sk_omem_alloc); } META_COLLECTOR(int_sk_rcv_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_receive_queue.qlen; + dst->value = sk->sk_receive_queue.qlen; } META_COLLECTOR(int_sk_snd_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_queue.qlen; + dst->value = sk->sk_write_queue.qlen; } META_COLLECTOR(int_sk_wmem_queued) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_wmem_queued; + dst->value = sk->sk_wmem_queued; } META_COLLECTOR(int_sk_fwd_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_forward_alloc; + dst->value = sk->sk_forward_alloc; } META_COLLECTOR(int_sk_sndbuf) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndbuf; + dst->value = sk->sk_sndbuf; } META_COLLECTOR(int_sk_alloc) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = (__force int) skb->sk->sk_allocation; + dst->value = (__force int) sk->sk_allocation; } META_COLLECTOR(int_sk_hash) @@ -469,92 +495,112 @@ META_COLLECTOR(int_sk_hash) META_COLLECTOR(int_sk_lingertime) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_lingertime / HZ; + dst->value = sk->sk_lingertime / HZ; } META_COLLECTOR(int_sk_err_qlen) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_error_queue.qlen; + dst->value = sk->sk_error_queue.qlen; } META_COLLECTOR(int_sk_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_ack_backlog; + dst->value = sk->sk_ack_backlog; } META_COLLECTOR(int_sk_max_ack_bl) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_max_ack_backlog; + dst->value = sk->sk_max_ack_backlog; } META_COLLECTOR(int_sk_prio) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_priority; + dst->value = sk->sk_priority; } META_COLLECTOR(int_sk_rcvlowat) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvlowat; + dst->value = sk->sk_rcvlowat; } META_COLLECTOR(int_sk_rcvtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_rcvtimeo / HZ; + dst->value = sk->sk_rcvtimeo / HZ; } META_COLLECTOR(int_sk_sndtimeo) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_sndtimeo / HZ; + dst->value = sk->sk_sndtimeo / HZ; } META_COLLECTOR(int_sk_sendmsg_off) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_frag.offset; + dst->value = sk->sk_frag.offset; } META_COLLECTOR(int_sk_write_pend) { - if (skip_nonlocal(skb)) { + const struct sock *sk = skb_to_full_sk(skb); + + if (!sk) { *err = -1; return; } - dst->value = skb->sk->sk_write_pending; + dst->value = sk->sk_write_pending; } /************************************************************************** diff --git a/net/sctp/transport.c b/net/sctp/transport.c index a0a431824f63..aab9e3f29755 100644 --- a/net/sctp/transport.c +++ b/net/sctp/transport.c @@ -331,7 +331,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt) * 1/8, rto_alpha would be expressed as 3. */ tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta) - + (((__u32)abs64((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); + + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta); tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha) + (rtt >> net->sctp.rto_alpha); } else { diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 6255d141133b..229956bf8457 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -138,6 +138,14 @@ out_free: */ int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs) { + if (!xprt->ops->bc_setup) + return 0; + return xprt->ops->bc_setup(xprt, min_reqs); +} +EXPORT_SYMBOL_GPL(xprt_setup_backchannel); + +int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs) +{ struct rpc_rqst *req; struct list_head tmp_list; int i; @@ -192,7 +200,6 @@ out_free: dprintk("RPC: setup backchannel transport failed\n"); return -ENOMEM; } -EXPORT_SYMBOL_GPL(xprt_setup_backchannel); /** * xprt_destroy_backchannel - Destroys the backchannel preallocated structures. @@ -205,6 +212,13 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel); */ void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs) { + if (xprt->ops->bc_destroy) + xprt->ops->bc_destroy(xprt, max_reqs); +} +EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); + +void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs) +{ struct rpc_rqst *req = NULL, *tmp = NULL; dprintk("RPC: destroy backchannel transport\n"); @@ -227,7 +241,6 @@ out: dprintk("RPC: backchannel list empty= %s\n", list_empty(&xprt->bc_pa_list) ? "true" : "false"); } -EXPORT_SYMBOL_GPL(xprt_destroy_backchannel); static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid) { @@ -264,6 +277,13 @@ void xprt_free_bc_request(struct rpc_rqst *req) { struct rpc_xprt *xprt = req->rq_xprt; + xprt->ops->bc_free_rqst(req); +} + +void xprt_free_bc_rqst(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + dprintk("RPC: free backchannel req=%p\n", req); req->rq_connect_cookie = xprt->connect_cookie - 1; diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index a8f579df14d8..bc5b7b5032ca 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1367,11 +1367,6 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req, /* reset result send buffer "put" position */ resv->iov_len = 0; - if (rqstp->rq_prot != IPPROTO_TCP) { - printk(KERN_ERR "No support for Non-TCP transports!\n"); - BUG(); - } - /* * Skip the next two words because they've already been * processed in the transport diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c index 887f0183b4c6..c88d9bc06f5c 100644 --- a/net/sunrpc/sysctl.c +++ b/net/sunrpc/sysctl.c @@ -76,7 +76,7 @@ static int proc_dodebug(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, loff_t *ppos) { - char tmpbuf[20], c, *s; + char tmpbuf[20], c, *s = NULL; char __user *p; unsigned int value; size_t left, len; @@ -103,23 +103,24 @@ proc_dodebug(struct ctl_table *table, int write, return -EFAULT; tmpbuf[left] = '\0'; - for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--) - value = 10 * value + (*s - '0'); - if (*s && !isspace(*s)) - return -EINVAL; - while (left && isspace(*s)) - left--, s++; + value = simple_strtol(tmpbuf, &s, 0); + if (s) { + left -= (s - tmpbuf); + if (left && !isspace(*s)) + return -EINVAL; + while (left && isspace(*s)) + left--, s++; + } else + left = 0; *(unsigned int *) table->data = value; /* Display the RPC tasks on writing to rpc_debug */ if (strcmp(table->procname, "rpc_debug") == 0) rpc_show_tasks(&init_net); } else { - if (!access_ok(VERIFY_WRITE, buffer, left)) - return -EFAULT; - len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data); + len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data); if (len > left) len = left; - if (__copy_to_user(buffer, tmpbuf, len)) + if (copy_to_user(buffer, tmpbuf, len)) return -EFAULT; if ((left -= len) > 0) { if (put_user('\n', (char __user *)buffer + len)) diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 48913de240bd..33f99d3004f2 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -5,3 +5,4 @@ rpcrdma-y := transport.o rpc_rdma.o verbs.o \ svc_rdma.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o +rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c new file mode 100644 index 000000000000..2dcb44f69e53 --- /dev/null +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -0,0 +1,394 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA. + */ + +#include <linux/module.h> +#include <linux/sunrpc/xprt.h> +#include <linux/sunrpc/svc.h> +#include <linux/sunrpc/svc_xprt.h> + +#include "xprt_rdma.h" + +#if IS_ENABLED(CONFIG_SUNRPC_DEBUG) +# define RPCDBG_FACILITY RPCDBG_TRANS +#endif + +#define RPCRDMA_BACKCHANNEL_DEBUG + +static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_buffer *buf = &r_xprt->rx_buf; + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + + spin_lock(&buf->rb_reqslock); + list_del(&req->rl_all); + spin_unlock(&buf->rb_reqslock); + + rpcrdma_destroy_req(&r_xprt->rx_ia, req); + + kfree(rqst); +} + +static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt, + struct rpc_rqst *rqst) +{ + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_regbuf *rb; + struct rpcrdma_req *req; + struct xdr_buf *buf; + size_t size; + + req = rpcrdma_create_req(r_xprt); + if (!req) + return -ENOMEM; + req->rl_backchannel = true; + + size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst); + rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + req->rl_rdmabuf = rb; + + size += RPCRDMA_INLINE_READ_THRESHOLD(rqst); + rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL); + if (IS_ERR(rb)) + goto out_fail; + rb->rg_owner = req; + req->rl_sendbuf = rb; + /* so that rpcr_to_rdmar works when receiving a request */ + rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base; + + buf = &rqst->rq_snd_buf; + buf->head[0].iov_base = rqst->rq_buffer; + buf->head[0].iov_len = 0; + buf->tail[0].iov_base = NULL; + buf->tail[0].iov_len = 0; + buf->page_len = 0; + buf->len = 0; + buf->buflen = size; + + return 0; + +out_fail: + rpcrdma_bc_free_rqst(r_xprt, rqst); + return -ENOMEM; +} + +/* Allocate and add receive buffers to the rpcrdma_buffer's + * existing list of rep's. These are released when the + * transport is destroyed. + */ +static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt, + unsigned int count) +{ + struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; + struct rpcrdma_rep *rep; + unsigned long flags; + int rc = 0; + + while (count--) { + rep = rpcrdma_create_rep(r_xprt); + if (IS_ERR(rep)) { + pr_err("RPC: %s: reply buffer alloc failed\n", + __func__); + rc = PTR_ERR(rep); + break; + } + + spin_lock_irqsave(&buffers->rb_lock, flags); + list_add(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock_irqrestore(&buffers->rb_lock, flags); + } + + return rc; +} + +/** + * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of concurrent incoming requests to expect + * + * Returns 0 on success; otherwise a negative errno + */ +int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; + struct rpc_rqst *rqst; + unsigned int i; + int rc; + + /* The backchannel reply path returns each rpc_rqst to the + * bc_pa_list _after_ the reply is sent. If the server is + * faster than the client, it can send another backward + * direction request before the rpc_rqst is returned to the + * list. The client rejects the request in this case. + * + * Twice as many rpc_rqsts are prepared to ensure there is + * always an rpc_rqst available as soon as a reply is sent. + */ + if (reqs > RPCRDMA_BACKWARD_WRS >> 1) + goto out_err; + + for (i = 0; i < (reqs << 1); i++) { + rqst = kzalloc(sizeof(*rqst), GFP_KERNEL); + if (!rqst) { + pr_err("RPC: %s: Failed to create bc rpc_rqst\n", + __func__); + goto out_free; + } + + rqst->rq_xprt = &r_xprt->rx_xprt; + INIT_LIST_HEAD(&rqst->rq_list); + INIT_LIST_HEAD(&rqst->rq_bc_list); + + if (rpcrdma_bc_setup_rqst(r_xprt, rqst)) + goto out_free; + + spin_lock_bh(&xprt->bc_pa_lock); + list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + } + + rc = rpcrdma_bc_setup_reps(r_xprt, reqs); + if (rc) + goto out_free; + + rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs); + if (rc) + goto out_free; + + buffer->rb_bc_srv_max_requests = reqs; + request_module("svcrdma"); + + return 0; + +out_free: + xprt_rdma_bc_destroy(xprt, reqs); + +out_err: + pr_err("RPC: %s: setup backchannel transport failed\n", __func__); + return -ENOMEM; +} + +/** + * xprt_rdma_bc_up - Create transport endpoint for backchannel service + * @serv: server endpoint + * @net: network namespace + * + * The "xprt" is an implied argument: it supplies the name of the + * backchannel transport class. + * + * Returns zero on success, negative errno on failure + */ +int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0); + if (ret < 0) + return ret; + return 0; +} + +/** + * rpcrdma_bc_marshal_reply - Send backwards direction reply + * @rqst: buffer containing RPC reply data + * + * Returns zero on success. + */ +int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_req *req = rpcr_to_rdmar(rqst); + struct rpcrdma_msg *headerp; + size_t rpclen; + + headerp = rdmab_to_msg(req->rl_rdmabuf); + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = + cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + + rpclen = rqst->rq_svec[0].iov_len; + + pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n", + __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf)); + pr_info("RPC: %s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, headerp); + pr_info("RPC: %s: RPC: %*ph\n", + __func__, (int)rpclen, rqst->rq_svec[0].iov_base); + + req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf); + req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN; + req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf); + + req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf); + req->rl_send_iov[1].length = rpclen; + req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf); + + req->rl_niovs = 2; + return 0; +} + +/** + * xprt_rdma_bc_destroy - Release resources for handling backchannel requests + * @xprt: transport associated with these backchannel resources + * @reqs: number of incoming requests to destroy; ignored + */ +void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpc_rqst *rqst, *tmp; + + spin_lock_bh(&xprt->bc_pa_lock); + list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { + list_del(&rqst->rq_bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); + + rpcrdma_bc_free_rqst(r_xprt, rqst); + + spin_lock_bh(&xprt->bc_pa_lock); + } + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * xprt_rdma_bc_free_rqst - Release a backchannel rqst + * @rqst: request to release + */ +void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + + smp_mb__before_atomic(); + WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)); + clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + smp_mb__after_atomic(); + + spin_lock_bh(&xprt->bc_pa_lock); + list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); + spin_unlock_bh(&xprt->bc_pa_lock); +} + +/** + * rpcrdma_bc_receive_call - Handle a backward direction call + * @xprt: transport receiving the call + * @rep: receive buffer containing the call + * + * Called in the RPC reply handler, which runs in a tasklet. + * Be quick about it. + * + * Operational assumptions: + * o Backchannel credits are ignored, just as the NFS server + * forechannel currently does + * o The ULP manages a replay cache (eg, NFSv4.1 sessions). + * No replay detection is done at the transport level + */ +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt, + struct rpcrdma_rep *rep) +{ + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + struct rpcrdma_msg *headerp; + struct svc_serv *bc_serv; + struct rpcrdma_req *req; + struct rpc_rqst *rqst; + struct xdr_buf *buf; + size_t size; + __be32 *p; + + headerp = rdmab_to_msg(rep->rr_rdmabuf); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: callback XID %08x, length=%u\n", + __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len); + pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp); +#endif + + /* Sanity check: + * Need at least enough bytes for RPC/RDMA header, as code + * here references the header fields by array offset. Also, + * backward calls are always inline, so ensure there + * are some bytes beyond the RPC/RDMA header. + */ + if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24) + goto out_short; + p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN); + size = rep->rr_len - RPCRDMA_HDRLEN_MIN; + + /* Grab a free bc rqst */ + spin_lock(&xprt->bc_pa_lock); + if (list_empty(&xprt->bc_pa_list)) { + spin_unlock(&xprt->bc_pa_lock); + goto out_overflow; + } + rqst = list_first_entry(&xprt->bc_pa_list, + struct rpc_rqst, rq_bc_pa_list); + list_del(&rqst->rq_bc_pa_list); + spin_unlock(&xprt->bc_pa_lock); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: using rqst %p\n", __func__, rqst); +#endif + + /* Prepare rqst */ + rqst->rq_reply_bytes_recvd = 0; + rqst->rq_bytes_sent = 0; + rqst->rq_xid = headerp->rm_xid; + set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); + + buf = &rqst->rq_rcv_buf; + memset(buf, 0, sizeof(*buf)); + buf->head[0].iov_base = p; + buf->head[0].iov_len = size; + buf->len = size; + + /* The receive buffer has to be hooked to the rpcrdma_req + * so that it can be reposted after the server is done + * parsing it but just before sending the backward + * direction reply. + */ + req = rpcr_to_rdmar(rqst); +#ifdef RPCRDMA_BACKCHANNEL_DEBUG + pr_info("RPC: %s: attaching rep %p to req %p\n", + __func__, rep, req); +#endif + req->rl_reply = rep; + + /* Defeat the retransmit detection logic in send_request */ + req->rl_connect_cookie = 0; + + /* Queue rqst for ULP's callback service */ + bc_serv = xprt->bc_serv; + spin_lock(&bc_serv->sv_cb_lock); + list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list); + spin_unlock(&bc_serv->sv_cb_lock); + + wake_up(&bc_serv->sv_cb_waitq); + + r_xprt->rx_stats.bcall_count++; + return; + +out_overflow: + pr_warn("RPC/RDMA backchannel overflow\n"); + xprt_disconnect_done(xprt); + /* This receive buffer gets reposted automatically + * when the connection is re-established. + */ + return; + +out_short: + pr_warn("RPC/RDMA short backward direction call\n"); + + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + xprt_disconnect_done(xprt); + else + pr_warn("RPC: %s: reposting rep %p\n", + __func__, rep); +} diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index a1434447b0d6..88cf9e7269c2 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -256,8 +256,11 @@ frwr_sendcompletion(struct ib_wc *wc) /* WARNING: Only wr_id and status are reliable at this point */ r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id; - pr_warn("RPC: %s: frmr %p flushed, status %s (%d)\n", - __func__, r, ib_wc_status_msg(wc->status), wc->status); + if (wc->status == IB_WC_WR_FLUSH_ERR) + dprintk("RPC: %s: frmr %p flushed\n", __func__, r); + else + pr_warn("RPC: %s: frmr %p error, status %s (%d)\n", + __func__, r, ib_wc_status_msg(wc->status), wc->status); r->r.frmr.fr_state = FRMR_IS_STALE; } diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index bc8bd6577467..c10d9699441c 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -441,6 +441,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst) enum rpcrdma_chunktype rtype, wtype; struct rpcrdma_msg *headerp; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state)) + return rpcrdma_bc_marshal_reply(rqst); +#endif + /* * rpclen gets amount of data in first buffer, which is the * pre-registered buffer. @@ -711,6 +716,37 @@ rpcrdma_connect_worker(struct work_struct *work) spin_unlock_bh(&xprt->transport_lock); } +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +rpcrdma_is_bcall(struct rpcrdma_msg *headerp) +{ + __be32 *p = (__be32 *)headerp; + + if (headerp->rm_type != rdma_msg) + return false; + if (headerp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (headerp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (headerp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != headerp->rm_xid) + return false; + /* call direction */ + if (p[8] != cpu_to_be32(RPC_CALL)) + return false; + + return true; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + /* * This function is called when an async event is posted to * the connection which changes the connection state. All it @@ -723,8 +759,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep) schedule_delayed_work(&ep->rep_connect_worker, 0); } -/* - * Called as a tasklet to do req/reply match and complete a request +/* Process received RPC/RDMA messages. + * * Errors must result in the RPC task either being awakened, or * allowed to timeout, to discover the errors at that time. */ @@ -741,52 +777,32 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep) unsigned long cwnd; u32 credits; - /* Check status. If bad, signal disconnect and return rep to pool */ - if (rep->rr_len == ~0U) { - rpcrdma_recv_buffer_put(rep); - if (r_xprt->rx_ep.rep_connected == 1) { - r_xprt->rx_ep.rep_connected = -EIO; - rpcrdma_conn_func(&r_xprt->rx_ep); - } - return; - } - if (rep->rr_len < RPCRDMA_HDRLEN_MIN) { - dprintk("RPC: %s: short/invalid reply\n", __func__); - goto repost; - } + dprintk("RPC: %s: incoming rep %p\n", __func__, rep); + + if (rep->rr_len == RPCRDMA_BAD_LEN) + goto out_badstatus; + if (rep->rr_len < RPCRDMA_HDRLEN_MIN) + goto out_shortreply; + headerp = rdmab_to_msg(rep->rr_rdmabuf); - if (headerp->rm_vers != rpcrdma_version) { - dprintk("RPC: %s: invalid version %d\n", - __func__, be32_to_cpu(headerp->rm_vers)); - goto repost; - } + if (headerp->rm_vers != rpcrdma_version) + goto out_badversion; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + if (rpcrdma_is_bcall(headerp)) + goto out_bcall; +#endif - /* Get XID and try for a match. */ - spin_lock(&xprt->transport_lock); + /* Match incoming rpcrdma_rep to an rpcrdma_req to + * get context for handling any incoming chunks. + */ + spin_lock_bh(&xprt->transport_lock); rqst = xprt_lookup_rqst(xprt, headerp->rm_xid); - if (rqst == NULL) { - spin_unlock(&xprt->transport_lock); - dprintk("RPC: %s: reply 0x%p failed " - "to match any request xid 0x%08x len %d\n", - __func__, rep, be32_to_cpu(headerp->rm_xid), - rep->rr_len); -repost: - r_xprt->rx_stats.bad_reply_count++; - if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) - rpcrdma_recv_buffer_put(rep); + if (!rqst) + goto out_nomatch; - return; - } - - /* get request object */ req = rpcr_to_rdmar(rqst); - if (req->rl_reply) { - spin_unlock(&xprt->transport_lock); - dprintk("RPC: %s: duplicate reply 0x%p to RPC " - "request 0x%p: xid 0x%08x\n", __func__, rep, req, - be32_to_cpu(headerp->rm_xid)); - goto repost; - } + if (req->rl_reply) + goto out_duplicate; dprintk("RPC: %s: reply 0x%p completes request 0x%p\n" " RPC request 0x%p xid 0x%08x\n", @@ -883,8 +899,50 @@ badheader: if (xprt->cwnd > cwnd) xprt_release_rqst_cong(rqst->rq_task); + xprt_complete_rqst(rqst->rq_task, status); + spin_unlock_bh(&xprt->transport_lock); dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n", __func__, xprt, rqst, status); - xprt_complete_rqst(rqst->rq_task, status); - spin_unlock(&xprt->transport_lock); + return; + +out_badstatus: + rpcrdma_recv_buffer_put(rep); + if (r_xprt->rx_ep.rep_connected == 1) { + r_xprt->rx_ep.rep_connected = -EIO; + rpcrdma_conn_func(&r_xprt->rx_ep); + } + return; + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +out_bcall: + rpcrdma_bc_receive_call(r_xprt, rep); + return; +#endif + +out_shortreply: + dprintk("RPC: %s: short/invalid reply\n", __func__); + goto repost; + +out_badversion: + dprintk("RPC: %s: invalid version %d\n", + __func__, be32_to_cpu(headerp->rm_vers)); + goto repost; + +out_nomatch: + spin_unlock_bh(&xprt->transport_lock); + dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n", + __func__, be32_to_cpu(headerp->rm_xid), + rep->rr_len); + goto repost; + +out_duplicate: + spin_unlock_bh(&xprt->transport_lock); + dprintk("RPC: %s: " + "duplicate reply %p to RPC request %p: xid 0x%08x\n", + __func__, rep, req, be32_to_cpu(headerp->rm_xid)); + +repost: + r_xprt->rx_stats.bad_reply_count++; + if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep)) + rpcrdma_recv_buffer_put(rep); } diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 2cd252f023a5..1b7051bdbdc8 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -239,6 +239,9 @@ void svc_rdma_cleanup(void) unregister_sysctl_table(svcrdma_table_header); svcrdma_table_header = NULL; } +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_unreg_xprt_class(&svc_rdma_bc_class); +#endif svc_unreg_xprt_class(&svc_rdma_class); kmem_cache_destroy(svc_rdma_map_cachep); kmem_cache_destroy(svc_rdma_ctxt_cachep); @@ -286,6 +289,9 @@ int svc_rdma_init(void) /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + svc_reg_xprt_class(&svc_rdma_bc_class); +#endif return 0; err1: kmem_cache_destroy(svc_rdma_map_cachep); diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index a266e870d870..b348b4adef29 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -56,6 +56,7 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT +static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int); static struct svc_xprt *svc_rdma_create(struct svc_serv *serv, struct net *net, struct sockaddr *sa, int salen, @@ -95,6 +96,63 @@ struct svc_xprt_class svc_rdma_class = { .xcl_ident = XPRT_TRANSPORT_RDMA, }; +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *, + struct sockaddr *, int, int); +static void svc_rdma_bc_detach(struct svc_xprt *); +static void svc_rdma_bc_free(struct svc_xprt *); + +static struct svc_xprt_ops svc_rdma_bc_ops = { + .xpo_create = svc_rdma_bc_create, + .xpo_detach = svc_rdma_bc_detach, + .xpo_free = svc_rdma_bc_free, + .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr, + .xpo_secure_port = svc_rdma_secure_port, +}; + +struct svc_xprt_class svc_rdma_bc_class = { + .xcl_name = "rdma-bc", + .xcl_owner = THIS_MODULE, + .xcl_ops = &svc_rdma_bc_ops, + .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN) +}; + +static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv, + struct net *net, + struct sockaddr *sa, int salen, + int flags) +{ + struct svcxprt_rdma *cma_xprt; + struct svc_xprt *xprt; + + cma_xprt = rdma_create_xprt(serv, 0); + if (!cma_xprt) + return ERR_PTR(-ENOMEM); + xprt = &cma_xprt->sc_xprt; + + svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv); + serv->sv_bc_xprt = xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + return xprt; +} + +static void svc_rdma_bc_detach(struct svc_xprt *xprt) +{ + dprintk("svcrdma: %s(%p)\n", __func__, xprt); +} + +static void svc_rdma_bc_free(struct svc_xprt *xprt) +{ + struct svcxprt_rdma *rdma = + container_of(xprt, struct svcxprt_rdma, sc_xprt); + + dprintk("svcrdma: %s(%p)\n", __func__, xprt); + if (xprt) + kfree(rdma); +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) { struct svc_rdma_op_ctxt *ctxt; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 41e452bc580c..8c545f7d7525 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -676,7 +676,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) static int xprt_rdma_enable_swap(struct rpc_xprt *xprt) { - return -EINVAL; + return 0; } static void @@ -705,7 +705,13 @@ static struct rpc_xprt_ops xprt_rdma_procs = { .print_stats = xprt_rdma_print_stats, .enable_swap = xprt_rdma_enable_swap, .disable_swap = xprt_rdma_disable_swap, - .inject_disconnect = xprt_rdma_inject_disconnect + .inject_disconnect = xprt_rdma_inject_disconnect, +#if defined(CONFIG_SUNRPC_BACKCHANNEL) + .bc_setup = xprt_rdma_bc_setup, + .bc_up = xprt_rdma_bc_up, + .bc_free_rqst = xprt_rdma_bc_free_rqst, + .bc_destroy = xprt_rdma_bc_destroy, +#endif }; static struct xprt_class xprt_rdma = { @@ -732,6 +738,7 @@ void xprt_rdma_cleanup(void) dprintk("RPC: %s: xprt_unregister returned %i\n", __func__, rc); + rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); } @@ -743,8 +750,15 @@ int xprt_rdma_init(void) if (rc) return rc; + rc = rpcrdma_alloc_wq(); + if (rc) { + frwr_destroy_recovery_wq(); + return rc; + } + rc = xprt_register_transport(&xprt_rdma); if (rc) { + rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); return rc; } diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index f63369bd01c5..eadd1655145a 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -68,47 +68,33 @@ * internal functions */ -/* - * handle replies in tasklet context, using a single, global list - * rdma tasklet function -- just turn around and call the func - * for all replies on the list - */ - -static DEFINE_SPINLOCK(rpcrdma_tk_lock_g); -static LIST_HEAD(rpcrdma_tasklets_g); +static struct workqueue_struct *rpcrdma_receive_wq; -static void -rpcrdma_run_tasklet(unsigned long data) +int +rpcrdma_alloc_wq(void) { - struct rpcrdma_rep *rep; - unsigned long flags; - - data = data; - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - while (!list_empty(&rpcrdma_tasklets_g)) { - rep = list_entry(rpcrdma_tasklets_g.next, - struct rpcrdma_rep, rr_list); - list_del(&rep->rr_list); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + struct workqueue_struct *recv_wq; - rpcrdma_reply_handler(rep); + recv_wq = alloc_workqueue("xprtrdma_receive", + WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI, + 0); + if (!recv_wq) + return -ENOMEM; - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - } - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); + rpcrdma_receive_wq = recv_wq; + return 0; } -static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL); - -static void -rpcrdma_schedule_tasklet(struct list_head *sched_list) +void +rpcrdma_destroy_wq(void) { - unsigned long flags; + struct workqueue_struct *wq; - spin_lock_irqsave(&rpcrdma_tk_lock_g, flags); - list_splice_tail(sched_list, &rpcrdma_tasklets_g); - spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags); - tasklet_schedule(&rpcrdma_tasklet_g); + if (rpcrdma_receive_wq) { + wq = rpcrdma_receive_wq; + rpcrdma_receive_wq = NULL; + destroy_workqueue(wq); + } } static void @@ -158,63 +144,54 @@ rpcrdma_sendcq_process_wc(struct ib_wc *wc) } } -static int -rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +/* The common case is a single send completion is waiting. By + * passing two WC entries to ib_poll_cq, a return code of 1 + * means there is exactly one WC waiting and no more. We don't + * have to invoke ib_poll_cq again to know that the CQ has been + * properly drained. + */ +static void +rpcrdma_sendcq_poll(struct ib_cq *cq) { - struct ib_wc *wcs; - int budget, count, rc; + struct ib_wc *pos, wcs[2]; + int count, rc; - budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { - wcs = ep->rep_send_wcs; + pos = wcs; - rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); - if (rc <= 0) - return rc; + rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); + if (rc < 0) + break; count = rc; while (count-- > 0) - rpcrdma_sendcq_process_wc(wcs++); - } while (rc == RPCRDMA_POLLSIZE && --budget); - return 0; + rpcrdma_sendcq_process_wc(pos++); + } while (rc == ARRAY_SIZE(wcs)); + return; } -/* - * Handle send, fast_reg_mr, and local_inv completions. - * - * Send events are typically suppressed and thus do not result - * in an upcall. Occasionally one is signaled, however. This - * prevents the provider's completion queue from wrapping and - * losing a completion. +/* Handle provider send completion upcalls. */ static void rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context) { - struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; - int rc; - - rc = rpcrdma_sendcq_poll(cq, ep); - if (rc) { - dprintk("RPC: %s: ib_poll_cq failed: %i\n", - __func__, rc); - return; - } + do { + rpcrdma_sendcq_poll(cq); + } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0); +} - rc = ib_req_notify_cq(cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc == 0) - return; - if (rc < 0) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - return; - } +static void +rpcrdma_receive_worker(struct work_struct *work) +{ + struct rpcrdma_rep *rep = + container_of(work, struct rpcrdma_rep, rr_work); - rpcrdma_sendcq_poll(cq, ep); + rpcrdma_reply_handler(rep); } static void -rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) +rpcrdma_recvcq_process_wc(struct ib_wc *wc) { struct rpcrdma_rep *rep = (struct rpcrdma_rep *)(unsigned long)wc->wr_id; @@ -237,91 +214,60 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list) prefetch(rdmab_to_msg(rep->rr_rdmabuf)); out_schedule: - list_add_tail(&rep->rr_list, sched_list); + queue_work(rpcrdma_receive_wq, &rep->rr_work); return; + out_fail: if (wc->status != IB_WC_WR_FLUSH_ERR) pr_err("RPC: %s: rep %p: %s\n", __func__, rep, ib_wc_status_msg(wc->status)); - rep->rr_len = ~0U; + rep->rr_len = RPCRDMA_BAD_LEN; goto out_schedule; } -static int -rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep) +/* The wc array is on stack: automatic memory is always CPU-local. + * + * struct ib_wc is 64 bytes, making the poll array potentially + * large. But this is at the bottom of the call chain. Further + * substantial work is done in another thread. + */ +static void +rpcrdma_recvcq_poll(struct ib_cq *cq) { - struct list_head sched_list; - struct ib_wc *wcs; - int budget, count, rc; + struct ib_wc *pos, wcs[4]; + int count, rc; - INIT_LIST_HEAD(&sched_list); - budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE; do { - wcs = ep->rep_recv_wcs; + pos = wcs; - rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs); - if (rc <= 0) - goto out_schedule; + rc = ib_poll_cq(cq, ARRAY_SIZE(wcs), pos); + if (rc < 0) + break; count = rc; while (count-- > 0) - rpcrdma_recvcq_process_wc(wcs++, &sched_list); - } while (rc == RPCRDMA_POLLSIZE && --budget); - rc = 0; - -out_schedule: - rpcrdma_schedule_tasklet(&sched_list); - return rc; + rpcrdma_recvcq_process_wc(pos++); + } while (rc == ARRAY_SIZE(wcs)); } -/* - * Handle receive completions. - * - * It is reentrant but processes single events in order to maintain - * ordering of receives to keep server credits. - * - * It is the responsibility of the scheduled tasklet to return - * recv buffers to the pool. NOTE: this affects synchronization of - * connection shutdown. That is, the structures required for - * the completion of the reply handler must remain intact until - * all memory has been reclaimed. +/* Handle provider receive completion upcalls. */ static void rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context) { - struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context; - int rc; - - rc = rpcrdma_recvcq_poll(cq, ep); - if (rc) { - dprintk("RPC: %s: ib_poll_cq failed: %i\n", - __func__, rc); - return; - } - - rc = ib_req_notify_cq(cq, - IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS); - if (rc == 0) - return; - if (rc < 0) { - dprintk("RPC: %s: ib_req_notify_cq failed: %i\n", - __func__, rc); - return; - } - - rpcrdma_recvcq_poll(cq, ep); + do { + rpcrdma_recvcq_poll(cq); + } while (ib_req_notify_cq(cq, IB_CQ_NEXT_COMP | + IB_CQ_REPORT_MISSED_EVENTS) > 0); } static void rpcrdma_flush_cqs(struct rpcrdma_ep *ep) { struct ib_wc wc; - LIST_HEAD(sched_list); while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0) - rpcrdma_recvcq_process_wc(&wc, &sched_list); - if (!list_empty(&sched_list)) - rpcrdma_schedule_tasklet(&sched_list); + rpcrdma_recvcq_process_wc(&wc); while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0) rpcrdma_sendcq_process_wc(&wc); } @@ -623,6 +569,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; struct ib_cq_init_attr cq_attr = {}; + unsigned int max_qp_wr; int rc, err; if (devattr->max_sge < RPCRDMA_MAX_IOVS) { @@ -631,18 +578,27 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, return -ENOMEM; } + if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + dprintk("RPC: %s: insufficient wqe's available\n", + __func__); + return -ENOMEM; + } + max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + /* check provider's send/recv wr limits */ - if (cdata->max_requests > devattr->max_qp_wr) - cdata->max_requests = devattr->max_qp_wr; + if (cdata->max_requests > max_qp_wr) + cdata->max_requests = max_qp_wr; ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_wr = cdata->max_requests; + ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS; rc = ia->ri_ops->ro_open(ia, ep, cdata); if (rc) return rc; ep->rep_attr.cap.max_recv_wr = cdata->max_requests; + ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS; ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS; ep->rep_attr.cap.max_recv_sge = 1; ep->rep_attr.cap.max_inline_data = 0; @@ -670,7 +626,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1; sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall, - rpcrdma_cq_async_error_upcall, ep, &cq_attr); + rpcrdma_cq_async_error_upcall, NULL, &cq_attr); if (IS_ERR(sendcq)) { rc = PTR_ERR(sendcq); dprintk("RPC: %s: failed to create send CQ: %i\n", @@ -687,7 +643,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1; recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall, - rpcrdma_cq_async_error_upcall, ep, &cq_attr); + rpcrdma_cq_async_error_upcall, NULL, &cq_attr); if (IS_ERR(recvcq)) { rc = PTR_ERR(recvcq); dprintk("RPC: %s: failed to create recv CQ: %i\n", @@ -886,7 +842,21 @@ retry: } rc = ep->rep_connected; } else { + struct rpcrdma_xprt *r_xprt; + unsigned int extras; + dprintk("RPC: %s: connected\n", __func__); + + r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + extras = r_xprt->rx_buf.rb_bc_srv_max_requests; + + if (extras) { + rc = rpcrdma_ep_post_extra_recv(r_xprt, extras); + if (rc) + pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n", + __func__, rc); + rc = 0; + } } out: @@ -923,20 +893,25 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) } } -static struct rpcrdma_req * +struct rpcrdma_req * rpcrdma_create_req(struct rpcrdma_xprt *r_xprt) { + struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; req = kzalloc(sizeof(*req), GFP_KERNEL); if (req == NULL) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&req->rl_free); + spin_lock(&buffer->rb_reqslock); + list_add(&req->rl_all, &buffer->rb_allreqs); + spin_unlock(&buffer->rb_reqslock); req->rl_buffer = &r_xprt->rx_buf; return req; } -static struct rpcrdma_rep * +struct rpcrdma_rep * rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; @@ -958,6 +933,7 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt) rep->rr_device = ia->ri_device; rep->rr_rxprt = r_xprt; + INIT_WORK(&rep->rr_work, rpcrdma_receive_worker); return rep; out_free: @@ -971,44 +947,21 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ia *ia = &r_xprt->rx_ia; - struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data; - char *p; - size_t len; int i, rc; - buf->rb_max_requests = cdata->max_requests; + buf->rb_max_requests = r_xprt->rx_data.max_requests; + buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_lock); - /* Need to allocate: - * 1. arrays for send and recv pointers - * 2. arrays of struct rpcrdma_req to fill in pointers - * 3. array of struct rpcrdma_rep for replies - * Send/recv buffers in req/rep need to be registered - */ - len = buf->rb_max_requests * - (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *)); - - p = kzalloc(len, GFP_KERNEL); - if (p == NULL) { - dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n", - __func__, len); - rc = -ENOMEM; - goto out; - } - buf->rb_pool = p; /* for freeing it later */ - - buf->rb_send_bufs = (struct rpcrdma_req **) p; - p = (char *) &buf->rb_send_bufs[buf->rb_max_requests]; - buf->rb_recv_bufs = (struct rpcrdma_rep **) p; - p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests]; - rc = ia->ri_ops->ro_init(r_xprt); if (rc) goto out; + INIT_LIST_HEAD(&buf->rb_send_bufs); + INIT_LIST_HEAD(&buf->rb_allreqs); + spin_lock_init(&buf->rb_reqslock); for (i = 0; i < buf->rb_max_requests; i++) { struct rpcrdma_req *req; - struct rpcrdma_rep *rep; req = rpcrdma_create_req(r_xprt); if (IS_ERR(req)) { @@ -1017,7 +970,13 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(req); goto out; } - buf->rb_send_bufs[i] = req; + req->rl_backchannel = false; + list_add(&req->rl_free, &buf->rb_send_bufs); + } + + INIT_LIST_HEAD(&buf->rb_recv_bufs); + for (i = 0; i < buf->rb_max_requests + 2; i++) { + struct rpcrdma_rep *rep; rep = rpcrdma_create_rep(r_xprt); if (IS_ERR(rep)) { @@ -1026,7 +985,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) rc = PTR_ERR(rep); goto out; } - buf->rb_recv_bufs[i] = rep; + list_add(&rep->rr_list, &buf->rb_recv_bufs); } return 0; @@ -1035,22 +994,38 @@ out: return rc; } +static struct rpcrdma_req * +rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_req *req; + + req = list_first_entry(&buf->rb_send_bufs, + struct rpcrdma_req, rl_free); + list_del(&req->rl_free); + return req; +} + +static struct rpcrdma_rep * +rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf) +{ + struct rpcrdma_rep *rep; + + rep = list_first_entry(&buf->rb_recv_bufs, + struct rpcrdma_rep, rr_list); + list_del(&rep->rr_list); + return rep; +} + static void rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep) { - if (!rep) - return; - rpcrdma_free_regbuf(ia, rep->rr_rdmabuf); kfree(rep); } -static void +void rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req) { - if (!req) - return; - rpcrdma_free_regbuf(ia, req->rl_sendbuf); rpcrdma_free_regbuf(ia, req->rl_rdmabuf); kfree(req); @@ -1060,25 +1035,29 @@ void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { struct rpcrdma_ia *ia = rdmab_to_ia(buf); - int i; - /* clean up in reverse order from create - * 1. recv mr memory (mr free, then kfree) - * 2. send mr memory (mr free, then kfree) - * 3. MWs - */ - dprintk("RPC: %s: entering\n", __func__); + while (!list_empty(&buf->rb_recv_bufs)) { + struct rpcrdma_rep *rep; - for (i = 0; i < buf->rb_max_requests; i++) { - if (buf->rb_recv_bufs) - rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]); - if (buf->rb_send_bufs) - rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]); + rep = rpcrdma_buffer_get_rep_locked(buf); + rpcrdma_destroy_rep(ia, rep); } - ia->ri_ops->ro_destroy(buf); + spin_lock(&buf->rb_reqslock); + while (!list_empty(&buf->rb_allreqs)) { + struct rpcrdma_req *req; + + req = list_first_entry(&buf->rb_allreqs, + struct rpcrdma_req, rl_all); + list_del(&req->rl_all); + + spin_unlock(&buf->rb_reqslock); + rpcrdma_destroy_req(ia, req); + spin_lock(&buf->rb_reqslock); + } + spin_unlock(&buf->rb_reqslock); - kfree(buf->rb_pool); + ia->ri_ops->ro_destroy(buf); } struct rpcrdma_mw * @@ -1110,53 +1089,34 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw) spin_unlock(&buf->rb_mwlock); } -static void -rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf) -{ - buf->rb_send_bufs[--buf->rb_send_index] = req; - req->rl_niovs = 0; - if (req->rl_reply) { - buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply; - req->rl_reply = NULL; - } -} - /* * Get a set of request/reply buffers. * - * Reply buffer (if needed) is attached to send buffer upon return. - * Rule: - * rb_send_index and rb_recv_index MUST always be pointing to the - * *next* available buffer (non-NULL). They are incremented after - * removing buffers, and decremented *before* returning them. + * Reply buffer (if available) is attached to send buffer upon return. */ struct rpcrdma_req * rpcrdma_buffer_get(struct rpcrdma_buffer *buffers) { struct rpcrdma_req *req; - unsigned long flags; - - spin_lock_irqsave(&buffers->rb_lock, flags); - if (buffers->rb_send_index == buffers->rb_max_requests) { - spin_unlock_irqrestore(&buffers->rb_lock, flags); - dprintk("RPC: %s: out of request buffers\n", __func__); - return ((struct rpcrdma_req *)NULL); - } - - req = buffers->rb_send_bufs[buffers->rb_send_index]; - if (buffers->rb_send_index < buffers->rb_recv_index) { - dprintk("RPC: %s: %d extra receives outstanding (ok)\n", - __func__, - buffers->rb_recv_index - buffers->rb_send_index); - req->rl_reply = NULL; - } else { - req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; - buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; - } - buffers->rb_send_bufs[buffers->rb_send_index++] = NULL; + spin_lock(&buffers->rb_lock); + if (list_empty(&buffers->rb_send_bufs)) + goto out_reqbuf; + req = rpcrdma_buffer_get_req_locked(buffers); + if (list_empty(&buffers->rb_recv_bufs)) + goto out_repbuf; + req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock(&buffers->rb_lock); + return req; - spin_unlock_irqrestore(&buffers->rb_lock, flags); +out_reqbuf: + spin_unlock(&buffers->rb_lock); + pr_warn("RPC: %s: out of request buffers\n", __func__); + return NULL; +out_repbuf: + spin_unlock(&buffers->rb_lock); + pr_warn("RPC: %s: out of reply buffers\n", __func__); + req->rl_reply = NULL; return req; } @@ -1168,30 +1128,31 @@ void rpcrdma_buffer_put(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; - unsigned long flags; + struct rpcrdma_rep *rep = req->rl_reply; - spin_lock_irqsave(&buffers->rb_lock, flags); - rpcrdma_buffer_put_sendbuf(req, buffers); - spin_unlock_irqrestore(&buffers->rb_lock, flags); + req->rl_niovs = 0; + req->rl_reply = NULL; + + spin_lock(&buffers->rb_lock); + list_add_tail(&req->rl_free, &buffers->rb_send_bufs); + if (rep) + list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); } /* * Recover reply buffers from pool. - * This happens when recovering from error conditions. - * Post-increment counter/array index. + * This happens when recovering from disconnect. */ void rpcrdma_recv_buffer_get(struct rpcrdma_req *req) { struct rpcrdma_buffer *buffers = req->rl_buffer; - unsigned long flags; - spin_lock_irqsave(&buffers->rb_lock, flags); - if (buffers->rb_recv_index < buffers->rb_max_requests) { - req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index]; - buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL; - } - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); + if (!list_empty(&buffers->rb_recv_bufs)) + req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock(&buffers->rb_lock); } /* @@ -1202,11 +1163,10 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep) { struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf; - unsigned long flags; - spin_lock_irqsave(&buffers->rb_lock, flags); - buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep; - spin_unlock_irqrestore(&buffers->rb_lock, flags); + spin_lock(&buffers->rb_lock); + list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs); + spin_unlock(&buffers->rb_lock); } /* @@ -1363,6 +1323,47 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia, return rc; } +/** + * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests + * @r_xprt: transport associated with these backchannel resources + * @min_reqs: minimum number of incoming requests expected + * + * Returns zero if all requested buffers were posted, or a negative errno. + */ +int +rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count) +{ + struct rpcrdma_buffer *buffers = &r_xprt->rx_buf; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpcrdma_rep *rep; + unsigned long flags; + int rc; + + while (count--) { + spin_lock_irqsave(&buffers->rb_lock, flags); + if (list_empty(&buffers->rb_recv_bufs)) + goto out_reqbuf; + rep = rpcrdma_buffer_get_rep_locked(buffers); + spin_unlock_irqrestore(&buffers->rb_lock, flags); + + rc = rpcrdma_ep_post_recv(ia, ep, rep); + if (rc) + goto out_rc; + } + + return 0; + +out_reqbuf: + spin_unlock_irqrestore(&buffers->rb_lock, flags); + pr_warn("%s: no extra receive buffers\n", __func__); + return -ENOMEM; + +out_rc: + rpcrdma_recv_buffer_put(rep); + return rc; +} + /* How many chunk list items fit within our inline buffers? */ unsigned int diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c82abf44e39d..ac7f8d4f632a 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -77,9 +77,6 @@ struct rpcrdma_ia { * RDMA Endpoint -- one per transport instance */ -#define RPCRDMA_WC_BUDGET (128) -#define RPCRDMA_POLLSIZE (16) - struct rpcrdma_ep { atomic_t rep_cqcount; int rep_cqinit; @@ -89,8 +86,6 @@ struct rpcrdma_ep { struct rdma_conn_param rep_remote_cma; struct sockaddr_storage rep_remote_addr; struct delayed_work rep_connect_worker; - struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE]; - struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE]; }; /* @@ -106,6 +101,16 @@ struct rpcrdma_ep { */ #define RPCRDMA_IGNORE_COMPLETION (0ULL) +/* Pre-allocate extra Work Requests for handling backward receives + * and sends. This is a fixed value because the Work Queues are + * allocated when the forward channel is set up. + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +#define RPCRDMA_BACKWARD_WRS (8) +#else +#define RPCRDMA_BACKWARD_WRS (0) +#endif + /* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV * * The below structure appears at the front of a large region of kmalloc'd @@ -169,10 +174,13 @@ struct rpcrdma_rep { unsigned int rr_len; struct ib_device *rr_device; struct rpcrdma_xprt *rr_rxprt; + struct work_struct rr_work; struct list_head rr_list; struct rpcrdma_regbuf *rr_rdmabuf; }; +#define RPCRDMA_BAD_LEN (~0U) + /* * struct rpcrdma_mw - external memory region metadata * @@ -256,6 +264,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */ #define RPCRDMA_MAX_IOVS (2) struct rpcrdma_req { + struct list_head rl_free; unsigned int rl_niovs; unsigned int rl_nchunks; unsigned int rl_connect_cookie; @@ -265,6 +274,9 @@ struct rpcrdma_req { struct rpcrdma_regbuf *rl_rdmabuf; struct rpcrdma_regbuf *rl_sendbuf; struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS]; + + struct list_head rl_all; + bool rl_backchannel; }; static inline struct rpcrdma_req * @@ -289,12 +301,14 @@ struct rpcrdma_buffer { struct list_head rb_all; char *rb_pool; - spinlock_t rb_lock; /* protect buf arrays */ + spinlock_t rb_lock; /* protect buf lists */ + struct list_head rb_send_bufs; + struct list_head rb_recv_bufs; u32 rb_max_requests; - int rb_send_index; - int rb_recv_index; - struct rpcrdma_req **rb_send_bufs; - struct rpcrdma_rep **rb_recv_bufs; + + u32 rb_bc_srv_max_requests; + spinlock_t rb_reqslock; /* protect rb_allreqs */ + struct list_head rb_allreqs; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -340,6 +354,7 @@ struct rpcrdma_stats { unsigned long failed_marshal_count; unsigned long bad_reply_count; unsigned long nomsg_call_count; + unsigned long bcall_count; }; /* @@ -415,6 +430,9 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *, /* * Buffer calls - xprtrdma/verbs.c */ +struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *); +struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *); +void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *); int rpcrdma_buffer_create(struct rpcrdma_xprt *); void rpcrdma_buffer_destroy(struct rpcrdma_buffer *); @@ -431,10 +449,14 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *); unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *); +int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int); int frwr_alloc_recovery_wq(void); void frwr_destroy_recovery_wq(void); +int rpcrdma_alloc_wq(void); +void rpcrdma_destroy_wq(void); + /* * Wrappers for chunk registration, shared by read/write chunk code. */ @@ -495,6 +517,18 @@ int rpcrdma_marshal_req(struct rpc_rqst *); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); +/* Backchannel calls - xprtrdma/backchannel.c + */ +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int); +int xprt_rdma_bc_up(struct svc_serv *, struct net *); +int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int); +void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *); +int rpcrdma_bc_marshal_reply(struct rpc_rqst *); +void xprt_rdma_bc_free_rqst(struct rpc_rqst *); +void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + /* Temporary NFS request map cache. Created in svc_rdma.c */ extern struct kmem_cache *svc_rdma_map_cachep; /* WR context cache. Created in svc_rdma.c */ diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1a85e0ed0b48..1d1a70498910 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i int flags = XS_SENDMSG_FLAGS; remainder -= len; - if (remainder != 0 || more) + if (more) flags |= MSG_MORE; + if (remainder != 0) + flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE; err = do_sendpage(sock, *ppage, base, len, flags); if (remainder == 0 || err != len) break; @@ -823,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport) kernel_sock_shutdown(sock, SHUT_RDWR); + mutex_lock(&transport->recv_mutex); write_lock_bh(&sk->sk_callback_lock); transport->inet = NULL; transport->sock = NULL; @@ -833,6 +836,7 @@ static void xs_reset_transport(struct sock_xprt *transport) xprt_clear_connected(xprt); write_unlock_bh(&sk->sk_callback_lock); xs_sock_reset_connection_flags(xprt); + mutex_unlock(&transport->recv_mutex); trace_rpc_socket_close(xprt, sock); sock_release(sock); @@ -886,6 +890,7 @@ static void xs_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&transport->connect_worker); xs_close(xprt); + cancel_work_sync(&transport->recv_worker); xs_xprt_free(xprt); module_put(THIS_MODULE); } @@ -906,44 +911,36 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) } /** - * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets - * @sk: socket with data to read + * xs_local_data_read_skb + * @xprt: transport + * @sk: socket + * @skb: skbuff * * Currently this assumes we can read the whole reply in a single gulp. */ -static void xs_local_data_ready(struct sock *sk) +static void xs_local_data_read_skb(struct rpc_xprt *xprt, + struct sock *sk, + struct sk_buff *skb) { struct rpc_task *task; - struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; + int repsize, copied; u32 _xid; __be32 *xp; - read_lock_bh(&sk->sk_callback_lock); - dprintk("RPC: %s...\n", __func__); - xprt = xprt_from_sock(sk); - if (xprt == NULL) - goto out; - - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb == NULL) - goto out; - repsize = skb->len - sizeof(rpc_fraghdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d\n", repsize); - goto dropit; + return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); if (xp == NULL) - goto dropit; + return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -961,50 +958,68 @@ static void xs_local_data_ready(struct sock *sk) xprt_complete_rqst(task, copied); out_unlock: - spin_unlock(&xprt->transport_lock); - dropit: - skb_free_datagram(sk, skb); - out: - read_unlock_bh(&sk->sk_callback_lock); + spin_unlock_bh(&xprt->transport_lock); +} + +static void xs_local_data_receive(struct sock_xprt *transport) +{ + struct sk_buff *skb; + struct sock *sk; + int err; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + for (;;) { + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + break; + xs_local_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + } +out: + mutex_unlock(&transport->recv_mutex); +} + +static void xs_local_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_local_data_receive(transport); } /** - * xs_udp_data_ready - "data ready" callback for UDP sockets - * @sk: socket with data to read + * xs_udp_data_read_skb - receive callback for UDP sockets + * @xprt: transport + * @sk: socket + * @skb: skbuff * */ -static void xs_udp_data_ready(struct sock *sk) +static void xs_udp_data_read_skb(struct rpc_xprt *xprt, + struct sock *sk, + struct sk_buff *skb) { struct rpc_task *task; - struct rpc_xprt *xprt; struct rpc_rqst *rovr; - struct sk_buff *skb; - int err, repsize, copied; + int repsize, copied; u32 _xid; __be32 *xp; - read_lock_bh(&sk->sk_callback_lock); - dprintk("RPC: xs_udp_data_ready...\n"); - if (!(xprt = xprt_from_sock(sk))) - goto out; - - if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL) - goto out; - repsize = skb->len - sizeof(struct udphdr); if (repsize < 4) { dprintk("RPC: impossible RPC reply size %d!\n", repsize); - goto dropit; + return; } /* Copy the XID from the skb... */ xp = skb_header_pointer(skb, sizeof(struct udphdr), sizeof(_xid), &_xid); if (xp == NULL) - goto dropit; + return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; @@ -1025,10 +1040,54 @@ static void xs_udp_data_ready(struct sock *sk) xprt_complete_rqst(task, copied); out_unlock: - spin_unlock(&xprt->transport_lock); - dropit: - skb_free_datagram(sk, skb); - out: + spin_unlock_bh(&xprt->transport_lock); +} + +static void xs_udp_data_receive(struct sock_xprt *transport) +{ + struct sk_buff *skb; + struct sock *sk; + int err; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + for (;;) { + skb = skb_recv_datagram(sk, 0, 1, &err); + if (skb == NULL) + break; + xs_udp_data_read_skb(&transport->xprt, sk, skb); + skb_free_datagram(sk, skb); + } +out: + mutex_unlock(&transport->recv_mutex); +} + +static void xs_udp_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_udp_data_receive(transport); +} + +/** + * xs_data_ready - "data ready" callback for UDP sockets + * @sk: socket with data to read + * + */ +static void xs_data_ready(struct sock *sk) +{ + struct rpc_xprt *xprt; + + read_lock_bh(&sk->sk_callback_lock); + dprintk("RPC: xs_data_ready...\n"); + xprt = xprt_from_sock(sk); + if (xprt != NULL) { + struct sock_xprt *transport = container_of(xprt, + struct sock_xprt, xprt); + queue_work(rpciod_workqueue, &transport->recv_worker); + } read_unlock_bh(&sk->sk_callback_lock); } @@ -1243,12 +1302,12 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); req = xprt_lookup_rqst(xprt, transport->tcp_xid); if (!req) { dprintk("RPC: XID %08x request not found!\n", ntohl(transport->tcp_xid)); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return -1; } @@ -1257,7 +1316,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_rqst(req->rq_task, transport->tcp_copied); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1277,10 +1336,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, struct rpc_rqst *req; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->transport_lock); + spin_lock_bh(&xprt->transport_lock); req = xprt_lookup_bc_request(xprt, transport->tcp_xid); if (req == NULL) { - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); printk(KERN_WARNING "Callback slot table overflowed\n"); xprt_force_disconnect(xprt); return -1; @@ -1291,7 +1350,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt, if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) xprt_complete_bc_request(req, transport->tcp_copied); - spin_unlock(&xprt->transport_lock); + spin_unlock_bh(&xprt->transport_lock); return 0; } @@ -1306,6 +1365,17 @@ static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, xs_tcp_read_reply(xprt, desc) : xs_tcp_read_callback(xprt, desc); } + +static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) +{ + int ret; + + ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0, + SVC_SOCK_ANONYMOUS); + if (ret < 0) + return ret; + return 0; +} #else static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) @@ -1391,6 +1461,44 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns return len - desc.count; } +static void xs_tcp_data_receive(struct sock_xprt *transport) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct sock *sk; + read_descriptor_t rd_desc = { + .count = 2*1024*1024, + .arg.data = xprt, + }; + unsigned long total = 0; + int read = 0; + + mutex_lock(&transport->recv_mutex); + sk = transport->inet; + if (sk == NULL) + goto out; + + /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ + for (;;) { + lock_sock(sk); + read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); + release_sock(sk); + if (read <= 0) + break; + total += read; + rd_desc.count = 65536; + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_tcp_data_ready(xprt, read, total); +} + +static void xs_tcp_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_tcp_data_receive(transport); +} + /** * xs_tcp_data_ready - "data ready" callback for TCP sockets * @sk: socket with data to read @@ -1398,34 +1506,24 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns */ static void xs_tcp_data_ready(struct sock *sk) { + struct sock_xprt *transport; struct rpc_xprt *xprt; - read_descriptor_t rd_desc; - int read; - unsigned long total = 0; dprintk("RPC: xs_tcp_data_ready...\n"); read_lock_bh(&sk->sk_callback_lock); - if (!(xprt = xprt_from_sock(sk))) { - read = 0; + if (!(xprt = xprt_from_sock(sk))) goto out; - } + transport = container_of(xprt, struct sock_xprt, xprt); + /* Any data means we had a useful conversation, so * the we don't need to delay the next reconnect */ if (xprt->reestablish_timeout) xprt->reestablish_timeout = 0; + queue_work(rpciod_workqueue, &transport->recv_worker); - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - rd_desc.arg.data = xprt; - do { - rd_desc.count = 65536; - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (read > 0) - total += read; - } while (read > 0); out: - trace_xs_tcp_data_ready(xprt, read, total); read_unlock_bh(&sk->sk_callback_lock); } @@ -1873,7 +1971,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_local_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_error_report = xs_error_report; sk->sk_allocation = GFP_NOIO; @@ -2059,7 +2157,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_save_old_callbacks(transport, sk); sk->sk_user_data = xprt; - sk->sk_data_ready = xs_udp_data_ready; + sk->sk_data_ready = xs_data_ready; sk->sk_write_space = xs_udp_write_space; sk->sk_allocation = GFP_NOIO; @@ -2472,7 +2570,7 @@ static int bc_send_request(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; - u32 len; + int len; dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid)); /* @@ -2580,6 +2678,12 @@ static struct rpc_xprt_ops xs_tcp_ops = { .enable_swap = xs_enable_swap, .disable_swap = xs_disable_swap, .inject_disconnect = xs_inject_disconnect, +#ifdef CONFIG_SUNRPC_BACKCHANNEL + .bc_setup = xprt_setup_bc, + .bc_up = xs_tcp_bc_up, + .bc_free_rqst = xprt_free_bc_rqst, + .bc_destroy = xprt_destroy_bc, +#endif }; /* @@ -2650,6 +2754,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args, } new = container_of(xprt, struct sock_xprt, xprt); + mutex_init(&new->recv_mutex); memcpy(&xprt->addr, args->dstaddr, args->addrlen); xprt->addrlen = args->addrlen; if (args->srcaddr) @@ -2703,6 +2808,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; + INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); @@ -2774,21 +2880,20 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args) xprt->timeout = &xs_udp_default_timeout; + INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket); + switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_udp_setup_socket); xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6); break; default: @@ -2853,21 +2958,20 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->ops = &xs_tcp_ops; xprt->timeout = &xs_tcp_default_timeout; + INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); + switch (addr->sa_family) { case AF_INET: if (((struct sockaddr_in *)addr)->sin_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP); break; case AF_INET6: if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0)) xprt_set_bound(xprt); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_tcp_setup_socket); xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6); break; default: diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index 400d87294de3..0a369bb440e7 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -1234,7 +1234,7 @@ vmci_transport_recv_connecting_server(struct sock *listener, /* Callers of accept() will be be waiting on the listening socket, not * the pending socket. */ - listener->sk_state_change(listener); + listener->sk_data_ready(listener); return 0; |
