diff options
Diffstat (limited to 'net')
57 files changed, 2456 insertions, 2117 deletions
diff --git a/net/9p/Makefile b/net/9p/Makefile index c0486cfc85d9..aa0a5641e5d0 100644 --- a/net/9p/Makefile +++ b/net/9p/Makefile @@ -8,7 +8,6 @@ obj-$(CONFIG_NET_9P_RDMA) += 9pnet_rdma.o mod.o \ client.o \ error.o \ - util.o \ protocol.o \ trans_fd.o \ trans_common.o \ diff --git a/net/9p/client.c b/net/9p/client.c index deae53a7dffc..5f23e18eecc0 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -231,144 +231,170 @@ free_and_return: return ret; } -static struct p9_fcall *p9_fcall_alloc(int alloc_msize) +static int p9_fcall_init(struct p9_client *c, struct p9_fcall *fc, + int alloc_msize) { - struct p9_fcall *fc; - fc = kmalloc(sizeof(struct p9_fcall) + alloc_msize, GFP_NOFS); - if (!fc) - return NULL; + if (likely(c->fcall_cache) && alloc_msize == c->msize) { + fc->sdata = kmem_cache_alloc(c->fcall_cache, GFP_NOFS); + fc->cache = c->fcall_cache; + } else { + fc->sdata = kmalloc(alloc_msize, GFP_NOFS); + fc->cache = NULL; + } + if (!fc->sdata) + return -ENOMEM; fc->capacity = alloc_msize; - fc->sdata = (char *) fc + sizeof(struct p9_fcall); - return fc; + return 0; +} + +void p9_fcall_fini(struct p9_fcall *fc) +{ + /* sdata can be NULL for interrupted requests in trans_rdma, + * and kmem_cache_free does not do NULL-check for us + */ + if (unlikely(!fc->sdata)) + return; + + if (fc->cache) + kmem_cache_free(fc->cache, fc->sdata); + else + kfree(fc->sdata); } +EXPORT_SYMBOL(p9_fcall_fini); + +static struct kmem_cache *p9_req_cache; /** - * p9_tag_alloc - lookup/allocate a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction - * - * this is a simple array lookup, but will grow the - * request_slots as necessary to accommodate transaction - * ids which did not previously have a slot. - * - * this code relies on the client spinlock to manage locks, its - * possible we should switch to something else, but I'd rather - * stick with something low-overhead for the common case. + * p9_req_alloc - Allocate a new request. + * @c: Client session. + * @type: Transaction type. + * @max_size: Maximum packet size for this request. * + * Context: Process context. + * Return: Pointer to new request. */ - static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int max_size) +p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) { - unsigned long flags; - int row, col; - struct p9_req_t *req; + struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); int alloc_msize = min(c->msize, max_size); + int tag; - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) { - spin_lock_irqsave(&c->lock, flags); - /* check again since original check was outside of lock */ - while (tag >= c->max_tag) { - row = (tag / P9_ROW_MAXTAG); - c->reqs[row] = kcalloc(P9_ROW_MAXTAG, - sizeof(struct p9_req_t), GFP_ATOMIC); - - if (!c->reqs[row]) { - pr_err("Couldn't grow tag array\n"); - spin_unlock_irqrestore(&c->lock, flags); - return ERR_PTR(-ENOMEM); - } - for (col = 0; col < P9_ROW_MAXTAG; col++) { - req = &c->reqs[row][col]; - req->status = REQ_STATUS_IDLE; - init_waitqueue_head(&req->wq); - } - c->max_tag += P9_ROW_MAXTAG; - } - spin_unlock_irqrestore(&c->lock, flags); - } - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; - - req = &c->reqs[row][col]; - if (!req->tc) - req->tc = p9_fcall_alloc(alloc_msize); - if (!req->rc) - req->rc = p9_fcall_alloc(alloc_msize); - if (!req->tc || !req->rc) - goto grow_failed; + if (!req) + return ERR_PTR(-ENOMEM); - p9pdu_reset(req->tc); - p9pdu_reset(req->rc); + if (p9_fcall_init(c, &req->tc, alloc_msize)) + goto free_req; + if (p9_fcall_init(c, &req->rc, alloc_msize)) + goto free; - req->tc->tag = tag-1; + p9pdu_reset(&req->tc); + p9pdu_reset(&req->rc); req->status = REQ_STATUS_ALLOC; + init_waitqueue_head(&req->wq); + INIT_LIST_HEAD(&req->req_list); + + idr_preload(GFP_NOFS); + spin_lock_irq(&c->lock); + if (type == P9_TVERSION) + tag = idr_alloc(&c->reqs, req, P9_NOTAG, P9_NOTAG + 1, + GFP_NOWAIT); + else + tag = idr_alloc(&c->reqs, req, 0, P9_NOTAG, GFP_NOWAIT); + req->tc.tag = tag; + spin_unlock_irq(&c->lock); + idr_preload_end(); + if (tag < 0) + goto free; + + /* Init ref to two because in the general case there is one ref + * that is put asynchronously by a writer thread, one ref + * temporarily given by p9_tag_lookup and put by p9_client_cb + * in the recv thread, and one ref put by p9_tag_remove in the + * main thread. The only exception is virtio that does not use + * p9_tag_lookup but does not have a writer thread either + * (the write happens synchronously in the request/zc_request + * callback), so p9_client_cb eats the second ref there + * as the pointer is duplicated directly by virtqueue_add_sgs() + */ + refcount_set(&req->refcount.refcount, 2); return req; -grow_failed: - pr_err("Couldn't grow tag array\n"); - kfree(req->tc); - kfree(req->rc); - req->tc = req->rc = NULL; +free: + p9_fcall_fini(&req->tc); + p9_fcall_fini(&req->rc); +free_req: + kmem_cache_free(p9_req_cache, req); return ERR_PTR(-ENOMEM); } /** - * p9_tag_lookup - lookup a request by tag - * @c: client session to lookup tag within - * @tag: numeric id for transaction + * p9_tag_lookup - Look up a request by tag. + * @c: Client session. + * @tag: Transaction ID. * + * Context: Any context. + * Return: A request, or %NULL if there is no request with that tag. */ - struct p9_req_t *p9_tag_lookup(struct p9_client *c, u16 tag) { - int row, col; - - /* This looks up the original request by tag so we know which - * buffer to read the data into */ - tag++; - - if (tag >= c->max_tag) - return NULL; + struct p9_req_t *req; - row = tag / P9_ROW_MAXTAG; - col = tag % P9_ROW_MAXTAG; + rcu_read_lock(); +again: + req = idr_find(&c->reqs, tag); + if (req) { + /* We have to be careful with the req found under rcu_read_lock + * Thanks to SLAB_TYPESAFE_BY_RCU we can safely try to get the + * ref again without corrupting other data, then check again + * that the tag matches once we have the ref + */ + if (!p9_req_try_get(req)) + goto again; + if (req->tc.tag != tag) { + p9_req_put(req); + goto again; + } + } + rcu_read_unlock(); - return &c->reqs[row][col]; + return req; } EXPORT_SYMBOL(p9_tag_lookup); /** - * p9_tag_init - setup tags structure and contents - * @c: v9fs client struct - * - * This initializes the tags structure for each client instance. + * p9_tag_remove - Remove a tag. + * @c: Client session. + * @r: Request of reference. * + * Context: Any context. */ +static int p9_tag_remove(struct p9_client *c, struct p9_req_t *r) +{ + unsigned long flags; + u16 tag = r->tc.tag; + + p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); + spin_lock_irqsave(&c->lock, flags); + idr_remove(&c->reqs, tag); + spin_unlock_irqrestore(&c->lock, flags); + return p9_req_put(r); +} -static int p9_tag_init(struct p9_client *c) +static void p9_req_free(struct kref *ref) { - int err = 0; + struct p9_req_t *r = container_of(ref, struct p9_req_t, refcount); + p9_fcall_fini(&r->tc); + p9_fcall_fini(&r->rc); + kmem_cache_free(p9_req_cache, r); +} - c->tagpool = p9_idpool_create(); - if (IS_ERR(c->tagpool)) { - err = PTR_ERR(c->tagpool); - goto error; - } - err = p9_idpool_get(c->tagpool); /* reserve tag 0 */ - if (err < 0) { - p9_idpool_destroy(c->tagpool); - goto error; - } - c->max_tag = 0; -error: - return err; +int p9_req_put(struct p9_req_t *r) +{ + return kref_put(&r->refcount, p9_req_free); } +EXPORT_SYMBOL(p9_req_put); /** * p9_tag_cleanup - cleans up tags structure and reclaims resources @@ -379,52 +405,17 @@ error: */ static void p9_tag_cleanup(struct p9_client *c) { - int row, col; - - /* check to insure all requests are idle */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - if (c->reqs[row][col].status != REQ_STATUS_IDLE) { - p9_debug(P9_DEBUG_MUX, - "Attempting to cleanup non-free tag %d,%d\n", - row, col); - /* TODO: delay execution of cleanup */ - return; - } - } - } - - if (c->tagpool) { - p9_idpool_put(0, c->tagpool); /* free reserved tag 0 */ - p9_idpool_destroy(c->tagpool); - } + struct p9_req_t *req; + int id; - /* free requests associated with tags */ - for (row = 0; row < (c->max_tag/P9_ROW_MAXTAG); row++) { - for (col = 0; col < P9_ROW_MAXTAG; col++) { - kfree(c->reqs[row][col].tc); - kfree(c->reqs[row][col].rc); - } - kfree(c->reqs[row]); + rcu_read_lock(); + idr_for_each_entry(&c->reqs, req, id) { + pr_info("Tag %d still in use\n", id); + if (p9_tag_remove(c, req) == 0) + pr_warn("Packet with tag %d has still references", + req->tc.tag); } - c->max_tag = 0; -} - -/** - * p9_free_req - free a request and clean-up as necessary - * c: client state - * r: request to release - * - */ - -static void p9_free_req(struct p9_client *c, struct p9_req_t *r) -{ - int tag = r->tc->tag; - p9_debug(P9_DEBUG_MUX, "clnt %p req %p tag: %d\n", c, r, tag); - - r->status = REQ_STATUS_IDLE; - if (tag != P9_NOTAG && p9_idpool_check(tag, c->tagpool)) - p9_idpool_put(tag, c->tagpool); + rcu_read_unlock(); } /** @@ -435,7 +426,7 @@ static void p9_free_req(struct p9_client *c, struct p9_req_t *r) */ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) { - p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, " tag %d\n", req->tc.tag); /* * This barrier is needed to make sure any change made to req before @@ -445,7 +436,8 @@ void p9_client_cb(struct p9_client *c, struct p9_req_t *req, int status) req->status = status; wake_up(&req->wq); - p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc->tag); + p9_debug(P9_DEBUG_MUX, "wakeup: %d\n", req->tc.tag); + p9_req_put(req); } EXPORT_SYMBOL(p9_client_cb); @@ -516,18 +508,18 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) int err; int ecode; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); - if (req->rc->size >= c->msize) { + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); + if (req->rc.size >= c->msize) { p9_debug(P9_DEBUG_ERROR, "requested packet size too big: %d\n", - req->rc->size); + req->rc.size); return -EIO; } /* * dump the response from server * This should be after check errors which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -537,7 +529,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) if (!p9_is_proto_dotl(c)) { char *ename; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -553,7 +545,7 @@ static int p9_check_errors(struct p9_client *c, struct p9_req_t *req) } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -587,12 +579,12 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, int8_t type; char *ename = NULL; - err = p9_parse_header(req->rc, NULL, &type, NULL, 0); + err = p9_parse_header(&req->rc, NULL, &type, NULL, 0); /* * dump the response from server * This should be after parse_header which poplulate pdu_fcall. */ - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); if (err) { p9_debug(P9_DEBUG_ERROR, "couldn't parse header %d\n", err); return err; @@ -607,13 +599,13 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, /* 7 = header size for RERROR; */ int inline_len = in_hdrlen - 7; - len = req->rc->size - req->rc->offset; + len = req->rc.size - req->rc.offset; if (len > (P9_ZC_HDR_SZ - 7)) { err = -EFAULT; goto out_err; } - ename = &req->rc->sdata[req->rc->offset]; + ename = &req->rc.sdata[req->rc.offset]; if (len > inline_len) { /* We have error in external buffer */ if (!copy_from_iter_full(ename + inline_len, @@ -623,7 +615,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } } ename = NULL; - err = p9pdu_readf(req->rc, c->proto_version, "s?d", + err = p9pdu_readf(&req->rc, c->proto_version, "s?d", &ename, &ecode); if (err) goto out_err; @@ -639,7 +631,7 @@ static int p9_check_zc_errors(struct p9_client *c, struct p9_req_t *req, } kfree(ename); } else { - err = p9pdu_readf(req->rc, c->proto_version, "d", &ecode); + err = p9pdu_readf(&req->rc, c->proto_version, "d", &ecode); err = -ecode; p9_debug(P9_DEBUG_9P, "<<< RLERROR (%d)\n", -ecode); @@ -672,7 +664,7 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) int16_t oldtag; int err; - err = p9_parse_header(oldreq->tc, NULL, NULL, &oldtag, 1); + err = p9_parse_header(&oldreq->tc, NULL, NULL, &oldtag, 1); if (err) return err; @@ -686,11 +678,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) * if we haven't received a response for oldreq, * remove it from the list */ - if (oldreq->status == REQ_STATUS_SENT) + if (oldreq->status == REQ_STATUS_SENT) { if (c->trans_mod->cancelled) c->trans_mod->cancelled(c, oldreq); + } - p9_free_req(c, req); + p9_tag_remove(c, req); return 0; } @@ -698,7 +691,7 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, int8_t type, int req_size, const char *fmt, va_list ap) { - int tag, err; + int err; struct p9_req_t *req; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -711,27 +704,22 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if ((c->status == BeginDisconnect) && (type != P9_TCLUNK)) return ERR_PTR(-EIO); - tag = P9_NOTAG; - if (type != P9_TVERSION) { - tag = p9_idpool_get(c->tagpool); - if (tag < 0) - return ERR_PTR(-ENOMEM); - } - - req = p9_tag_alloc(c, tag, req_size); + req = p9_tag_alloc(c, type, req_size); if (IS_ERR(req)) return req; /* marshall the data */ - p9pdu_prepare(req->tc, tag, type); - err = p9pdu_vwritef(req->tc, c->proto_version, fmt, ap); + p9pdu_prepare(&req->tc, req->tc.tag, type); + err = p9pdu_vwritef(&req->tc, c->proto_version, fmt, ap); if (err) goto reterr; - p9pdu_finalize(c, req->tc); - trace_9p_client_req(c, type, tag); + p9pdu_finalize(c, &req->tc); + trace_9p_client_req(c, type, req->tc.tag); return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); + /* We have to put also the 2nd reference as it won't be used */ + p9_req_put(req); return ERR_PTR(err); } @@ -741,7 +729,7 @@ reterr: * @type: type of request * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t * @@ -766,6 +754,8 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) err = c->trans_mod->request(c, req); if (err < 0) { + /* write won't happen */ + p9_req_put(req); if (err != -ERESTARTSYS && err != -EFAULT) c->status = Disconnected; goto recalc_sigpending; @@ -813,11 +803,11 @@ recalc_sigpending: goto reterr; err = p9_check_errors(c, req); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -832,7 +822,7 @@ reterr: * @hdrlen: reader header size, This is the size of response protocol data * @fmt: protocol format string (see protocol.c) * - * Returns request structure (which client must free using p9_free_req) + * Returns request structure (which client must free using p9_tag_remove) */ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, struct iov_iter *uidata, @@ -895,11 +885,11 @@ recalc_sigpending: goto reterr; err = p9_check_zc_errors(c, req, uidata, in_hdrlen); - trace_9p_client_res(c, type, req->rc->tag, err); + trace_9p_client_res(c, type, req->rc.tag, err); if (!err) return req; reterr: - p9_free_req(c, req); + p9_tag_remove(c, req); return ERR_PTR(safe_errno(err)); } @@ -978,10 +968,10 @@ static int p9_client_version(struct p9_client *c) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, c->proto_version, "ds", &msize, &version); + err = p9pdu_readf(&req->rc, c->proto_version, "ds", &msize, &version); if (err) { p9_debug(P9_DEBUG_9P, "version error %d\n", err); - trace_9p_protocol_dump(c, req->rc); + trace_9p_protocol_dump(c, &req->rc); goto error; } @@ -1002,7 +992,7 @@ static int p9_client_version(struct p9_client *c) error: kfree(version); - p9_free_req(c, req); + p9_tag_remove(c, req); return err; } @@ -1020,20 +1010,18 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) clnt->trans_mod = NULL; clnt->trans = NULL; + clnt->fcall_cache = NULL; client_id = utsname()->nodename; memcpy(clnt->name, client_id, strlen(client_id) + 1); spin_lock_init(&clnt->lock); idr_init(&clnt->fids); - - err = p9_tag_init(clnt); - if (err < 0) - goto free_client; + idr_init(&clnt->reqs); err = parse_opts(options, clnt); if (err < 0) - goto destroy_tagpool; + goto free_client; if (!clnt->trans_mod) clnt->trans_mod = v9fs_get_default_trans(); @@ -1042,7 +1030,7 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) err = -EPROTONOSUPPORT; p9_debug(P9_DEBUG_ERROR, "No transport defined or default transport\n"); - goto destroy_tagpool; + goto free_client; } p9_debug(P9_DEBUG_MUX, "clnt %p trans %p msize %d protocol %d\n", @@ -1059,14 +1047,21 @@ struct p9_client *p9_client_create(const char *dev_name, char *options) if (err) goto close_trans; + /* P9_HDRSZ + 4 is the smallest packet header we can have that is + * followed by data accessed from userspace by read + */ + clnt->fcall_cache = + kmem_cache_create_usercopy("9p-fcall-cache", clnt->msize, + 0, 0, P9_HDRSZ + 4, + clnt->msize - (P9_HDRSZ + 4), + NULL); + return clnt; close_trans: clnt->trans_mod->close(clnt); put_trans: v9fs_put_trans(clnt->trans_mod); -destroy_tagpool: - p9_idpool_destroy(clnt->tagpool); free_client: kfree(clnt); return ERR_PTR(err); @@ -1092,6 +1087,7 @@ void p9_client_destroy(struct p9_client *clnt) p9_tag_cleanup(clnt); + kmem_cache_destroy(clnt->fcall_cache); kfree(clnt); } EXPORT_SYMBOL(p9_client_destroy); @@ -1135,10 +1131,10 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", &qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1147,7 +1143,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid, memmove(&fid->qid, &qid, sizeof(struct p9_qid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return fid; error: @@ -1192,13 +1188,13 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, uint16_t nwname, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "R", &nwqids, &wqids); + err = p9pdu_readf(&req->rc, clnt->proto_version, "R", &nwqids, &wqids); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RWALK nwqid %d:\n", nwqids); @@ -1259,9 +1255,9 @@ int p9_client_open(struct p9_fid *fid, int mode) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1273,7 +1269,7 @@ int p9_client_open(struct p9_fid *fid, int mode) fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1303,9 +1299,9 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1318,7 +1314,7 @@ int p9_client_create_dotl(struct p9_fid *ofid, const char *name, u32 flags, u32 ofid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1348,9 +1344,9 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Qd", &qid, &iounit); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Qd", &qid, &iounit); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1363,7 +1359,7 @@ int p9_client_fcreate(struct p9_fid *fid, const char *name, u32 perm, int mode, fid->iounit = iounit; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1387,9 +1383,9 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } @@ -1397,7 +1393,7 @@ int p9_client_symlink(struct p9_fid *dfid, const char *name, qid->type, (unsigned long long)qid->path, qid->version); free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1417,7 +1413,7 @@ int p9_client_link(struct p9_fid *dfid, struct p9_fid *oldfid, const char *newna return PTR_ERR(req); p9_debug(P9_DEBUG_9P, "<<< RLINK\n"); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return 0; } EXPORT_SYMBOL(p9_client_link); @@ -1441,7 +1437,7 @@ int p9_client_fsync(struct p9_fid *fid, int datasync) p9_debug(P9_DEBUG_9P, "<<< RFSYNC fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; @@ -1476,7 +1472,7 @@ again: p9_debug(P9_DEBUG_9P, "<<< RCLUNK fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: /* * Fid is not valid even after a failed clunk @@ -1510,7 +1506,7 @@ int p9_client_remove(struct p9_fid *fid) p9_debug(P9_DEBUG_9P, "<<< RREMOVE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: if (err == -ERESTARTSYS) p9_client_clunk(fid); @@ -1537,7 +1533,7 @@ int p9_client_unlinkat(struct p9_fid *dfid, const char *name, int flags) } p9_debug(P9_DEBUG_9P, "<<< RUNLINKAT fid %d %s\n", dfid->fid, name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1585,11 +1581,11 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, + *err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1599,7 +1595,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) p9_debug(P9_DEBUG_9P, "<<< RREAD count %d\n", count); if (!count) { - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } @@ -1609,7 +1605,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) offset += n; if (n != count) { *err = -EFAULT; - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); break; } } else { @@ -1617,7 +1613,7 @@ p9_client_read(struct p9_fid *fid, u64 offset, struct iov_iter *to, int *err) total += count; offset += count; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); } return total; } @@ -1658,10 +1654,10 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) break; } - *err = p9pdu_readf(req->rc, clnt->proto_version, "d", &count); + *err = p9pdu_readf(&req->rc, clnt->proto_version, "d", &count); if (*err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); break; } if (rsize < count) { @@ -1671,7 +1667,7 @@ p9_client_write(struct p9_fid *fid, u64 offset, struct iov_iter *from, int *err) p9_debug(P9_DEBUG_9P, "<<< RWRITE count %d\n", count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); iov_iter_advance(from, count); total += count; offset += count; @@ -1702,10 +1698,10 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "wS", &ignored, ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "wS", &ignored, ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1722,7 +1718,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid) from_kgid(&init_user_ns, ret->n_gid), from_kuid(&init_user_ns, ret->n_muid)); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1755,10 +1751,10 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "A", ret); + err = p9pdu_readf(&req->rc, clnt->proto_version, "A", ret); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1783,7 +1779,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid *fid, ret->st_ctime_nsec, ret->st_btime_sec, ret->st_btime_nsec, ret->st_gen, ret->st_data_version); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return ret; error: @@ -1852,7 +1848,7 @@ int p9_client_wstat(struct p9_fid *fid, struct p9_wstat *wst) p9_debug(P9_DEBUG_9P, "<<< RWSTAT fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1884,7 +1880,7 @@ int p9_client_setattr(struct p9_fid *fid, struct p9_iattr_dotl *p9attr) goto error; } p9_debug(P9_DEBUG_9P, "<<< RSETATTR fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1907,12 +1903,12 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, - &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, - &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); + err = p9pdu_readf(&req->rc, clnt->proto_version, "ddqqqqqqd", &sb->type, + &sb->bsize, &sb->blocks, &sb->bfree, &sb->bavail, + &sb->files, &sb->ffree, &sb->fsid, &sb->namelen); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto error; } @@ -1923,7 +1919,7 @@ int p9_client_statfs(struct p9_fid *fid, struct p9_rstatfs *sb) sb->blocks, sb->bfree, sb->bavail, sb->files, sb->ffree, sb->fsid, (long int)sb->namelen); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1951,7 +1947,7 @@ int p9_client_rename(struct p9_fid *fid, p9_debug(P9_DEBUG_9P, "<<< RRENAME fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -1981,7 +1977,7 @@ int p9_client_renameat(struct p9_fid *olddirfid, const char *old_name, p9_debug(P9_DEBUG_9P, "<<< RRENAMEAT newdirfid %d new name %s\n", newdirfid->fid, new_name); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2015,13 +2011,13 @@ struct p9_fid *p9_client_xattrwalk(struct p9_fid *file_fid, err = PTR_ERR(req); goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "q", attr_size); + err = p9pdu_readf(&req->rc, clnt->proto_version, "q", attr_size); if (err) { - trace_9p_protocol_dump(clnt, req->rc); - p9_free_req(clnt, req); + trace_9p_protocol_dump(clnt, &req->rc); + p9_tag_remove(clnt, req); goto clunk_fid; } - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); p9_debug(P9_DEBUG_9P, "<<< RXATTRWALK fid %d size %llu\n", attr_fid->fid, *attr_size); return attr_fid; @@ -2055,7 +2051,7 @@ int p9_client_xattrcreate(struct p9_fid *fid, const char *name, goto error; } p9_debug(P9_DEBUG_9P, "<<< RXATTRCREATE fid %d\n", fid->fid); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2103,9 +2099,9 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) goto error; } - err = p9pdu_readf(req->rc, clnt->proto_version, "D", &count, &dataptr); + err = p9pdu_readf(&req->rc, clnt->proto_version, "D", &count, &dataptr); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto free_and_error; } if (rsize < count) { @@ -2118,11 +2114,11 @@ int p9_client_readdir(struct p9_fid *fid, char *data, u32 count, u64 offset) if (non_zc) memmove(data, dataptr, count); - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return count; free_and_error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); error: return err; } @@ -2144,16 +2140,16 @@ int p9_client_mknod_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKNOD qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2175,16 +2171,16 @@ int p9_client_mkdir_dotl(struct p9_fid *fid, const char *name, int mode, if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "Q", qid); + err = p9pdu_readf(&req->rc, clnt->proto_version, "Q", qid); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RMKDIR qid %x.%llx.%x\n", qid->type, (unsigned long long)qid->path, qid->version); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2210,14 +2206,14 @@ int p9_client_lock_dotl(struct p9_fid *fid, struct p9_flock *flock, u8 *status) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "b", status); + err = p9pdu_readf(&req->rc, clnt->proto_version, "b", status); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RLOCK status %i\n", *status); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } @@ -2241,18 +2237,18 @@ int p9_client_getlock_dotl(struct p9_fid *fid, struct p9_getlock *glock) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "bqqds", &glock->type, - &glock->start, &glock->length, &glock->proc_id, - &glock->client_id); + err = p9pdu_readf(&req->rc, clnt->proto_version, "bqqds", &glock->type, + &glock->start, &glock->length, &glock->proc_id, + &glock->client_id); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RGETLOCK type %i start %lld length %lld " "proc_id %d client_id %s\n", glock->type, glock->start, glock->length, glock->proc_id, glock->client_id); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_getlock_dotl); @@ -2271,14 +2267,25 @@ int p9_client_readlink(struct p9_fid *fid, char **target) if (IS_ERR(req)) return PTR_ERR(req); - err = p9pdu_readf(req->rc, clnt->proto_version, "s", target); + err = p9pdu_readf(&req->rc, clnt->proto_version, "s", target); if (err) { - trace_9p_protocol_dump(clnt, req->rc); + trace_9p_protocol_dump(clnt, &req->rc); goto error; } p9_debug(P9_DEBUG_9P, "<<< RREADLINK target %s\n", *target); error: - p9_free_req(clnt, req); + p9_tag_remove(clnt, req); return err; } EXPORT_SYMBOL(p9_client_readlink); + +int __init p9_client_init(void) +{ + p9_req_cache = KMEM_CACHE(p9_req_t, SLAB_TYPESAFE_BY_RCU); + return p9_req_cache ? 0 : -ENOMEM; +} + +void __exit p9_client_exit(void) +{ + kmem_cache_destroy(p9_req_cache); +} diff --git a/net/9p/mod.c b/net/9p/mod.c index 253ba824a325..0da56d6af73b 100644 --- a/net/9p/mod.c +++ b/net/9p/mod.c @@ -171,11 +171,17 @@ void v9fs_put_trans(struct p9_trans_module *m) */ static int __init init_p9(void) { + int ret; + + ret = p9_client_init(); + if (ret) + return ret; + p9_error_init(); pr_info("Installing 9P2000 support\n"); p9_trans_fd_init(); - return 0; + return ret; } /** @@ -188,6 +194,7 @@ static void __exit exit_p9(void) pr_info("Unloading 9P2000 support\n"); p9_trans_fd_exit(); + p9_client_exit(); } module_init(init_p9) diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 4a1e1dd30b52..462ba144cb39 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -46,10 +46,15 @@ p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); void p9stat_free(struct p9_wstat *stbuf) { kfree(stbuf->name); + stbuf->name = NULL; kfree(stbuf->uid); + stbuf->uid = NULL; kfree(stbuf->gid); + stbuf->gid = NULL; kfree(stbuf->muid); + stbuf->muid = NULL; kfree(stbuf->extension); + stbuf->extension = NULL; } EXPORT_SYMBOL(p9stat_free); @@ -566,9 +571,10 @@ int p9stat_read(struct p9_client *clnt, char *buf, int len, struct p9_wstat *st) if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9stat_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); + return ret; } - return ret; + return fake_pdu.offset; } EXPORT_SYMBOL(p9stat_read); @@ -617,13 +623,19 @@ int p9dirent_read(struct p9_client *clnt, char *buf, int len, if (ret) { p9_debug(P9_DEBUG_9P, "<<< p9dirent_read failed: %d\n", ret); trace_9p_protocol_dump(clnt, &fake_pdu); - goto out; + return ret; } - strcpy(dirent->d_name, nameptr); + ret = strscpy(dirent->d_name, nameptr, sizeof(dirent->d_name)); + if (ret < 0) { + p9_debug(P9_DEBUG_ERROR, + "On the wire dirent name too long: %s\n", + nameptr); + kfree(nameptr); + return ret; + } kfree(nameptr); -out: return fake_pdu.offset; } EXPORT_SYMBOL(p9dirent_read); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e2ef3c782c53..f868cf6fba79 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -131,7 +131,8 @@ struct p9_conn { int err; struct list_head req_list; struct list_head unsent_req_list; - struct p9_req_t *req; + struct p9_req_t *rreq; + struct p9_req_t *wreq; char tmp_buf[7]; struct p9_fcall rc; int wpos; @@ -291,7 +292,6 @@ static void p9_read_work(struct work_struct *work) __poll_t n; int err; struct p9_conn *m; - int status = REQ_STATUS_ERROR; m = container_of(work, struct p9_conn, rq); @@ -322,7 +322,7 @@ static void p9_read_work(struct work_struct *work) m->rc.offset += err; /* header read in */ - if ((!m->req) && (m->rc.offset == m->rc.capacity)) { + if ((!m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); /* Header size */ @@ -346,23 +346,23 @@ static void p9_read_work(struct work_struct *work) "mux %p pkt: size: %d bytes tag: %d\n", m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, m->rc.tag); - if (!m->req || (m->req->status != REQ_STATUS_SENT)) { + m->rreq = p9_tag_lookup(m->client, m->rc.tag); + if (!m->rreq || (m->rreq->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", m->rc.tag); err = -EIO; goto error; } - if (m->req->rc == NULL) { + if (!m->rreq->rc.sdata) { p9_debug(P9_DEBUG_ERROR, "No recv fcall for tag %d (req %p), disconnecting!\n", - m->rc.tag, m->req); - m->req = NULL; + m->rc.tag, m->rreq); + m->rreq = NULL; err = -EIO; goto error; } - m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + m->rc.sdata = m->rreq->rc.sdata; memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); m->rc.capacity = m->rc.size; } @@ -370,20 +370,27 @@ static void p9_read_work(struct work_struct *work) /* packet is read in * not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rc.offset == m->rc.capacity)) { + if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); - m->req->rc->size = m->rc.offset; + m->rreq->rc.size = m->rc.offset; spin_lock(&m->client->lock); - if (m->req->status != REQ_STATUS_ERROR) - status = REQ_STATUS_RCVD; - list_del(&m->req->req_list); - /* update req->status while holding client->lock */ - p9_client_cb(m->client, m->req, status); + if (m->rreq->status == REQ_STATUS_SENT) { + list_del(&m->rreq->req_list); + p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); + } else { + spin_unlock(&m->client->lock); + p9_debug(P9_DEBUG_ERROR, + "Request tag %d errored out while we were reading the reply\n", + m->rc.tag); + err = -EIO; + goto error; + } spin_unlock(&m->client->lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; - m->req = NULL; + p9_req_put(m->rreq); + m->rreq = NULL; } end_clear: @@ -469,9 +476,11 @@ static void p9_write_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "move req %p\n", req); list_move_tail(&req->req_list, &m->req_list); - m->wbuf = req->tc->sdata; - m->wsize = req->tc->size; + m->wbuf = req->tc.sdata; + m->wsize = req->tc.size; m->wpos = 0; + p9_req_get(req); + m->wreq = req; spin_unlock(&m->client->lock); } @@ -492,8 +501,11 @@ static void p9_write_work(struct work_struct *work) } m->wpos += err; - if (m->wpos == m->wsize) + if (m->wpos == m->wsize) { m->wpos = m->wsize = 0; + p9_req_put(m->wreq); + m->wreq = NULL; + } end_clear: clear_bit(Wworksched, &m->wsched); @@ -663,7 +675,7 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) struct p9_conn *m = &ts->conn; p9_debug(P9_DEBUG_TRANS, "mux %p task %p tcall %p id %d\n", - m, current, req->tc, req->tc->id); + m, current, &req->tc, req->tc.id); if (m->err < 0) return m->err; @@ -694,6 +706,7 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; + p9_req_put(req); ret = 0; } spin_unlock(&client->lock); @@ -711,6 +724,7 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) spin_lock(&client->lock); list_del(&req->req_list); spin_unlock(&client->lock); + p9_req_put(req); return 0; } @@ -862,7 +876,15 @@ static void p9_conn_destroy(struct p9_conn *m) p9_mux_poll_stop(m); cancel_work_sync(&m->rq); + if (m->rreq) { + p9_req_put(m->rreq); + m->rreq = NULL; + } cancel_work_sync(&m->wq); + if (m->wreq) { + p9_req_put(m->wreq); + m->wreq = NULL; + } p9_conn_cancel(m, -ECONNRESET); diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index b513cffeeb3c..119103bfa82e 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -122,7 +122,7 @@ struct p9_rdma_context { dma_addr_t busa; union { struct p9_req_t *req; - struct p9_fcall *rc; + struct p9_fcall rc; }; }; @@ -274,8 +274,7 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) case RDMA_CM_EVENT_DISCONNECTED: if (rdma) rdma->state = P9_RDMA_CLOSED; - if (c) - c->status = Disconnected; + c->status = Disconnected; break; case RDMA_CM_EVENT_TIMEWAIT_EXIT: @@ -320,8 +319,8 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) if (wc->status != IB_WC_SUCCESS) goto err_out; - c->rc->size = wc->byte_len; - err = p9_parse_header(c->rc, NULL, NULL, &tag, 1); + c->rc.size = wc->byte_len; + err = p9_parse_header(&c->rc, NULL, NULL, &tag, 1); if (err) goto err_out; @@ -331,12 +330,13 @@ recv_done(struct ib_cq *cq, struct ib_wc *wc) /* Check that we have not yet received a reply for this request. */ - if (unlikely(req->rc)) { + if (unlikely(req->rc.sdata)) { pr_err("Duplicate reply for request %d", tag); goto err_out; } - req->rc = c->rc; + req->rc.size = c->rc.size; + req->rc.sdata = c->rc.sdata; p9_client_cb(client, req, REQ_STATUS_RCVD); out: @@ -361,9 +361,10 @@ send_done(struct ib_cq *cq, struct ib_wc *wc) container_of(wc->wr_cqe, struct p9_rdma_context, cqe); ib_dma_unmap_single(rdma->cm_id->device, - c->busa, c->req->tc->size, + c->busa, c->req->tc.size, DMA_TO_DEVICE); up(&rdma->sq_sem); + p9_req_put(c->req); kfree(c); } @@ -401,7 +402,7 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c) struct ib_sge sge; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->rc->sdata, client->msize, + c->rc.sdata, client->msize, DMA_FROM_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) goto error; @@ -443,9 +444,9 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) **/ if (unlikely(atomic_read(&rdma->excess_rc) > 0)) { if ((atomic_sub_return(1, &rdma->excess_rc) >= 0)) { - /* Got one ! */ - kfree(req->rc); - req->rc = NULL; + /* Got one! */ + p9_fcall_fini(&req->rc); + req->rc.sdata = NULL; goto dont_need_post_recv; } else { /* We raced and lost. */ @@ -459,7 +460,7 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = -ENOMEM; goto recv_error; } - rpl_context->rc = req->rc; + rpl_context->rc.sdata = req->rc.sdata; /* * Post a receive buffer for this request. We need to ensure @@ -475,11 +476,11 @@ static int rdma_request(struct p9_client *client, struct p9_req_t *req) err = post_recv(client, rpl_context); if (err) { - p9_debug(P9_DEBUG_FCALL, "POST RECV failed\n"); + p9_debug(P9_DEBUG_ERROR, "POST RECV failed: %d\n", err); goto recv_error; } /* remove posted receive buffer from request structure */ - req->rc = NULL; + req->rc.sdata = NULL; dont_need_post_recv: /* Post the request */ @@ -491,7 +492,7 @@ dont_need_post_recv: c->req = req; c->busa = ib_dma_map_single(rdma->cm_id->device, - c->req->tc->sdata, c->req->tc->size, + c->req->tc.sdata, c->req->tc.size, DMA_TO_DEVICE); if (ib_dma_mapping_error(rdma->cm_id->device, c->busa)) { err = -EIO; @@ -501,7 +502,7 @@ dont_need_post_recv: c->cqe.done = send_done; sge.addr = c->busa; - sge.length = c->req->tc->size; + sge.length = c->req->tc.size; sge.lkey = rdma->pd->local_dma_lkey; wr.next = NULL; @@ -544,7 +545,7 @@ dont_need_post_recv: recv_error: kfree(rpl_context); spin_lock_irqsave(&rdma->req_lock, flags); - if (rdma->state < P9_RDMA_CLOSING) { + if (err != -EINTR && rdma->state < P9_RDMA_CLOSING) { rdma->state = P9_RDMA_CLOSING; spin_unlock_irqrestore(&rdma->req_lock, flags); rdma_disconnect(rdma->cm_id); diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 7728b0acde09..eb596c2ed546 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -155,7 +155,7 @@ static void req_done(struct virtqueue *vq) } if (len) { - req->rc->size = len; + req->rc.size = len; p9_client_cb(chan->client, req, REQ_STATUS_RCVD); } } @@ -207,6 +207,13 @@ static int p9_virtio_cancel(struct p9_client *client, struct p9_req_t *req) return 1; } +/* Reply won't come, so drop req ref */ +static int p9_virtio_cancelled(struct p9_client *client, struct p9_req_t *req) +{ + p9_req_put(req); + return 0; +} + /** * pack_sg_list_p - Just like pack_sg_list. Instead of taking a buffer, * this takes a list of pages. @@ -273,12 +280,12 @@ req_retry: out_sgs = in_sgs = 0; /* Handle out VirtIO ring buffers */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, req->rc->capacity); + VIRTQUEUE_NUM, req->rc.sdata, req->rc.capacity); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -404,6 +411,7 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, struct scatterlist *sgs[4]; size_t offs; int need_drop = 0; + int kicked = 0; p9_debug(P9_DEBUG_TRANS, "virtio request\n"); @@ -411,29 +419,33 @@ p9_virtio_zc_request(struct p9_client *client, struct p9_req_t *req, __le32 sz; int n = p9_get_mapped_pages(chan, &out_pages, uodata, outlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } out_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != outlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); outlen = n; } /* The size field of the message must include the length of the * header and the length of the data. We didn't actually know * the length of the data until this point so add it in now. */ - sz = cpu_to_le32(req->tc->size + outlen); - memcpy(&req->tc->sdata[0], &sz, sizeof(sz)); + sz = cpu_to_le32(req->tc.size + outlen); + memcpy(&req->tc.sdata[0], &sz, sizeof(sz)); } else if (uidata) { int n = p9_get_mapped_pages(chan, &in_pages, uidata, inlen, &offs, &need_drop); - if (n < 0) - return n; + if (n < 0) { + err = n; + goto err_out; + } in_nr_pages = DIV_ROUND_UP(n + offs, PAGE_SIZE); if (n != inlen) { __le32 v = cpu_to_le32(n); - memcpy(&req->tc->sdata[req->tc->size - 4], &v, 4); + memcpy(&req->tc.sdata[req->tc.size - 4], &v, 4); inlen = n; } } @@ -445,7 +457,7 @@ req_retry_pinned: /* out data */ out = pack_sg_list(chan->sg, 0, - VIRTQUEUE_NUM, req->tc->sdata, req->tc->size); + VIRTQUEUE_NUM, req->tc.sdata, req->tc.size); if (out) sgs[out_sgs++] = chan->sg; @@ -464,7 +476,7 @@ req_retry_pinned: * alloced memory and payload onto the user buffer. */ in = pack_sg_list(chan->sg, out, - VIRTQUEUE_NUM, req->rc->sdata, in_hdr_len); + VIRTQUEUE_NUM, req->rc.sdata, in_hdr_len); if (in) sgs[out_sgs + in_sgs++] = chan->sg + out; @@ -498,6 +510,7 @@ req_retry_pinned: } virtqueue_kick(chan->vq); spin_unlock_irqrestore(&chan->lock, flags); + kicked = 1; p9_debug(P9_DEBUG_TRANS, "virtio request kicked\n"); err = wait_event_killable(req->wq, req->status >= REQ_STATUS_RCVD); /* @@ -518,6 +531,10 @@ err_out: } kvfree(in_pages); kvfree(out_pages); + if (!kicked) { + /* reply won't come */ + p9_req_put(req); + } return err; } @@ -750,6 +767,7 @@ static struct p9_trans_module p9_virtio_trans = { .request = p9_virtio_request, .zc_request = p9_virtio_zc_request, .cancel = p9_virtio_cancel, + .cancelled = p9_virtio_cancelled, /* * We leave one entry for input and one entry for response * headers. We also skip one more entry to accomodate, address diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index c2d54ac76bfd..e2fbf3677b9b 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -141,7 +141,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) struct xen_9pfs_front_priv *priv = NULL; RING_IDX cons, prod, masked_cons, masked_prod; unsigned long flags; - u32 size = p9_req->tc->size; + u32 size = p9_req->tc.size; struct xen_9pfs_dataring *ring; int num; @@ -154,7 +154,7 @@ static int p9_xen_request(struct p9_client *client, struct p9_req_t *p9_req) if (!priv || priv->client != client) return -EINVAL; - num = p9_req->tc->tag % priv->num_rings; + num = p9_req->tc.tag % priv->num_rings; ring = &priv->rings[num]; again: @@ -176,7 +176,7 @@ again: masked_prod = xen_9pfs_mask(prod, XEN_9PFS_RING_SIZE); masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); - xen_9pfs_write_packet(ring->data.out, p9_req->tc->sdata, size, + xen_9pfs_write_packet(ring->data.out, p9_req->tc.sdata, size, &masked_prod, masked_cons, XEN_9PFS_RING_SIZE); p9_req->status = REQ_STATUS_SENT; @@ -185,6 +185,7 @@ again: ring->intf->out_prod = prod; spin_unlock_irqrestore(&ring->lock, flags); notify_remote_via_irq(ring->irq); + p9_req_put(p9_req); return 0; } @@ -229,12 +230,12 @@ static void p9_xen_response(struct work_struct *work) continue; } - memcpy(req->rc, &h, sizeof(h)); - req->rc->offset = 0; + memcpy(&req->rc, &h, sizeof(h)); + req->rc.offset = 0; masked_cons = xen_9pfs_mask(cons, XEN_9PFS_RING_SIZE); /* Then, read the whole packet (including the header) */ - xen_9pfs_read_packet(req->rc->sdata, ring->data.in, h.size, + xen_9pfs_read_packet(req->rc.sdata, ring->data.in, h.size, masked_prod, &masked_cons, XEN_9PFS_RING_SIZE); @@ -391,8 +392,8 @@ static int xen_9pfs_front_probe(struct xenbus_device *dev, unsigned int max_rings, max_ring_order, len = 0; versions = xenbus_read(XBT_NIL, dev->otherend, "versions", &len); - if (!len) - return -EINVAL; + if (IS_ERR(versions)) + return PTR_ERR(versions); if (strcmp(versions, "1")) { kfree(versions); return -EINVAL; diff --git a/net/9p/util.c b/net/9p/util.c deleted file mode 100644 index 55ad98277e85..000000000000 --- a/net/9p/util.c +++ /dev/null @@ -1,140 +0,0 @@ -/* - * net/9p/util.c - * - * This file contains some helper functions - * - * Copyright (C) 2007 by Latchesar Ionkov <lucho@ionkov.net> - * Copyright (C) 2004 by Eric Van Hensbergen <ericvh@gmail.com> - * Copyright (C) 2002 by Ron Minnich <rminnich@lanl.gov> - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write to: - * Free Software Foundation - * 51 Franklin Street, Fifth Floor - * Boston, MA 02111-1301 USA - * - */ - -#include <linux/module.h> -#include <linux/errno.h> -#include <linux/fs.h> -#include <linux/sched.h> -#include <linux/parser.h> -#include <linux/idr.h> -#include <linux/slab.h> -#include <net/9p/9p.h> - -/** - * struct p9_idpool - per-connection accounting for tag idpool - * @lock: protects the pool - * @pool: idr to allocate tag id from - * - */ - -struct p9_idpool { - spinlock_t lock; - struct idr pool; -}; - -/** - * p9_idpool_create - create a new per-connection id pool - * - */ - -struct p9_idpool *p9_idpool_create(void) -{ - struct p9_idpool *p; - - p = kmalloc(sizeof(struct p9_idpool), GFP_KERNEL); - if (!p) - return ERR_PTR(-ENOMEM); - - spin_lock_init(&p->lock); - idr_init(&p->pool); - - return p; -} -EXPORT_SYMBOL(p9_idpool_create); - -/** - * p9_idpool_destroy - create a new per-connection id pool - * @p: idpool to destroy - */ - -void p9_idpool_destroy(struct p9_idpool *p) -{ - idr_destroy(&p->pool); - kfree(p); -} -EXPORT_SYMBOL(p9_idpool_destroy); - -/** - * p9_idpool_get - allocate numeric id from pool - * @p: pool to allocate from - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -int p9_idpool_get(struct p9_idpool *p) -{ - int i; - unsigned long flags; - - idr_preload(GFP_NOFS); - spin_lock_irqsave(&p->lock, flags); - - /* no need to store exactly p, we just need something non-null */ - i = idr_alloc(&p->pool, p, 0, 0, GFP_NOWAIT); - - spin_unlock_irqrestore(&p->lock, flags); - idr_preload_end(); - if (i < 0) - return -1; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", i, p); - return i; -} -EXPORT_SYMBOL(p9_idpool_get); - -/** - * p9_idpool_put - release numeric id from pool - * @id: numeric id which is being released - * @p: pool to release id into - * - * Bugs: This seems to be an awful generic function, should it be in idr.c with - * the lock included in struct idr? - */ - -void p9_idpool_put(int id, struct p9_idpool *p) -{ - unsigned long flags; - - p9_debug(P9_DEBUG_MUX, " id %d pool %p\n", id, p); - - spin_lock_irqsave(&p->lock, flags); - idr_remove(&p->pool, id); - spin_unlock_irqrestore(&p->lock, flags); -} -EXPORT_SYMBOL(p9_idpool_put); - -/** - * p9_idpool_check - check if the specified id is available - * @id: id to check - * @p: pool to check - */ - -int p9_idpool_check(int id, struct p9_idpool *p) -{ - return idr_find(&p->pool, id) != NULL; -} -EXPORT_SYMBOL(p9_idpool_check); diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index 024139b51d3a..6bac0d6b7b94 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -1422,7 +1422,14 @@ static void br_multicast_query_received(struct net_bridge *br, return; br_multicast_update_query_timer(br, query, max_delay); - br_multicast_mark_router(br, port); + + /* Based on RFC4541, section 2.1.1 IGMP Forwarding Rules, + * the arrival port for IGMP Queries where the source address + * is 0.0.0.0 should not be added to router port list. + */ + if ((saddr->proto == htons(ETH_P_IP) && saddr->u.ip4) || + saddr->proto == htons(ETH_P_IPV6)) + br_multicast_mark_router(br, port); } static void br_ip4_multicast_query(struct net_bridge *br, diff --git a/net/core/datagram.c b/net/core/datagram.c index 6a034eb538a1..57f3a6fcfc1e 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -808,8 +808,9 @@ int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, return -EINVAL; } - if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE)) - netdev_rx_csum_fault(skb->dev); + if (unlikely(skb->ip_summed == CHECKSUM_COMPLETE) && + !skb->csum_complete_sw) + netdev_rx_csum_fault(NULL); } return 0; fault: diff --git a/net/core/dev.c b/net/core/dev.c index 022ad73d6253..77d43ae2a7bb 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5457,7 +5457,7 @@ static void gro_flush_oldest(struct list_head *head) /* Do not adjust napi->gro_hash[].count, caller is adding a new * SKB to the chain. */ - list_del(&oldest->list); + skb_list_del_init(oldest); napi_gro_complete(oldest); } diff --git a/net/core/filter.c b/net/core/filter.c index 35c6933c2622..e521c5ebc7d1 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -5264,8 +5264,6 @@ sk_msg_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_msg_pull_data_proto; case BPF_FUNC_msg_push_data: return &bpf_msg_push_data_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; default: return bpf_base_func_proto(func_id); } @@ -5296,8 +5294,6 @@ sk_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sk_redirect_map_proto; case BPF_FUNC_sk_redirect_hash: return &bpf_sk_redirect_hash_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sk_lookup_tcp_proto; @@ -5496,7 +5492,13 @@ static bool cg_skb_is_valid_access(int off, int size, case bpf_ctx_range(struct __sk_buff, data_meta): case bpf_ctx_range(struct __sk_buff, flow_keys): return false; + case bpf_ctx_range(struct __sk_buff, data): + case bpf_ctx_range(struct __sk_buff, data_end): + if (!capable(CAP_SYS_ADMIN)) + return false; + break; } + if (type == BPF_WRITE) { switch (off) { case bpf_ctx_range(struct __sk_buff, mark): @@ -5638,6 +5640,15 @@ static bool sock_filter_is_valid_access(int off, int size, prog->expected_attach_type); } +static int bpf_noop_prologue(struct bpf_insn *insn_buf, bool direct_write, + const struct bpf_prog *prog) +{ + /* Neither direct read nor direct write requires any preliminary + * action. + */ + return 0; +} + static int bpf_unclone_prologue(struct bpf_insn *insn_buf, bool direct_write, const struct bpf_prog *prog, int drop_verdict) { @@ -7204,6 +7215,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .get_func_proto = xdp_func_proto, .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -7302,6 +7314,7 @@ const struct bpf_verifier_ops sk_msg_verifier_ops = { .get_func_proto = sk_msg_func_proto, .is_valid_access = sk_msg_is_valid_access, .convert_ctx_access = sk_msg_convert_ctx_access, + .gen_prologue = bpf_noop_prologue, }; const struct bpf_prog_ops sk_msg_prog_ops = { diff --git a/net/core/neighbour.c b/net/core/neighbour.c index ee605d9d8bd4..41954e42a2de 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -2364,7 +2364,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) if (!master_idx) return false; - master = netdev_master_upper_dev_get(dev); + master = dev ? netdev_master_upper_dev_get(dev) : NULL; if (!master || master->ifindex != master_idx) return true; @@ -2373,7 +2373,7 @@ static bool neigh_master_filtered(struct net_device *dev, int master_idx) static bool neigh_ifindex_filtered(struct net_device *dev, int filter_idx) { - if (filter_idx && dev->ifindex != filter_idx) + if (filter_idx && (!dev || dev->ifindex != filter_idx)) return true; return false; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index 0958c7be2c22..f679c7a7d761 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -3333,6 +3333,7 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) int idx; int s_idx = cb->family; int type = cb->nlh->nlmsg_type - RTM_BASE; + int ret = 0; if (s_idx == 0) s_idx = 1; @@ -3365,12 +3366,13 @@ static int rtnl_dump_all(struct sk_buff *skb, struct netlink_callback *cb) cb->prev_seq = 0; cb->seq = 0; } - if (dumpit(skb, cb)) + ret = dumpit(skb, cb); + if (ret < 0) break; } cb->family = idx; - return skb->len; + return skb->len ? : ret; } struct sk_buff *rtmsg_ifinfo_build_skb(int type, struct net_device *dev, diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index b1a2c5e38530..37b4667128a3 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -279,7 +279,6 @@ static int proc_dointvec_minmax_bpf_enable(struct ctl_table *table, int write, return ret; } -# ifdef CONFIG_HAVE_EBPF_JIT static int proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, @@ -290,7 +289,6 @@ proc_dointvec_minmax_bpf_restricted(struct ctl_table *table, int write, return proc_dointvec_minmax(table, write, buffer, lenp, ppos); } -# endif #endif static struct ctl_table net_core_table[] = { @@ -397,6 +395,14 @@ static struct ctl_table net_core_table[] = { .extra2 = &one, }, # endif + { + .procname = "bpf_jit_limit", + .data = &bpf_jit_limit, + .maxlen = sizeof(int), + .mode = 0600, + .proc_handler = proc_dointvec_minmax_bpf_restricted, + .extra1 = &one, + }, #endif { .procname = "netdev_tstamp_prequeue", diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c index 63d5b58fbfdb..a34602ae27de 100644 --- a/net/ipv4/devinet.c +++ b/net/ipv4/devinet.c @@ -1704,6 +1704,7 @@ static int inet_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { + fillargs->netnsid = -1; NL_SET_ERR_MSG(extack, "ipv4: Invalid target network namespace id"); return PTR_ERR(net); } @@ -1761,7 +1762,7 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) struct net_device *dev; struct in_device *in_dev; struct hlist_head *head; - int err; + int err = 0; s_h = cb->args[0]; s_idx = idx = cb->args[1]; @@ -1771,12 +1772,15 @@ static int inet_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) err = inet_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) - return err; + goto put_tgt_net; + err = 0; if (fillargs.ifindex) { dev = __dev_get_by_index(tgt_net, fillargs.ifindex); - if (!dev) - return -ENODEV; + if (!dev) { + err = -ENODEV; + goto put_tgt_net; + } in_dev = __in_dev_get_rtnl(dev); if (in_dev) { @@ -1821,7 +1825,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return skb->len; + return err < 0 ? err : skb->len; } static void rtmsg_ifa(int event, struct in_ifaddr *ifa, struct nlmsghdr *nlh, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5bf653f36911..6df95be96311 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -829,6 +829,7 @@ int ip_valid_fib_dump_req(struct net *net, const struct nlmsghdr *nlh, return -EINVAL; } + filter->dump_all_families = (rtm->rtm_family == AF_UNSPEC); filter->flags = rtm->rtm_flags; filter->protocol = rtm->rtm_protocol; filter->rt_type = rtm->rtm_type; @@ -899,6 +900,9 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (filter.table_id) { tb = fib_get_table(net, filter.table_id); if (!tb) { + if (filter.dump_all_families) + return skb->len; + NL_SET_ERR_MSG(cb->extack, "ipv4: FIB table does not exist"); return -ENOENT; } diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 7a3e2acda94c..a6defbec4f1b 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -2542,6 +2542,9 @@ static int ipmr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = ipmr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { + if (filter.dump_all_families) + return skb->len; + NL_SET_ERR_MSG(cb->extack, "ipv4: MR table does not exist"); return -ENOENT; } diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index cf8252d05a01..ca3ed931f2a9 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -609,8 +609,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) struct net *net = dev_net(skb->dev); sk = __udp4_lib_lookup(net, iph->daddr, uh->dest, - iph->saddr, uh->source, skb->dev->ifindex, 0, - udptable, NULL); + iph->saddr, uh->source, skb->dev->ifindex, + inet_sdif(skb), udptable, NULL); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -2120,8 +2120,24 @@ static inline int udp4_csum_init(struct sk_buff *skb, struct udphdr *uh, /* Note, we are only interested in != 0 or == 0, thus the * force to int. */ - return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, - inet_compute_pseudo); + err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + inet_compute_pseudo); + if (err) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { + /* If SW calculated the value, we know it's bad */ + if (skb->csum_complete_sw) + return 1; + + /* HW says the value is bad. Let's validate that. + * skb->csum is no longer the full packet checksum, + * so don't treat it as such. + */ + skb_checksum_complete_unset(skb); + } + + return 0; } /* wrapper for udp_queue_rcv_skb tacking care of csum conversion and diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c index d9ad986c7b2c..5cbb9be05295 100644 --- a/net/ipv4/udp_diag.c +++ b/net/ipv4/udp_diag.c @@ -42,6 +42,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb, rcu_read_lock(); if (req->sdiag_family == AF_INET) + /* src and dst are swapped for historical reasons */ sk = __udp4_lib_lookup(net, req->id.idiag_src[0], req->id.idiag_sport, req->id.idiag_dst[0], req->id.idiag_dport, diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 45b84dd5c4eb..63a808d5af15 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5058,6 +5058,7 @@ static int inet6_valid_dump_ifaddr_req(const struct nlmsghdr *nlh, fillargs->netnsid = nla_get_s32(tb[i]); net = rtnl_get_net_ns_capable(sk, fillargs->netnsid); if (IS_ERR(net)) { + fillargs->netnsid = -1; NL_SET_ERR_MSG_MOD(extack, "Invalid target network namespace id"); return PTR_ERR(net); } @@ -5089,23 +5090,25 @@ static int inet6_dump_addr(struct sk_buff *skb, struct netlink_callback *cb, struct net_device *dev; struct inet6_dev *idev; struct hlist_head *head; + int err = 0; s_h = cb->args[0]; s_idx = idx = cb->args[1]; s_ip_idx = cb->args[2]; if (cb->strict_check) { - int err; - err = inet6_valid_dump_ifaddr_req(nlh, &fillargs, &tgt_net, skb->sk, cb); if (err < 0) - return err; + goto put_tgt_net; + err = 0; if (fillargs.ifindex) { dev = __dev_get_by_index(tgt_net, fillargs.ifindex); - if (!dev) - return -ENODEV; + if (!dev) { + err = -ENODEV; + goto put_tgt_net; + } idev = __in6_dev_get(dev); if (idev) { err = in6_dump_addrs(idev, skb, cb, s_ip_idx, @@ -5144,7 +5147,7 @@ put_tgt_net: if (fillargs.netnsid >= 0) put_net(tgt_net); - return skb->len; + return err < 0 ? err : skb->len; } static int inet6_dump_ifaddr(struct sk_buff *skb, struct netlink_callback *cb) diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c index 547515e8450a..377717045f8f 100644 --- a/net/ipv6/ip6_checksum.c +++ b/net/ipv6/ip6_checksum.c @@ -88,8 +88,24 @@ int udp6_csum_init(struct sk_buff *skb, struct udphdr *uh, int proto) * Note, we are only interested in != 0 or == 0, thus the * force to int. */ - return (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, - ip6_compute_pseudo); + err = (__force int)skb_checksum_init_zero_check(skb, proto, uh->check, + ip6_compute_pseudo); + if (err) + return err; + + if (skb->ip_summed == CHECKSUM_COMPLETE && !skb->csum_valid) { + /* If SW calculated the value, we know it's bad */ + if (skb->csum_complete_sw) + return 1; + + /* HW says the value is bad. Let's validate that. + * skb->csum is no longer the full packet checksum, + * so don't treat is as such. + */ + skb_checksum_complete_unset(skb); + } + + return 0; } EXPORT_SYMBOL(udp6_csum_init); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index 2a058b408a6a..1b8bc008b53b 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -620,6 +620,9 @@ static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) if (arg.filter.table_id) { tb = fib6_get_table(net, arg.filter.table_id); if (!tb) { + if (arg.filter.dump_all_families) + return skb->len; + NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist"); return -ENOENT; } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index c3317ffb09eb..e2ea691e42c6 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -2473,6 +2473,9 @@ static int ip6mr_rtm_dumproute(struct sk_buff *skb, struct netlink_callback *cb) mrt = ip6mr_get_table(sock_net(skb->sk), filter.table_id); if (!mrt) { + if (filter.dump_all_families) + return skb->len; + NL_SET_ERR_MSG_MOD(cb->extack, "MR table does not exist"); return -ENOENT; } diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c index a25cfdd47c89..659ecf4e4b3c 100644 --- a/net/ipv6/ndisc.c +++ b/net/ipv6/ndisc.c @@ -1732,10 +1732,9 @@ int ndisc_rcv(struct sk_buff *skb) return 0; } - memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); - switch (msg->icmph.icmp6_type) { case NDISC_NEIGHBOUR_SOLICITATION: + memset(NEIGH_CB(skb), 0, sizeof(struct neighbour_cb)); ndisc_recv_ns(skb); break; diff --git a/net/ipv6/route.c b/net/ipv6/route.c index e3226284e480..2a7423c39456 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -2745,6 +2745,8 @@ static int ip6_route_check_nh_onlink(struct net *net, grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0); if (grt) { if (!grt->dst.error && + /* ignore match if it is the default route */ + grt->from && !ipv6_addr_any(&grt->from->fib6_dst.addr) && (grt->rt6i_flags & flags || dev != grt->dst.dev)) { NL_SET_ERR_MSG(extack, "Nexthop has invalid gateway or device mismatch"); diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 06d17ff3562f..d2d97d07ef27 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -478,7 +478,7 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, struct net *net = dev_net(skb->dev); sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source, - inet6_iif(skb), 0, udptable, skb); + inet6_iif(skb), inet6_sdif(skb), udptable, skb); if (!sk) { __ICMP6_INC_STATS(net, __in6_dev_get(skb->dev), ICMP6_MIB_INERRORS); diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 022bca98bde6..ca3b0f46de53 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -1320,7 +1320,6 @@ check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w) const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { [TCA_KIND] = { .type = NLA_STRING }, - [TCA_OPTIONS] = { .type = NLA_NESTED }, [TCA_RATE] = { .type = NLA_BINARY, .len = sizeof(struct tc_estimator) }, [TCA_STAB] = { .type = NLA_NESTED }, diff --git a/net/sched/sch_gred.c b/net/sched/sch_gred.c index cbe4831f46f4..4a042abf844c 100644 --- a/net/sched/sch_gred.c +++ b/net/sched/sch_gred.c @@ -413,7 +413,7 @@ static int gred_change(struct Qdisc *sch, struct nlattr *opt, if (tb[TCA_GRED_PARMS] == NULL && tb[TCA_GRED_STAB] == NULL) { if (tb[TCA_GRED_LIMIT] != NULL) sch->limit = nla_get_u32(tb[TCA_GRED_LIMIT]); - return gred_change_table_def(sch, opt); + return gred_change_table_def(sch, tb[TCA_GRED_DPS]); } if (tb[TCA_GRED_PARMS] == NULL || diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index e871368500e3..18daebcef181 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -122,22 +122,17 @@ static void __smc_lgr_unregister_conn(struct smc_connection *conn) sock_put(&smc->sk); /* sock_hold in smc_lgr_register_conn() */ } -/* Unregister connection and trigger lgr freeing if applicable +/* Unregister connection from lgr */ static void smc_lgr_unregister_conn(struct smc_connection *conn) { struct smc_link_group *lgr = conn->lgr; - int reduced = 0; write_lock_bh(&lgr->conns_lock); if (conn->alert_token_local) { - reduced = 1; __smc_lgr_unregister_conn(conn); } write_unlock_bh(&lgr->conns_lock); - if (!reduced || lgr->conns_num) - return; - smc_lgr_schedule_free_work(lgr); } /* Send delete link, either as client to request the initiation @@ -291,7 +286,8 @@ out: return rc; } -static void smc_buf_unuse(struct smc_connection *conn) +static void smc_buf_unuse(struct smc_connection *conn, + struct smc_link_group *lgr) { if (conn->sndbuf_desc) conn->sndbuf_desc->used = 0; @@ -301,8 +297,6 @@ static void smc_buf_unuse(struct smc_connection *conn) conn->rmb_desc->used = 0; } else { /* buf registration failed, reuse not possible */ - struct smc_link_group *lgr = conn->lgr; - write_lock_bh(&lgr->rmbs_lock); list_del(&conn->rmb_desc->list); write_unlock_bh(&lgr->rmbs_lock); @@ -315,16 +309,21 @@ static void smc_buf_unuse(struct smc_connection *conn) /* remove a finished connection from its link group */ void smc_conn_free(struct smc_connection *conn) { - if (!conn->lgr) + struct smc_link_group *lgr = conn->lgr; + + if (!lgr) return; - if (conn->lgr->is_smcd) { + if (lgr->is_smcd) { smc_ism_unset_conn(conn); tasklet_kill(&conn->rx_tsklet); } else { smc_cdc_tx_dismiss_slots(conn); } - smc_lgr_unregister_conn(conn); - smc_buf_unuse(conn); + smc_lgr_unregister_conn(conn); /* unsets conn->lgr */ + smc_buf_unuse(conn, lgr); /* allow buffer reuse */ + + if (!lgr->conns_num) + smc_lgr_schedule_free_work(lgr); } static void smc_link_clear(struct smc_link *lnk) diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c index 305ecea92170..ad8ead738981 100644 --- a/net/sunrpc/auth.c +++ b/net/sunrpc/auth.c @@ -30,10 +30,9 @@ struct rpc_cred_cache { static unsigned int auth_hashbits = RPC_CREDCACHE_DEFAULT_HASHBITS; -static DEFINE_SPINLOCK(rpc_authflavor_lock); -static const struct rpc_authops *auth_flavors[RPC_AUTH_MAXFLAVOR] = { - &authnull_ops, /* AUTH_NULL */ - &authunix_ops, /* AUTH_UNIX */ +static const struct rpc_authops __rcu *auth_flavors[RPC_AUTH_MAXFLAVOR] = { + [RPC_AUTH_NULL] = (const struct rpc_authops __force __rcu *)&authnull_ops, + [RPC_AUTH_UNIX] = (const struct rpc_authops __force __rcu *)&authunix_ops, NULL, /* others can be loadable modules */ }; @@ -93,39 +92,65 @@ pseudoflavor_to_flavor(u32 flavor) { int rpcauth_register(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == NULL) { - auth_flavors[flavor] = ops; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], NULL, ops); + if (old == NULL || old == ops) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_register); int rpcauth_unregister(const struct rpc_authops *ops) { + const struct rpc_authops *old; rpc_authflavor_t flavor; - int ret = -EPERM; if ((flavor = ops->au_flavor) >= RPC_AUTH_MAXFLAVOR) return -EINVAL; - spin_lock(&rpc_authflavor_lock); - if (auth_flavors[flavor] == ops) { - auth_flavors[flavor] = NULL; - ret = 0; - } - spin_unlock(&rpc_authflavor_lock); - return ret; + + old = cmpxchg((const struct rpc_authops ** __force)&auth_flavors[flavor], ops, NULL); + if (old == ops || old == NULL) + return 0; + return -EPERM; } EXPORT_SYMBOL_GPL(rpcauth_unregister); +static const struct rpc_authops * +rpcauth_get_authops(rpc_authflavor_t flavor) +{ + const struct rpc_authops *ops; + + if (flavor >= RPC_AUTH_MAXFLAVOR) + return NULL; + + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) { + rcu_read_unlock(); + request_module("rpc-auth-%u", flavor); + rcu_read_lock(); + ops = rcu_dereference(auth_flavors[flavor]); + if (ops == NULL) + goto out; + } + if (!try_module_get(ops->owner)) + ops = NULL; +out: + rcu_read_unlock(); + return ops; +} + +static void +rpcauth_put_authops(const struct rpc_authops *ops) +{ + module_put(ops->owner); +} + /** * rpcauth_get_pseudoflavor - check if security flavor is supported * @flavor: a security flavor @@ -138,25 +163,16 @@ EXPORT_SYMBOL_GPL(rpcauth_unregister); rpc_authflavor_t rpcauth_get_pseudoflavor(rpc_authflavor_t flavor, struct rpcsec_gss_info *info) { - const struct rpc_authops *ops; + const struct rpc_authops *ops = rpcauth_get_authops(flavor); rpc_authflavor_t pseudoflavor; - ops = auth_flavors[flavor]; - if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); + if (!ops) return RPC_AUTH_MAXFLAVOR; - } - spin_unlock(&rpc_authflavor_lock); - pseudoflavor = flavor; if (ops->info2flavor != NULL) pseudoflavor = ops->info2flavor(info); - module_put(ops->owner); + rpcauth_put_authops(ops); return pseudoflavor; } EXPORT_SYMBOL_GPL(rpcauth_get_pseudoflavor); @@ -176,25 +192,15 @@ rpcauth_get_gssinfo(rpc_authflavor_t pseudoflavor, struct rpcsec_gss_info *info) const struct rpc_authops *ops; int result; - if (flavor >= RPC_AUTH_MAXFLAVOR) - return -EINVAL; - - ops = auth_flavors[flavor]; + ops = rpcauth_get_authops(flavor); if (ops == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); return -ENOENT; - } - spin_unlock(&rpc_authflavor_lock); result = -ENOENT; if (ops->flavor2info != NULL) result = ops->flavor2info(pseudoflavor, info); - module_put(ops->owner); + rpcauth_put_authops(ops); return result; } EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); @@ -212,15 +218,13 @@ EXPORT_SYMBOL_GPL(rpcauth_get_gssinfo); int rpcauth_list_flavors(rpc_authflavor_t *array, int size) { - rpc_authflavor_t flavor; - int result = 0; + const struct rpc_authops *ops; + rpc_authflavor_t flavor, pseudos[4]; + int i, len, result = 0; - spin_lock(&rpc_authflavor_lock); + rcu_read_lock(); for (flavor = 0; flavor < RPC_AUTH_MAXFLAVOR; flavor++) { - const struct rpc_authops *ops = auth_flavors[flavor]; - rpc_authflavor_t pseudos[4]; - int i, len; - + ops = rcu_dereference(auth_flavors[flavor]); if (result >= size) { result = -ENOMEM; break; @@ -245,7 +249,7 @@ rpcauth_list_flavors(rpc_authflavor_t *array, int size) array[result++] = pseudos[i]; } } - spin_unlock(&rpc_authflavor_lock); + rcu_read_unlock(); dprintk("RPC: %s returns %d\n", __func__, result); return result; @@ -255,25 +259,17 @@ EXPORT_SYMBOL_GPL(rpcauth_list_flavors); struct rpc_auth * rpcauth_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - struct rpc_auth *auth; + struct rpc_auth *auth = ERR_PTR(-EINVAL); const struct rpc_authops *ops; - u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); + u32 flavor = pseudoflavor_to_flavor(args->pseudoflavor); - auth = ERR_PTR(-EINVAL); - if (flavor >= RPC_AUTH_MAXFLAVOR) + ops = rpcauth_get_authops(flavor); + if (ops == NULL) goto out; - if ((ops = auth_flavors[flavor]) == NULL) - request_module("rpc-auth-%u", flavor); - spin_lock(&rpc_authflavor_lock); - ops = auth_flavors[flavor]; - if (ops == NULL || !try_module_get(ops->owner)) { - spin_unlock(&rpc_authflavor_lock); - goto out; - } - spin_unlock(&rpc_authflavor_lock); auth = ops->create(args, clnt); - module_put(ops->owner); + + rpcauth_put_authops(ops); if (IS_ERR(auth)) return auth; if (clnt->cl_auth) @@ -288,32 +284,37 @@ EXPORT_SYMBOL_GPL(rpcauth_create); void rpcauth_release(struct rpc_auth *auth) { - if (!atomic_dec_and_test(&auth->au_count)) + if (!refcount_dec_and_test(&auth->au_count)) return; auth->au_ops->destroy(auth); } static DEFINE_SPINLOCK(rpc_credcache_lock); -static void +/* + * On success, the caller is responsible for freeing the reference + * held by the hashtable + */ +static bool rpcauth_unhash_cred_locked(struct rpc_cred *cred) { + if (!test_and_clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; hlist_del_rcu(&cred->cr_hash); - smp_mb__before_atomic(); - clear_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + return true; } -static int +static bool rpcauth_unhash_cred(struct rpc_cred *cred) { spinlock_t *cache_lock; - int ret; + bool ret; + if (!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + return false; cache_lock = &cred->cr_auth->au_credcache->lock; spin_lock(cache_lock); - ret = atomic_read(&cred->cr_count) == 0; - if (ret) - rpcauth_unhash_cred_locked(cred); + ret = rpcauth_unhash_cred_locked(cred); spin_unlock(cache_lock); return ret; } @@ -392,6 +393,44 @@ void rpcauth_destroy_credlist(struct list_head *head) } } +static void +rpcauth_lru_add_locked(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + number_cred_unused++; + list_add_tail(&cred->cr_lru, &cred_unused); +} + +static void +rpcauth_lru_add(struct rpc_cred *cred) +{ + if (!list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_add_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + +static void +rpcauth_lru_remove_locked(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + number_cred_unused--; + list_del_init(&cred->cr_lru); +} + +static void +rpcauth_lru_remove(struct rpc_cred *cred) +{ + if (list_empty(&cred->cr_lru)) + return; + spin_lock(&rpc_credcache_lock); + rpcauth_lru_remove_locked(cred); + spin_unlock(&rpc_credcache_lock); +} + /* * Clear the RPC credential cache, and delete those credentials * that are not referenced. @@ -411,13 +450,10 @@ rpcauth_clear_credcache(struct rpc_cred_cache *cache) head = &cache->hashtable[i]; while (!hlist_empty(head)) { cred = hlist_entry(head->first, struct rpc_cred, cr_hash); - get_rpccred(cred); - if (!list_empty(&cred->cr_lru)) { - list_del(&cred->cr_lru); - number_cred_unused--; - } - list_add_tail(&cred->cr_lru, &free); rpcauth_unhash_cred_locked(cred); + /* Note: We now hold a reference to cred */ + rpcauth_lru_remove_locked(cred); + list_add_tail(&cred->cr_lru, &free); } } spin_unlock(&cache->lock); @@ -451,7 +487,6 @@ EXPORT_SYMBOL_GPL(rpcauth_destroy_credcache); static long rpcauth_prune_expired(struct list_head *free, int nr_to_scan) { - spinlock_t *cache_lock; struct rpc_cred *cred, *next; unsigned long expired = jiffies - RPC_AUTH_EXPIRY_MORATORIUM; long freed = 0; @@ -460,32 +495,24 @@ rpcauth_prune_expired(struct list_head *free, int nr_to_scan) if (nr_to_scan-- == 0) break; + if (refcount_read(&cred->cr_count) > 1) { + rpcauth_lru_remove_locked(cred); + continue; + } /* * Enforce a 60 second garbage collection moratorium * Note that the cred_unused list must be time-ordered. */ - if (time_in_range(cred->cr_expire, expired, jiffies) && - test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - freed = SHRINK_STOP; - break; - } - - list_del_init(&cred->cr_lru); - number_cred_unused--; - freed++; - if (atomic_read(&cred->cr_count) != 0) + if (!time_in_range(cred->cr_expire, expired, jiffies)) + continue; + if (!rpcauth_unhash_cred(cred)) continue; - cache_lock = &cred->cr_auth->au_credcache->lock; - spin_lock(cache_lock); - if (atomic_read(&cred->cr_count) == 0) { - get_rpccred(cred); - list_add_tail(&cred->cr_lru, free); - rpcauth_unhash_cred_locked(cred); - } - spin_unlock(cache_lock); + rpcauth_lru_remove_locked(cred); + freed++; + list_add_tail(&cred->cr_lru, free); } - return freed; + return freed ? freed : SHRINK_STOP; } static unsigned long @@ -561,19 +588,15 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; if (flags & RPCAUTH_LOOKUP_RCU) { - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) && - !test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags)) - cred = entry; + if (test_bit(RPCAUTH_CRED_NEW, &entry->cr_flags) || + refcount_read(&entry->cr_count) == 0) + continue; + cred = entry; break; } - spin_lock(&cache->lock); - if (test_bit(RPCAUTH_CRED_HASHED, &entry->cr_flags) == 0) { - spin_unlock(&cache->lock); - continue; - } cred = get_rpccred(entry); - spin_unlock(&cache->lock); - break; + if (cred) + break; } rcu_read_unlock(); @@ -594,11 +617,13 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred, if (!entry->cr_ops->crmatch(acred, entry, flags)) continue; cred = get_rpccred(entry); - break; + if (cred) + break; } if (cred == NULL) { cred = new; set_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags); + refcount_inc(&cred->cr_count); hlist_add_head_rcu(&cred->cr_hash, &cache->hashtable[nr]); } else list_add_tail(&new->cr_lru, &free); @@ -645,7 +670,7 @@ rpcauth_init_cred(struct rpc_cred *cred, const struct auth_cred *acred, { INIT_HLIST_NODE(&cred->cr_hash); INIT_LIST_HEAD(&cred->cr_lru); - atomic_set(&cred->cr_count, 1); + refcount_set(&cred->cr_count, 1); cred->cr_auth = auth; cred->cr_ops = ops; cred->cr_expire = jiffies; @@ -713,36 +738,29 @@ put_rpccred(struct rpc_cred *cred) { if (cred == NULL) return; - /* Fast path for unhashed credentials */ - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) == 0) { - if (atomic_dec_and_test(&cred->cr_count)) - cred->cr_ops->crdestroy(cred); - return; - } - - if (!atomic_dec_and_lock(&cred->cr_count, &rpc_credcache_lock)) - return; - if (!list_empty(&cred->cr_lru)) { - number_cred_unused--; - list_del_init(&cred->cr_lru); - } - if (test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags) != 0) { - if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { - cred->cr_expire = jiffies; - list_add_tail(&cred->cr_lru, &cred_unused); - number_cred_unused++; - goto out_nodestroy; - } - if (!rpcauth_unhash_cred(cred)) { - /* We were hashed and someone looked us up... */ - goto out_nodestroy; - } + rcu_read_lock(); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; + if (refcount_read(&cred->cr_count) != 1 || + !test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags)) + goto out; + if (test_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags) != 0) { + cred->cr_expire = jiffies; + rpcauth_lru_add(cred); + /* Race breaker */ + if (unlikely(!test_bit(RPCAUTH_CRED_HASHED, &cred->cr_flags))) + rpcauth_lru_remove(cred); + } else if (rpcauth_unhash_cred(cred)) { + rpcauth_lru_remove(cred); + if (refcount_dec_and_test(&cred->cr_count)) + goto destroy; } - spin_unlock(&rpc_credcache_lock); - cred->cr_ops->crdestroy(cred); +out: + rcu_read_unlock(); return; -out_nodestroy: - spin_unlock(&rpc_credcache_lock); +destroy: + rcu_read_unlock(); + cred->cr_ops->crdestroy(cred); } EXPORT_SYMBOL_GPL(put_rpccred); @@ -817,6 +835,16 @@ rpcauth_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, return rpcauth_unwrap_req_decode(decode, rqstp, data, obj); } +bool +rpcauth_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_cred *cred = task->tk_rqstp->rq_cred; + + if (!cred || !cred->cr_ops->crneed_reencode) + return false; + return cred->cr_ops->crneed_reencode(task); +} + int rpcauth_refreshcred(struct rpc_task *task) { diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c index f1df9837f1ac..d8831b988b1e 100644 --- a/net/sunrpc/auth_generic.c +++ b/net/sunrpc/auth_generic.c @@ -274,7 +274,7 @@ static const struct rpc_authops generic_auth_ops = { static struct rpc_auth generic_auth = { .au_ops = &generic_auth_ops, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static bool generic_key_to_expire(struct rpc_cred *cred) diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c index 21c0aa0a0d1d..30f970cdc7f6 100644 --- a/net/sunrpc/auth_gss/auth_gss.c +++ b/net/sunrpc/auth_gss/auth_gss.c @@ -1058,7 +1058,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) auth->au_flavor = flavor; if (gss_pseudoflavor_to_datatouch(gss_auth->mech, flavor)) auth->au_flags |= RPCAUTH_AUTH_DATATOUCH; - atomic_set(&auth->au_count, 1); + refcount_set(&auth->au_count, 1); kref_init(&gss_auth->kref); err = rpcauth_init_credcache(auth); @@ -1187,7 +1187,7 @@ gss_auth_find_or_add_hashed(const struct rpc_auth_create_args *args, if (strcmp(gss_auth->target_name, args->target_name)) continue; } - if (!atomic_inc_not_zero(&gss_auth->rpc_auth.au_count)) + if (!refcount_inc_not_zero(&gss_auth->rpc_auth.au_count)) continue; goto out; } @@ -1984,6 +1984,46 @@ gss_unwrap_req_decode(kxdrdproc_t decode, struct rpc_rqst *rqstp, return decode(rqstp, &xdr, obj); } +static bool +gss_seq_is_newer(u32 new, u32 old) +{ + return (s32)(new - old) > 0; +} + +static bool +gss_xmit_need_reencode(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_cred *cred = req->rq_cred; + struct gss_cl_ctx *ctx = gss_cred_get_ctx(cred); + u32 win, seq_xmit; + bool ret = true; + + if (!ctx) + return true; + + if (gss_seq_is_newer(req->rq_seqno, READ_ONCE(ctx->gc_seq))) + goto out; + + seq_xmit = READ_ONCE(ctx->gc_seq_xmit); + while (gss_seq_is_newer(req->rq_seqno, seq_xmit)) { + u32 tmp = seq_xmit; + + seq_xmit = cmpxchg(&ctx->gc_seq_xmit, tmp, req->rq_seqno); + if (seq_xmit == tmp) { + ret = false; + goto out; + } + } + + win = ctx->gc_win; + if (win > 0) + ret = !gss_seq_is_newer(req->rq_seqno, seq_xmit - win); +out: + gss_put_ctx(ctx); + return ret; +} + static int gss_unwrap_resp(struct rpc_task *task, kxdrdproc_t decode, void *rqstp, __be32 *p, void *obj) @@ -2052,6 +2092,7 @@ static const struct rpc_credops gss_credops = { .crunwrap_resp = gss_unwrap_resp, .crkey_timeout = gss_key_timeout, .crstringify_acceptor = gss_stringify_acceptor, + .crneed_reencode = gss_xmit_need_reencode, }; static const struct rpc_credops gss_nullops = { diff --git a/net/sunrpc/auth_gss/gss_krb5_seal.c b/net/sunrpc/auth_gss/gss_krb5_seal.c index eaad9bc7a0bd..b4adeb06660b 100644 --- a/net/sunrpc/auth_gss/gss_krb5_seal.c +++ b/net/sunrpc/auth_gss/gss_krb5_seal.c @@ -63,13 +63,12 @@ #include <linux/sunrpc/gss_krb5.h> #include <linux/random.h> #include <linux/crypto.h> +#include <linux/atomic.h> #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) # define RPCDBG_FACILITY RPCDBG_AUTH #endif -DEFINE_SPINLOCK(krb5_seq_lock); - static void * setup_token(struct krb5_ctx *ctx, struct xdr_netobj *token) { @@ -124,6 +123,30 @@ setup_token_v2(struct krb5_ctx *ctx, struct xdr_netobj *token) return krb5_hdr; } +u32 +gss_seq_send_fetch_and_inc(struct krb5_ctx *ctx) +{ + u32 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg(&ctx->seq_send, old, old + 1); + } while (old != seq_send); + return seq_send; +} + +u64 +gss_seq_send64_fetch_and_inc(struct krb5_ctx *ctx) +{ + u64 old, seq_send = READ_ONCE(ctx->seq_send); + + do { + old = seq_send; + seq_send = cmpxchg64(&ctx->seq_send64, old, old + 1); + } while (old != seq_send); + return seq_send; +} + static u32 gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, struct xdr_netobj *token) @@ -154,9 +177,7 @@ gss_get_mic_v1(struct krb5_ctx *ctx, struct xdr_buf *text, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(ctx); if (krb5_make_seq_num(ctx, ctx->seq, ctx->initiate ? 0 : 0xff, seq_send, ptr + GSS_KRB5_TOK_HDR_LEN, ptr + 8)) @@ -174,7 +195,6 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, .data = cksumdata}; void *krb5_hdr; s32 now; - u64 seq_send; u8 *cksumkey; unsigned int cksum_usage; __be64 seq_send_be64; @@ -185,11 +205,7 @@ gss_get_mic_v2(struct krb5_ctx *ctx, struct xdr_buf *text, /* Set up the sequence number. Now 64-bits in clear * text and w/o direction indicator */ - spin_lock(&krb5_seq_lock); - seq_send = ctx->seq_send64++; - spin_unlock(&krb5_seq_lock); - - seq_send_be64 = cpu_to_be64(seq_send); + seq_send_be64 = cpu_to_be64(gss_seq_send64_fetch_and_inc(ctx)); memcpy(krb5_hdr + 8, (char *) &seq_send_be64, 8); if (ctx->initiate) { diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 3d975a4013d2..962fa84e6db1 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -228,9 +228,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset, memcpy(ptr + GSS_KRB5_TOK_HDR_LEN, md5cksum.data, md5cksum.len); - spin_lock(&krb5_seq_lock); - seq_send = kctx->seq_send++; - spin_unlock(&krb5_seq_lock); + seq_send = gss_seq_send_fetch_and_inc(kctx); /* XXX would probably be more efficient to compute checksum * and encrypt at the same time: */ @@ -477,9 +475,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset, *be16ptr++ = 0; be64ptr = (__be64 *)be16ptr; - spin_lock(&krb5_seq_lock); - *be64ptr = cpu_to_be64(kctx->seq_send64++); - spin_unlock(&krb5_seq_lock); + *be64ptr = cpu_to_be64(gss_seq_send64_fetch_and_inc(kctx)); err = (*kctx->gk5e->encrypt_v2)(kctx, offset, buf, pages); if (err) diff --git a/net/sunrpc/auth_gss/gss_mech_switch.c b/net/sunrpc/auth_gss/gss_mech_switch.c index 5fec3abbe19b..16ac0f4cb7d8 100644 --- a/net/sunrpc/auth_gss/gss_mech_switch.c +++ b/net/sunrpc/auth_gss/gss_mech_switch.c @@ -117,7 +117,7 @@ int gss_mech_register(struct gss_api_mech *gm) if (status) return status; spin_lock(®istered_mechs_lock); - list_add(&gm->gm_list, ®istered_mechs); + list_add_rcu(&gm->gm_list, ®istered_mechs); spin_unlock(®istered_mechs_lock); dprintk("RPC: registered gss mechanism %s\n", gm->gm_name); return 0; @@ -132,7 +132,7 @@ EXPORT_SYMBOL_GPL(gss_mech_register); void gss_mech_unregister(struct gss_api_mech *gm) { spin_lock(®istered_mechs_lock); - list_del(&gm->gm_list); + list_del_rcu(&gm->gm_list); spin_unlock(®istered_mechs_lock); dprintk("RPC: unregistered gss mechanism %s\n", gm->gm_name); gss_mech_free(gm); @@ -151,15 +151,15 @@ _gss_mech_get_by_name(const char *name) { struct gss_api_mech *pos, *gm = NULL; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (0 == strcmp(name, pos->gm_name)) { if (try_module_get(pos->gm_owner)) gm = pos; break; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -186,8 +186,8 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) dprintk("RPC: %s(%s)\n", __func__, buf); request_module("rpc-auth-gss-%s", buf); - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (obj->len == pos->gm_oid.len) { if (0 == memcmp(obj->data, pos->gm_oid.data, obj->len)) { if (try_module_get(pos->gm_owner)) @@ -196,7 +196,7 @@ struct gss_api_mech *gss_mech_get_by_OID(struct rpcsec_gss_oid *obj) } } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -216,15 +216,15 @@ static struct gss_api_mech *_gss_mech_get_by_pseudoflavor(u32 pseudoflavor) { struct gss_api_mech *gm = NULL, *pos; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { if (!mech_supports_pseudoflavor(pos, pseudoflavor)) continue; if (try_module_get(pos->gm_owner)) gm = pos; break; } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return gm; } @@ -257,8 +257,8 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) struct gss_api_mech *pos = NULL; int j, i = 0; - spin_lock(®istered_mechs_lock); - list_for_each_entry(pos, ®istered_mechs, gm_list) { + rcu_read_lock(); + list_for_each_entry_rcu(pos, ®istered_mechs, gm_list) { for (j = 0; j < pos->gm_pf_num; j++) { if (i >= size) { spin_unlock(®istered_mechs_lock); @@ -267,7 +267,7 @@ int gss_mech_list_pseudoflavors(rpc_authflavor_t *array_ptr, int size) array_ptr[i++] = pos->gm_pfs[j].pseudoflavor; } } - spin_unlock(®istered_mechs_lock); + rcu_read_unlock(); return i; } diff --git a/net/sunrpc/auth_gss/gss_rpc_xdr.c b/net/sunrpc/auth_gss/gss_rpc_xdr.c index 444380f968f1..006062ad5f58 100644 --- a/net/sunrpc/auth_gss/gss_rpc_xdr.c +++ b/net/sunrpc/auth_gss/gss_rpc_xdr.c @@ -784,6 +784,7 @@ void gssx_enc_accept_sec_context(struct rpc_rqst *req, xdr_inline_pages(&req->rq_rcv_buf, PAGE_SIZE/2 /* pretty arbitrary */, arg->pages, 0 /* page base */, arg->npages * PAGE_SIZE); + req->rq_rcv_buf.flags |= XDRBUF_SPARSE_PAGES; done: if (err) dprintk("RPC: gssx_enc_accept_sec_context: %d\n", err); diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c index 4b48228ee8c7..2694a1bc026b 100644 --- a/net/sunrpc/auth_null.c +++ b/net/sunrpc/auth_null.c @@ -21,7 +21,7 @@ static struct rpc_cred null_cred; static struct rpc_auth * nul_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { - atomic_inc(&null_auth.au_count); + refcount_inc(&null_auth.au_count); return &null_auth; } @@ -119,7 +119,7 @@ struct rpc_auth null_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authnull_ops, .au_flavor = RPC_AUTH_NULL, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static @@ -138,6 +138,6 @@ struct rpc_cred null_cred = { .cr_lru = LIST_HEAD_INIT(null_cred.cr_lru), .cr_auth = &null_auth, .cr_ops = &null_credops, - .cr_count = ATOMIC_INIT(1), + .cr_count = REFCOUNT_INIT(2), .cr_flags = 1UL << RPCAUTH_CRED_UPTODATE, }; diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c index 185e56d4f9ae..4c1c7e56288f 100644 --- a/net/sunrpc/auth_unix.c +++ b/net/sunrpc/auth_unix.c @@ -34,7 +34,7 @@ unx_create(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt) { dprintk("RPC: creating UNIX authenticator for client %p\n", clnt); - atomic_inc(&unix_auth.au_count); + refcount_inc(&unix_auth.au_count); return &unix_auth; } @@ -239,7 +239,7 @@ struct rpc_auth unix_auth = { .au_flags = RPCAUTH_AUTH_NO_CRKEY_TIMEOUT, .au_ops = &authunix_ops, .au_flavor = RPC_AUTH_UNIX, - .au_count = ATOMIC_INIT(0), + .au_count = REFCOUNT_INIT(1), }; static diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c index 3c15a99b9700..fa5ba6ed3197 100644 --- a/net/sunrpc/backchannel_rqst.c +++ b/net/sunrpc/backchannel_rqst.c @@ -91,7 +91,6 @@ struct rpc_rqst *xprt_alloc_bc_req(struct rpc_xprt *xprt, gfp_t gfp_flags) return NULL; req->rq_xprt = xprt; - INIT_LIST_HEAD(&req->rq_list); INIT_LIST_HEAD(&req->rq_bc_list); /* Preallocate one XDR receive buffer */ diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 8ea2f5fadd96..ae3b8145da35 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -61,6 +61,7 @@ static void call_start(struct rpc_task *task); static void call_reserve(struct rpc_task *task); static void call_reserveresult(struct rpc_task *task); static void call_allocate(struct rpc_task *task); +static void call_encode(struct rpc_task *task); static void call_decode(struct rpc_task *task); static void call_bind(struct rpc_task *task); static void call_bind_status(struct rpc_task *task); @@ -1137,10 +1138,10 @@ EXPORT_SYMBOL_GPL(rpc_call_async); struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) { struct rpc_task *task; - struct xdr_buf *xbufp = &req->rq_snd_buf; struct rpc_task_setup task_setup_data = { .callback_ops = &rpc_default_ops, - .flags = RPC_TASK_SOFTCONN, + .flags = RPC_TASK_SOFTCONN | + RPC_TASK_NO_RETRANS_TIMEOUT, }; dprintk("RPC: rpc_run_bc_task req= %p\n", req); @@ -1148,14 +1149,7 @@ struct rpc_task *rpc_run_bc_task(struct rpc_rqst *req) * Create an rpc_task to send the data */ task = rpc_new_task(&task_setup_data); - task->tk_rqstp = req; - - /* - * Set up the xdr_buf length. - * This also indicates that the buffer is XDR encoded already. - */ - xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + - xbufp->tail[0].iov_len; + xprt_init_bc_request(req, task); task->tk_action = call_bc_transmit; atomic_inc(&task->tk_count); @@ -1558,7 +1552,6 @@ call_reserveresult(struct rpc_task *task) task->tk_status = 0; if (status >= 0) { if (task->tk_rqstp) { - xprt_request_init(task); task->tk_action = call_refresh; return; } @@ -1680,7 +1673,7 @@ call_allocate(struct rpc_task *task) dprint_status(task); task->tk_status = 0; - task->tk_action = call_bind; + task->tk_action = call_encode; if (req->rq_buffer) return; @@ -1721,22 +1714,15 @@ call_allocate(struct rpc_task *task) rpc_exit(task, -ERESTARTSYS); } -static inline int +static int rpc_task_need_encode(struct rpc_task *task) { - return task->tk_rqstp->rq_snd_buf.len == 0; + return test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) == 0 && + (!(task->tk_flags & RPC_TASK_SENT) || + !(task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) || + xprt_request_need_retransmit(task)); } -static inline void -rpc_task_force_reencode(struct rpc_task *task) -{ - task->tk_rqstp->rq_snd_buf.len = 0; - task->tk_rqstp->rq_bytes_sent = 0; -} - -/* - * 3. Encode arguments of an RPC call - */ static void rpc_xdr_encode(struct rpc_task *task) { @@ -1752,6 +1738,7 @@ rpc_xdr_encode(struct rpc_task *task) xdr_buf_init(&req->rq_rcv_buf, req->rq_rbuffer, req->rq_rcvsize); + req->rq_bytes_sent = 0; p = rpc_encode_header(task); if (p == NULL) { @@ -1766,6 +1753,36 @@ rpc_xdr_encode(struct rpc_task *task) task->tk_status = rpcauth_wrap_req(task, encode, req, p, task->tk_msg.rpc_argp); + if (task->tk_status == 0) + xprt_request_prepare(req); +} + +/* + * 3. Encode arguments of an RPC call + */ +static void +call_encode(struct rpc_task *task) +{ + if (!rpc_task_need_encode(task)) + goto out; + /* Encode here so that rpcsec_gss can use correct sequence number. */ + rpc_xdr_encode(task); + /* Did the encode result in an error condition? */ + if (task->tk_status != 0) { + /* Was the error nonfatal? */ + if (task->tk_status == -EAGAIN || task->tk_status == -ENOMEM) + rpc_delay(task, HZ >> 4); + else + rpc_exit(task, task->tk_status); + return; + } + + /* Add task to reply queue before transmission to avoid races */ + if (rpc_reply_expected(task)) + xprt_request_enqueue_receive(task); + xprt_request_enqueue_transmit(task); +out: + task->tk_action = call_bind; } /* @@ -1947,43 +1964,16 @@ call_connect_status(struct rpc_task *task) static void call_transmit(struct rpc_task *task) { - int is_retrans = RPC_WAS_SENT(task); - dprint_status(task); - task->tk_action = call_status; - if (task->tk_status < 0) - return; - if (!xprt_prepare_transmit(task)) - return; - task->tk_action = call_transmit_status; - /* Encode here so that rpcsec_gss can use correct sequence number. */ - if (rpc_task_need_encode(task)) { - rpc_xdr_encode(task); - /* Did the encode result in an error condition? */ - if (task->tk_status != 0) { - /* Was the error nonfatal? */ - if (task->tk_status == -EAGAIN) - rpc_delay(task, HZ >> 4); - else - rpc_exit(task, task->tk_status); + task->tk_status = 0; + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) { + if (!xprt_prepare_transmit(task)) return; - } + xprt_transmit(task); } - xprt_transmit(task); - if (task->tk_status < 0) - return; - if (is_retrans) - task->tk_client->cl_stats->rpcretrans++; - /* - * On success, ensure that we call xprt_end_transmit() before sleeping - * in order to allow access to the socket to other RPC requests. - */ - call_transmit_status(task); - if (rpc_reply_expected(task)) - return; - task->tk_action = rpc_exit_task; - rpc_wake_up_queued_task(&task->tk_rqstp->rq_xprt->pending, task); + task->tk_action = call_transmit_status; + xprt_end_transmit(task); } /* @@ -1999,19 +1989,17 @@ call_transmit_status(struct rpc_task *task) * test first. */ if (task->tk_status == 0) { - xprt_end_transmit(task); - rpc_task_force_reencode(task); + xprt_request_wait_receive(task); return; } switch (task->tk_status) { - case -EAGAIN: - case -ENOBUFS: - break; default: dprint_status(task); - xprt_end_transmit(task); - rpc_task_force_reencode(task); + break; + case -EBADMSG: + task->tk_status = 0; + task->tk_action = call_encode; break; /* * Special cases: if we've been waiting on the @@ -2019,6 +2007,14 @@ call_transmit_status(struct rpc_task *task) * socket just returned a connection error, * then hold onto the transport lock. */ + case -ENOBUFS: + rpc_delay(task, HZ>>2); + /* fall through */ + case -EBADSLT: + case -EAGAIN: + task->tk_action = call_transmit; + task->tk_status = 0; + break; case -ECONNREFUSED: case -EHOSTDOWN: case -ENETDOWN: @@ -2026,7 +2022,6 @@ call_transmit_status(struct rpc_task *task) case -ENETUNREACH: case -EPERM: if (RPC_IS_SOFTCONN(task)) { - xprt_end_transmit(task); if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); @@ -2039,7 +2034,7 @@ call_transmit_status(struct rpc_task *task) case -EADDRINUSE: case -ENOTCONN: case -EPIPE: - rpc_task_force_reencode(task); + break; } } @@ -2053,6 +2048,11 @@ call_bc_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; + if (rpc_task_need_encode(task)) + xprt_request_enqueue_transmit(task); + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + goto out_wakeup; + if (!xprt_prepare_transmit(task)) goto out_retry; @@ -2061,14 +2061,9 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); goto out_done; } - if (req->rq_connect_cookie != req->rq_xprt->connect_cookie) - req->rq_bytes_sent = 0; xprt_transmit(task); - if (task->tk_status == -EAGAIN) - goto out_nospace; - xprt_end_transmit(task); dprint_status(task); switch (task->tk_status) { @@ -2084,6 +2079,8 @@ call_bc_transmit(struct rpc_task *task) case -ENOTCONN: case -EPIPE: break; + case -EAGAIN: + goto out_retry; case -ETIMEDOUT: /* * Problem reaching the server. Disconnect and let the @@ -2107,12 +2104,11 @@ call_bc_transmit(struct rpc_task *task) "error: %d\n", task->tk_status); break; } +out_wakeup: rpc_wake_up_queued_task(&req->rq_xprt->pending, task); out_done: task->tk_action = rpc_exit_task; return; -out_nospace: - req->rq_connect_cookie = req->rq_xprt->connect_cookie; out_retry: task->tk_status = 0; } @@ -2125,15 +2121,11 @@ static void call_status(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - struct rpc_rqst *req = task->tk_rqstp; int status; if (!task->tk_msg.rpc_proc->p_proc) trace_xprt_ping(task->tk_xprt, task->tk_status); - if (req->rq_reply_bytes_recvd > 0 && !req->rq_bytes_sent) - task->tk_status = req->rq_reply_bytes_recvd; - dprint_status(task); status = task->tk_status; @@ -2173,13 +2165,8 @@ call_status(struct rpc_task *task) /* fall through */ case -EPIPE: case -ENOTCONN: - task->tk_action = call_bind; - break; - case -ENOBUFS: - rpc_delay(task, HZ>>2); - /* fall through */ case -EAGAIN: - task->tk_action = call_transmit; + task->tk_action = call_encode; break; case -EIO: /* shutdown or soft timeout */ @@ -2244,7 +2231,7 @@ call_timeout(struct rpc_task *task) rpcauth_invalcred(task); retry: - task->tk_action = call_bind; + task->tk_action = call_encode; task->tk_status = 0; } @@ -2261,6 +2248,11 @@ call_decode(struct rpc_task *task) dprint_status(task); + if (!decode) { + task->tk_action = rpc_exit_task; + return; + } + if (task->tk_flags & RPC_CALL_MAJORSEEN) { if (clnt->cl_chatty) { printk(KERN_NOTICE "%s: server %s OK\n", @@ -2283,7 +2275,7 @@ call_decode(struct rpc_task *task) if (req->rq_rcv_buf.len < 12) { if (!RPC_IS_SOFT(task)) { - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; } dprintk("RPC: %s: too small RPC reply size (%d bytes)\n", @@ -2298,13 +2290,11 @@ call_decode(struct rpc_task *task) goto out_retry; return; } - task->tk_action = rpc_exit_task; - if (decode) { - task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, - task->tk_msg.rpc_resp); - } + task->tk_status = rpcauth_unwrap_resp(task, decode, req, p, + task->tk_msg.rpc_resp); + dprintk("RPC: %5u call_decode result %d\n", task->tk_pid, task->tk_status); return; @@ -2416,7 +2406,7 @@ rpc_verify_header(struct rpc_task *task) task->tk_garb_retry--; dprintk("RPC: %5u %s: retry garbled creds\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; goto out_retry; case RPC_AUTH_TOOWEAK: printk(KERN_NOTICE "RPC: server %s requires stronger " @@ -2485,7 +2475,7 @@ out_garbage: task->tk_garb_retry--; dprintk("RPC: %5u %s: retrying\n", task->tk_pid, __func__); - task->tk_action = call_bind; + task->tk_action = call_encode; out_retry: return ERR_PTR(-EAGAIN); } diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 3fe5d60ab0e2..57ca5bead1cb 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -99,65 +99,79 @@ __rpc_add_timer(struct rpc_wait_queue *queue, struct rpc_task *task) list_add(&task->u.tk_wait.timer_list, &queue->timer_list.list); } -static void rpc_rotate_queue_owner(struct rpc_wait_queue *queue) -{ - struct list_head *q = &queue->tasks[queue->priority]; - struct rpc_task *task; - - if (!list_empty(q)) { - task = list_first_entry(q, struct rpc_task, u.tk_wait.list); - if (task->tk_owner == queue->owner) - list_move_tail(&task->u.tk_wait.list, q); - } -} - static void rpc_set_waitqueue_priority(struct rpc_wait_queue *queue, int priority) { if (queue->priority != priority) { - /* Fairness: rotate the list when changing priority */ - rpc_rotate_queue_owner(queue); queue->priority = priority; + queue->nr = 1U << priority; } } -static void rpc_set_waitqueue_owner(struct rpc_wait_queue *queue, pid_t pid) -{ - queue->owner = pid; - queue->nr = RPC_BATCH_COUNT; -} - static void rpc_reset_waitqueue_priority(struct rpc_wait_queue *queue) { rpc_set_waitqueue_priority(queue, queue->maxpriority); - rpc_set_waitqueue_owner(queue, 0); } /* - * Add new request to a priority queue. + * Add a request to a queue list */ -static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, - struct rpc_task *task, - unsigned char queue_priority) +static void +__rpc_list_enqueue_task(struct list_head *q, struct rpc_task *task) { - struct list_head *q; struct rpc_task *t; - INIT_LIST_HEAD(&task->u.tk_wait.links); - if (unlikely(queue_priority > queue->maxpriority)) - queue_priority = queue->maxpriority; - if (queue_priority > queue->priority) - rpc_set_waitqueue_priority(queue, queue_priority); - q = &queue->tasks[queue_priority]; list_for_each_entry(t, q, u.tk_wait.list) { if (t->tk_owner == task->tk_owner) { - list_add_tail(&task->u.tk_wait.list, &t->u.tk_wait.links); + list_add_tail(&task->u.tk_wait.links, + &t->u.tk_wait.links); + /* Cache the queue head in task->u.tk_wait.list */ + task->u.tk_wait.list.next = q; + task->u.tk_wait.list.prev = NULL; return; } } + INIT_LIST_HEAD(&task->u.tk_wait.links); list_add_tail(&task->u.tk_wait.list, q); } /* + * Remove request from a queue list + */ +static void +__rpc_list_dequeue_task(struct rpc_task *task) +{ + struct list_head *q; + struct rpc_task *t; + + if (task->u.tk_wait.list.prev == NULL) { + list_del(&task->u.tk_wait.links); + return; + } + if (!list_empty(&task->u.tk_wait.links)) { + t = list_first_entry(&task->u.tk_wait.links, + struct rpc_task, + u.tk_wait.links); + /* Assume __rpc_list_enqueue_task() cached the queue head */ + q = t->u.tk_wait.list.next; + list_add_tail(&t->u.tk_wait.list, q); + list_del(&task->u.tk_wait.links); + } + list_del(&task->u.tk_wait.list); +} + +/* + * Add new request to a priority queue. + */ +static void __rpc_add_wait_queue_priority(struct rpc_wait_queue *queue, + struct rpc_task *task, + unsigned char queue_priority) +{ + if (unlikely(queue_priority > queue->maxpriority)) + queue_priority = queue->maxpriority; + __rpc_list_enqueue_task(&queue->tasks[queue_priority], task); +} + +/* * Add new request to wait queue. * * Swapper tasks always get inserted at the head of the queue. @@ -194,13 +208,7 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue, */ static void __rpc_remove_wait_queue_priority(struct rpc_task *task) { - struct rpc_task *t; - - if (!list_empty(&task->u.tk_wait.links)) { - t = list_entry(task->u.tk_wait.links.next, struct rpc_task, u.tk_wait.list); - list_move(&t->u.tk_wait.list, &task->u.tk_wait.list); - list_splice_init(&task->u.tk_wait.links, &t->u.tk_wait.links); - } + __rpc_list_dequeue_task(task); } /* @@ -212,7 +220,8 @@ static void __rpc_remove_wait_queue(struct rpc_wait_queue *queue, struct rpc_tas __rpc_disable_timer(queue, task); if (RPC_IS_PRIORITY(queue)) __rpc_remove_wait_queue_priority(task); - list_del(&task->u.tk_wait.list); + else + list_del(&task->u.tk_wait.list); queue->qlen--; dprintk("RPC: %5u removed from queue %p \"%s\"\n", task->tk_pid, queue, rpc_qname(queue)); @@ -440,14 +449,28 @@ static void __rpc_do_wake_up_task_on_wq(struct workqueue_struct *wq, /* * Wake up a queued task while the queue lock is being held */ -static void rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, - struct rpc_wait_queue *queue, struct rpc_task *task) +static struct rpc_task * +rpc_wake_up_task_on_wq_queue_action_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task, + bool (*action)(struct rpc_task *, void *), void *data) { if (RPC_IS_QUEUED(task)) { smp_rmb(); - if (task->tk_waitqueue == queue) - __rpc_do_wake_up_task_on_wq(wq, queue, task); + if (task->tk_waitqueue == queue) { + if (action == NULL || action(task, data)) { + __rpc_do_wake_up_task_on_wq(wq, queue, task); + return task; + } + } } + return NULL; +} + +static void +rpc_wake_up_task_on_wq_queue_locked(struct workqueue_struct *wq, + struct rpc_wait_queue *queue, struct rpc_task *task) +{ + rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, task, NULL, NULL); } /* @@ -465,6 +488,8 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); spin_unlock_bh(&queue->lock); @@ -475,12 +500,48 @@ void rpc_wake_up_queued_task_on_wq(struct workqueue_struct *wq, */ void rpc_wake_up_queued_task(struct rpc_wait_queue *queue, struct rpc_task *task) { + if (!RPC_IS_QUEUED(task)) + return; spin_lock_bh(&queue->lock); rpc_wake_up_task_queue_locked(queue, task); spin_unlock_bh(&queue->lock); } EXPORT_SYMBOL_GPL(rpc_wake_up_queued_task); +static bool rpc_task_action_set_status(struct rpc_task *task, void *status) +{ + task->tk_status = *(int *)status; + return true; +} + +static void +rpc_wake_up_task_queue_set_status_locked(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + rpc_wake_up_task_on_wq_queue_action_locked(rpciod_workqueue, queue, + task, rpc_task_action_set_status, &status); +} + +/** + * rpc_wake_up_queued_task_set_status - wake up a task and set task->tk_status + * @queue: pointer to rpc_wait_queue + * @task: pointer to rpc_task + * @status: integer error value + * + * If @task is queued on @queue, then it is woken up, and @task->tk_status is + * set to the value of @status. + */ +void +rpc_wake_up_queued_task_set_status(struct rpc_wait_queue *queue, + struct rpc_task *task, int status) +{ + if (!RPC_IS_QUEUED(task)) + return; + spin_lock_bh(&queue->lock); + rpc_wake_up_task_queue_set_status_locked(queue, task, status); + spin_unlock_bh(&queue->lock); +} + /* * Wake up the next task on a priority queue. */ @@ -493,17 +554,9 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q * Service a batch of tasks from a single owner. */ q = &queue->tasks[queue->priority]; - if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); - if (queue->owner == task->tk_owner) { - if (--queue->nr) - goto out; - list_move_tail(&task->u.tk_wait.list, q); - } - /* - * Check if we need to switch queues. - */ - goto new_owner; + if (!list_empty(q) && --queue->nr) { + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); + goto out; } /* @@ -515,7 +568,7 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q else q = q - 1; if (!list_empty(q)) { - task = list_entry(q->next, struct rpc_task, u.tk_wait.list); + task = list_first_entry(q, struct rpc_task, u.tk_wait.list); goto new_queue; } } while (q != &queue->tasks[queue->priority]); @@ -525,8 +578,6 @@ static struct rpc_task *__rpc_find_next_queued_priority(struct rpc_wait_queue *q new_queue: rpc_set_waitqueue_priority(queue, (unsigned int)(q - &queue->tasks[0])); -new_owner: - rpc_set_waitqueue_owner(queue, task->tk_owner); out: return task; } @@ -553,12 +604,9 @@ struct rpc_task *rpc_wake_up_first_on_wq(struct workqueue_struct *wq, queue, rpc_qname(queue)); spin_lock_bh(&queue->lock); task = __rpc_find_next_queued(queue); - if (task != NULL) { - if (func(task, data)) - rpc_wake_up_task_on_wq_queue_locked(wq, queue, task); - else - task = NULL; - } + if (task != NULL) + task = rpc_wake_up_task_on_wq_queue_action_locked(wq, queue, + task, func, data); spin_unlock_bh(&queue->lock); return task; diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c index f217c348b341..9062967575c4 100644 --- a/net/sunrpc/socklib.c +++ b/net/sunrpc/socklib.c @@ -26,7 +26,8 @@ * Possibly called several times to iterate over an sk_buff and copy * data out of it. */ -size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) +static size_t +xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) { if (len > desc->count) len = desc->count; @@ -36,7 +37,6 @@ size_t xdr_skb_read_bits(struct xdr_skb_reader *desc, void *to, size_t len) desc->offset += len; return len; } -EXPORT_SYMBOL_GPL(xdr_skb_read_bits); /** * xdr_skb_read_and_csum_bits - copy and checksum from skb to buffer @@ -69,7 +69,8 @@ static size_t xdr_skb_read_and_csum_bits(struct xdr_skb_reader *desc, void *to, * @copy_actor: virtual method for copying data * */ -ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) +static ssize_t +xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct xdr_skb_reader *desc, xdr_skb_read_actor copy_actor) { struct page **ppage = xdr->pages; unsigned int len, pglen = xdr->page_len; @@ -104,7 +105,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct /* ACL likes to be lazy in allocating pages - ACLs * are small by default but can get huge. */ - if (unlikely(*ppage == NULL)) { + if ((xdr->flags & XDRBUF_SPARSE_PAGES) && *ppage == NULL) { *ppage = alloc_page(GFP_ATOMIC); if (unlikely(*ppage == NULL)) { if (copied == 0) @@ -140,7 +141,6 @@ copy_tail: out: return copied; } -EXPORT_SYMBOL_GPL(xdr_partial_copy_from_skb); /** * csum_partial_copy_to_xdr - checksum and copy data diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c index 5185efb9027b..87533fbb96cf 100644 --- a/net/sunrpc/svc_xprt.c +++ b/net/sunrpc/svc_xprt.c @@ -171,7 +171,6 @@ void svc_xprt_init(struct net *net, struct svc_xprt_class *xcl, mutex_init(&xprt->xpt_mutex); spin_lock_init(&xprt->xpt_lock); set_bit(XPT_BUSY, &xprt->xpt_flags); - rpc_init_wait_queue(&xprt->xpt_bc_pending, "xpt_bc_pending"); xprt->xpt_net = get_net(net); strcpy(xprt->xpt_remotebuf, "uninitialized"); } @@ -895,7 +894,6 @@ int svc_send(struct svc_rqst *rqstp) else len = xprt->xpt_ops->xpo_sendto(rqstp); mutex_unlock(&xprt->xpt_mutex); - rpc_wake_up(&xprt->xpt_bc_pending); trace_svc_send(rqstp, len); svc_xprt_release(rqstp); diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 5445145e639c..db8bb6b3a2b0 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1004,7 +1004,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) if (!bc_xprt) return -EAGAIN; - spin_lock(&bc_xprt->recv_lock); + spin_lock(&bc_xprt->queue_lock); req = xprt_lookup_rqst(bc_xprt, xid); if (!req) goto unlock_notfound; @@ -1022,7 +1022,7 @@ static int receive_cb_reply(struct svc_sock *svsk, struct svc_rqst *rqstp) memcpy(dst->iov_base, src->iov_base, src->iov_len); xprt_complete_rqst(req->rq_task, rqstp->rq_arg.len); rqstp->rq_arg.len = 0; - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return 0; unlock_notfound: printk(KERN_NOTICE @@ -1031,7 +1031,7 @@ unlock_notfound: __func__, ntohl(calldir), bc_xprt, ntohl(xid)); unlock_eagain: - spin_unlock(&bc_xprt->recv_lock); + spin_unlock(&bc_xprt->queue_lock); return -EAGAIN; } diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 30afbd236656..2bbb8d38d2bf 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -15,6 +15,7 @@ #include <linux/errno.h> #include <linux/sunrpc/xdr.h> #include <linux/sunrpc/msg_prot.h> +#include <linux/bvec.h> /* * XDR functions for basic NFS types @@ -128,6 +129,39 @@ xdr_terminate_string(struct xdr_buf *buf, const u32 len) } EXPORT_SYMBOL_GPL(xdr_terminate_string); +size_t +xdr_buf_pagecount(struct xdr_buf *buf) +{ + if (!buf->page_len) + return 0; + return (buf->page_base + buf->page_len + PAGE_SIZE - 1) >> PAGE_SHIFT; +} + +int +xdr_alloc_bvec(struct xdr_buf *buf, gfp_t gfp) +{ + size_t i, n = xdr_buf_pagecount(buf); + + if (n != 0 && buf->bvec == NULL) { + buf->bvec = kmalloc_array(n, sizeof(buf->bvec[0]), gfp); + if (!buf->bvec) + return -ENOMEM; + for (i = 0; i < n; i++) { + buf->bvec[i].bv_page = buf->pages[i]; + buf->bvec[i].bv_len = PAGE_SIZE; + buf->bvec[i].bv_offset = 0; + } + } + return 0; +} + +void +xdr_free_bvec(struct xdr_buf *buf) +{ + kfree(buf->bvec); + buf->bvec = NULL; +} + void xdr_inline_pages(struct xdr_buf *xdr, unsigned int offset, struct page **pages, unsigned int base, unsigned int len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index a8db2e3f8904..86bea4520c4d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -68,8 +68,6 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net); static __be32 xprt_alloc_xid(struct rpc_xprt *xprt); static void xprt_connect_status(struct rpc_task *task); -static int __xprt_get_cong(struct rpc_xprt *, struct rpc_task *); -static void __xprt_put_cong(struct rpc_xprt *, struct rpc_rqst *); static void xprt_destroy(struct rpc_xprt *xprt); static DEFINE_SPINLOCK(xprt_list_lock); @@ -171,6 +169,17 @@ out: } EXPORT_SYMBOL_GPL(xprt_load_transport); +static void xprt_clear_locked(struct rpc_xprt *xprt) +{ + xprt->snd_task = NULL; + if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { + smp_mb__before_atomic(); + clear_bit(XPRT_LOCKED, &xprt->state); + smp_mb__after_atomic(); + } else + queue_work(xprtiod_workqueue, &xprt->task_cleanup); +} + /** * xprt_reserve_xprt - serialize write access to transports * @task: task that is requesting access to the transport @@ -183,44 +192,53 @@ EXPORT_SYMBOL_GPL(xprt_load_transport); int xprt_reserve_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) return 1; goto out_sleep; } + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; xprt->snd_task = task; - if (req != NULL) - req->rq_ntrans++; return 1; +out_unlock: + xprt_clear_locked(xprt); out_sleep: dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt); -static void xprt_clear_locked(struct rpc_xprt *xprt) +static bool +xprt_need_congestion_window_wait(struct rpc_xprt *xprt) { - xprt->snd_task = NULL; - if (!test_bit(XPRT_CLOSE_WAIT, &xprt->state)) { - smp_mb__before_atomic(); - clear_bit(XPRT_LOCKED, &xprt->state); - smp_mb__after_atomic(); - } else - queue_work(xprtiod_workqueue, &xprt->task_cleanup); + return test_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_set_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!list_empty(&xprt->xmit_queue)) { + /* Peek at head of queue to see if it can make progress */ + if (list_first_entry(&xprt->xmit_queue, struct rpc_rqst, + rq_xmit)->rq_cong) + return; + } + set_bit(XPRT_CWND_WAIT, &xprt->state); +} + +static void +xprt_test_and_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (!RPCXPRT_CONGESTED(xprt)) + clear_bit(XPRT_CWND_WAIT, &xprt->state); } /* @@ -230,11 +248,11 @@ static void xprt_clear_locked(struct rpc_xprt *xprt) * Same as xprt_reserve_xprt, but Van Jacobson congestion control is * integrated into the decision of whether a request is allowed to be * woken up and given access to the transport. + * Note that the lock is only granted if we know there are free slots. */ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - int priority; if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) { if (task == xprt->snd_task) @@ -245,25 +263,19 @@ int xprt_reserve_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) xprt->snd_task = task; return 1; } - if (__xprt_get_cong(xprt, task)) { + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (!xprt_need_congestion_window_wait(xprt)) { xprt->snd_task = task; - req->rq_ntrans++; return 1; } +out_unlock: xprt_clear_locked(xprt); out_sleep: - if (req) - __xprt_put_cong(xprt, req); dprintk("RPC: %5u failed to lock transport %p\n", task->tk_pid, xprt); - task->tk_timeout = 0; + task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; task->tk_status = -EAGAIN; - if (req == NULL) - priority = RPC_PRIORITY_LOW; - else if (!req->rq_ntrans) - priority = RPC_PRIORITY_NORMAL; - else - priority = RPC_PRIORITY_HIGH; - rpc_sleep_on_priority(&xprt->sending, task, NULL, priority); + rpc_sleep_on(&xprt->sending, task, NULL); return 0; } EXPORT_SYMBOL_GPL(xprt_reserve_xprt_cong); @@ -272,6 +284,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) { int retval; + if (test_bit(XPRT_LOCKED, &xprt->state) && xprt->snd_task == task) + return 1; spin_lock_bh(&xprt->transport_lock); retval = xprt->ops->reserve_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -281,12 +295,8 @@ static inline int xprt_lock_write(struct rpc_xprt *xprt, struct rpc_task *task) static bool __xprt_lock_write_func(struct rpc_task *task, void *data) { struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - req = task->tk_rqstp; xprt->snd_task = task; - if (req) - req->rq_ntrans++; return true; } @@ -294,53 +304,30 @@ static void __xprt_lock_write_next(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, __xprt_lock_write_func, xprt)) return; +out_unlock: xprt_clear_locked(xprt); } -static bool __xprt_lock_write_cong_func(struct rpc_task *task, void *data) -{ - struct rpc_xprt *xprt = data; - struct rpc_rqst *req; - - req = task->tk_rqstp; - if (req == NULL) { - xprt->snd_task = task; - return true; - } - if (__xprt_get_cong(xprt, task)) { - xprt->snd_task = task; - req->rq_ntrans++; - return true; - } - return false; -} - static void __xprt_lock_write_next_cong(struct rpc_xprt *xprt) { if (test_and_set_bit(XPRT_LOCKED, &xprt->state)) return; - if (RPCXPRT_CONGESTED(xprt)) + if (test_bit(XPRT_WRITE_SPACE, &xprt->state)) + goto out_unlock; + if (xprt_need_congestion_window_wait(xprt)) goto out_unlock; if (rpc_wake_up_first_on_wq(xprtiod_workqueue, &xprt->sending, - __xprt_lock_write_cong_func, xprt)) + __xprt_lock_write_func, xprt)) return; out_unlock: xprt_clear_locked(xprt); } -static void xprt_task_clear_bytes_sent(struct rpc_task *task) -{ - if (task != NULL) { - struct rpc_rqst *req = task->tk_rqstp; - if (req != NULL) - req->rq_bytes_sent = 0; - } -} - /** * xprt_release_xprt - allow other requests to use a transport * @xprt: transport with other tasks potentially waiting @@ -351,7 +338,6 @@ static void xprt_task_clear_bytes_sent(struct rpc_task *task) void xprt_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next(xprt); } @@ -369,7 +355,6 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt); void xprt_release_xprt_cong(struct rpc_xprt *xprt, struct rpc_task *task) { if (xprt->snd_task == task) { - xprt_task_clear_bytes_sent(task); xprt_clear_locked(xprt); __xprt_lock_write_next_cong(xprt); } @@ -378,6 +363,8 @@ EXPORT_SYMBOL_GPL(xprt_release_xprt_cong); static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *task) { + if (xprt->snd_task != task) + return; spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); spin_unlock_bh(&xprt->transport_lock); @@ -388,16 +375,16 @@ static inline void xprt_release_write(struct rpc_xprt *xprt, struct rpc_task *ta * overflowed. Put the task to sleep if this is the case. */ static int -__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_task *task) +__xprt_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; - if (req->rq_cong) return 1; dprintk("RPC: %5u xprt_cwnd_limited cong = %lu cwnd = %lu\n", - task->tk_pid, xprt->cong, xprt->cwnd); - if (RPCXPRT_CONGESTED(xprt)) + req->rq_task->tk_pid, xprt->cong, xprt->cwnd); + if (RPCXPRT_CONGESTED(xprt)) { + xprt_set_congestion_window_wait(xprt); return 0; + } req->rq_cong = 1; xprt->cong += RPC_CWNDSCALE; return 1; @@ -414,10 +401,32 @@ __xprt_put_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) return; req->rq_cong = 0; xprt->cong -= RPC_CWNDSCALE; + xprt_test_and_clear_congestion_window_wait(xprt); __xprt_lock_write_next_cong(xprt); } /** + * xprt_request_get_cong - Request congestion control credits + * @xprt: pointer to transport + * @req: pointer to RPC request + * + * Useful for transports that require congestion control. + */ +bool +xprt_request_get_cong(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + bool ret = false; + + if (req->rq_cong) + return true; + spin_lock_bh(&xprt->transport_lock); + ret = __xprt_get_cong(xprt, req) != 0; + spin_unlock_bh(&xprt->transport_lock); + return ret; +} +EXPORT_SYMBOL_GPL(xprt_request_get_cong); + +/** * xprt_release_rqst_cong - housekeeping when request is complete * @task: RPC request that recently completed * @@ -431,6 +440,20 @@ void xprt_release_rqst_cong(struct rpc_task *task) } EXPORT_SYMBOL_GPL(xprt_release_rqst_cong); +/* + * Clear the congestion window wait flag and wake up the next + * entry on xprt->sending + */ +static void +xprt_clear_congestion_window_wait(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_CWND_WAIT, &xprt->state)) { + spin_lock_bh(&xprt->transport_lock); + __xprt_lock_write_next_cong(xprt); + spin_unlock_bh(&xprt->transport_lock); + } +} + /** * xprt_adjust_cwnd - adjust transport congestion window * @xprt: pointer to xprt @@ -488,39 +511,46 @@ EXPORT_SYMBOL_GPL(xprt_wake_pending_tasks); /** * xprt_wait_for_buffer_space - wait for transport output buffer to clear - * @task: task to be put to sleep - * @action: function pointer to be executed after wait + * @xprt: transport * * Note that we only set the timer for the case of RPC_IS_SOFT(), since * we don't in general want to force a socket disconnection due to * an incomplete RPC call transmission. */ -void xprt_wait_for_buffer_space(struct rpc_task *task, rpc_action action) +void xprt_wait_for_buffer_space(struct rpc_xprt *xprt) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; - - task->tk_timeout = RPC_IS_SOFT(task) ? req->rq_timeout : 0; - rpc_sleep_on(&xprt->pending, task, action); + set_bit(XPRT_WRITE_SPACE, &xprt->state); } EXPORT_SYMBOL_GPL(xprt_wait_for_buffer_space); +static bool +xprt_clear_write_space_locked(struct rpc_xprt *xprt) +{ + if (test_and_clear_bit(XPRT_WRITE_SPACE, &xprt->state)) { + __xprt_lock_write_next(xprt); + dprintk("RPC: write space: waking waiting task on " + "xprt %p\n", xprt); + return true; + } + return false; +} + /** * xprt_write_space - wake the task waiting for transport output buffer space * @xprt: transport with waiting tasks * * Can be called in a soft IRQ context, so xprt_write_space never sleeps. */ -void xprt_write_space(struct rpc_xprt *xprt) +bool xprt_write_space(struct rpc_xprt *xprt) { + bool ret; + + if (!test_bit(XPRT_WRITE_SPACE, &xprt->state)) + return false; spin_lock_bh(&xprt->transport_lock); - if (xprt->snd_task) { - dprintk("RPC: write space: waking waiting task on " - "xprt %p\n", xprt); - rpc_wake_up_queued_task_on_wq(xprtiod_workqueue, - &xprt->pending, xprt->snd_task); - } + ret = xprt_clear_write_space_locked(xprt); spin_unlock_bh(&xprt->transport_lock); + return ret; } EXPORT_SYMBOL_GPL(xprt_write_space); @@ -631,6 +661,7 @@ void xprt_disconnect_done(struct rpc_xprt *xprt) dprintk("RPC: disconnected transport %p\n", xprt); spin_lock_bh(&xprt->transport_lock); xprt_clear_connected(xprt); + xprt_clear_write_space_locked(xprt); xprt_wake_pending_tasks(xprt, -EAGAIN); spin_unlock_bh(&xprt->transport_lock); } @@ -654,6 +685,22 @@ void xprt_force_disconnect(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_force_disconnect); +static unsigned int +xprt_connect_cookie(struct rpc_xprt *xprt) +{ + return READ_ONCE(xprt->connect_cookie); +} + +static bool +xprt_request_retransmit_after_disconnect(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + return req->rq_connect_cookie != xprt_connect_cookie(xprt) || + !xprt_connected(xprt); +} + /** * xprt_conditional_disconnect - force a transport to disconnect * @xprt: transport to disconnect @@ -692,7 +739,7 @@ static void xprt_schedule_autodisconnect(struct rpc_xprt *xprt) __must_hold(&xprt->transport_lock) { - if (list_empty(&xprt->recv) && xprt_has_timer(xprt)) + if (RB_EMPTY_ROOT(&xprt->recv_queue) && xprt_has_timer(xprt)) mod_timer(&xprt->timer, xprt->last_used + xprt->idle_timeout); } @@ -702,7 +749,7 @@ xprt_init_autodisconnect(struct timer_list *t) struct rpc_xprt *xprt = from_timer(xprt, t, timer); spin_lock(&xprt->transport_lock); - if (!list_empty(&xprt->recv)) + if (!RB_EMPTY_ROOT(&xprt->recv_queue)) goto out_abort; /* Reset xprt->last_used to avoid connect/autodisconnect cycling */ xprt->last_used = jiffies; @@ -726,7 +773,6 @@ bool xprt_lock_connect(struct rpc_xprt *xprt, goto out; if (xprt->snd_task != task) goto out; - xprt_task_clear_bytes_sent(task); xprt->snd_task = cookie; ret = true; out: @@ -772,7 +818,6 @@ void xprt_connect(struct rpc_task *task) xprt->ops->close(xprt); if (!xprt_connected(xprt)) { - task->tk_rqstp->rq_bytes_sent = 0; task->tk_timeout = task->tk_rqstp->rq_timeout; task->tk_rqstp->rq_connect_cookie = xprt->connect_cookie; rpc_sleep_on(&xprt->pending, task, xprt_connect_status); @@ -789,17 +834,11 @@ void xprt_connect(struct rpc_task *task) static void xprt_connect_status(struct rpc_task *task) { - struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt; - - if (task->tk_status == 0) { - xprt->stat.connect_count++; - xprt->stat.connect_time += (long)jiffies - xprt->stat.connect_start; + switch (task->tk_status) { + case 0: dprintk("RPC: %5u xprt_connect_status: connection established\n", task->tk_pid); - return; - } - - switch (task->tk_status) { + break; case -ECONNREFUSED: case -ECONNRESET: case -ECONNABORTED: @@ -816,28 +855,97 @@ static void xprt_connect_status(struct rpc_task *task) default: dprintk("RPC: %5u xprt_connect_status: error %d connecting to " "server %s\n", task->tk_pid, -task->tk_status, - xprt->servername); + task->tk_rqstp->rq_xprt->servername); task->tk_status = -EIO; } } +enum xprt_xid_rb_cmp { + XID_RB_EQUAL, + XID_RB_LEFT, + XID_RB_RIGHT, +}; +static enum xprt_xid_rb_cmp +xprt_xid_cmp(__be32 xid1, __be32 xid2) +{ + if (xid1 == xid2) + return XID_RB_EQUAL; + if ((__force u32)xid1 < (__force u32)xid2) + return XID_RB_LEFT; + return XID_RB_RIGHT; +} + +static struct rpc_rqst * +xprt_request_rb_find(struct rpc_xprt *xprt, __be32 xid) +{ + struct rb_node *n = xprt->recv_queue.rb_node; + struct rpc_rqst *req; + + while (n != NULL) { + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch (xprt_xid_cmp(xid, req->rq_xid)) { + case XID_RB_LEFT: + n = n->rb_left; + break; + case XID_RB_RIGHT: + n = n->rb_right; + break; + case XID_RB_EQUAL: + return req; + } + } + return NULL; +} + +static void +xprt_request_rb_insert(struct rpc_xprt *xprt, struct rpc_rqst *new) +{ + struct rb_node **p = &xprt->recv_queue.rb_node; + struct rb_node *n = NULL; + struct rpc_rqst *req; + + while (*p != NULL) { + n = *p; + req = rb_entry(n, struct rpc_rqst, rq_recv); + switch(xprt_xid_cmp(new->rq_xid, req->rq_xid)) { + case XID_RB_LEFT: + p = &n->rb_left; + break; + case XID_RB_RIGHT: + p = &n->rb_right; + break; + case XID_RB_EQUAL: + WARN_ON_ONCE(new != req); + return; + } + } + rb_link_node(&new->rq_recv, n, p); + rb_insert_color(&new->rq_recv, &xprt->recv_queue); +} + +static void +xprt_request_rb_remove(struct rpc_xprt *xprt, struct rpc_rqst *req) +{ + rb_erase(&req->rq_recv, &xprt->recv_queue); +} + /** * xprt_lookup_rqst - find an RPC request corresponding to an XID * @xprt: transport on which the original request was transmitted * @xid: RPC XID of incoming reply * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) { struct rpc_rqst *entry; - list_for_each_entry(entry, &xprt->recv, rq_list) - if (entry->rq_xid == xid) { - trace_xprt_lookup_rqst(xprt, xid, 0); - entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); - return entry; - } + entry = xprt_request_rb_find(xprt, xid); + if (entry != NULL) { + trace_xprt_lookup_rqst(xprt, xid, 0); + entry->rq_rtt = ktime_sub(ktime_get(), entry->rq_xtime); + return entry; + } dprintk("RPC: xprt_lookup_rqst did not find xid %08x\n", ntohl(xid)); @@ -847,16 +955,22 @@ struct rpc_rqst *xprt_lookup_rqst(struct rpc_xprt *xprt, __be32 xid) } EXPORT_SYMBOL_GPL(xprt_lookup_rqst); +static bool +xprt_is_pinned_rqst(struct rpc_rqst *req) +{ + return atomic_read(&req->rq_pin) != 0; +} + /** * xprt_pin_rqst - Pin a request on the transport receive list * @req: Request to pin * * Caller must ensure this is atomic with the call to xprt_lookup_rqst() - * so should be holding the xprt transport lock. + * so should be holding the xprt receive lock. */ void xprt_pin_rqst(struct rpc_rqst *req) { - set_bit(RPC_TASK_MSG_RECV, &req->rq_task->tk_runstate); + atomic_inc(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_pin_rqst); @@ -864,38 +978,87 @@ EXPORT_SYMBOL_GPL(xprt_pin_rqst); * xprt_unpin_rqst - Unpin a request on the transport receive list * @req: Request to pin * - * Caller should be holding the xprt transport lock. + * Caller should be holding the xprt receive lock. */ void xprt_unpin_rqst(struct rpc_rqst *req) { - struct rpc_task *task = req->rq_task; - - clear_bit(RPC_TASK_MSG_RECV, &task->tk_runstate); - if (test_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate)) - wake_up_bit(&task->tk_runstate, RPC_TASK_MSG_RECV); + if (!test_bit(RPC_TASK_MSG_PIN_WAIT, &req->rq_task->tk_runstate)) { + atomic_dec(&req->rq_pin); + return; + } + if (atomic_dec_and_test(&req->rq_pin)) + wake_up_var(&req->rq_pin); } EXPORT_SYMBOL_GPL(xprt_unpin_rqst); static void xprt_wait_on_pinned_rqst(struct rpc_rqst *req) -__must_hold(&req->rq_xprt->recv_lock) { - struct rpc_task *task = req->rq_task; + wait_var_event(&req->rq_pin, !xprt_is_pinned_rqst(req)); +} - if (task && test_bit(RPC_TASK_MSG_RECV, &task->tk_runstate)) { - spin_unlock(&req->rq_xprt->recv_lock); - set_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - wait_on_bit(&task->tk_runstate, RPC_TASK_MSG_RECV, - TASK_UNINTERRUPTIBLE); - clear_bit(RPC_TASK_MSG_RECV_WAIT, &task->tk_runstate); - spin_lock(&req->rq_xprt->recv_lock); - } +static bool +xprt_request_data_received(struct rpc_task *task) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) != 0; +} + +static bool +xprt_request_need_enqueue_receive(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) && + READ_ONCE(task->tk_rqstp->rq_reply_bytes_recvd) == 0; +} + +/** + * xprt_request_enqueue_receive - Add an request to the receive queue + * @task: RPC task + * + */ +void +xprt_request_enqueue_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!xprt_request_need_enqueue_receive(task, req)) + return; + spin_lock(&xprt->queue_lock); + + /* Update the softirq receive buffer */ + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, + sizeof(req->rq_private_buf)); + + /* Add request to the receive list */ + xprt_request_rb_insert(xprt, req); + set_bit(RPC_TASK_NEED_RECV, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + + xprt_reset_majortimeo(req); + /* Turn off autodisconnect */ + del_singleshot_timer_sync(&xprt->timer); +} + +/** + * xprt_request_dequeue_receive_locked - Remove a request from the receive queue + * @task: RPC task + * + * Caller must hold xprt->queue_lock. + */ +static void +xprt_request_dequeue_receive_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + xprt_request_rb_remove(req->rq_xprt, req); } /** * xprt_update_rtt - Update RPC RTT statistics * @task: RPC request that recently completed * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_update_rtt(struct rpc_task *task) { @@ -917,7 +1080,7 @@ EXPORT_SYMBOL_GPL(xprt_update_rtt); * @task: RPC request that recently completed * @copied: actual number of bytes received from the transport * - * Caller holds xprt->recv_lock. + * Caller holds xprt->queue_lock. */ void xprt_complete_rqst(struct rpc_task *task, int copied) { @@ -930,12 +1093,12 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; - list_del_init(&req->rq_list); req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ smp_wmb(); req->rq_reply_bytes_recvd = copied; + xprt_request_dequeue_receive_locked(task); rpc_wake_up_queued_task(&xprt->pending, task); } EXPORT_SYMBOL_GPL(xprt_complete_rqst); @@ -957,6 +1120,172 @@ static void xprt_timer(struct rpc_task *task) } /** + * xprt_request_wait_receive - wait for the reply to an RPC request + * @task: RPC task about to send a request + * + */ +void xprt_request_wait_receive(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (!test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) + return; + /* + * Sleep on the pending queue if we're expecting a reply. + * The spinlock ensures atomicity between the test of + * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). + */ + spin_lock(&xprt->queue_lock); + if (test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + xprt->ops->set_retrans_timeout(task); + rpc_sleep_on(&xprt->pending, task, xprt_timer); + /* + * Send an extra queue wakeup call if the + * connection was dropped in case the call to + * rpc_sleep_on() raced. + */ + if (xprt_request_retransmit_after_disconnect(task)) + rpc_wake_up_queued_task_set_status(&xprt->pending, + task, -ENOTCONN); + } + spin_unlock(&xprt->queue_lock); +} + +static bool +xprt_request_need_enqueue_transmit(struct rpc_task *task, struct rpc_rqst *req) +{ + return !test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); +} + +/** + * xprt_request_enqueue_transmit - queue a task for transmission + * @task: pointer to rpc_task + * + * Add a task to the transmission queue. + */ +void +xprt_request_enqueue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *pos, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt_request_need_enqueue_transmit(task, req)) { + spin_lock(&xprt->queue_lock); + /* + * Requests that carry congestion control credits are added + * to the head of the list to avoid starvation issues. + */ + if (req->rq_cong) { + xprt_clear_congestion_window_wait(xprt); + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else if (RPC_IS_SWAPPER(task)) { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_cong || pos->rq_bytes_sent) + continue; + if (RPC_IS_SWAPPER(pos->rq_task)) + continue; + /* Note: req is added _before_ pos */ + list_add_tail(&req->rq_xmit, &pos->rq_xmit); + INIT_LIST_HEAD(&req->rq_xmit2); + goto out; + } + } else { + list_for_each_entry(pos, &xprt->xmit_queue, rq_xmit) { + if (pos->rq_task->tk_owner != task->tk_owner) + continue; + list_add_tail(&req->rq_xmit2, &pos->rq_xmit2); + INIT_LIST_HEAD(&req->rq_xmit); + goto out; + } + } + list_add_tail(&req->rq_xmit, &xprt->xmit_queue); + INIT_LIST_HEAD(&req->rq_xmit2); +out: + set_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + } +} + +/** + * xprt_request_dequeue_transmit_locked - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + * Caller must hold xprt->queue_lock + */ +static void +xprt_request_dequeue_transmit_locked(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + + if (!test_and_clear_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + return; + if (!list_empty(&req->rq_xmit)) { + list_del(&req->rq_xmit); + if (!list_empty(&req->rq_xmit2)) { + struct rpc_rqst *next = list_first_entry(&req->rq_xmit2, + struct rpc_rqst, rq_xmit2); + list_del(&req->rq_xmit2); + list_add_tail(&next->rq_xmit, &next->rq_xprt->xmit_queue); + } + } else + list_del(&req->rq_xmit2); +} + +/** + * xprt_request_dequeue_transmit - remove a task from the transmission queue + * @task: pointer to rpc_task + * + * Remove a task from the transmission queue + */ +static void +xprt_request_dequeue_transmit(struct rpc_task *task) +{ + struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + spin_unlock(&xprt->queue_lock); +} + +/** + * xprt_request_prepare - prepare an encoded request for transport + * @req: pointer to rpc_rqst + * + * Calls into the transport layer to do whatever is needed to prepare + * the request for transmission or receive. + */ +void +xprt_request_prepare(struct rpc_rqst *req) +{ + struct rpc_xprt *xprt = req->rq_xprt; + + if (xprt->ops->prepare_request) + xprt->ops->prepare_request(req); +} + +/** + * xprt_request_need_retransmit - Test if a task needs retransmission + * @task: pointer to rpc_task + * + * Test for whether a connection breakage requires the task to retransmit + */ +bool +xprt_request_need_retransmit(struct rpc_task *task) +{ + return xprt_request_retransmit_after_disconnect(task); +} + +/** * xprt_prepare_transmit - reserve the transport before sending a request * @task: RPC task about to send a request * @@ -965,32 +1294,18 @@ bool xprt_prepare_transmit(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; - bool ret = false; dprintk("RPC: %5u xprt_prepare_transmit\n", task->tk_pid); - spin_lock_bh(&xprt->transport_lock); - if (!req->rq_bytes_sent) { - if (req->rq_reply_bytes_recvd) { - task->tk_status = req->rq_reply_bytes_recvd; - goto out_unlock; - } - if ((task->tk_flags & RPC_TASK_NO_RETRANS_TIMEOUT) - && xprt_connected(xprt) - && req->rq_connect_cookie == xprt->connect_cookie) { - xprt->ops->set_retrans_timeout(task); - rpc_sleep_on(&xprt->pending, task, xprt_timer); - goto out_unlock; - } - } - if (!xprt->ops->reserve_xprt(xprt, task)) { - task->tk_status = -EAGAIN; - goto out_unlock; + if (!xprt_lock_write(xprt, task)) { + /* Race breaker: someone may have transmitted us */ + if (!test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + rpc_wake_up_queued_task_set_status(&xprt->sending, + task, 0); + return false; + } - ret = true; -out_unlock: - spin_unlock_bh(&xprt->transport_lock); - return ret; + return true; } void xprt_end_transmit(struct rpc_task *task) @@ -999,54 +1314,62 @@ void xprt_end_transmit(struct rpc_task *task) } /** - * xprt_transmit - send an RPC request on a transport - * @task: controlling RPC task + * xprt_request_transmit - send an RPC request on a transport + * @req: pointer to request to transmit + * @snd_task: RPC task that owns the transport lock * - * We have to copy the iovec because sendmsg fiddles with its contents. + * This performs the transmission of a single request. + * Note that if the request is not the same as snd_task, then it + * does need to be pinned. + * Returns '0' on success. */ -void xprt_transmit(struct rpc_task *task) +static int +xprt_request_transmit(struct rpc_rqst *req, struct rpc_task *snd_task) { - struct rpc_rqst *req = task->tk_rqstp; - struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_xprt *xprt = req->rq_xprt; + struct rpc_task *task = req->rq_task; unsigned int connect_cookie; + int is_retrans = RPC_WAS_SENT(task); int status; dprintk("RPC: %5u xprt_transmit(%u)\n", task->tk_pid, req->rq_slen); - if (!req->rq_reply_bytes_recvd) { - if (list_empty(&req->rq_list) && rpc_reply_expected(task)) { - /* - * Add to the list only if we're expecting a reply - */ - /* Update the softirq receive buffer */ - memcpy(&req->rq_private_buf, &req->rq_rcv_buf, - sizeof(req->rq_private_buf)); - /* Add request to the receive list */ - spin_lock(&xprt->recv_lock); - list_add_tail(&req->rq_list, &xprt->recv); - spin_unlock(&xprt->recv_lock); - xprt_reset_majortimeo(req); - /* Turn off autodisconnect */ - del_singleshot_timer_sync(&xprt->timer); + if (!req->rq_bytes_sent) { + if (xprt_request_data_received(task)) { + status = 0; + goto out_dequeue; } - } else if (!req->rq_bytes_sent) - return; + /* Verify that our message lies in the RPCSEC_GSS window */ + if (rpcauth_xmit_need_reencode(task)) { + status = -EBADMSG; + goto out_dequeue; + } + } + + /* + * Update req->rq_ntrans before transmitting to avoid races with + * xprt_update_rtt(), which needs to know that it is recording a + * reply to the first transmission. + */ + req->rq_ntrans++; connect_cookie = xprt->connect_cookie; - status = xprt->ops->send_request(task); + status = xprt->ops->send_request(req); trace_xprt_transmit(xprt, req->rq_xid, status); if (status != 0) { - task->tk_status = status; - return; + req->rq_ntrans--; + return status; } + + if (is_retrans) + task->tk_client->cl_stats->rpcretrans++; + xprt_inject_disconnect(xprt); dprintk("RPC: %5u xmit complete\n", task->tk_pid); task->tk_flags |= RPC_TASK_SENT; spin_lock_bh(&xprt->transport_lock); - xprt->ops->set_retrans_timeout(task); - xprt->stat.sends++; xprt->stat.req_u += xprt->stat.sends - xprt->stat.recvs; xprt->stat.bklog_u += xprt->backlog.qlen; @@ -1055,25 +1378,49 @@ void xprt_transmit(struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); req->rq_connect_cookie = connect_cookie; - if (rpc_reply_expected(task) && !READ_ONCE(req->rq_reply_bytes_recvd)) { - /* - * Sleep on the pending queue if we're expecting a reply. - * The spinlock ensures atomicity between the test of - * req->rq_reply_bytes_recvd, and the call to rpc_sleep_on(). - */ - spin_lock(&xprt->recv_lock); - if (!req->rq_reply_bytes_recvd) { - rpc_sleep_on(&xprt->pending, task, xprt_timer); - /* - * Send an extra queue wakeup call if the - * connection was dropped in case the call to - * rpc_sleep_on() raced. - */ - if (!xprt_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock(&xprt->recv_lock); +out_dequeue: + xprt_request_dequeue_transmit(task); + rpc_wake_up_queued_task_set_status(&xprt->sending, task, status); + return status; +} + +/** + * xprt_transmit - send an RPC request on a transport + * @task: controlling RPC task + * + * Attempts to drain the transmit queue. On exit, either the transport + * signalled an error that needs to be handled before transmission can + * resume, or @task finished transmitting, and detected that it already + * received a reply. + */ +void +xprt_transmit(struct rpc_task *task) +{ + struct rpc_rqst *next, *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; + int status; + + spin_lock(&xprt->queue_lock); + while (!list_empty(&xprt->xmit_queue)) { + next = list_first_entry(&xprt->xmit_queue, + struct rpc_rqst, rq_xmit); + xprt_pin_rqst(next); + spin_unlock(&xprt->queue_lock); + status = xprt_request_transmit(next, task); + if (status == -EBADMSG && next != req) + status = 0; + cond_resched(); + spin_lock(&xprt->queue_lock); + xprt_unpin_rqst(next); + if (status == 0) { + if (!xprt_request_data_received(task) || + test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + continue; + } else if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate)) + task->tk_status = status; + break; } + spin_unlock(&xprt->queue_lock); } static void xprt_add_backlog(struct rpc_xprt *xprt, struct rpc_task *task) @@ -1170,20 +1517,6 @@ out_init_req: } EXPORT_SYMBOL_GPL(xprt_alloc_slot); -void xprt_lock_and_alloc_slot(struct rpc_xprt *xprt, struct rpc_task *task) -{ - /* Note: grabbing the xprt_lock_write() ensures that we throttle - * new slot allocation if the transport is congested (i.e. when - * reconnecting a stream transport or when out of socket write - * buffer space). - */ - if (xprt_lock_write(xprt, task)) { - xprt_alloc_slot(xprt, task); - xprt_release_write(xprt, task); - } -} -EXPORT_SYMBOL_GPL(xprt_lock_and_alloc_slot); - void xprt_free_slot(struct rpc_xprt *xprt, struct rpc_rqst *req) { spin_lock(&xprt->reserve_lock); @@ -1250,6 +1583,60 @@ void xprt_free(struct rpc_xprt *xprt) } EXPORT_SYMBOL_GPL(xprt_free); +static void +xprt_init_connect_cookie(struct rpc_rqst *req, struct rpc_xprt *xprt) +{ + req->rq_connect_cookie = xprt_connect_cookie(xprt) - 1; +} + +static __be32 +xprt_alloc_xid(struct rpc_xprt *xprt) +{ + __be32 xid; + + spin_lock(&xprt->reserve_lock); + xid = (__force __be32)xprt->xid++; + spin_unlock(&xprt->reserve_lock); + return xid; +} + +static void +xprt_init_xid(struct rpc_xprt *xprt) +{ + xprt->xid = prandom_u32(); +} + +static void +xprt_request_init(struct rpc_task *task) +{ + struct rpc_xprt *xprt = task->tk_xprt; + struct rpc_rqst *req = task->tk_rqstp; + + req->rq_timeout = task->tk_client->cl_timeout->to_initval; + req->rq_task = task; + req->rq_xprt = xprt; + req->rq_buffer = NULL; + req->rq_xid = xprt_alloc_xid(xprt); + xprt_init_connect_cookie(req, xprt); + req->rq_bytes_sent = 0; + req->rq_snd_buf.len = 0; + req->rq_snd_buf.buflen = 0; + req->rq_rcv_buf.len = 0; + req->rq_rcv_buf.buflen = 0; + req->rq_release_snd_buf = NULL; + xprt_reset_majortimeo(req); + dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, + req, ntohl(req->rq_xid)); +} + +static void +xprt_do_reserve(struct rpc_xprt *xprt, struct rpc_task *task) +{ + xprt->ops->alloc_slot(xprt, task); + if (task->tk_rqstp != NULL) + xprt_request_init(task); +} + /** * xprt_reserve - allocate an RPC request slot * @task: RPC task requesting a slot allocation @@ -1269,7 +1656,7 @@ void xprt_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; if (!xprt_throttle_congested(xprt, task)) - xprt->ops->alloc_slot(xprt, task); + xprt_do_reserve(xprt, task); } /** @@ -1291,45 +1678,29 @@ void xprt_retry_reserve(struct rpc_task *task) task->tk_timeout = 0; task->tk_status = -EAGAIN; - xprt->ops->alloc_slot(xprt, task); -} - -static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt) -{ - __be32 xid; - - spin_lock(&xprt->reserve_lock); - xid = (__force __be32)xprt->xid++; - spin_unlock(&xprt->reserve_lock); - return xid; + xprt_do_reserve(xprt, task); } -static inline void xprt_init_xid(struct rpc_xprt *xprt) -{ - xprt->xid = prandom_u32(); -} - -void xprt_request_init(struct rpc_task *task) +static void +xprt_request_dequeue_all(struct rpc_task *task, struct rpc_rqst *req) { - struct rpc_xprt *xprt = task->tk_xprt; - struct rpc_rqst *req = task->tk_rqstp; + struct rpc_xprt *xprt = req->rq_xprt; - INIT_LIST_HEAD(&req->rq_list); - req->rq_timeout = task->tk_client->cl_timeout->to_initval; - req->rq_task = task; - req->rq_xprt = xprt; - req->rq_buffer = NULL; - req->rq_xid = xprt_alloc_xid(xprt); - req->rq_connect_cookie = xprt->connect_cookie - 1; - req->rq_bytes_sent = 0; - req->rq_snd_buf.len = 0; - req->rq_snd_buf.buflen = 0; - req->rq_rcv_buf.len = 0; - req->rq_rcv_buf.buflen = 0; - req->rq_release_snd_buf = NULL; - xprt_reset_majortimeo(req); - dprintk("RPC: %5u reserved req %p xid %08x\n", task->tk_pid, - req, ntohl(req->rq_xid)); + if (test_bit(RPC_TASK_NEED_XMIT, &task->tk_runstate) || + test_bit(RPC_TASK_NEED_RECV, &task->tk_runstate) || + xprt_is_pinned_rqst(req)) { + spin_lock(&xprt->queue_lock); + xprt_request_dequeue_transmit_locked(task); + xprt_request_dequeue_receive_locked(task); + while (xprt_is_pinned_rqst(req)) { + set_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + spin_unlock(&xprt->queue_lock); + xprt_wait_on_pinned_rqst(req); + spin_lock(&xprt->queue_lock); + clear_bit(RPC_TASK_MSG_PIN_WAIT, &task->tk_runstate); + } + spin_unlock(&xprt->queue_lock); + } } /** @@ -1345,8 +1716,7 @@ void xprt_release(struct rpc_task *task) if (req == NULL) { if (task->tk_client) { xprt = task->tk_xprt; - if (xprt->snd_task == task) - xprt_release_write(xprt, task); + xprt_release_write(xprt, task); } return; } @@ -1356,12 +1726,7 @@ void xprt_release(struct rpc_task *task) task->tk_ops->rpc_count_stats(task, task->tk_calldata); else if (task->tk_client) rpc_count_iostats(task, task->tk_client->cl_metrics); - spin_lock(&xprt->recv_lock); - if (!list_empty(&req->rq_list)) { - list_del_init(&req->rq_list); - xprt_wait_on_pinned_rqst(req); - } - spin_unlock(&xprt->recv_lock); + xprt_request_dequeue_all(task, req); spin_lock_bh(&xprt->transport_lock); xprt->ops->release_xprt(xprt, task); if (xprt->ops->release_request) @@ -1372,6 +1737,7 @@ void xprt_release(struct rpc_task *task) if (req->rq_buffer) xprt->ops->buf_free(task); xprt_inject_disconnect(xprt); + xdr_free_bvec(&req->rq_rcv_buf); if (req->rq_cred != NULL) put_rpccred(req->rq_cred); task->tk_rqstp = NULL; @@ -1385,16 +1751,36 @@ void xprt_release(struct rpc_task *task) xprt_free_bc_request(req); } +#ifdef CONFIG_SUNRPC_BACKCHANNEL +void +xprt_init_bc_request(struct rpc_rqst *req, struct rpc_task *task) +{ + struct xdr_buf *xbufp = &req->rq_snd_buf; + + task->tk_rqstp = req; + req->rq_task = task; + xprt_init_connect_cookie(req, req->rq_xprt); + /* + * Set up the xdr_buf length. + * This also indicates that the buffer is XDR encoded already. + */ + xbufp->len = xbufp->head[0].iov_len + xbufp->page_len + + xbufp->tail[0].iov_len; + req->rq_bytes_sent = 0; +} +#endif + static void xprt_init(struct rpc_xprt *xprt, struct net *net) { kref_init(&xprt->kref); spin_lock_init(&xprt->transport_lock); spin_lock_init(&xprt->reserve_lock); - spin_lock_init(&xprt->recv_lock); + spin_lock_init(&xprt->queue_lock); INIT_LIST_HEAD(&xprt->free); - INIT_LIST_HEAD(&xprt->recv); + xprt->recv_queue = RB_ROOT; + INIT_LIST_HEAD(&xprt->xmit_queue); #if defined(CONFIG_SUNRPC_BACKCHANNEL) spin_lock_init(&xprt->bc_pa_lock); INIT_LIST_HEAD(&xprt->bc_pa_list); @@ -1407,7 +1793,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net) rpc_init_wait_queue(&xprt->binding, "xprt_binding"); rpc_init_wait_queue(&xprt->pending, "xprt_pending"); - rpc_init_priority_wait_queue(&xprt->sending, "xprt_sending"); + rpc_init_wait_queue(&xprt->sending, "xprt_sending"); rpc_init_priority_wait_queue(&xprt->backlog, "xprt_backlog"); xprt_init_xid(xprt); diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index 90adeff4c06b..e5b367a3e517 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -51,12 +51,11 @@ static int rpcrdma_bc_setup_reqs(struct rpcrdma_xprt *r_xprt, rqst = &req->rl_slot; rqst->rq_xprt = xprt; - INIT_LIST_HEAD(&rqst->rq_list); INIT_LIST_HEAD(&rqst->rq_bc_list); __set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); size = r_xprt->rx_data.inline_rsize; rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL); @@ -201,6 +200,9 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst) if (!xprt_connected(rqst->rq_xprt)) goto drop_connection; + if (!xprt_request_get_cong(rqst->rq_xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_bc_marshal_reply(rqst); if (rc < 0) goto failed_marshal; @@ -228,16 +230,16 @@ void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs) struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); struct rpc_rqst *rqst, *tmp; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) { list_del(&rqst->rq_bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); rpcrdma_bc_free_rqst(r_xprt, rqst); - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); } - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** @@ -255,9 +257,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst) rpcrdma_recv_buffer_put(req->rl_reply); req->rl_reply = NULL; - spin_lock_bh(&xprt->bc_pa_lock); + spin_lock(&xprt->bc_pa_lock); list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list); - spin_unlock_bh(&xprt->bc_pa_lock); + spin_unlock(&xprt->bc_pa_lock); } /** diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c index 0f7c465d9a5a..7f5632cd5a48 100644 --- a/net/sunrpc/xprtrdma/fmr_ops.c +++ b/net/sunrpc/xprtrdma/fmr_ops.c @@ -49,46 +49,7 @@ fmr_is_supported(struct rpcrdma_ia *ia) return true; } -static int -fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - static struct ib_fmr_attr fmr_attr = { - .max_pages = RPCRDMA_MAX_FMR_SGES, - .max_maps = 1, - .page_shift = PAGE_SHIFT - }; - - mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(u64), GFP_KERNEL); - if (!mr->fmr.fm_physaddrs) - goto out_free; - - mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, - sizeof(*mr->mr_sg), GFP_KERNEL); - if (!mr->mr_sg) - goto out_free; - - sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); - - mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, - &fmr_attr); - if (IS_ERR(mr->fmr.fm_mr)) - goto out_fmr_err; - - INIT_LIST_HEAD(&mr->mr_list); - return 0; - -out_fmr_err: - dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, - PTR_ERR(mr->fmr.fm_mr)); - -out_free: - kfree(mr->mr_sg); - kfree(mr->fmr.fm_physaddrs); - return -ENOMEM; -} - -static int +static void __fmr_unmap(struct rpcrdma_mr *mr) { LIST_HEAD(l); @@ -97,13 +58,16 @@ __fmr_unmap(struct rpcrdma_mr *mr) list_add(&mr->fmr.fm_mr->list, &l); rc = ib_unmap_fmr(&l); list_del(&mr->fmr.fm_mr->list); - return rc; + if (rc) + pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", + mr, rc); } +/* Release an MR. + */ static void fmr_op_release_mr(struct rpcrdma_mr *mr) { - LIST_HEAD(unmap_list); int rc; kfree(mr->fmr.fm_physaddrs); @@ -112,10 +76,7 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) /* In case this one was left mapped, try to unmap it * to prevent dealloc_fmr from failing with EBUSY */ - rc = __fmr_unmap(mr); - if (rc) - pr_err("rpcrdma: final ib_unmap_fmr for %p failed %i\n", - mr, rc); + __fmr_unmap(mr); rc = ib_dealloc_fmr(mr->fmr.fm_mr); if (rc) @@ -125,40 +86,68 @@ fmr_op_release_mr(struct rpcrdma_mr *mr) kfree(mr); } -/* Reset of a single FMR. +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. */ static void -fmr_op_recover_mr(struct rpcrdma_mr *mr) +fmr_mr_recycle_worker(struct work_struct *work) { + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - int rc; - /* ORDER: invalidate first */ - rc = __fmr_unmap(mr); - if (rc) - goto out_release; - - /* ORDER: then DMA unmap */ - rpcrdma_mr_unmap_and_put(mr); + trace_xprtrdma_mr_recycle(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FMR reset failed (%d), %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); spin_lock(&r_xprt->rx_buf.rb_mrlock); list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; spin_unlock(&r_xprt->rx_buf.rb_mrlock); - fmr_op_release_mr(mr); } +static int +fmr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) +{ + static struct ib_fmr_attr fmr_attr = { + .max_pages = RPCRDMA_MAX_FMR_SGES, + .max_maps = 1, + .page_shift = PAGE_SHIFT + }; + + mr->fmr.fm_physaddrs = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(u64), GFP_KERNEL); + if (!mr->fmr.fm_physaddrs) + goto out_free; + + mr->mr_sg = kcalloc(RPCRDMA_MAX_FMR_SGES, + sizeof(*mr->mr_sg), GFP_KERNEL); + if (!mr->mr_sg) + goto out_free; + + sg_init_table(mr->mr_sg, RPCRDMA_MAX_FMR_SGES); + + mr->fmr.fm_mr = ib_alloc_fmr(ia->ri_pd, RPCRDMA_FMR_ACCESS_FLAGS, + &fmr_attr); + if (IS_ERR(mr->fmr.fm_mr)) + goto out_fmr_err; + + INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, fmr_mr_recycle_worker); + return 0; + +out_fmr_err: + dprintk("RPC: %s: ib_alloc_fmr returned %ld\n", __func__, + PTR_ERR(mr->fmr.fm_mr)); + +out_free: + kfree(mr->mr_sg); + kfree(mr->fmr.fm_physaddrs); + return -ENOMEM; +} + /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -187,6 +176,7 @@ fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / RPCRDMA_MAX_FMR_SGES); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -244,7 +234,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); for (i = 0, dma_pages = mr->fmr.fm_physaddrs; i < mr->mr_nents; i++) dma_pages[i] = sg_dma_address(&mr->mr_sg[i]); @@ -305,13 +295,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) { dprintk("RPC: %s: unmapping fmr %p\n", __func__, &mr->fmr); - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); list_add_tail(&mr->fmr.fm_mr->list, &unmap_list); } r_xprt->rx_stats.local_inv_needed++; rc = ib_unmap_fmr(&unmap_list); if (rc) - goto out_reset; + goto out_release; /* ORDER: Now DMA unmap all of the req's MRs, and return * them to the free MW list. @@ -324,13 +314,13 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) return; -out_reset: +out_release: pr_err("rpcrdma: ib_unmap_fmr failed (%i)\n", rc); while (!list_empty(mrs)) { mr = rpcrdma_mr_pop(mrs); list_del(&mr->fmr.fm_mr->list); - fmr_op_recover_mr(mr); + rpcrdma_mr_recycle(mr); } } @@ -338,7 +328,6 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = { .ro_map = fmr_op_map, .ro_send = fmr_op_send, .ro_unmap_sync = fmr_op_unmap_sync, - .ro_recover_mr = fmr_op_recover_mr, .ro_open = fmr_op_open, .ro_maxpages = fmr_op_maxpages, .ro_init_mr = fmr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index 1bb00dd6ccdb..fc6378cc0c1c 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -97,6 +97,44 @@ out_not_supported: return false; } +static void +frwr_op_release_mr(struct rpcrdma_mr *mr) +{ + int rc; + + rc = ib_dereg_mr(mr->frwr.fr_mr); + if (rc) + pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", + mr, rc); + kfree(mr->mr_sg); + kfree(mr); +} + +/* MRs are dynamically allocated, so simply clean up and release the MR. + * A replacement MR will subsequently be allocated on demand. + */ +static void +frwr_mr_recycle_worker(struct work_struct *work) +{ + struct rpcrdma_mr *mr = container_of(work, struct rpcrdma_mr, mr_recycle); + enum rpcrdma_frwr_state state = mr->frwr.fr_state; + struct rpcrdma_xprt *r_xprt = mr->mr_xprt; + + trace_xprtrdma_mr_recycle(mr); + + if (state != FRWR_FLUSHED_LI) { + trace_xprtrdma_mr_unmap(mr); + ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, + mr->mr_sg, mr->mr_nents, mr->mr_dir); + } + + spin_lock(&r_xprt->rx_buf.rb_mrlock); + list_del(&mr->mr_all); + r_xprt->rx_stats.mrs_recycled++; + spin_unlock(&r_xprt->rx_buf.rb_mrlock); + frwr_op_release_mr(mr); +} + static int frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) { @@ -113,6 +151,7 @@ frwr_op_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) goto out_list_err; INIT_LIST_HEAD(&mr->mr_list); + INIT_WORK(&mr->mr_recycle, frwr_mr_recycle_worker); sg_init_table(mr->mr_sg, depth); init_completion(&frwr->fr_linv_done); return 0; @@ -131,79 +170,6 @@ out_list_err: return rc; } -static void -frwr_op_release_mr(struct rpcrdma_mr *mr) -{ - int rc; - - rc = ib_dereg_mr(mr->frwr.fr_mr); - if (rc) - pr_err("rpcrdma: final ib_dereg_mr for %p returned %i\n", - mr, rc); - kfree(mr->mr_sg); - kfree(mr); -} - -static int -__frwr_mr_reset(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr) -{ - struct rpcrdma_frwr *frwr = &mr->frwr; - int rc; - - rc = ib_dereg_mr(frwr->fr_mr); - if (rc) { - pr_warn("rpcrdma: ib_dereg_mr status %d, frwr %p orphaned\n", - rc, mr); - return rc; - } - - frwr->fr_mr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, - ia->ri_max_frwr_depth); - if (IS_ERR(frwr->fr_mr)) { - pr_warn("rpcrdma: ib_alloc_mr status %ld, frwr %p orphaned\n", - PTR_ERR(frwr->fr_mr), mr); - return PTR_ERR(frwr->fr_mr); - } - - dprintk("RPC: %s: recovered FRWR %p\n", __func__, frwr); - frwr->fr_state = FRWR_IS_INVALID; - return 0; -} - -/* Reset of a single FRWR. Generate a fresh rkey by replacing the MR. - */ -static void -frwr_op_recover_mr(struct rpcrdma_mr *mr) -{ - enum rpcrdma_frwr_state state = mr->frwr.fr_state; - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_ia *ia = &r_xprt->rx_ia; - int rc; - - rc = __frwr_mr_reset(ia, mr); - if (state != FRWR_FLUSHED_LI) { - trace_xprtrdma_dma_unmap(mr); - ib_dma_unmap_sg(ia->ri_device, - mr->mr_sg, mr->mr_nents, mr->mr_dir); - } - if (rc) - goto out_release; - - rpcrdma_mr_put(mr); - r_xprt->rx_stats.mrs_recovered++; - return; - -out_release: - pr_err("rpcrdma: FRWR reset failed %d, %p released\n", rc, mr); - r_xprt->rx_stats.mrs_orphaned++; - - spin_lock(&r_xprt->rx_buf.rb_mrlock); - list_del(&mr->mr_all); - spin_unlock(&r_xprt->rx_buf.rb_mrlock); - - frwr_op_release_mr(mr); -} - /* On success, sets: * ep->rep_attr.cap.max_send_wr * ep->rep_attr.cap.max_recv_wr @@ -276,6 +242,7 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS / ia->ri_max_frwr_depth); + ia->ri_max_segs += 2; /* segments for head and tail buffers */ return 0; } @@ -384,7 +351,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr = NULL; do { if (mr) - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); mr = rpcrdma_mr_get(r_xprt); if (!mr) return ERR_PTR(-EAGAIN); @@ -417,7 +384,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg, mr->mr_nents = ib_dma_map_sg(ia->ri_device, mr->mr_sg, i, mr->mr_dir); if (!mr->mr_nents) goto out_dmamap_err; - trace_xprtrdma_dma_map(mr); + trace_xprtrdma_mr_map(mr); ibmr = frwr->fr_mr; n = ib_map_mr_sg(ibmr, mr->mr_sg, mr->mr_nents, NULL, PAGE_SIZE); @@ -451,7 +418,7 @@ out_dmamap_err: out_mapmr_err: pr_err("rpcrdma: failed to map mr %p (%d/%d)\n", frwr->fr_mr, n, mr->mr_nents); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); return ERR_PTR(-EIO); } @@ -499,7 +466,7 @@ frwr_op_reminv(struct rpcrdma_rep *rep, struct list_head *mrs) list_for_each_entry(mr, mrs, mr_list) if (mr->mr_handle == rep->rr_inv_rkey) { list_del_init(&mr->mr_list); - trace_xprtrdma_remoteinv(mr); + trace_xprtrdma_mr_remoteinv(mr); mr->frwr.fr_state = FRWR_IS_INVALID; rpcrdma_mr_unmap_and_put(mr); break; /* only one invalidated MR per RPC */ @@ -536,7 +503,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) mr->frwr.fr_state = FRWR_IS_INVALID; frwr = &mr->frwr; - trace_xprtrdma_localinv(mr); + trace_xprtrdma_mr_localinv(mr); frwr->fr_cqe.done = frwr_wc_localinv; last = &frwr->fr_invwr; @@ -570,7 +537,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct list_head *mrs) if (bad_wr != first) wait_for_completion(&frwr->fr_linv_done); if (rc) - goto reset_mrs; + goto out_release; /* ORDER: Now DMA unmap all of the MRs, and return * them to the free MR list. @@ -582,22 +549,21 @@ unmap: } return; -reset_mrs: +out_release: pr_err("rpcrdma: FRWR invalidate ib_post_send returned %i\n", rc); - /* Find and reset the MRs in the LOCAL_INV WRs that did not + /* Unmap and release the MRs in the LOCAL_INV WRs that did not * get posted. */ while (bad_wr) { frwr = container_of(bad_wr, struct rpcrdma_frwr, fr_invwr); mr = container_of(frwr, struct rpcrdma_mr, frwr); - - __frwr_mr_reset(ia, mr); - bad_wr = bad_wr->next; + + list_del(&mr->mr_list); + frwr_op_release_mr(mr); } - goto unmap; } const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { @@ -605,7 +571,6 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = { .ro_send = frwr_op_send, .ro_reminv = frwr_op_reminv, .ro_unmap_sync = frwr_op_unmap_sync, - .ro_recover_mr = frwr_op_recover_mr, .ro_open = frwr_op_open, .ro_maxpages = frwr_op_maxpages, .ro_init_mr = frwr_op_init_mr, diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c index c8ae983c6cc0..9f53e0240035 100644 --- a/net/sunrpc/xprtrdma/rpc_rdma.c +++ b/net/sunrpc/xprtrdma/rpc_rdma.c @@ -71,7 +71,6 @@ static unsigned int rpcrdma_max_call_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Read list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = maxsegs * rpcrdma_readchunk_maxsz * sizeof(__be32); /* Minimal Read chunk size */ @@ -97,7 +96,6 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs) size = RPCRDMA_HDRLEN_MIN; /* Maximum Write list size */ - maxsegs += 2; /* segment for head and tail buffers */ size = sizeof(__be32); /* segment count */ size += maxsegs * rpcrdma_segment_maxsz * sizeof(__be32); size += sizeof(__be32); /* list discriminator */ @@ -805,7 +803,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) struct rpcrdma_mr *mr; mr = rpcrdma_mr_pop(&req->rl_registered); - rpcrdma_mr_defer_recovery(mr); + rpcrdma_mr_recycle(mr); } /* This implementation supports the following combinations @@ -866,7 +864,7 @@ rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst) out_err: switch (ret) { case -EAGAIN: - xprt_wait_for_buffer_space(rqst->rq_task, NULL); + xprt_wait_for_buffer_space(rqst->rq_xprt); break; case -ENOBUFS: break; @@ -1216,7 +1214,6 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) struct rpcrdma_xprt *r_xprt = rep->rr_rxprt; struct rpc_xprt *xprt = &r_xprt->rx_xprt; struct rpc_rqst *rqst = rep->rr_rqst; - unsigned long cwnd; int status; xprt->reestablish_timeout = 0; @@ -1238,15 +1235,10 @@ void rpcrdma_complete_rqst(struct rpcrdma_rep *rep) goto out_badheader; out: - spin_lock(&xprt->recv_lock); - cwnd = xprt->cwnd; - xprt->cwnd = r_xprt->rx_buf.rb_credits << RPC_CWNDSHIFT; - if (xprt->cwnd > cwnd) - xprt_release_rqst_cong(rqst->rq_task); - + spin_lock(&xprt->queue_lock); xprt_complete_rqst(rqst->rq_task, status); xprt_unpin_rqst(rqst); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); return; /* If the incoming reply terminated a pending RPC, the next @@ -1345,19 +1337,23 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep) /* Match incoming rpcrdma_rep to an rpcrdma_req to * get context for handling any incoming chunks. */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rqst = xprt_lookup_rqst(xprt, rep->rr_xid); if (!rqst) goto out_norqst; xprt_pin_rqst(rqst); + spin_unlock(&xprt->queue_lock); if (credits == 0) credits = 1; /* don't deadlock */ else if (credits > buf->rb_max_requests) credits = buf->rb_max_requests; - buf->rb_credits = credits; - - spin_unlock(&xprt->recv_lock); + if (buf->rb_credits != credits) { + spin_lock_bh(&xprt->transport_lock); + buf->rb_credits = credits; + xprt->cwnd = credits << RPC_CWNDSHIFT; + spin_unlock_bh(&xprt->transport_lock); + } req = rpcr_to_rdmar(rqst); req->rl_reply = rep; @@ -1378,7 +1374,7 @@ out_badversion: * is corrupt. */ out_norqst: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); trace_xprtrdma_reply_rqst(rep); goto repost; diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index a68180090554..d3a1a237cee6 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -56,7 +56,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, if (src->iov_len < 24) goto out_shortreply; - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); req = xprt_lookup_rqst(xprt, xid); if (!req) goto out_notfound; @@ -86,7 +86,7 @@ int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, __be32 *rdma_resp, rcvbuf->len = 0; out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); out: return ret; @@ -215,9 +215,8 @@ drop_connection: * connection. */ static int -xprt_rdma_bc_send_request(struct rpc_task *task) +xprt_rdma_bc_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; struct svcxprt_rdma *rdma; int ret; @@ -225,12 +224,7 @@ xprt_rdma_bc_send_request(struct rpc_task *task) dprintk("svcrdma: sending bc call with xid: %08x\n", be32_to_cpu(rqst->rq_xid)); - if (!mutex_trylock(&sxprt->xpt_mutex)) { - rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&sxprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); - } + mutex_lock(&sxprt->xpt_mutex); ret = -ENOTCONN; rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); @@ -248,6 +242,7 @@ static void xprt_rdma_bc_close(struct rpc_xprt *xprt) { dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + xprt->cwnd = RPC_CWNDSHIFT; } static void diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 143ce2579ba9..ae2a83828953 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -225,69 +225,59 @@ xprt_rdma_free_addresses(struct rpc_xprt *xprt) } } -void -rpcrdma_conn_func(struct rpcrdma_ep *ep) -{ - schedule_delayed_work(&ep->rep_connect_worker, 0); -} - -void -rpcrdma_connect_worker(struct work_struct *work) -{ - struct rpcrdma_ep *ep = - container_of(work, struct rpcrdma_ep, rep_connect_worker.work); - struct rpcrdma_xprt *r_xprt = - container_of(ep, struct rpcrdma_xprt, rx_ep); - struct rpc_xprt *xprt = &r_xprt->rx_xprt; - - spin_lock_bh(&xprt->transport_lock); - if (ep->rep_connected > 0) { - if (!xprt_test_and_set_connected(xprt)) - xprt_wake_pending_tasks(xprt, 0); - } else { - if (xprt_test_and_clear_connected(xprt)) - xprt_wake_pending_tasks(xprt, -ENOTCONN); - } - spin_unlock_bh(&xprt->transport_lock); -} - +/** + * xprt_rdma_connect_worker - establish connection in the background + * @work: worker thread context + * + * Requester holds the xprt's send lock to prevent activity on this + * transport while a fresh connection is being established. RPC tasks + * sleep on the xprt's pending queue waiting for connect to complete. + */ static void xprt_rdma_connect_worker(struct work_struct *work) { struct rpcrdma_xprt *r_xprt = container_of(work, struct rpcrdma_xprt, rx_connect_worker.work); struct rpc_xprt *xprt = &r_xprt->rx_xprt; - int rc = 0; - - xprt_clear_connected(xprt); + int rc; rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia); - if (rc) - xprt_wake_pending_tasks(xprt, rc); - xprt_clear_connecting(xprt); + if (r_xprt->rx_ep.rep_connected > 0) { + if (!xprt_test_and_set_connected(xprt)) { + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; + xprt_wake_pending_tasks(xprt, -EAGAIN); + } + } else { + if (xprt_test_and_clear_connected(xprt)) + xprt_wake_pending_tasks(xprt, rc); + } } +/** + * xprt_rdma_inject_disconnect - inject a connection fault + * @xprt: transport context + * + * If @xprt is connected, disconnect it to simulate spurious connection + * loss. + */ static void xprt_rdma_inject_disconnect(struct rpc_xprt *xprt) { - struct rpcrdma_xprt *r_xprt = container_of(xprt, struct rpcrdma_xprt, - rx_xprt); + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); trace_xprtrdma_inject_dsc(r_xprt); rdma_disconnect(r_xprt->rx_ia.ri_id); } -/* - * xprt_rdma_destroy +/** + * xprt_rdma_destroy - Full tear down of transport + * @xprt: doomed transport context * - * Destroy the xprt. - * Free all memory associated with the object, including its own. - * NOTE: none of the *destroy methods free memory for their top-level - * objects, even though they may have allocated it (they do free - * private memory). It's up to the caller to handle it. In this - * case (RDMA transport), all structure memory is inlined with the - * struct rpcrdma_xprt. + * Caller guarantees there will be no more calls to us with + * this @xprt. */ static void xprt_rdma_destroy(struct rpc_xprt *xprt) @@ -298,8 +288,6 @@ xprt_rdma_destroy(struct rpc_xprt *xprt) cancel_delayed_work_sync(&r_xprt->rx_connect_worker); - xprt_clear_connected(xprt); - rpcrdma_ep_destroy(&r_xprt->rx_ep, &r_xprt->rx_ia); rpcrdma_buffer_destroy(&r_xprt->rx_buf); rpcrdma_ia_close(&r_xprt->rx_ia); @@ -442,11 +430,12 @@ out1: } /** - * xprt_rdma_close - Close down RDMA connection - * @xprt: generic transport to be closed + * xprt_rdma_close - close a transport connection + * @xprt: transport context * - * Called during transport shutdown reconnect, or device - * removal. Caller holds the transport's write lock. + * Called during transport shutdown, reconnect, or device removal. + * Caller holds @xprt's send lock to prevent activity on this + * transport while the connection is torn down. */ static void xprt_rdma_close(struct rpc_xprt *xprt) @@ -468,6 +457,12 @@ xprt_rdma_close(struct rpc_xprt *xprt) xprt->reestablish_timeout = 0; xprt_disconnect_done(xprt); rpcrdma_ep_disconnect(ep, ia); + + /* Prepare @xprt for the next connection by reinitializing + * its credit grant to one (see RFC 8166, Section 3.3.3). + */ + r_xprt->rx_buf.rb_credits = 1; + xprt->cwnd = RPC_CWNDSHIFT; } /** @@ -519,6 +514,12 @@ xprt_rdma_timer(struct rpc_xprt *xprt, struct rpc_task *task) xprt_force_disconnect(xprt); } +/** + * xprt_rdma_connect - try to establish a transport connection + * @xprt: transport state + * @task: RPC scheduler context + * + */ static void xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) { @@ -638,13 +639,6 @@ rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req, * 0: Success; rq_buffer points to RPC buffer to use * ENOMEM: Out of memory, call again later * EIO: A permanent error occurred, do not retry - * - * The RDMA allocate/free functions need the task structure as a place - * to hide the struct rpcrdma_req, which is necessary for the actual - * send/recv sequence. - * - * xprt_rdma_allocate provides buffers that are already mapped for - * DMA, and a local DMA lkey is provided for each. */ static int xprt_rdma_allocate(struct rpc_task *task) @@ -693,7 +687,7 @@ xprt_rdma_free(struct rpc_task *task) /** * xprt_rdma_send_request - marshal and send an RPC request - * @task: RPC task with an RPC message in rq_snd_buf + * @rqst: RPC message in rq_snd_buf * * Caller holds the transport's write lock. * @@ -706,9 +700,8 @@ xprt_rdma_free(struct rpc_task *task) * sent. Do not try to send this message again. */ static int -xprt_rdma_send_request(struct rpc_task *task) +xprt_rdma_send_request(struct rpc_rqst *rqst) { - struct rpc_rqst *rqst = task->tk_rqstp; struct rpc_xprt *xprt = rqst->rq_xprt; struct rpcrdma_req *req = rpcr_to_rdmar(rqst); struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); @@ -722,6 +715,9 @@ xprt_rdma_send_request(struct rpc_task *task) if (!xprt_connected(xprt)) goto drop_connection; + if (!xprt_request_get_cong(xprt, rqst)) + return -EBADSLT; + rc = rpcrdma_marshal_req(r_xprt, rqst); if (rc < 0) goto failed_marshal; @@ -741,7 +737,7 @@ xprt_rdma_send_request(struct rpc_task *task) /* An RPC with no reply will throw off credit accounting, * so drop the connection to reset the credit grant. */ - if (!rpc_reply_expected(task)) + if (!rpc_reply_expected(rqst->rq_task)) goto drop_connection; return 0; @@ -766,7 +762,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) 0, /* need a local port? */ xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -786,7 +782,7 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) r_xprt->rx_stats.bad_reply_count, r_xprt->rx_stats.nomsg_call_count); seq_printf(seq, "%lu %lu %lu %lu %lu %lu\n", - r_xprt->rx_stats.mrs_recovered, + r_xprt->rx_stats.mrs_recycled, r_xprt->rx_stats.mrs_orphaned, r_xprt->rx_stats.mrs_allocated, r_xprt->rx_stats.local_inv_needed, diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 956a5ea47b58..3ddba94c939f 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -108,20 +108,48 @@ rpcrdma_destroy_wq(void) } } +/** + * rpcrdma_disconnect_worker - Force a disconnect + * @work: endpoint to be disconnected + * + * Provider callbacks can possibly run in an IRQ context. This function + * is invoked in a worker thread to guarantee that disconnect wake-up + * calls are always done in process context. + */ +static void +rpcrdma_disconnect_worker(struct work_struct *work) +{ + struct rpcrdma_ep *ep = container_of(work, struct rpcrdma_ep, + rep_disconnect_worker.work); + struct rpcrdma_xprt *r_xprt = + container_of(ep, struct rpcrdma_xprt, rx_ep); + + xprt_force_disconnect(&r_xprt->rx_xprt); +} + +/** + * rpcrdma_qp_event_handler - Handle one QP event (error notification) + * @event: details of the event + * @context: ep that owns QP where event occurred + * + * Called from the RDMA provider (device driver) possibly in an interrupt + * context. + */ static void -rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context) +rpcrdma_qp_event_handler(struct ib_event *event, void *context) { struct rpcrdma_ep *ep = context; struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt, rx_ep); - trace_xprtrdma_qp_error(r_xprt, event); - pr_err("rpcrdma: %s on device %s ep %p\n", - ib_event_msg(event->event), event->device->name, context); + trace_xprtrdma_qp_event(r_xprt, event); + pr_err("rpcrdma: %s on device %s connected to %s:%s\n", + ib_event_msg(event->event), event->device->name, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); if (ep->rep_connected == 1) { ep->rep_connected = -EIO; - rpcrdma_conn_func(ep); + schedule_delayed_work(&ep->rep_disconnect_worker, 0); wake_up_all(&ep->rep_connect_wait); } } @@ -219,38 +247,48 @@ rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt, rpcrdma_set_max_header_sizes(r_xprt); } +/** + * rpcrdma_cm_event_handler - Handle RDMA CM events + * @id: rdma_cm_id on which an event has occurred + * @event: details of the event + * + * Called with @id's mutex held. Returns 1 if caller should + * destroy @id, otherwise 0. + */ static int -rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) +rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event) { - struct rpcrdma_xprt *xprt = id->context; - struct rpcrdma_ia *ia = &xprt->rx_ia; - struct rpcrdma_ep *ep = &xprt->rx_ep; - int connstate = 0; + struct rpcrdma_xprt *r_xprt = id->context; + struct rpcrdma_ia *ia = &r_xprt->rx_ia; + struct rpcrdma_ep *ep = &r_xprt->rx_ep; + struct rpc_xprt *xprt = &r_xprt->rx_xprt; + + might_sleep(); - trace_xprtrdma_conn_upcall(xprt, event); + trace_xprtrdma_cm_event(r_xprt, event); switch (event->event) { case RDMA_CM_EVENT_ADDR_RESOLVED: case RDMA_CM_EVENT_ROUTE_RESOLVED: ia->ri_async_rc = 0; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ADDR_ERROR: ia->ri_async_rc = -EPROTO; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_ROUTE_ERROR: ia->ri_async_rc = -ENETUNREACH; complete(&ia->ri_done); - break; + return 0; case RDMA_CM_EVENT_DEVICE_REMOVAL: #if IS_ENABLED(CONFIG_SUNRPC_DEBUG) pr_info("rpcrdma: removing device %s for %s:%s\n", ia->ri_device->name, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt)); + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt)); #endif set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags); ep->rep_connected = -ENODEV; - xprt_force_disconnect(&xprt->rx_xprt); + xprt_force_disconnect(xprt); wait_for_completion(&ia->ri_remove_done); ia->ri_id = NULL; @@ -258,41 +296,40 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event) /* Return 1 to ensure the core destroys the id. */ return 1; case RDMA_CM_EVENT_ESTABLISHED: - ++xprt->rx_xprt.connect_cookie; - connstate = 1; - rpcrdma_update_connect_private(xprt, &event->param.conn); - goto connected; + ++xprt->connect_cookie; + ep->rep_connected = 1; + rpcrdma_update_connect_private(r_xprt, &event->param.conn); + wake_up_all(&ep->rep_connect_wait); + break; case RDMA_CM_EVENT_CONNECT_ERROR: - connstate = -ENOTCONN; - goto connected; + ep->rep_connected = -ENOTCONN; + goto disconnected; case RDMA_CM_EVENT_UNREACHABLE: - connstate = -ENETUNREACH; - goto connected; + ep->rep_connected = -ENETUNREACH; + goto disconnected; case RDMA_CM_EVENT_REJECTED: dprintk("rpcrdma: connection to %s:%s rejected: %s\n", - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), rdma_reject_msg(id, event->status)); - connstate = -ECONNREFUSED; + ep->rep_connected = -ECONNREFUSED; if (event->status == IB_CM_REJ_STALE_CONN) - connstate = -EAGAIN; - goto connected; + ep->rep_connected = -EAGAIN; + goto disconnected; case RDMA_CM_EVENT_DISCONNECTED: - ++xprt->rx_xprt.connect_cookie; - connstate = -ECONNABORTED; -connected: - ep->rep_connected = connstate; - rpcrdma_conn_func(ep); + ++xprt->connect_cookie; + ep->rep_connected = -ECONNABORTED; +disconnected: + xprt_force_disconnect(xprt); wake_up_all(&ep->rep_connect_wait); - /*FALLTHROUGH*/ + break; default: - dprintk("RPC: %s: %s:%s on %s/%s (ep 0x%p): %s\n", - __func__, - rpcrdma_addrstr(xprt), rpcrdma_portstr(xprt), - ia->ri_device->name, ia->ri_ops->ro_displayname, - ep, rdma_event_msg(event->event)); break; } + dprintk("RPC: %s: %s:%s on %s/%s: %s\n", __func__, + rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt), + ia->ri_device->name, ia->ri_ops->ro_displayname, + rdma_event_msg(event->event)); return 0; } @@ -308,7 +345,7 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia) init_completion(&ia->ri_done); init_completion(&ia->ri_remove_done); - id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_conn_upcall, + id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler, xprt, RDMA_PS_TCP, IB_QPT_RC); if (IS_ERR(id)) { rc = PTR_ERR(id); @@ -519,7 +556,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, if (rc) return rc; - ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall; + ep->rep_attr.event_handler = rpcrdma_qp_event_handler; ep->rep_attr.qp_context = ep; ep->rep_attr.srq = NULL; ep->rep_attr.cap.max_send_sge = max_sge; @@ -542,7 +579,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, cdata->max_requests >> 2); ep->rep_send_count = ep->rep_send_batch; init_waitqueue_head(&ep->rep_connect_wait); - INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker); + INIT_DELAYED_WORK(&ep->rep_disconnect_worker, + rpcrdma_disconnect_worker); sendcq = ib_alloc_cq(ia->ri_device, NULL, ep->rep_attr.cap.max_send_wr + 1, @@ -615,7 +653,7 @@ out1: void rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { - cancel_delayed_work_sync(&ep->rep_connect_worker); + cancel_delayed_work_sync(&ep->rep_disconnect_worker); if (ia->ri_id && ia->ri_id->qp) { rpcrdma_ep_disconnect(ep, ia); @@ -728,6 +766,7 @@ rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia) { struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia); + struct rpc_xprt *xprt = &r_xprt->rx_xprt; int rc; retry: @@ -754,6 +793,8 @@ retry: } ep->rep_connected = 0; + xprt_clear_connected(xprt); + rpcrdma_post_recvs(r_xprt, true); rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma); @@ -877,7 +918,6 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) sc->sc_xprt = r_xprt; buf->rb_sc_ctxs[i] = sc; } - buf->rb_flags = 0; return 0; @@ -978,39 +1018,6 @@ rpcrdma_sendctx_put_locked(struct rpcrdma_sendctx *sc) } static void -rpcrdma_mr_recovery_worker(struct work_struct *work) -{ - struct rpcrdma_buffer *buf = container_of(work, struct rpcrdma_buffer, - rb_recovery_worker.work); - struct rpcrdma_mr *mr; - - spin_lock(&buf->rb_recovery_lock); - while (!list_empty(&buf->rb_stale_mrs)) { - mr = rpcrdma_mr_pop(&buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - trace_xprtrdma_recover_mr(mr); - mr->mr_xprt->rx_ia.ri_ops->ro_recover_mr(mr); - - spin_lock(&buf->rb_recovery_lock); - } - spin_unlock(&buf->rb_recovery_lock); -} - -void -rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr) -{ - struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - struct rpcrdma_buffer *buf = &r_xprt->rx_buf; - - spin_lock(&buf->rb_recovery_lock); - rpcrdma_mr_push(mr, &buf->rb_stale_mrs); - spin_unlock(&buf->rb_recovery_lock); - - schedule_delayed_work(&buf->rb_recovery_worker, 0); -} - -static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; @@ -1019,7 +1026,7 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) LIST_HEAD(free); LIST_HEAD(all); - for (count = 0; count < 3; count++) { + for (count = 0; count < ia->ri_max_segs; count++) { struct rpcrdma_mr *mr; int rc; @@ -1138,18 +1145,15 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_buffer *buf = &r_xprt->rx_buf; int i, rc; + buf->rb_flags = 0; buf->rb_max_requests = r_xprt->rx_data.max_requests; buf->rb_bc_srv_max_requests = 0; spin_lock_init(&buf->rb_mrlock); spin_lock_init(&buf->rb_lock); - spin_lock_init(&buf->rb_recovery_lock); INIT_LIST_HEAD(&buf->rb_mrs); INIT_LIST_HEAD(&buf->rb_all); - INIT_LIST_HEAD(&buf->rb_stale_mrs); INIT_DELAYED_WORK(&buf->rb_refresh_worker, rpcrdma_mr_refresh_worker); - INIT_DELAYED_WORK(&buf->rb_recovery_worker, - rpcrdma_mr_recovery_worker); rpcrdma_mrs_create(r_xprt); @@ -1233,7 +1237,6 @@ rpcrdma_mrs_destroy(struct rpcrdma_buffer *buf) void rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf) { - cancel_delayed_work_sync(&buf->rb_recovery_worker); cancel_delayed_work_sync(&buf->rb_refresh_worker); rpcrdma_sendctxs_destroy(buf); @@ -1326,7 +1329,7 @@ rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr) { struct rpcrdma_xprt *r_xprt = mr->mr_xprt; - trace_xprtrdma_dma_unmap(mr); + trace_xprtrdma_mr_unmap(mr); ib_dma_unmap_sg(r_xprt->rx_ia.ri_device, mr->mr_sg, mr->mr_nents, mr->mr_dir); __rpcrdma_mr_put(&r_xprt->rx_buf, mr); @@ -1518,9 +1521,11 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) struct ib_recv_wr *wr, *bad_wr; int needed, count, rc; + rc = 0; + count = 0; needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1); if (buf->rb_posted_receives > needed) - return; + goto out; needed -= buf->rb_posted_receives; count = 0; @@ -1556,7 +1561,7 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) --needed; } if (!count) - return; + goto out; rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr, (const struct ib_recv_wr **)&bad_wr); @@ -1570,5 +1575,6 @@ rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp) } } buf->rb_posted_receives += count; +out: trace_xprtrdma_post_recvs(r_xprt, count, rc); } diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 2ca14f7c2d51..a13ccb643ce0 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -101,7 +101,7 @@ struct rpcrdma_ep { wait_queue_head_t rep_connect_wait; struct rpcrdma_connect_private rep_cm_private; struct rdma_conn_param rep_remote_cma; - struct delayed_work rep_connect_worker; + struct delayed_work rep_disconnect_worker; }; /* Pre-allocate extra Work Requests for handling backward receives @@ -280,6 +280,7 @@ struct rpcrdma_mr { u32 mr_handle; u32 mr_length; u64 mr_offset; + struct work_struct mr_recycle; struct list_head mr_all; }; @@ -411,9 +412,6 @@ struct rpcrdma_buffer { u32 rb_bc_max_requests; - spinlock_t rb_recovery_lock; /* protect rb_stale_mrs */ - struct list_head rb_stale_mrs; - struct delayed_work rb_recovery_worker; struct delayed_work rb_refresh_worker; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -452,7 +450,7 @@ struct rpcrdma_stats { unsigned long hardway_register_count; unsigned long failed_marshal_count; unsigned long bad_reply_count; - unsigned long mrs_recovered; + unsigned long mrs_recycled; unsigned long mrs_orphaned; unsigned long mrs_allocated; unsigned long empty_sendctx_q; @@ -481,7 +479,6 @@ struct rpcrdma_memreg_ops { struct list_head *mrs); void (*ro_unmap_sync)(struct rpcrdma_xprt *, struct list_head *); - void (*ro_recover_mr)(struct rpcrdma_mr *mr); int (*ro_open)(struct rpcrdma_ia *, struct rpcrdma_ep *, struct rpcrdma_create_data_internal *); @@ -559,7 +556,6 @@ int rpcrdma_ep_create(struct rpcrdma_ep *, struct rpcrdma_ia *, struct rpcrdma_create_data_internal *); void rpcrdma_ep_destroy(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *); -void rpcrdma_conn_func(struct rpcrdma_ep *ep); void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *); int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *, @@ -578,7 +574,12 @@ struct rpcrdma_sendctx *rpcrdma_sendctx_get_locked(struct rpcrdma_buffer *buf); struct rpcrdma_mr *rpcrdma_mr_get(struct rpcrdma_xprt *r_xprt); void rpcrdma_mr_put(struct rpcrdma_mr *mr); void rpcrdma_mr_unmap_and_put(struct rpcrdma_mr *mr); -void rpcrdma_mr_defer_recovery(struct rpcrdma_mr *mr); + +static inline void +rpcrdma_mr_recycle(struct rpcrdma_mr *mr) +{ + schedule_work(&mr->mr_recycle); +} struct rpcrdma_req *rpcrdma_buffer_get(struct rpcrdma_buffer *); void rpcrdma_buffer_put(struct rpcrdma_req *); @@ -652,7 +653,6 @@ static inline void rpcrdma_set_xdrlen(struct xdr_buf *xdr, size_t len) extern unsigned int xprt_rdma_max_inline_read; void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); void xprt_rdma_free_addresses(struct rpc_xprt *xprt); -void rpcrdma_connect_worker(struct work_struct *work); void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 6b7539c0466e..1b51e04d3566 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -47,13 +47,13 @@ #include <net/checksum.h> #include <net/udp.h> #include <net/tcp.h> +#include <linux/bvec.h> +#include <linux/uio.h> #include <trace/events/sunrpc.h> #include "sunrpc.h" -#define RPC_TCP_READ_CHUNK_SZ (3*512*1024) - static void xs_close(struct rpc_xprt *xprt); static void xs_tcp_set_socket_timeouts(struct rpc_xprt *xprt, struct socket *sock); @@ -129,7 +129,7 @@ static struct ctl_table xs_tunables_table[] = { .mode = 0644, .proc_handler = proc_dointvec_minmax, .extra1 = &xprt_min_resvport_limit, - .extra2 = &xprt_max_resvport + .extra2 = &xprt_max_resvport_limit }, { .procname = "max_resvport", @@ -137,7 +137,7 @@ static struct ctl_table xs_tunables_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = proc_dointvec_minmax, - .extra1 = &xprt_min_resvport, + .extra1 = &xprt_min_resvport_limit, .extra2 = &xprt_max_resvport_limit }, { @@ -325,6 +325,362 @@ static void xs_free_peer_addresses(struct rpc_xprt *xprt) } } +static size_t +xs_alloc_sparse_pages(struct xdr_buf *buf, size_t want, gfp_t gfp) +{ + size_t i,n; + + if (!(buf->flags & XDRBUF_SPARSE_PAGES)) + return want; + if (want > buf->page_len) + want = buf->page_len; + n = (buf->page_base + want + PAGE_SIZE - 1) >> PAGE_SHIFT; + for (i = 0; i < n; i++) { + if (buf->pages[i]) + continue; + buf->bvec[i].bv_page = buf->pages[i] = alloc_page(gfp); + if (!buf->pages[i]) { + buf->page_len = (i * PAGE_SIZE) - buf->page_base; + return buf->page_len; + } + } + return want; +} + +static ssize_t +xs_sock_recvmsg(struct socket *sock, struct msghdr *msg, int flags, size_t seek) +{ + ssize_t ret; + if (seek != 0) + iov_iter_advance(&msg->msg_iter, seek); + ret = sock_recvmsg(sock, msg, flags); + return ret > 0 ? ret + seek : ret; +} + +static ssize_t +xs_read_kvec(struct socket *sock, struct msghdr *msg, int flags, + struct kvec *kvec, size_t count, size_t seek) +{ + iov_iter_kvec(&msg->msg_iter, READ | ITER_KVEC, kvec, 1, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_bvec(struct socket *sock, struct msghdr *msg, int flags, + struct bio_vec *bvec, unsigned long nr, size_t count, + size_t seek) +{ + iov_iter_bvec(&msg->msg_iter, READ | ITER_BVEC, bvec, nr, count); + return xs_sock_recvmsg(sock, msg, flags, seek); +} + +static ssize_t +xs_read_discard(struct socket *sock, struct msghdr *msg, int flags, + size_t count) +{ + struct kvec kvec = { 0 }; + return xs_read_kvec(sock, msg, flags | MSG_TRUNC, &kvec, count, 0); +} + +static ssize_t +xs_read_xdr_buf(struct socket *sock, struct msghdr *msg, int flags, + struct xdr_buf *buf, size_t count, size_t seek, size_t *read) +{ + size_t want, seek_init = seek, offset = 0; + ssize_t ret; + + if (seek < buf->head[0].iov_len) { + want = min_t(size_t, count, buf->head[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->head[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->head[0].iov_len; + offset += buf->head[0].iov_len; + } + if (seek < buf->page_len) { + want = xs_alloc_sparse_pages(buf, + min_t(size_t, count - offset, buf->page_len), + GFP_NOWAIT); + ret = xs_read_bvec(sock, msg, flags, buf->bvec, + xdr_buf_pagecount(buf), + want + buf->page_base, + seek + buf->page_base); + if (ret <= 0) + goto sock_err; + offset += ret - buf->page_base; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + seek = 0; + } else { + seek -= buf->page_len; + offset += buf->page_len; + } + if (seek < buf->tail[0].iov_len) { + want = min_t(size_t, count - offset, buf->tail[0].iov_len); + ret = xs_read_kvec(sock, msg, flags, &buf->tail[0], want, seek); + if (ret <= 0) + goto sock_err; + offset += ret; + if (offset == count || msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + goto out; + if (ret != want) + goto eagain; + } else + offset += buf->tail[0].iov_len; + ret = -EMSGSIZE; + msg->msg_flags |= MSG_TRUNC; +out: + *read = offset - seek_init; + return ret; +eagain: + ret = -EAGAIN; + goto out; +sock_err: + offset += seek; + goto out; +} + +static void +xs_read_header(struct sock_xprt *transport, struct xdr_buf *buf) +{ + if (!transport->recv.copied) { + if (buf->head[0].iov_len >= transport->recv.offset) + memcpy(buf->head[0].iov_base, + &transport->recv.xid, + transport->recv.offset); + transport->recv.copied = transport->recv.offset; + } +} + +static bool +xs_read_stream_request_done(struct sock_xprt *transport) +{ + return transport->recv.fraghdr & cpu_to_be32(RPC_LAST_STREAM_FRAGMENT); +} + +static ssize_t +xs_read_stream_request(struct sock_xprt *transport, struct msghdr *msg, + int flags, struct rpc_rqst *req) +{ + struct xdr_buf *buf = &req->rq_private_buf; + size_t want, read; + ssize_t ret; + + xs_read_header(transport, buf); + + want = transport->recv.len - transport->recv.offset; + ret = xs_read_xdr_buf(transport->sock, msg, flags, buf, + transport->recv.copied + want, transport->recv.copied, + &read); + transport->recv.offset += read; + transport->recv.copied += read; + if (transport->recv.offset == transport->recv.len) { + if (xs_read_stream_request_done(transport)) + msg->msg_flags |= MSG_EOR; + return transport->recv.copied; + } + + switch (ret) { + case -EMSGSIZE: + return transport->recv.copied; + case 0: + return -ESHUTDOWN; + default: + if (ret < 0) + return ret; + } + return -EAGAIN; +} + +static size_t +xs_read_stream_headersize(bool isfrag) +{ + if (isfrag) + return sizeof(__be32); + return 3 * sizeof(__be32); +} + +static ssize_t +xs_read_stream_header(struct sock_xprt *transport, struct msghdr *msg, + int flags, size_t want, size_t seek) +{ + struct kvec kvec = { + .iov_base = &transport->recv.fraghdr, + .iov_len = want, + }; + return xs_read_kvec(transport->sock, msg, flags, &kvec, want, seek); +} + +#if defined(CONFIG_SUNRPC_BACKCHANNEL) +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret; + + /* Look up and lock the request corresponding to the given XID */ + req = xprt_lookup_bc_request(xprt, transport->recv.xid); + if (!req) { + printk(KERN_WARNING "Callback slot table overflowed\n"); + return -ESHUTDOWN; + } + + ret = xs_read_stream_request(transport, msg, flags, req); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_bc_request(req, ret); + + return ret; +} +#else /* CONFIG_SUNRPC_BACKCHANNEL */ +static ssize_t +xs_read_stream_call(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + return -ESHUTDOWN; +} +#endif /* CONFIG_SUNRPC_BACKCHANNEL */ + +static ssize_t +xs_read_stream_reply(struct sock_xprt *transport, struct msghdr *msg, int flags) +{ + struct rpc_xprt *xprt = &transport->xprt; + struct rpc_rqst *req; + ssize_t ret = 0; + + /* Look up and lock the request corresponding to the given XID */ + spin_lock(&xprt->queue_lock); + req = xprt_lookup_rqst(xprt, transport->recv.xid); + if (!req) { + msg->msg_flags |= MSG_TRUNC; + goto out; + } + xprt_pin_rqst(req); + spin_unlock(&xprt->queue_lock); + + ret = xs_read_stream_request(transport, msg, flags, req); + + spin_lock(&xprt->queue_lock); + if (msg->msg_flags & (MSG_EOR|MSG_TRUNC)) + xprt_complete_rqst(req->rq_task, ret); + xprt_unpin_rqst(req); +out: + spin_unlock(&xprt->queue_lock); + return ret; +} + +static ssize_t +xs_read_stream(struct sock_xprt *transport, int flags) +{ + struct msghdr msg = { 0 }; + size_t want, read = 0; + ssize_t ret = 0; + + if (transport->recv.len == 0) { + want = xs_read_stream_headersize(transport->recv.copied != 0); + ret = xs_read_stream_header(transport, &msg, flags, want, + transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset = ret; + if (ret != want) { + ret = -EAGAIN; + goto out_err; + } + transport->recv.len = be32_to_cpu(transport->recv.fraghdr) & + RPC_FRAGMENT_SIZE_MASK; + transport->recv.offset -= sizeof(transport->recv.fraghdr); + read = ret; + } + + switch (be32_to_cpu(transport->recv.calldir)) { + case RPC_CALL: + ret = xs_read_stream_call(transport, &msg, flags); + break; + case RPC_REPLY: + ret = xs_read_stream_reply(transport, &msg, flags); + } + if (msg.msg_flags & MSG_TRUNC) { + transport->recv.calldir = cpu_to_be32(-1); + transport->recv.copied = -1; + } + if (ret < 0) + goto out_err; + read += ret; + if (transport->recv.offset < transport->recv.len) { + ret = xs_read_discard(transport->sock, &msg, flags, + transport->recv.len - transport->recv.offset); + if (ret <= 0) + goto out_err; + transport->recv.offset += ret; + read += ret; + if (transport->recv.offset != transport->recv.len) + return -EAGAIN; + } + if (xs_read_stream_request_done(transport)) { + trace_xs_stream_read_request(transport); + transport->recv.copied = 0; + } + transport->recv.offset = 0; + transport->recv.len = 0; + return read; +out_err: + switch (ret) { + case 0: + case -ESHUTDOWN: + xprt_force_disconnect(&transport->xprt); + return -ESHUTDOWN; + } + return ret; +} + +static void xs_stream_data_receive(struct sock_xprt *transport) +{ + size_t read = 0; + ssize_t ret = 0; + + mutex_lock(&transport->recv_mutex); + if (transport->sock == NULL) + goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); + for (;;) { + ret = xs_read_stream(transport, MSG_DONTWAIT); + if (ret <= 0) + break; + read += ret; + cond_resched(); + } +out: + mutex_unlock(&transport->recv_mutex); + trace_xs_stream_read_data(&transport->xprt, ret, read); +} + +static void xs_stream_data_receive_workfn(struct work_struct *work) +{ + struct sock_xprt *transport = + container_of(work, struct sock_xprt, recv_worker); + xs_stream_data_receive(transport); +} + +static void +xs_stream_reset_connect(struct sock_xprt *transport) +{ + transport->recv.offset = 0; + transport->recv.len = 0; + transport->recv.copied = 0; + transport->xmit.offset = 0; + transport->xprt.stat.connect_count++; + transport->xprt.stat.connect_start = jiffies; +} + #define XS_SENDMSG_FLAGS (MSG_DONTWAIT | MSG_NOSIGNAL) static int xs_send_kvec(struct socket *sock, struct sockaddr *addr, int addrlen, struct kvec *vec, unsigned int base, int more) @@ -440,28 +796,21 @@ out: return err; } -static void xs_nospace_callback(struct rpc_task *task) -{ - struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt); - - transport->inet->sk_write_pending--; -} - /** - * xs_nospace - place task on wait queue if transmit was incomplete - * @task: task to put to sleep + * xs_nospace - handle transmit was incomplete + * @req: pointer to RPC request * */ -static int xs_nospace(struct rpc_task *task) +static int xs_nospace(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct sock *sk = transport->inet; int ret = -EAGAIN; dprintk("RPC: %5u xmit incomplete (%u left of %u)\n", - task->tk_pid, req->rq_slen - req->rq_bytes_sent, + req->rq_task->tk_pid, + req->rq_slen - transport->xmit.offset, req->rq_slen); /* Protect against races with write_space */ @@ -471,7 +820,7 @@ static int xs_nospace(struct rpc_task *task) if (xprt_connected(xprt)) { /* wait for more buffer space */ sk->sk_write_pending++; - xprt_wait_for_buffer_space(task, xs_nospace_callback); + xprt_wait_for_buffer_space(xprt); } else ret = -ENOTCONN; @@ -491,6 +840,22 @@ static int xs_nospace(struct rpc_task *task) return ret; } +static void +xs_stream_prepare_request(struct rpc_rqst *req) +{ + req->rq_task->tk_status = xdr_alloc_bvec(&req->rq_rcv_buf, GFP_NOIO); +} + +/* + * Determine if the previous message in the stream was aborted before it + * could complete transmission. + */ +static bool +xs_send_request_was_aborted(struct sock_xprt *transport, struct rpc_rqst *req) +{ + return transport->xmit.offset != 0 && req->rq_bytes_sent == 0; +} + /* * Construct a stream transport record marker in @buf. */ @@ -503,7 +868,7 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) /** * xs_local_send_request - write an RPC request to an AF_LOCAL socket - * @task: RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -512,9 +877,8 @@ static inline void xs_encode_stream_record_marker(struct xdr_buf *buf) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occured, the request was not sent */ -static int xs_local_send_request(struct rpc_task *task) +static int xs_local_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); @@ -522,25 +886,34 @@ static int xs_local_send_request(struct rpc_task *task) int status; int sent = 0; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + xs_close(xprt); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", req->rq_svec->iov_base, req->rq_svec->iov_len); req->rq_xtime = ktime_get(); - status = xs_sendpages(transport->sock, NULL, 0, xdr, req->rq_bytes_sent, + status = xs_sendpages(transport->sock, NULL, 0, xdr, + transport->xmit.offset, true, &sent); dprintk("RPC: %s(%u) = %d\n", - __func__, xdr->len - req->rq_bytes_sent, status); + __func__, xdr->len - transport->xmit.offset, status); if (status == -EAGAIN && sock_writeable(transport->inet)) status = -ENOBUFS; if (likely(sent > 0) || status == 0) { - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } status = -EAGAIN; @@ -550,7 +923,7 @@ static int xs_local_send_request(struct rpc_task *task) case -ENOBUFS: break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; default: dprintk("RPC: sendmsg returned unrecognized error %d\n", @@ -566,7 +939,7 @@ static int xs_local_send_request(struct rpc_task *task) /** * xs_udp_send_request - write an RPC request to a UDP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -575,9 +948,8 @@ static int xs_local_send_request(struct rpc_task *task) * ENOTCONN: Caller needs to invoke connect logic then call again * other: Some other error occurred, the request was not sent */ -static int xs_udp_send_request(struct rpc_task *task) +static int xs_udp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -590,12 +962,16 @@ static int xs_udp_send_request(struct rpc_task *task) if (!xprt_bound(xprt)) return -ENOTCONN; + + if (!xprt_request_get_cong(xprt, req)) + return -EBADSLT; + req->rq_xtime = ktime_get(); status = xs_sendpages(transport->sock, xs_addr(xprt), xprt->addrlen, - xdr, req->rq_bytes_sent, true, &sent); + xdr, 0, true, &sent); dprintk("RPC: xs_udp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len, status); /* firewall is blocking us, don't return -EAGAIN or we end up looping */ if (status == -EPERM) @@ -619,7 +995,7 @@ process_status: /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ENETUNREACH: case -ENOBUFS: @@ -639,7 +1015,7 @@ process_status: /** * xs_tcp_send_request - write an RPC request to a TCP socket - * @task: address of RPC task that manages the state of an RPC request + * @req: pointer to RPC request * * Return values: * 0: The request has been sent @@ -651,9 +1027,8 @@ process_status: * XXX: In the case of soft timeouts, should we eventually give up * if sendmsg is not able to make progress? */ -static int xs_tcp_send_request(struct rpc_task *task) +static int xs_tcp_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct rpc_xprt *xprt = req->rq_xprt; struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); struct xdr_buf *xdr = &req->rq_snd_buf; @@ -662,6 +1037,13 @@ static int xs_tcp_send_request(struct rpc_task *task) int status; int sent; + /* Close the stream if the previous transmission was incomplete */ + if (xs_send_request_was_aborted(transport, req)) { + if (transport->sock != NULL) + kernel_sock_shutdown(transport->sock, SHUT_RDWR); + return -ENOTCONN; + } + xs_encode_stream_record_marker(&req->rq_snd_buf); xs_pktdump("packet data:", @@ -671,7 +1053,7 @@ static int xs_tcp_send_request(struct rpc_task *task) * completes while the socket holds a reference to the pages, * then we may end up resending corrupted data. */ - if (task->tk_flags & RPC_TASK_SENT) + if (req->rq_task->tk_flags & RPC_TASK_SENT) zerocopy = false; if (test_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state)) @@ -684,17 +1066,20 @@ static int xs_tcp_send_request(struct rpc_task *task) while (1) { sent = 0; status = xs_sendpages(transport->sock, NULL, 0, xdr, - req->rq_bytes_sent, zerocopy, &sent); + transport->xmit.offset, + zerocopy, &sent); dprintk("RPC: xs_tcp_send_request(%u) = %d\n", - xdr->len - req->rq_bytes_sent, status); + xdr->len - transport->xmit.offset, status); /* If we've sent the entire packet, immediately * reset the count of bytes sent. */ - req->rq_bytes_sent += sent; - req->rq_xmit_bytes_sent += sent; + transport->xmit.offset += sent; + req->rq_bytes_sent = transport->xmit.offset; if (likely(req->rq_bytes_sent >= req->rq_slen)) { + req->rq_xmit_bytes_sent += transport->xmit.offset; req->rq_bytes_sent = 0; + transport->xmit.offset = 0; return 0; } @@ -732,7 +1117,7 @@ static int xs_tcp_send_request(struct rpc_task *task) /* Should we call xs_close() here? */ break; case -EAGAIN: - status = xs_nospace(task); + status = xs_nospace(req); break; case -ECONNRESET: case -ECONNREFUSED: @@ -749,35 +1134,6 @@ static int xs_tcp_send_request(struct rpc_task *task) return status; } -/** - * xs_tcp_release_xprt - clean up after a tcp transmission - * @xprt: transport - * @task: rpc task - * - * This cleans up if an error causes us to abort the transmission of a request. - * In this case, the socket may need to be reset in order to avoid confusing - * the server. - */ -static void xs_tcp_release_xprt(struct rpc_xprt *xprt, struct rpc_task *task) -{ - struct rpc_rqst *req; - - if (task != xprt->snd_task) - return; - if (task == NULL) - goto out_release; - req = task->tk_rqstp; - if (req == NULL) - goto out_release; - if (req->rq_bytes_sent == 0) - goto out_release; - if (req->rq_bytes_sent == req->rq_snd_buf.len) - goto out_release; - set_bit(XPRT_CLOSE_WAIT, &xprt->state); -out_release: - xprt_release_xprt(xprt, task); -} - static void xs_save_old_callbacks(struct sock_xprt *transport, struct sock *sk) { transport->old_data_ready = sk->sk_data_ready; @@ -921,114 +1277,6 @@ static void xs_destroy(struct rpc_xprt *xprt) module_put(THIS_MODULE); } -static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb) -{ - struct xdr_skb_reader desc = { - .skb = skb, - .offset = sizeof(rpc_fraghdr), - .count = skb->len - sizeof(rpc_fraghdr), - }; - - if (xdr_partial_copy_from_skb(xdr, 0, &desc, xdr_skb_read_bits) < 0) - return -1; - if (desc.count) - return -1; - return 0; -} - -/** - * xs_local_data_read_skb - * @xprt: transport - * @sk: socket - * @skb: skbuff - * - * Currently this assumes we can read the whole reply in a single gulp. - */ -static void xs_local_data_read_skb(struct rpc_xprt *xprt, - struct sock *sk, - struct sk_buff *skb) -{ - struct rpc_task *task; - struct rpc_rqst *rovr; - int repsize, copied; - u32 _xid; - __be32 *xp; - - repsize = skb->len - sizeof(rpc_fraghdr); - if (repsize < 4) { - dprintk("RPC: impossible RPC reply size %d\n", repsize); - return; - } - - /* Copy the XID from the skb... */ - xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid); - if (xp == NULL) - return; - - /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); - rovr = xprt_lookup_rqst(xprt, *xp); - if (!rovr) - goto out_unlock; - xprt_pin_rqst(rovr); - spin_unlock(&xprt->recv_lock); - task = rovr->rq_task; - - copied = rovr->rq_private_buf.buflen; - if (copied > repsize) - copied = repsize; - - if (xs_local_copy_to_xdr(&rovr->rq_private_buf, skb)) { - dprintk("RPC: sk_buff copy failed\n"); - spin_lock(&xprt->recv_lock); - goto out_unpin; - } - - spin_lock(&xprt->recv_lock); - xprt_complete_rqst(task, copied); -out_unpin: - xprt_unpin_rqst(rovr); - out_unlock: - spin_unlock(&xprt->recv_lock); -} - -static void xs_local_data_receive(struct sock_xprt *transport) -{ - struct sk_buff *skb; - struct sock *sk; - int err; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - for (;;) { - skb = skb_recv_datagram(sk, 0, 1, &err); - if (skb != NULL) { - xs_local_data_read_skb(&transport->xprt, sk, skb); - skb_free_datagram(sk, skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } -out: - mutex_unlock(&transport->recv_mutex); -} - -static void xs_local_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_local_data_receive(transport); -} - /** * xs_udp_data_read_skb - receive callback for UDP sockets * @xprt: transport @@ -1058,13 +1306,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, return; /* Look up and lock the request corresponding to the given XID */ - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); rovr = xprt_lookup_rqst(xprt, *xp); if (!rovr) goto out_unlock; xprt_pin_rqst(rovr); xprt_update_rtt(rovr->rq_task); - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); task = rovr->rq_task; if ((copied = rovr->rq_private_buf.buflen) > repsize) @@ -1072,7 +1320,7 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, /* Suck it into the iovec, verify checksum if not done by hw. */ if (csum_partial_copy_to_xdr(&rovr->rq_private_buf, skb)) { - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); __UDPX_INC_STATS(sk, UDP_MIB_INERRORS); goto out_unpin; } @@ -1081,13 +1329,13 @@ static void xs_udp_data_read_skb(struct rpc_xprt *xprt, spin_lock_bh(&xprt->transport_lock); xprt_adjust_cwnd(xprt, task, copied); spin_unlock_bh(&xprt->transport_lock); - spin_lock(&xprt->recv_lock); + spin_lock(&xprt->queue_lock); xprt_complete_rqst(task, copied); __UDPX_INC_STATS(sk, UDP_MIB_INDATAGRAMS); out_unpin: xprt_unpin_rqst(rovr); out_unlock: - spin_unlock(&xprt->recv_lock); + spin_unlock(&xprt->queue_lock); } static void xs_udp_data_receive(struct sock_xprt *transport) @@ -1096,25 +1344,18 @@ static void xs_udp_data_receive(struct sock_xprt *transport) struct sock *sk; int err; -restart: mutex_lock(&transport->recv_mutex); sk = transport->inet; if (sk == NULL) goto out; + clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); for (;;) { skb = skb_recv_udp(sk, 0, 1, &err); - if (skb != NULL) { - xs_udp_data_read_skb(&transport->xprt, sk, skb); - consume_skb(skb); - continue; - } - if (!test_and_clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) + if (skb == NULL) break; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } + xs_udp_data_read_skb(&transport->xprt, sk, skb); + consume_skb(skb); + cond_resched(); } out: mutex_unlock(&transport->recv_mutex); @@ -1163,263 +1404,7 @@ static void xs_tcp_force_close(struct rpc_xprt *xprt) xprt_force_disconnect(xprt); } -static inline void xs_tcp_read_fraghdr(struct rpc_xprt *xprt, struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - size_t len, used; - char *p; - - p = ((char *) &transport->tcp_fraghdr) + transport->tcp_offset; - len = sizeof(transport->tcp_fraghdr) - transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - - transport->tcp_reclen = ntohl(transport->tcp_fraghdr); - if (transport->tcp_reclen & RPC_LAST_STREAM_FRAGMENT) - transport->tcp_flags |= TCP_RCV_LAST_FRAG; - else - transport->tcp_flags &= ~TCP_RCV_LAST_FRAG; - transport->tcp_reclen &= RPC_FRAGMENT_SIZE_MASK; - - transport->tcp_flags &= ~TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - - /* Sanity check of the record length */ - if (unlikely(transport->tcp_reclen < 8)) { - dprintk("RPC: invalid TCP record fragment length\n"); - xs_tcp_force_close(xprt); - return; - } - dprintk("RPC: reading TCP record fragment of length %d\n", - transport->tcp_reclen); -} - -static void xs_tcp_check_fraghdr(struct sock_xprt *transport) -{ - if (transport->tcp_offset == transport->tcp_reclen) { - transport->tcp_flags |= TCP_RCV_COPY_FRAGHDR; - transport->tcp_offset = 0; - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) { - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RCV_COPY_XID; - transport->tcp_copied = 0; - } - } -} - -static inline void xs_tcp_read_xid(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len, used; - char *p; - - len = sizeof(transport->tcp_xid) - transport->tcp_offset; - dprintk("RPC: reading XID (%zu bytes)\n", len); - p = ((char *) &transport->tcp_xid) + transport->tcp_offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_COPY_XID; - transport->tcp_flags |= TCP_RCV_READ_CALLDIR; - transport->tcp_copied = 4; - dprintk("RPC: reading %s XID %08x\n", - (transport->tcp_flags & TCP_RPC_REPLY) ? "reply for" - : "request with", - ntohl(transport->tcp_xid)); - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_calldir(struct sock_xprt *transport, - struct xdr_skb_reader *desc) -{ - size_t len, used; - u32 offset; - char *p; - - /* - * We want transport->tcp_offset to be 8 at the end of this routine - * (4 bytes for the xid and 4 bytes for the call/reply flag). - * When this function is called for the first time, - * transport->tcp_offset is 4 (after having already read the xid). - */ - offset = transport->tcp_offset - sizeof(transport->tcp_xid); - len = sizeof(transport->tcp_calldir) - offset; - dprintk("RPC: reading CALL/REPLY flag (%zu bytes)\n", len); - p = ((char *) &transport->tcp_calldir) + offset; - used = xdr_skb_read_bits(desc, p, len); - transport->tcp_offset += used; - if (used != len) - return; - transport->tcp_flags &= ~TCP_RCV_READ_CALLDIR; - /* - * We don't yet have the XDR buffer, so we will write the calldir - * out after we get the buffer from the 'struct rpc_rqst' - */ - switch (ntohl(transport->tcp_calldir)) { - case RPC_REPLY: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags |= TCP_RPC_REPLY; - break; - case RPC_CALL: - transport->tcp_flags |= TCP_RCV_COPY_CALLDIR; - transport->tcp_flags |= TCP_RCV_COPY_DATA; - transport->tcp_flags &= ~TCP_RPC_REPLY; - break; - default: - dprintk("RPC: invalid request message type\n"); - xs_tcp_force_close(&transport->xprt); - } - xs_tcp_check_fraghdr(transport); -} - -static inline void xs_tcp_read_common(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc, - struct rpc_rqst *req) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct xdr_buf *rcvbuf; - size_t len; - ssize_t r; - - rcvbuf = &req->rq_private_buf; - - if (transport->tcp_flags & TCP_RCV_COPY_CALLDIR) { - /* - * Save the RPC direction in the XDR buffer - */ - memcpy(rcvbuf->head[0].iov_base + transport->tcp_copied, - &transport->tcp_calldir, - sizeof(transport->tcp_calldir)); - transport->tcp_copied += sizeof(transport->tcp_calldir); - transport->tcp_flags &= ~TCP_RCV_COPY_CALLDIR; - } - - len = desc->count; - if (len > transport->tcp_reclen - transport->tcp_offset) - desc->count = transport->tcp_reclen - transport->tcp_offset; - r = xdr_partial_copy_from_skb(rcvbuf, transport->tcp_copied, - desc, xdr_skb_read_bits); - - if (desc->count) { - /* Error when copying to the receive buffer, - * usually because we weren't able to allocate - * additional buffer pages. All we can do now - * is turn off TCP_RCV_COPY_DATA, so the request - * will not receive any additional updates, - * and time out. - * Any remaining data from this record will - * be discarded. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - dprintk("RPC: XID %08x truncated request\n", - ntohl(transport->tcp_xid)); - dprintk("RPC: xprt = %p, tcp_copied = %lu, " - "tcp_offset = %u, tcp_reclen = %u\n", - xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - return; - } - - transport->tcp_copied += r; - transport->tcp_offset += r; - desc->count = len - r; - - dprintk("RPC: XID %08x read %zd bytes\n", - ntohl(transport->tcp_xid), r); - dprintk("RPC: xprt = %p, tcp_copied = %lu, tcp_offset = %u, " - "tcp_reclen = %u\n", xprt, transport->tcp_copied, - transport->tcp_offset, transport->tcp_reclen); - - if (transport->tcp_copied == req->rq_private_buf.buflen) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - else if (transport->tcp_offset == transport->tcp_reclen) { - if (transport->tcp_flags & TCP_RCV_LAST_FRAG) - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -/* - * Finds the request corresponding to the RPC xid and invokes the common - * tcp read code to read the data. - */ -static inline int xs_tcp_read_reply(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid)); - - /* Find and lock the request corresponding to this xid */ - spin_lock(&xprt->recv_lock); - req = xprt_lookup_rqst(xprt, transport->tcp_xid); - if (!req) { - dprintk("RPC: XID %08x request not found!\n", - ntohl(transport->tcp_xid)); - spin_unlock(&xprt->recv_lock); - return -1; - } - xprt_pin_rqst(req); - spin_unlock(&xprt->recv_lock); - - xs_tcp_read_common(xprt, desc, req); - - spin_lock(&xprt->recv_lock); - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_rqst(req->rq_task, transport->tcp_copied); - xprt_unpin_rqst(req); - spin_unlock(&xprt->recv_lock); - return 0; -} - #if defined(CONFIG_SUNRPC_BACKCHANNEL) -/* - * Obtains an rpc_rqst previously allocated and invokes the common - * tcp read code to read the data. The result is placed in the callback - * queue. - * If we're unable to obtain the rpc_rqst we schedule the closing of the - * connection and return -1. - */ -static int xs_tcp_read_callback(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - struct rpc_rqst *req; - - /* Look up the request corresponding to the given XID */ - req = xprt_lookup_bc_request(xprt, transport->tcp_xid); - if (req == NULL) { - printk(KERN_WARNING "Callback slot table overflowed\n"); - xprt_force_disconnect(xprt); - return -1; - } - - dprintk("RPC: read callback XID %08x\n", ntohl(req->rq_xid)); - xs_tcp_read_common(xprt, desc, req); - - if (!(transport->tcp_flags & TCP_RCV_COPY_DATA)) - xprt_complete_bc_request(req, transport->tcp_copied); - - return 0; -} - -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - return (transport->tcp_flags & TCP_RPC_REPLY) ? - xs_tcp_read_reply(xprt, desc) : - xs_tcp_read_callback(xprt, desc); -} - static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net) { int ret; @@ -1435,145 +1420,8 @@ static size_t xs_tcp_bc_maxpayload(struct rpc_xprt *xprt) { return PAGE_SIZE; } -#else -static inline int _xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - return xs_tcp_read_reply(xprt, desc); -} #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Read data off the transport. This can be either an RPC_CALL or an - * RPC_REPLY. Relay the processing to helper functions. - */ -static void xs_tcp_read_data(struct rpc_xprt *xprt, - struct xdr_skb_reader *desc) -{ - struct sock_xprt *transport = - container_of(xprt, struct sock_xprt, xprt); - - if (_xs_tcp_read_data(xprt, desc) == 0) - xs_tcp_check_fraghdr(transport); - else { - /* - * The transport_lock protects the request handling. - * There's no need to hold it to update the tcp_flags. - */ - transport->tcp_flags &= ~TCP_RCV_COPY_DATA; - } -} - -static inline void xs_tcp_read_discard(struct sock_xprt *transport, struct xdr_skb_reader *desc) -{ - size_t len; - - len = transport->tcp_reclen - transport->tcp_offset; - if (len > desc->count) - len = desc->count; - desc->count -= len; - desc->offset += len; - transport->tcp_offset += len; - dprintk("RPC: discarded %zu bytes\n", len); - xs_tcp_check_fraghdr(transport); -} - -static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, unsigned int offset, size_t len) -{ - struct rpc_xprt *xprt = rd_desc->arg.data; - struct sock_xprt *transport = container_of(xprt, struct sock_xprt, xprt); - struct xdr_skb_reader desc = { - .skb = skb, - .offset = offset, - .count = len, - }; - size_t ret; - - dprintk("RPC: xs_tcp_data_recv started\n"); - do { - trace_xs_tcp_data_recv(transport); - /* Read in a new fragment marker if necessary */ - /* Can we ever really expect to get completely empty fragments? */ - if (transport->tcp_flags & TCP_RCV_COPY_FRAGHDR) { - xs_tcp_read_fraghdr(xprt, &desc); - continue; - } - /* Read in the xid if necessary */ - if (transport->tcp_flags & TCP_RCV_COPY_XID) { - xs_tcp_read_xid(transport, &desc); - continue; - } - /* Read in the call/reply flag */ - if (transport->tcp_flags & TCP_RCV_READ_CALLDIR) { - xs_tcp_read_calldir(transport, &desc); - continue; - } - /* Read in the request data */ - if (transport->tcp_flags & TCP_RCV_COPY_DATA) { - xs_tcp_read_data(xprt, &desc); - continue; - } - /* Skip over any trailing bytes on short reads */ - xs_tcp_read_discard(transport, &desc); - } while (desc.count); - ret = len - desc.count; - if (ret < rd_desc->count) - rd_desc->count -= ret; - else - rd_desc->count = 0; - trace_xs_tcp_data_recv(transport); - dprintk("RPC: xs_tcp_data_recv done\n"); - return ret; -} - -static void xs_tcp_data_receive(struct sock_xprt *transport) -{ - struct rpc_xprt *xprt = &transport->xprt; - struct sock *sk; - read_descriptor_t rd_desc = { - .arg.data = xprt, - }; - unsigned long total = 0; - int read = 0; - -restart: - mutex_lock(&transport->recv_mutex); - sk = transport->inet; - if (sk == NULL) - goto out; - - /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */ - for (;;) { - rd_desc.count = RPC_TCP_READ_CHUNK_SZ; - lock_sock(sk); - read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv); - if (rd_desc.count != 0 || read < 0) { - clear_bit(XPRT_SOCK_DATA_READY, &transport->sock_state); - release_sock(sk); - break; - } - release_sock(sk); - total += read; - if (need_resched()) { - mutex_unlock(&transport->recv_mutex); - cond_resched(); - goto restart; - } - } - if (test_bit(XPRT_SOCK_DATA_READY, &transport->sock_state)) - queue_work(xprtiod_workqueue, &transport->recv_worker); -out: - mutex_unlock(&transport->recv_mutex); - trace_xs_tcp_data_ready(xprt, read, total); -} - -static void xs_tcp_data_receive_workfn(struct work_struct *work) -{ - struct sock_xprt *transport = - container_of(work, struct sock_xprt, recv_worker); - xs_tcp_data_receive(transport); -} - /** * xs_tcp_state_change - callback to handle TCP socket state changes * @sk: socket whose state has changed @@ -1600,17 +1448,13 @@ static void xs_tcp_state_change(struct sock *sk) case TCP_ESTABLISHED: spin_lock(&xprt->transport_lock); if (!xprt_test_and_set_connected(xprt)) { - - /* Reset TCP record info */ - transport->tcp_offset = 0; - transport->tcp_reclen = 0; - transport->tcp_copied = 0; - transport->tcp_flags = - TCP_RCV_COPY_FRAGHDR | TCP_RCV_COPY_XID; xprt->connect_cookie++; clear_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); xprt_clear_connecting(xprt); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_wake_pending_tasks(xprt, -EAGAIN); } spin_unlock(&xprt->transport_lock); @@ -1675,7 +1519,8 @@ static void xs_write_space(struct sock *sk) if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0) goto out; - xprt_write_space(xprt); + if (xprt_write_space(xprt)) + sk->sk_write_pending--; out: rcu_read_unlock(); } @@ -1773,11 +1618,17 @@ static void xs_udp_timer(struct rpc_xprt *xprt, struct rpc_task *task) spin_unlock_bh(&xprt->transport_lock); } -static unsigned short xs_get_random_port(void) +static int xs_get_random_port(void) { - unsigned short range = xprt_max_resvport - xprt_min_resvport + 1; - unsigned short rand = (unsigned short) prandom_u32() % range; - return rand + xprt_min_resvport; + unsigned short min = xprt_min_resvport, max = xprt_max_resvport; + unsigned short range; + unsigned short rand; + + if (max < min) + return -EADDRINUSE; + range = max - min + 1; + rand = (unsigned short) prandom_u32() % range; + return rand + min; } /** @@ -1833,9 +1684,9 @@ static void xs_set_srcport(struct sock_xprt *transport, struct socket *sock) transport->srcport = xs_sock_getport(sock); } -static unsigned short xs_get_srcport(struct sock_xprt *transport) +static int xs_get_srcport(struct sock_xprt *transport) { - unsigned short port = transport->srcport; + int port = transport->srcport; if (port == 0 && transport->xprt.resvport) port = xs_get_random_port(); @@ -1856,7 +1707,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) { struct sockaddr_storage myaddr; int err, nloop = 0; - unsigned short port = xs_get_srcport(transport); + int port = xs_get_srcport(transport); unsigned short last; /* @@ -1874,8 +1725,8 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock) * transport->xprt.resvport == 1) xs_get_srcport above will * ensure that port is non-zero and we will bind as needed. */ - if (port == 0) - return 0; + if (port <= 0) + return port; memcpy(&myaddr, &transport->srcaddr, transport->xprt.addrlen); do { @@ -2028,9 +1879,8 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt, write_unlock_bh(&sk->sk_callback_lock); } - /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; + xs_stream_reset_connect(transport); + return kernel_connect(sock, xs_addr(xprt), xprt->addrlen, 0); } @@ -2062,6 +1912,9 @@ static int xs_local_setup_socket(struct sock_xprt *transport) case 0: dprintk("RPC: xprt %p connected to %s\n", xprt, xprt->address_strings[RPC_DISPLAY_ADDR]); + xprt->stat.connect_count++; + xprt->stat.connect_time += (long)jiffies - + xprt->stat.connect_start; xprt_set_connected(xprt); case -ENOBUFS: break; @@ -2386,9 +2239,10 @@ static int xs_tcp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock) xs_set_memalloc(xprt); + /* Reset TCP record info */ + xs_stream_reset_connect(transport); + /* Tell the socket layer to start connecting... */ - xprt->stat.connect_count++; - xprt->stat.connect_start = jiffies; set_bit(XPRT_SOCK_CONNECTING, &transport->sock_state); ret = kernel_connect(sock, xs_addr(xprt), xprt->addrlen, O_NONBLOCK); switch (ret) { @@ -2561,7 +2415,7 @@ static void xs_local_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) "%llu %llu %lu %llu %llu\n", xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2616,7 +2470,7 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) transport->srcport, xprt->stat.bind_count, xprt->stat.connect_count, - xprt->stat.connect_time, + xprt->stat.connect_time / HZ, idle_time, xprt->stat.sends, xprt->stat.recvs, @@ -2704,9 +2558,8 @@ static int bc_sendto(struct rpc_rqst *req) /* * The send routine. Borrows from svc_send */ -static int bc_send_request(struct rpc_task *task) +static int bc_send_request(struct rpc_rqst *req) { - struct rpc_rqst *req = task->tk_rqstp; struct svc_xprt *xprt; int len; @@ -2720,12 +2573,7 @@ static int bc_send_request(struct rpc_task *task) * Grab the mutex to serialize data as the connection is shared * with the fore channel */ - if (!mutex_trylock(&xprt->xpt_mutex)) { - rpc_sleep_on(&xprt->xpt_bc_pending, task, NULL); - if (!mutex_trylock(&xprt->xpt_mutex)) - return -EAGAIN; - rpc_wake_up_queued_task(&xprt->xpt_bc_pending, task); - } + mutex_lock(&xprt->xpt_mutex); if (test_bit(XPT_DEAD, &xprt->xpt_flags)) len = -ENOTCONN; else @@ -2761,7 +2609,7 @@ static void bc_destroy(struct rpc_xprt *xprt) static const struct rpc_xprt_ops xs_local_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, + .release_xprt = xprt_release_xprt, .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = xs_local_rpcbind, @@ -2769,6 +2617,7 @@ static const struct rpc_xprt_ops xs_local_ops = { .connect = xs_local_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_local_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_close, @@ -2803,14 +2652,15 @@ static const struct rpc_xprt_ops xs_udp_ops = { static const struct rpc_xprt_ops xs_tcp_ops = { .reserve_xprt = xprt_reserve_xprt, - .release_xprt = xs_tcp_release_xprt, - .alloc_slot = xprt_lock_and_alloc_slot, + .release_xprt = xprt_release_xprt, + .alloc_slot = xprt_alloc_slot, .free_slot = xprt_free_slot, .rpcbind = rpcb_getport_async, .set_port = xs_set_port, .connect = xs_connect, .buf_alloc = rpc_malloc, .buf_free = rpc_free, + .prepare_request = xs_stream_prepare_request, .send_request = xs_tcp_send_request, .set_retrans_timeout = xprt_set_retrans_timeout_def, .close = xs_tcp_shutdown, @@ -2952,9 +2802,8 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args) xprt->ops = &xs_local_ops; xprt->timeout = &xs_local_default_timeout; - INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn); - INIT_DELAYED_WORK(&transport->connect_worker, - xs_dummy_setup_socket); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); + INIT_DELAYED_WORK(&transport->connect_worker, xs_dummy_setup_socket); switch (sun->sun_family) { case AF_LOCAL: @@ -3106,7 +2955,7 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args) xprt->connect_timeout = xprt->timeout->to_initval * (xprt->timeout->to_retries + 1); - INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn); + INIT_WORK(&transport->recv_worker, xs_stream_data_receive_workfn); INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket); switch (addr->sa_family) { @@ -3317,12 +3166,8 @@ static int param_set_uint_minmax(const char *val, static int param_set_portnr(const char *val, const struct kernel_param *kp) { - if (kp->arg == &xprt_min_resvport) - return param_set_uint_minmax(val, kp, - RPC_MIN_RESVPORT, - xprt_max_resvport); return param_set_uint_minmax(val, kp, - xprt_min_resvport, + RPC_MIN_RESVPORT, RPC_MAX_RESVPORT); } |