diff options
Diffstat (limited to 'net')
406 files changed, 10591 insertions, 6402 deletions
diff --git a/net/802/garp.c b/net/802/garp.c index f6012f8e59f0..fc9eb02a912f 100644 --- a/net/802/garp.c +++ b/net/802/garp.c @@ -407,7 +407,7 @@ static void garp_join_timer_arm(struct garp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(garp_join_time) * prandom_u32() >> 32; + delay = prandom_u32_max(msecs_to_jiffies(garp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/802/mrp.c b/net/802/mrp.c index 35e04cc5390c..155f74d8b14f 100644 --- a/net/802/mrp.c +++ b/net/802/mrp.c @@ -592,7 +592,7 @@ static void mrp_join_timer_arm(struct mrp_applicant *app) { unsigned long delay; - delay = (u64)msecs_to_jiffies(mrp_join_time) * prandom_u32() >> 32; + delay = prandom_u32_max(msecs_to_jiffies(mrp_join_time)); mod_timer(&app->join_timer, jiffies + delay); } diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c index e1bb41a443c4..296d0145932f 100644 --- a/net/8021q/vlan_dev.c +++ b/net/8021q/vlan_dev.c @@ -712,13 +712,13 @@ static void vlan_dev_get_stats64(struct net_device *dev, p = per_cpu_ptr(vlan_dev_priv(dev)->vlan_pcpu_stats, i); do { - start = u64_stats_fetch_begin_irq(&p->syncp); + start = u64_stats_fetch_begin(&p->syncp); rxpackets = u64_stats_read(&p->rx_packets); rxbytes = u64_stats_read(&p->rx_bytes); rxmulticast = u64_stats_read(&p->rx_multicast); txpackets = u64_stats_read(&p->tx_packets); txbytes = u64_stats_read(&p->tx_bytes); - } while (u64_stats_fetch_retry_irq(&p->syncp, start)); + } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += rxpackets; stats->rx_bytes += rxbytes; diff --git a/net/9p/client.c b/net/9p/client.c index 0a6110e15d0f..aaa37b07e30a 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -255,24 +255,42 @@ static struct kmem_cache *p9_req_cache; * p9_tag_alloc - Allocate a new request. * @c: Client session. * @type: Transaction type. - * @max_size: Maximum packet size for this request. + * @t_size: Buffer size for holding this request + * (automatic calculation by format template if 0). + * @r_size: Buffer size for holding server's reply on this request + * (automatic calculation by format template if 0). + * @fmt: Format template for assembling 9p request message + * (see p9pdu_vwritef). + * @ap: Variable arguments to be fed to passed format template + * (see p9pdu_vwritef). * * Context: Process context. * Return: Pointer to new request. */ static struct p9_req_t * -p9_tag_alloc(struct p9_client *c, int8_t type, unsigned int max_size) +p9_tag_alloc(struct p9_client *c, int8_t type, uint t_size, uint r_size, + const char *fmt, va_list ap) { struct p9_req_t *req = kmem_cache_alloc(p9_req_cache, GFP_NOFS); - int alloc_msize = min(c->msize, max_size); + int alloc_tsize; + int alloc_rsize; int tag; + va_list apc; + + va_copy(apc, ap); + alloc_tsize = min_t(size_t, c->msize, + t_size ?: p9_msg_buf_size(c, type, fmt, apc)); + va_end(apc); + + alloc_rsize = min_t(size_t, c->msize, + r_size ?: p9_msg_buf_size(c, type + 1, fmt, ap)); if (!req) return ERR_PTR(-ENOMEM); - if (p9_fcall_init(c, &req->tc, alloc_msize)) + if (p9_fcall_init(c, &req->tc, alloc_tsize)) goto free_req; - if (p9_fcall_init(c, &req->rc, alloc_msize)) + if (p9_fcall_init(c, &req->rc, alloc_rsize)) goto free; p9pdu_reset(&req->tc); @@ -592,11 +610,12 @@ static int p9_client_flush(struct p9_client *c, struct p9_req_t *oldreq) } static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, - int8_t type, int req_size, + int8_t type, uint t_size, uint r_size, const char *fmt, va_list ap) { int err; struct p9_req_t *req; + va_list apc; p9_debug(P9_DEBUG_MUX, "client %p op %d\n", c, type); @@ -608,7 +627,9 @@ static struct p9_req_t *p9_client_prepare_req(struct p9_client *c, if (c->status == BeginDisconnect && type != P9_TCLUNK) return ERR_PTR(-EIO); - req = p9_tag_alloc(c, type, req_size); + va_copy(apc, ap); + req = p9_tag_alloc(c, type, t_size, r_size, fmt, apc); + va_end(apc); if (IS_ERR(req)) return req; @@ -643,9 +664,18 @@ p9_client_rpc(struct p9_client *c, int8_t type, const char *fmt, ...) int sigpending, err; unsigned long flags; struct p9_req_t *req; + /* Passing zero for tsize/rsize to p9_client_prepare_req() tells it to + * auto determine an appropriate (small) request/response size + * according to actual message data being sent. Currently RDMA + * transport is excluded from this response message size optimization, + * as it would not cope with it, due to its pooled response buffers + * (using an optimized request size for RDMA as well though). + */ + const uint tsize = 0; + const uint rsize = c->trans_mod->pooled_rbuffers ? c->msize : 0; va_start(ap, fmt); - req = p9_client_prepare_req(c, type, c->msize, fmt, ap); + req = p9_client_prepare_req(c, type, tsize, rsize, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; @@ -743,7 +773,7 @@ static struct p9_req_t *p9_client_zc_rpc(struct p9_client *c, int8_t type, /* We allocate a inline protocol data of only 4k bytes. * The actual content is passed in zero-copy fashion. */ - req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, fmt, ap); + req = p9_client_prepare_req(c, type, P9_ZC_HDR_SZ, P9_ZC_HDR_SZ, fmt, ap); va_end(ap); if (IS_ERR(req)) return req; diff --git a/net/9p/protocol.c b/net/9p/protocol.c index 83694c631989..4e3a2a1ffcb3 100644 --- a/net/9p/protocol.c +++ b/net/9p/protocol.c @@ -23,6 +23,173 @@ #include <trace/events/9p.h> +/* len[2] text[len] */ +#define P9_STRLEN(s) \ + (2 + min_t(size_t, s ? strlen(s) : 0, USHRT_MAX)) + +/** + * p9_msg_buf_size - Returns a buffer size sufficiently large to hold the + * intended 9p message. + * @c: client + * @type: message type + * @fmt: format template for assembling request message + * (see p9pdu_vwritef) + * @ap: variable arguments to be fed to passed format template + * (see p9pdu_vwritef) + * + * Note: Even for response types (P9_R*) the format template and variable + * arguments must always be for the originating request type (P9_T*). + */ +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap) +{ + /* size[4] type[1] tag[2] */ + const int hdr = 4 + 1 + 2; + /* ename[s] errno[4] */ + const int rerror_size = hdr + P9_ERRMAX + 4; + /* ecode[4] */ + const int rlerror_size = hdr + 4; + const int err_size = + c->proto_version == p9_proto_2000L ? rlerror_size : rerror_size; + + static_assert(NAME_MAX <= 4*1024, "p9_msg_buf_size() currently assumes " + "a max. allowed directory entry name length of 4k"); + + switch (type) { + + /* message types not used at all */ + case P9_TERROR: + case P9_TLERROR: + case P9_TAUTH: + case P9_RAUTH: + BUG(); + + /* variable length & potentially large message types */ + case P9_TATTACH: + BUG_ON(strcmp("ddss?u", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + const char *uname = va_arg(ap, const char *); + const char *aname = va_arg(ap, const char *); + /* fid[4] afid[4] uname[s] aname[s] n_uname[4] */ + return hdr + 4 + 4 + P9_STRLEN(uname) + P9_STRLEN(aname) + 4; + } + case P9_TWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint i, nwname = va_arg(ap, int); + size_t wname_all; + const char **wnames = va_arg(ap, const char **); + for (i = 0, wname_all = 0; i < nwname; ++i) { + wname_all += P9_STRLEN(wnames[i]); + } + /* fid[4] newfid[4] nwname[2] nwname*(wname[s]) */ + return hdr + 4 + 4 + 2 + wname_all; + } + case P9_RWALK: + BUG_ON(strcmp("ddT", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int32_t); + { + uint nwname = va_arg(ap, int); + /* nwqid[2] nwqid*(wqid[13]) */ + return max_t(size_t, hdr + 2 + nwname * 13, err_size); + } + case P9_TCREATE: + BUG_ON(strcmp("dsdb?s", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + if (c->proto_version == p9_proto_legacy) { + /* fid[4] name[s] perm[4] mode[1] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1; + } else { + va_arg(ap, int32_t); + va_arg(ap, int); + { + const char *ext = va_arg(ap, const char *); + /* fid[4] name[s] perm[4] mode[1] extension[s] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 1 + P9_STRLEN(ext); + } + } + } + case P9_TLCREATE: + BUG_ON(strcmp("dsddg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + /* fid[4] name[s] flags[4] mode[4] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + 4 + 4 + 4; + } + case P9_RREAD: + case P9_RREADDIR: + BUG_ON(strcmp("dqd", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* count[4] data[count] */ + return max_t(size_t, hdr + 4 + count, err_size); + } + case P9_TWRITE: + BUG_ON(strcmp("dqV", fmt)); + va_arg(ap, int32_t); + va_arg(ap, int64_t); + { + const int32_t count = va_arg(ap, int32_t); + /* fid[4] offset[8] count[4] data[count] */ + return hdr + 4 + 8 + 4 + count; + } + case P9_TRENAMEAT: + BUG_ON(strcmp("dsds", fmt)); + va_arg(ap, int32_t); + { + const char *oldname, *newname; + oldname = va_arg(ap, const char *); + va_arg(ap, int32_t); + newname = va_arg(ap, const char *); + /* olddirfid[4] oldname[s] newdirfid[4] newname[s] */ + return hdr + 4 + P9_STRLEN(oldname) + 4 + P9_STRLEN(newname); + } + case P9_TSYMLINK: + BUG_ON(strcmp("dssg", fmt)); + va_arg(ap, int32_t); + { + const char *name = va_arg(ap, const char *); + const char *symtgt = va_arg(ap, const char *); + /* fid[4] name[s] symtgt[s] gid[4] */ + return hdr + 4 + P9_STRLEN(name) + P9_STRLEN(symtgt) + 4; + } + + case P9_RERROR: + return rerror_size; + case P9_RLERROR: + return rlerror_size; + + /* small message types */ + case P9_TWSTAT: + case P9_RSTAT: + case P9_RREADLINK: + case P9_TXATTRWALK: + case P9_TXATTRCREATE: + case P9_TLINK: + case P9_TMKDIR: + case P9_TMKNOD: + case P9_TRENAME: + case P9_TUNLINKAT: + case P9_TLOCK: + return 8 * 1024; + + /* tiny message types */ + default: + return 4 * 1024; + + } +} + static int p9pdu_writef(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); diff --git a/net/9p/protocol.h b/net/9p/protocol.h index 6d719c30331a..ad2283d1f96b 100644 --- a/net/9p/protocol.h +++ b/net/9p/protocol.h @@ -8,6 +8,8 @@ * Copyright (C) 2008 by IBM, Corp. */ +size_t p9_msg_buf_size(struct p9_client *c, enum p9_msg_t type, + const char *fmt, va_list ap); int p9pdu_vwritef(struct p9_fcall *pdu, int proto_version, const char *fmt, va_list ap); int p9pdu_readf(struct p9_fcall *pdu, int proto_version, const char *fmt, ...); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index e758978b44be..56a186768750 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -91,6 +91,7 @@ struct p9_poll_wait { * @mux_list: list link for mux to manage multiple connections (?) * @client: reference to client instance for this connection * @err: error state + * @req_lock: lock protecting req_list and requests statuses * @req_list: accounting for requests which have been sent * @unsent_req_list: accounting for requests that haven't been sent * @rreq: read request @@ -114,6 +115,7 @@ struct p9_conn { struct list_head mux_list; struct p9_client *client; int err; + spinlock_t req_lock; struct list_head req_list; struct list_head unsent_req_list; struct p9_req_t *rreq; @@ -189,10 +191,10 @@ static void p9_conn_cancel(struct p9_conn *m, int err) p9_debug(P9_DEBUG_ERROR, "mux %p err %d\n", m, err); - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (m->err) { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; } @@ -205,6 +207,8 @@ static void p9_conn_cancel(struct p9_conn *m, int err) list_move(&req->req_list, &cancel_list); } + spin_unlock(&m->req_lock); + list_for_each_entry_safe(req, rtmp, &cancel_list, req_list) { p9_debug(P9_DEBUG_ERROR, "call back req %p\n", req); list_del(&req->req_list); @@ -212,7 +216,6 @@ static void p9_conn_cancel(struct p9_conn *m, int err) req->t_err = err; p9_client_cb(m->client, req, REQ_STATUS_ERROR); } - spin_unlock(&m->client->lock); } static __poll_t @@ -359,7 +362,7 @@ static void p9_read_work(struct work_struct *work) if ((m->rreq) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); m->rreq->rc.size = m->rc.offset; - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (m->rreq->status == REQ_STATUS_SENT) { list_del(&m->rreq->req_list); p9_client_cb(m->client, m->rreq, REQ_STATUS_RCVD); @@ -368,14 +371,14 @@ static void p9_read_work(struct work_struct *work) p9_debug(P9_DEBUG_TRANS, "Ignore replies associated with a cancelled request\n"); } else { - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); p9_debug(P9_DEBUG_ERROR, "Request tag %d errored out while we were reading the reply\n", m->rc.tag); err = -EIO; goto error; } - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); m->rc.sdata = NULL; m->rc.offset = 0; m->rc.capacity = 0; @@ -453,10 +456,10 @@ static void p9_write_work(struct work_struct *work) } if (!m->wsize) { - spin_lock(&m->client->lock); + spin_lock(&m->req_lock); if (list_empty(&m->unsent_req_list)) { clear_bit(Wworksched, &m->wsched); - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); return; } @@ -471,7 +474,7 @@ static void p9_write_work(struct work_struct *work) m->wpos = 0; p9_req_get(req); m->wreq = req; - spin_unlock(&m->client->lock); + spin_unlock(&m->req_lock); } p9_debug(P9_DEBUG_TRANS, "mux %p pos %d size %d\n", @@ -588,6 +591,7 @@ static void p9_conn_create(struct p9_client *client) INIT_LIST_HEAD(&m->mux_list); m->client = client; + spin_lock_init(&m->req_lock); INIT_LIST_HEAD(&m->req_list); INIT_LIST_HEAD(&m->unsent_req_list); INIT_WORK(&m->rq, p9_read_work); @@ -669,10 +673,10 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) if (m->err < 0) return m->err; - spin_lock(&client->lock); + spin_lock(&m->req_lock); req->status = REQ_STATUS_UNSENT; list_add_tail(&req->req_list, &m->unsent_req_list); - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); if (test_and_clear_bit(Wpending, &m->wsched)) n = EPOLLOUT; @@ -687,11 +691,13 @@ static int p9_fd_request(struct p9_client *client, struct p9_req_t *req) static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; int ret = 1; p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); - spin_lock(&client->lock); + spin_lock(&m->req_lock); if (req->status == REQ_STATUS_UNSENT) { list_del(&req->req_list); @@ -699,21 +705,24 @@ static int p9_fd_cancel(struct p9_client *client, struct p9_req_t *req) p9_req_put(client, req); ret = 0; } - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); return ret; } static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) { + struct p9_trans_fd *ts = client->trans; + struct p9_conn *m = &ts->conn; + p9_debug(P9_DEBUG_TRANS, "client %p req %p\n", client, req); - spin_lock(&client->lock); + spin_lock(&m->req_lock); /* Ignore cancelled request if message has been received * before lock. */ if (req->status == REQ_STATUS_RCVD) { - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); return 0; } @@ -722,7 +731,8 @@ static int p9_fd_cancelled(struct p9_client *client, struct p9_req_t *req) */ list_del(&req->req_list); req->status = REQ_STATUS_FLSHD; - spin_unlock(&client->lock); + spin_unlock(&m->req_lock); + p9_req_put(client, req); return 0; @@ -821,11 +831,14 @@ static int p9_fd_open(struct p9_client *client, int rfd, int wfd) goto out_free_ts; if (!(ts->rd->f_mode & FMODE_READ)) goto out_put_rd; + /* prevent workers from hanging on IO when fd is a pipe */ + ts->rd->f_flags |= O_NONBLOCK; ts->wr = fget(wfd); if (!ts->wr) goto out_put_rd; if (!(ts->wr->f_mode & FMODE_WRITE)) goto out_put_wr; + ts->wr->f_flags |= O_NONBLOCK; client->trans = ts; client->status = Connected; @@ -1061,7 +1074,9 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) int err; struct p9_fd_opts opts; - parse_opts(args, &opts); + err = parse_opts(args, &opts); + if (err < 0) + return err; client->trans_opts.fd.rfd = opts.rfd; client->trans_opts.fd.wfd = opts.wfd; @@ -1082,6 +1097,7 @@ p9_fd_create(struct p9_client *client, const char *addr, char *args) static struct p9_trans_module p9_tcp_trans = { .name = "tcp", .maxsize = MAX_SOCK_BUF, + .pooled_rbuffers = false, .def = 0, .create = p9_fd_create_tcp, .close = p9_fd_close, diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c index d817d3745238..6ff706760676 100644 --- a/net/9p/trans_rdma.c +++ b/net/9p/trans_rdma.c @@ -739,6 +739,7 @@ error: static struct p9_trans_module p9_rdma_trans = { .name = "rdma", .maxsize = P9_RDMA_MAXSIZE, + .pooled_rbuffers = true, .def = 0, .owner = THIS_MODULE, .create = rdma_create_trans, diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index b84d35cf6899..e757f0601304 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -802,6 +802,7 @@ static struct p9_trans_module p9_virtio_trans = { * page in zero copy. */ .maxsize = PAGE_SIZE * (VIRTQUEUE_NUM - 3), + .pooled_rbuffers = false, .def = 1, .owner = THIS_MODULE, }; diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 227f89cc7237..b15c64128c3e 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -246,6 +246,7 @@ static irqreturn_t xen_9pfs_front_event_handler(int irq, void *r) static struct p9_trans_module p9_xen_trans = { .name = "xen", .maxsize = 1 << (XEN_9PFS_RING_ORDER + XEN_PAGE_SHIFT - 2), + .pooled_rbuffers = false, .def = 1, .create = p9_xen_create, .close = p9_xen_close, @@ -510,7 +511,7 @@ static struct xenbus_driver xen_9pfs_front_driver = { .otherend_changed = xen_9pfs_front_changed, }; -static int p9_trans_xen_init(void) +static int __init p9_trans_xen_init(void) { int rc; @@ -529,7 +530,7 @@ static int p9_trans_xen_init(void) module_init(p9_trans_xen_init); MODULE_ALIAS_9P("xen"); -static void p9_trans_xen_exit(void) +static void __exit p9_trans_xen_exit(void) { v9fs_unregister_trans(&p9_xen_trans); return xenbus_unregister_driver(&xen_9pfs_front_driver); diff --git a/net/Kconfig.debug b/net/Kconfig.debug index e6ae11cc2fb7..5e3fffe707dd 100644 --- a/net/Kconfig.debug +++ b/net/Kconfig.debug @@ -2,7 +2,7 @@ config NET_DEV_REFCNT_TRACKER bool "Enable net device refcount tracking" - depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET select REF_TRACKER default n help @@ -11,7 +11,7 @@ config NET_DEV_REFCNT_TRACKER config NET_NS_REFCNT_TRACKER bool "Enable networking namespace refcount tracking" - depends on DEBUG_KERNEL && STACKTRACE_SUPPORT + depends on DEBUG_KERNEL && STACKTRACE_SUPPORT && NET select REF_TRACKER default n help diff --git a/net/atm/mpoa_proc.c b/net/atm/mpoa_proc.c index 829db9eba0cb..aaf64b953915 100644 --- a/net/atm/mpoa_proc.c +++ b/net/atm/mpoa_proc.c @@ -219,11 +219,12 @@ static ssize_t proc_mpc_write(struct file *file, const char __user *buff, if (!page) return -ENOMEM; - for (p = page, len = 0; len < nbytes; p++, len++) { + for (p = page, len = 0; len < nbytes; p++) { if (get_user(*p, buff++)) { free_page((unsigned long)page); return -EFAULT; } + len += 1; if (*p == '\0' || *p == '\n') break; } diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c index b6db999abf75..f1741fbfb617 100644 --- a/net/batman-adv/bat_v_elp.c +++ b/net/batman-adv/bat_v_elp.c @@ -125,7 +125,6 @@ static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh) /* if not a wifi interface, check if this device provides data via * ethtool (e.g. an Ethernet adapter) */ - memset(&link_settings, 0, sizeof(link_settings)); rtnl_lock(); ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings); rtnl_unlock(); diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c index b8f8da7ee3de..41c1ad33d009 100644 --- a/net/batman-adv/hard-interface.c +++ b/net/batman-adv/hard-interface.c @@ -10,6 +10,7 @@ #include <linux/atomic.h> #include <linux/byteorder/generic.h> #include <linux/container_of.h> +#include <linux/errno.h> #include <linux/gfp.h> #include <linux/if.h> #include <linux/if_arp.h> @@ -700,6 +701,9 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface, int max_header_len = batadv_max_header_len(); int ret; + if (hard_iface->net_dev->mtu < ETH_MIN_MTU + max_header_len) + return -EINVAL; + if (hard_iface->if_status != BATADV_IF_NOT_IN_USE) goto out; diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h index 23f3d53f4b51..c48803b32bb0 100644 --- a/net/batman-adv/main.h +++ b/net/batman-adv/main.h @@ -13,7 +13,7 @@ #define BATADV_DRIVER_DEVICE "batman-adv" #ifndef BATADV_SOURCE_VERSION -#define BATADV_SOURCE_VERSION "2022.2" +#define BATADV_SOURCE_VERSION "2022.3" #endif /* B.A.T.M.A.N. parameters */ diff --git a/net/batman-adv/trace.h b/net/batman-adv/trace.h index 31c8f922651d..5dd52bc5cabb 100644 --- a/net/batman-adv/trace.h +++ b/net/batman-adv/trace.h @@ -9,8 +9,6 @@ #include "main.h" -#include <linux/bug.h> -#include <linux/kernel.h> #include <linux/netdevice.h> #include <linux/percpu.h> #include <linux/printk.h> diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h index 2be5d4a712c5..758cd797a063 100644 --- a/net/batman-adv/types.h +++ b/net/batman-adv/types.h @@ -1740,45 +1740,6 @@ struct batadv_priv { #endif }; -/** - * struct batadv_socket_client - layer2 icmp socket client data - */ -struct batadv_socket_client { - /** - * @queue_list: packet queue for packets destined for this socket client - */ - struct list_head queue_list; - - /** @queue_len: number of packets in the packet queue (queue_list) */ - unsigned int queue_len; - - /** @index: socket client's index in the batadv_socket_client_hash */ - unsigned char index; - - /** @lock: lock protecting queue_list, queue_len & index */ - spinlock_t lock; - - /** @queue_wait: socket client's wait queue */ - wait_queue_head_t queue_wait; - - /** @bat_priv: pointer to soft_iface this client belongs to */ - struct batadv_priv *bat_priv; -}; - -/** - * struct batadv_socket_packet - layer2 icmp packet for socket client - */ -struct batadv_socket_packet { - /** @list: list node for &batadv_socket_client.queue_list */ - struct list_head list; - - /** @icmp_len: size of the layer2 icmp packet */ - size_t icmp_len; - - /** @icmp_packet: layer2 icmp packet */ - u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE]; -}; - #ifdef CONFIG_BATMAN_ADV_BLA /** diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c index 9777e7b109ee..7a59c4487050 100644 --- a/net/bluetooth/hci_conn.c +++ b/net/bluetooth/hci_conn.c @@ -44,6 +44,11 @@ struct sco_param { u8 retrans_effort; }; +struct conn_handle_t { + struct hci_conn *conn; + __u16 handle; +}; + static const struct sco_param esco_param_cvsd[] = { { EDR_ESCO_MASK & ~ESCO_2EV3, 0x000a, 0x01 }, /* S3 */ { EDR_ESCO_MASK & ~ESCO_2EV3, 0x0007, 0x01 }, /* S2 */ @@ -316,17 +321,60 @@ static bool find_next_esco_param(struct hci_conn *conn, return conn->attempt <= size; } -static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) +static int configure_datapath_sync(struct hci_dev *hdev, struct bt_codec *codec) { - struct hci_dev *hdev = conn->hdev; + int err; + __u8 vnd_len, *vnd_data = NULL; + struct hci_op_configure_data_path *cmd = NULL; + + err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, + &vnd_data); + if (err < 0) + goto error; + + cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); + if (!cmd) { + err = -ENOMEM; + goto error; + } + + err = hdev->get_data_path_id(hdev, &cmd->data_path_id); + if (err < 0) + goto error; + + cmd->vnd_len = vnd_len; + memcpy(cmd->vnd_data, vnd_data, vnd_len); + + cmd->direction = 0x00; + __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, + sizeof(*cmd) + vnd_len, cmd, HCI_CMD_TIMEOUT); + + cmd->direction = 0x01; + err = __hci_cmd_sync_status(hdev, HCI_CONFIGURE_DATA_PATH, + sizeof(*cmd) + vnd_len, cmd, + HCI_CMD_TIMEOUT); +error: + + kfree(cmd); + kfree(vnd_data); + return err; +} + +static int hci_enhanced_setup_sync(struct hci_dev *hdev, void *data) +{ + struct conn_handle_t *conn_handle = data; + struct hci_conn *conn = conn_handle->conn; + __u16 handle = conn_handle->handle; struct hci_cp_enhanced_setup_sync_conn cp; const struct sco_param *param; + kfree(conn_handle); + bt_dev_dbg(hdev, "hcon %p", conn); /* for offload use case, codec needs to configured before opening SCO */ if (conn->codec.data_path) - hci_req_configure_datapath(hdev, &conn->codec); + configure_datapath_sync(hdev, &conn->codec); conn->state = BT_CONNECT; conn->out = true; @@ -344,7 +392,7 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) case BT_CODEC_MSBC: if (!find_next_esco_param(conn, esco_param_msbc, ARRAY_SIZE(esco_param_msbc))) - return false; + return -EINVAL; param = &esco_param_msbc[conn->attempt - 1]; cp.tx_coding_format.id = 0x05; @@ -396,11 +444,11 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) if (lmp_esco_capable(conn->link)) { if (!find_next_esco_param(conn, esco_param_cvsd, ARRAY_SIZE(esco_param_cvsd))) - return false; + return -EINVAL; param = &esco_param_cvsd[conn->attempt - 1]; } else { if (conn->attempt > ARRAY_SIZE(sco_param_cvsd)) - return false; + return -EINVAL; param = &sco_param_cvsd[conn->attempt - 1]; } cp.tx_coding_format.id = 2; @@ -423,7 +471,7 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) cp.out_transport_unit_size = 16; break; default: - return false; + return -EINVAL; } cp.retrans_effort = param->retrans_effort; @@ -431,9 +479,9 @@ static bool hci_enhanced_setup_sync_conn(struct hci_conn *conn, __u16 handle) cp.max_latency = __cpu_to_le16(param->max_latency); if (hci_send_cmd(hdev, HCI_OP_ENHANCED_SETUP_SYNC_CONN, sizeof(cp), &cp) < 0) - return false; + return -EIO; - return true; + return 0; } static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) @@ -490,8 +538,24 @@ static bool hci_setup_sync_conn(struct hci_conn *conn, __u16 handle) bool hci_setup_sync(struct hci_conn *conn, __u16 handle) { - if (enhanced_sync_conn_capable(conn->hdev)) - return hci_enhanced_setup_sync_conn(conn, handle); + int result; + struct conn_handle_t *conn_handle; + + if (enhanced_sync_conn_capable(conn->hdev)) { + conn_handle = kzalloc(sizeof(*conn_handle), GFP_KERNEL); + + if (!conn_handle) + return false; + + conn_handle->conn = conn; + conn_handle->handle = handle; + result = hci_cmd_sync_queue(conn->hdev, hci_enhanced_setup_sync, + conn_handle, NULL); + if (result < 0) + kfree(conn_handle); + + return result == 0; + } return hci_setup_sync_conn(conn, handle); } @@ -2696,3 +2760,79 @@ u32 hci_conn_get_phy(struct hci_conn *conn) return phys; } + +int hci_abort_conn(struct hci_conn *conn, u8 reason) +{ + int r = 0; + + switch (conn->state) { + case BT_CONNECTED: + case BT_CONFIG: + if (conn->type == AMP_LINK) { + struct hci_cp_disconn_phy_link cp; + + cp.phy_handle = HCI_PHY_HANDLE(conn->handle); + cp.reason = reason; + r = hci_send_cmd(conn->hdev, HCI_OP_DISCONN_PHY_LINK, + sizeof(cp), &cp); + } else { + struct hci_cp_disconnect dc; + + dc.handle = cpu_to_le16(conn->handle); + dc.reason = reason; + r = hci_send_cmd(conn->hdev, HCI_OP_DISCONNECT, + sizeof(dc), &dc); + } + + conn->state = BT_DISCONN; + + break; + case BT_CONNECT: + if (conn->type == LE_LINK) { + if (test_bit(HCI_CONN_SCANNING, &conn->flags)) + break; + r = hci_send_cmd(conn->hdev, + HCI_OP_LE_CREATE_CONN_CANCEL, 0, NULL); + } else if (conn->type == ACL_LINK) { + if (conn->hdev->hci_ver < BLUETOOTH_VER_1_2) + break; + r = hci_send_cmd(conn->hdev, + HCI_OP_CREATE_CONN_CANCEL, + 6, &conn->dst); + } + break; + case BT_CONNECT2: + if (conn->type == ACL_LINK) { + struct hci_cp_reject_conn_req rej; + + bacpy(&rej.bdaddr, &conn->dst); + rej.reason = reason; + + r = hci_send_cmd(conn->hdev, + HCI_OP_REJECT_CONN_REQ, + sizeof(rej), &rej); + } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { + struct hci_cp_reject_sync_conn_req rej; + + bacpy(&rej.bdaddr, &conn->dst); + + /* SCO rejection has its own limited set of + * allowed error values (0x0D-0x0F) which isn't + * compatible with most values passed to this + * function. To be safe hard-code one of the + * values that's suitable for SCO. + */ + rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; + + r = hci_send_cmd(conn->hdev, + HCI_OP_REJECT_SYNC_CONN_REQ, + sizeof(rej), &rej); + } + break; + default: + conn->state = BT_CLOSED; + break; + } + + return r; +} diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c index b3a5a3cc9372..0540555b3704 100644 --- a/net/bluetooth/hci_core.c +++ b/net/bluetooth/hci_core.c @@ -597,6 +597,15 @@ static int hci_dev_do_reset(struct hci_dev *hdev) /* Cancel these to avoid queueing non-chained pending work */ hci_dev_set_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE); + /* Wait for + * + * if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) + * queue_delayed_work(&hdev->{cmd,ncmd}_timer) + * + * inside RCU section to see the flag or complete scheduling. + */ + synchronize_rcu(); + /* Explicitly cancel works in case scheduled after setting the flag. */ cancel_delayed_work(&hdev->cmd_timer); cancel_delayed_work(&hdev->ncmd_timer); @@ -714,7 +723,7 @@ static void hci_update_passive_scan_state(struct hci_dev *hdev, u8 scan) hci_dev_set_flag(hdev, HCI_BREDR_ENABLED); if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - hci_req_update_adv_data(hdev, hdev->cur_adv_instance); + hci_update_adv_data(hdev, hdev->cur_adv_instance); mgmt_new_settings(hdev); } @@ -1706,7 +1715,8 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, u32 flags, u16 adv_data_len, u8 *adv_data, u16 scan_rsp_len, u8 *scan_rsp_data, u16 timeout, u16 duration, s8 tx_power, - u32 min_interval, u32 max_interval) + u32 min_interval, u32 max_interval, + u8 mesh_handle) { struct adv_info *adv; @@ -1717,7 +1727,7 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, memset(adv->per_adv_data, 0, sizeof(adv->per_adv_data)); } else { if (hdev->adv_instance_cnt >= hdev->le_num_of_adv_sets || - instance < 1 || instance > hdev->le_num_of_adv_sets) + instance < 1 || instance > hdev->le_num_of_adv_sets + 1) return ERR_PTR(-EOVERFLOW); adv = kzalloc(sizeof(*adv), GFP_KERNEL); @@ -1734,6 +1744,11 @@ struct adv_info *hci_add_adv_instance(struct hci_dev *hdev, u8 instance, adv->min_interval = min_interval; adv->max_interval = max_interval; adv->tx_power = tx_power; + /* Defining a mesh_handle changes the timing units to ms, + * rather than seconds, and ties the instance to the requested + * mesh_tx queue. + */ + adv->mesh = mesh_handle; hci_set_adv_instance_data(hdev, instance, adv_data_len, adv_data, scan_rsp_len, scan_rsp_data); @@ -1762,7 +1777,7 @@ struct adv_info *hci_add_per_instance(struct hci_dev *hdev, u8 instance, adv = hci_add_adv_instance(hdev, instance, flags, 0, NULL, 0, NULL, 0, 0, HCI_ADV_TX_POWER_NO_PREFERENCE, - min_interval, max_interval); + min_interval, max_interval, 0); if (IS_ERR(adv)) return adv; @@ -2391,6 +2406,10 @@ static int hci_suspend_notifier(struct notifier_block *nb, unsigned long action, container_of(nb, struct hci_dev, suspend_notifier); int ret = 0; + /* Userspace has full control of this device. Do nothing. */ + if (hci_dev_test_flag(hdev, HCI_USER_CHANNEL)) + return NOTIFY_DONE; + if (action == PM_SUSPEND_PREPARE) ret = hci_suspend_dev(hdev); else if (action == PM_POST_SUSPEND) @@ -2486,6 +2505,7 @@ struct hci_dev *hci_alloc_dev_priv(int sizeof_priv) mutex_init(&hdev->lock); mutex_init(&hdev->req_lock); + INIT_LIST_HEAD(&hdev->mesh_pending); INIT_LIST_HEAD(&hdev->mgmt_pending); INIT_LIST_HEAD(&hdev->reject_list); INIT_LIST_HEAD(&hdev->accept_list); @@ -3469,15 +3489,27 @@ static inline int __get_blocks(struct hci_dev *hdev, struct sk_buff *skb) return DIV_ROUND_UP(skb->len - HCI_ACL_HDR_SIZE, hdev->block_len); } -static void __check_timeout(struct hci_dev *hdev, unsigned int cnt) +static void __check_timeout(struct hci_dev *hdev, unsigned int cnt, u8 type) { - if (!hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) { - /* ACL tx timeout must be longer than maximum - * link supervision timeout (40.9 seconds) */ - if (!cnt && time_after(jiffies, hdev->acl_last_tx + - HCI_ACL_TX_TIMEOUT)) - hci_link_tx_to(hdev, ACL_LINK); + unsigned long last_tx; + + if (hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) + return; + + switch (type) { + case LE_LINK: + last_tx = hdev->le_last_tx; + break; + default: + last_tx = hdev->acl_last_tx; + break; } + + /* tx timeout must be longer than maximum link supervision timeout + * (40.9 seconds) + */ + if (!cnt && time_after(jiffies, last_tx + HCI_ACL_TX_TIMEOUT)) + hci_link_tx_to(hdev, type); } /* Schedule SCO */ @@ -3535,7 +3567,7 @@ static void hci_sched_acl_pkt(struct hci_dev *hdev) struct sk_buff *skb; int quote; - __check_timeout(hdev, cnt); + __check_timeout(hdev, cnt, ACL_LINK); while (hdev->acl_cnt && (chan = hci_chan_sent(hdev, ACL_LINK, "e))) { @@ -3578,8 +3610,6 @@ static void hci_sched_acl_blk(struct hci_dev *hdev) int quote; u8 type; - __check_timeout(hdev, cnt); - BT_DBG("%s", hdev->name); if (hdev->dev_type == HCI_AMP) @@ -3587,6 +3617,8 @@ static void hci_sched_acl_blk(struct hci_dev *hdev) else type = ACL_LINK; + __check_timeout(hdev, cnt, type); + while (hdev->block_cnt > 0 && (chan = hci_chan_sent(hdev, type, "e))) { u32 priority = (skb_peek(&chan->data_q))->priority; @@ -3660,7 +3692,7 @@ static void hci_sched_le(struct hci_dev *hdev) cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt; - __check_timeout(hdev, cnt); + __check_timeout(hdev, cnt, LE_LINK); tmp = cnt; while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, "e))) { @@ -4056,12 +4088,14 @@ static void hci_cmd_work(struct work_struct *work) if (res < 0) __hci_cmd_sync_cancel(hdev, -res); + rcu_read_lock(); if (test_bit(HCI_RESET, &hdev->flags) || hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) cancel_delayed_work(&hdev->cmd_timer); else - schedule_delayed_work(&hdev->cmd_timer, - HCI_CMD_TIMEOUT); + queue_delayed_work(hdev->workqueue, &hdev->cmd_timer, + HCI_CMD_TIMEOUT); + rcu_read_unlock(); } else { skb_queue_head(&hdev->cmd_q, skb); queue_work(hdev->workqueue, &hdev->cmd_work); diff --git a/net/bluetooth/hci_debugfs.c b/net/bluetooth/hci_debugfs.c index 902b40a90b91..3f401ec5bb0c 100644 --- a/net/bluetooth/hci_debugfs.c +++ b/net/bluetooth/hci_debugfs.c @@ -1245,7 +1245,7 @@ void hci_debugfs_create_conn(struct hci_conn *conn) struct hci_dev *hdev = conn->hdev; char name[6]; - if (IS_ERR_OR_NULL(hdev->debugfs)) + if (IS_ERR_OR_NULL(hdev->debugfs) || conn->debugfs) return; snprintf(name, sizeof(name), "%u", conn->handle); diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c index 6643c9c20fa4..faca701bce2a 100644 --- a/net/bluetooth/hci_event.c +++ b/net/bluetooth/hci_event.c @@ -712,6 +712,47 @@ static u8 hci_cc_read_local_version(struct hci_dev *hdev, void *data, return rp->status; } +static u8 hci_cc_read_enc_key_size(struct hci_dev *hdev, void *data, + struct sk_buff *skb) +{ + struct hci_rp_read_enc_key_size *rp = data; + struct hci_conn *conn; + u16 handle; + u8 status = rp->status; + + bt_dev_dbg(hdev, "status 0x%2.2x", status); + + handle = le16_to_cpu(rp->handle); + + hci_dev_lock(hdev); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) { + status = 0xFF; + goto done; + } + + /* While unexpected, the read_enc_key_size command may fail. The most + * secure approach is to then assume the key size is 0 to force a + * disconnection. + */ + if (status) { + bt_dev_err(hdev, "failed to read key size for handle %u", + handle); + conn->enc_key_size = 0; + } else { + conn->enc_key_size = rp->key_size; + status = 0; + } + + hci_encrypt_cfm(conn, 0); + +done: + hci_dev_unlock(hdev); + + return status; +} + static u8 hci_cc_read_local_commands(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -1715,6 +1756,8 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) hci_dev_set_flag(hdev, HCI_LE_SCAN); if (hdev->le_scan_type == LE_SCAN_ACTIVE) clear_pending_adv_report(hdev); + if (hci_dev_test_flag(hdev, HCI_MESH)) + hci_discovery_set_state(hdev, DISCOVERY_FINDING); break; case LE_SCAN_DISABLE: @@ -1729,7 +1772,7 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) d->last_adv_addr_type, NULL, d->last_adv_rssi, d->last_adv_flags, d->last_adv_data, - d->last_adv_data_len, NULL, 0); + d->last_adv_data_len, NULL, 0, 0); } /* Cancel this timer so that we don't try to disable scanning @@ -1745,6 +1788,9 @@ static void le_set_scan_enable_complete(struct hci_dev *hdev, u8 enable) */ if (hci_dev_test_and_clear_flag(hdev, HCI_LE_SCAN_INTERRUPTED)) hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) && + hdev->discovery.state == DISCOVERY_FINDING) + queue_work(hdev->workqueue, &hdev->reenable_adv_work); break; @@ -2152,7 +2198,7 @@ static u8 hci_cc_set_ext_adv_param(struct hci_dev *hdev, void *data, adv_instance->tx_power = rp->tx_power; } /* Update adv data as tx power is known now */ - hci_req_update_adv_data(hdev, cp->handle); + hci_update_adv_data(hdev, cp->handle); hci_dev_unlock(hdev); @@ -3071,7 +3117,7 @@ static void hci_inquiry_result_evt(struct hci_dev *hdev, void *edata, mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, HCI_RSSI_INVALID, - flags, NULL, 0, NULL, 0); + flags, NULL, 0, NULL, 0, 0); } hci_dev_unlock(hdev); @@ -3534,47 +3580,6 @@ unlock: hci_dev_unlock(hdev); } -static void read_enc_key_size_complete(struct hci_dev *hdev, u8 status, - u16 opcode, struct sk_buff *skb) -{ - const struct hci_rp_read_enc_key_size *rp; - struct hci_conn *conn; - u16 handle; - - BT_DBG("%s status 0x%02x", hdev->name, status); - - if (!skb || skb->len < sizeof(*rp)) { - bt_dev_err(hdev, "invalid read key size response"); - return; - } - - rp = (void *)skb->data; - handle = le16_to_cpu(rp->handle); - - hci_dev_lock(hdev); - - conn = hci_conn_hash_lookup_handle(hdev, handle); - if (!conn) - goto unlock; - - /* While unexpected, the read_enc_key_size command may fail. The most - * secure approach is to then assume the key size is 0 to force a - * disconnection. - */ - if (rp->status) { - bt_dev_err(hdev, "failed to read key size for handle %u", - handle); - conn->enc_key_size = 0; - } else { - conn->enc_key_size = rp->key_size; - } - - hci_encrypt_cfm(conn, 0); - -unlock: - hci_dev_unlock(hdev); -} - static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { @@ -3639,7 +3644,6 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, /* Try reading the encryption key size for encrypted ACL links */ if (!ev->status && ev->encrypt && conn->type == ACL_LINK) { struct hci_cp_read_enc_key_size cp; - struct hci_request req; /* Only send HCI_Read_Encryption_Key_Size if the * controller really supports it. If it doesn't, assume @@ -3650,12 +3654,9 @@ static void hci_encrypt_change_evt(struct hci_dev *hdev, void *data, goto notify; } - hci_req_init(&req, hdev); - cp.handle = cpu_to_le16(conn->handle); - hci_req_add(&req, HCI_OP_READ_ENC_KEY_SIZE, sizeof(cp), &cp); - - if (hci_req_run_skb(&req, read_enc_key_size_complete)) { + if (hci_send_cmd(hdev, HCI_OP_READ_ENC_KEY_SIZE, + sizeof(cp), &cp)) { bt_dev_err(hdev, "sending read key size failed"); conn->enc_key_size = HCI_LINK_KEY_SIZE; goto notify; @@ -3766,16 +3767,18 @@ static inline void handle_cmd_cnt_and_timer(struct hci_dev *hdev, u8 ncmd) { cancel_delayed_work(&hdev->cmd_timer); + rcu_read_lock(); if (!test_bit(HCI_RESET, &hdev->flags)) { if (ncmd) { cancel_delayed_work(&hdev->ncmd_timer); atomic_set(&hdev->cmd_cnt, 1); } else { if (!hci_dev_test_flag(hdev, HCI_CMD_DRAIN_WORKQUEUE)) - schedule_delayed_work(&hdev->ncmd_timer, - HCI_NCMD_TIMEOUT); + queue_delayed_work(hdev->workqueue, &hdev->ncmd_timer, + HCI_NCMD_TIMEOUT); } } + rcu_read_unlock(); } static u8 hci_cc_le_read_buffer_size_v2(struct hci_dev *hdev, void *data, @@ -4037,6 +4040,8 @@ static const struct hci_cc { sizeof(struct hci_rp_read_local_amp_info)), HCI_CC(HCI_OP_READ_CLOCK, hci_cc_read_clock, sizeof(struct hci_rp_read_clock)), + HCI_CC(HCI_OP_READ_ENC_KEY_SIZE, hci_cc_read_enc_key_size, + sizeof(struct hci_rp_read_enc_key_size)), HCI_CC(HCI_OP_READ_INQ_RSP_TX_POWER, hci_cc_read_inq_rsp_tx_power, sizeof(struct hci_rp_read_inq_rsp_tx_power)), HCI_CC(HCI_OP_READ_DEF_ERR_DATA_REPORTING, @@ -4829,7 +4834,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata, mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - flags, NULL, 0, NULL, 0); + flags, NULL, 0, NULL, 0, 0); } } else if (skb->len == array_size(ev->num, sizeof(struct inquiry_info_rssi))) { @@ -4860,7 +4865,7 @@ static void hci_inquiry_result_with_rssi_evt(struct hci_dev *hdev, void *edata, mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - flags, NULL, 0, NULL, 0); + flags, NULL, 0, NULL, 0, 0); } } else { bt_dev_err(hdev, "Malformed HCI Event: 0x%2.2x", @@ -5116,7 +5121,7 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev, void *edata, mgmt_device_found(hdev, &info->bdaddr, ACL_LINK, 0x00, info->dev_class, info->rssi, - flags, info->data, eir_len, NULL, 0); + flags, info->data, eir_len, NULL, 0, 0); } hci_dev_unlock(hdev); @@ -6172,7 +6177,7 @@ static struct hci_conn *check_pending_le_conn(struct hci_dev *hdev, static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, u8 bdaddr_type, bdaddr_t *direct_addr, u8 direct_addr_type, s8 rssi, u8 *data, u8 len, - bool ext_adv) + bool ext_adv, bool ctl_time, u64 instant) { struct discovery_state *d = &hdev->discovery; struct smp_irk *irk; @@ -6220,7 +6225,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * important to see if the address is matching the local * controller address. */ - if (direct_addr) { + if (!hci_dev_test_flag(hdev, HCI_MESH) && direct_addr) { direct_addr_type = ev_bdaddr_type(hdev, direct_addr_type, &bdaddr_resolved); @@ -6268,6 +6273,18 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, conn->le_adv_data_len = len; } + if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND) + flags = MGMT_DEV_FOUND_NOT_CONNECTABLE; + else + flags = 0; + + /* All scan results should be sent up for Mesh systems */ + if (hci_dev_test_flag(hdev, HCI_MESH)) { + mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, + rssi, flags, data, len, NULL, 0, instant); + return; + } + /* Passive scanning shouldn't trigger any device found events, * except for devices marked as CONN_REPORT for which we do send * device found events, or advertisement monitoring requested. @@ -6281,12 +6298,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, idr_is_empty(&hdev->adv_monitors_idr)) return; - if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND) - flags = MGMT_DEV_FOUND_NOT_CONNECTABLE; - else - flags = 0; mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, - rssi, flags, data, len, NULL, 0); + rssi, flags, data, len, NULL, 0, 0); return; } @@ -6305,11 +6318,8 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, * and just sends a scan response event, then it is marked as * not connectable as well. */ - if (type == LE_ADV_NONCONN_IND || type == LE_ADV_SCAN_IND || - type == LE_ADV_SCAN_RSP) + if (type == LE_ADV_SCAN_RSP) flags = MGMT_DEV_FOUND_NOT_CONNECTABLE; - else - flags = 0; /* If there's nothing pending either store the data from this * event or send an immediate device found event if the data @@ -6326,7 +6336,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, } mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, - rssi, flags, data, len, NULL, 0); + rssi, flags, data, len, NULL, 0, 0); return; } @@ -6345,7 +6355,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, d->last_adv_addr_type, NULL, d->last_adv_rssi, d->last_adv_flags, d->last_adv_data, - d->last_adv_data_len, NULL, 0); + d->last_adv_data_len, NULL, 0, 0); /* If the new report will trigger a SCAN_REQ store it for * later merging. @@ -6362,7 +6372,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ clear_pending_adv_report(hdev); mgmt_device_found(hdev, bdaddr, LE_LINK, bdaddr_type, NULL, - rssi, flags, data, len, NULL, 0); + rssi, flags, data, len, NULL, 0, 0); return; } @@ -6372,7 +6382,7 @@ static void process_adv_report(struct hci_dev *hdev, u8 type, bdaddr_t *bdaddr, */ mgmt_device_found(hdev, &d->last_adv_addr, LE_LINK, d->last_adv_addr_type, NULL, rssi, d->last_adv_flags, - d->last_adv_data, d->last_adv_data_len, data, len); + d->last_adv_data, d->last_adv_data_len, data, len, 0); clear_pending_adv_report(hdev); } @@ -6380,6 +6390,7 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_ev_le_advertising_report *ev = data; + u64 instant = jiffies; if (!ev->num) return; @@ -6404,7 +6415,8 @@ static void hci_le_adv_report_evt(struct hci_dev *hdev, void *data, rssi = info->data[info->length]; process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, NULL, 0, rssi, - info->data, info->length, false); + info->data, info->length, false, + false, instant); } else { bt_dev_err(hdev, "Dropping invalid advertising data"); } @@ -6461,6 +6473,7 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_ev_le_ext_adv_report *ev = data; + u64 instant = jiffies; if (!ev->num) return; @@ -6487,7 +6500,8 @@ static void hci_le_ext_adv_report_evt(struct hci_dev *hdev, void *data, process_adv_report(hdev, legacy_evt_type, &info->bdaddr, info->bdaddr_type, NULL, 0, info->rssi, info->data, info->length, - !(evt_type & LE_EXT_ADV_LEGACY_PDU)); + !(evt_type & LE_EXT_ADV_LEGACY_PDU), + false, instant); } } @@ -6710,6 +6724,7 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data, struct sk_buff *skb) { struct hci_ev_le_direct_adv_report *ev = data; + u64 instant = jiffies; int i; if (!hci_le_ev_skb_pull(hdev, skb, HCI_EV_LE_DIRECT_ADV_REPORT, @@ -6727,7 +6742,7 @@ static void hci_le_direct_adv_report_evt(struct hci_dev *hdev, void *data, process_adv_report(hdev, info->type, &info->bdaddr, info->bdaddr_type, &info->direct_addr, info->direct_addr_type, info->rssi, NULL, 0, - false); + false, false, instant); } hci_dev_unlock(hdev); @@ -6776,6 +6791,13 @@ static void hci_le_cis_estabilished_evt(struct hci_dev *hdev, void *data, goto unlock; } + if (conn->type != ISO_LINK) { + bt_dev_err(hdev, + "Invalid connection link type handle 0x%4.4x", + handle); + goto unlock; + } + if (conn->role == HCI_ROLE_SLAVE) { __le32 interval; @@ -6896,6 +6918,13 @@ static void hci_le_create_big_complete_evt(struct hci_dev *hdev, void *data, if (!conn) goto unlock; + if (conn->type != ISO_LINK) { + bt_dev_err(hdev, + "Invalid connection link type handle 0x%2.2x", + ev->handle); + goto unlock; + } + if (ev->num_bis) conn->handle = __le16_to_cpu(ev->bis_handle[0]); diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c index e64d558e5d69..5a0296a4352e 100644 --- a/net/bluetooth/hci_request.c +++ b/net/bluetooth/hci_request.c @@ -269,42 +269,10 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen, void hci_req_add(struct hci_request *req, u16 opcode, u32 plen, const void *param) { + bt_dev_err(req->hdev, "HCI_REQ-0x%4.4x", opcode); hci_req_add_ev(req, opcode, plen, param, 0); } -void __hci_req_write_fast_connectable(struct hci_request *req, bool enable) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_page_scan_activity acp; - u8 type; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - return; - - if (hdev->hci_ver < BLUETOOTH_VER_1_2) - return; - - if (enable) { - type = PAGE_SCAN_TYPE_INTERLACED; - - /* 160 msec page scan interval */ - acp.interval = cpu_to_le16(0x0100); - } else { - type = hdev->def_page_scan_type; - acp.interval = cpu_to_le16(hdev->def_page_scan_int); - } - - acp.window = cpu_to_le16(hdev->def_page_scan_window); - - if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval || - __cpu_to_le16(hdev->page_scan_window) != acp.window) - hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY, - sizeof(acp), &acp); - - if (hdev->page_scan_type != type) - hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type); -} - static void start_interleave_scan(struct hci_dev *hdev) { hdev->interleave_scan_state = INTERLEAVE_SCAN_NO_FILTER; @@ -357,45 +325,6 @@ static bool __hci_update_interleaved_scan(struct hci_dev *hdev) return false; } -void __hci_req_update_name(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_local_name cp; - - memcpy(cp.name, hdev->dev_name, sizeof(cp.name)); - - hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp); -} - -void __hci_req_update_eir(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_write_eir cp; - - if (!hdev_is_powered(hdev)) - return; - - if (!lmp_ext_inq_capable(hdev)) - return; - - if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED)) - return; - - if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) - return; - - memset(&cp, 0, sizeof(cp)); - - eir_create(hdev, cp.data); - - if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0) - return; - - memcpy(hdev->eir, cp.data, sizeof(cp.data)); - - hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp); -} - void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn) { struct hci_dev *hdev = req->hdev; @@ -721,6 +650,96 @@ static inline bool hci_is_le_conn_scanning(struct hci_dev *hdev) return false; } +static void set_random_addr(struct hci_request *req, bdaddr_t *rpa); +static int hci_update_random_address(struct hci_request *req, + bool require_privacy, bool use_rpa, + u8 *own_addr_type) +{ + struct hci_dev *hdev = req->hdev; + int err; + + /* If privacy is enabled use a resolvable private address. If + * current RPA has expired or there is something else than + * the current RPA in use, then generate a new one. + */ + if (use_rpa) { + /* If Controller supports LL Privacy use own address type is + * 0x03 + */ + if (use_ll_privacy(hdev)) + *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; + else + *own_addr_type = ADDR_LE_DEV_RANDOM; + + if (rpa_valid(hdev)) + return 0; + + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); + if (err < 0) { + bt_dev_err(hdev, "failed to generate new RPA"); + return err; + } + + set_random_addr(req, &hdev->rpa); + + return 0; + } + + /* In case of required privacy without resolvable private address, + * use an non-resolvable private address. This is useful for active + * scanning and non-connectable advertising. + */ + if (require_privacy) { + bdaddr_t nrpa; + + while (true) { + /* The non-resolvable private address is generated + * from random six bytes with the two most significant + * bits cleared. + */ + get_random_bytes(&nrpa, 6); + nrpa.b[5] &= 0x3f; + + /* The non-resolvable private address shall not be + * equal to the public address. + */ + if (bacmp(&hdev->bdaddr, &nrpa)) + break; + } + + *own_addr_type = ADDR_LE_DEV_RANDOM; + set_random_addr(req, &nrpa); + return 0; + } + + /* If forcing static address is in use or there is no public + * address use the static address as random address (but skip + * the HCI command if the current random address is already the + * static one. + * + * In case BR/EDR has been disabled on a dual-mode controller + * and a static address has been configured, then use that + * address instead of the public BR/EDR address. + */ + if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || + !bacmp(&hdev->bdaddr, BDADDR_ANY) || + (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && + bacmp(&hdev->static_addr, BDADDR_ANY))) { + *own_addr_type = ADDR_LE_DEV_RANDOM; + if (bacmp(&hdev->static_addr, &hdev->random_addr)) + hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, + &hdev->static_addr); + return 0; + } + + /* Neither privacy nor static address is being used so use a + * public address. + */ + *own_addr_type = ADDR_LE_DEV_PUBLIC; + + return 0; +} + /* Ensure to call hci_req_add_le_scan_disable() first to disable the * controller based address resolution to be able to reconfigure * resolving list. @@ -810,366 +829,6 @@ void hci_req_add_le_passive_scan(struct hci_request *req) addr_resolv); } -static void cancel_adv_timeout(struct hci_dev *hdev) -{ - if (hdev->adv_instance_timeout) { - hdev->adv_instance_timeout = 0; - cancel_delayed_work(&hdev->adv_instance_expire); - } -} - -static bool adv_cur_instance_is_scannable(struct hci_dev *hdev) -{ - return hci_adv_instance_is_scannable(hdev, hdev->cur_adv_instance); -} - -void __hci_req_disable_advertising(struct hci_request *req) -{ - if (ext_adv_capable(req->hdev)) { - __hci_req_disable_ext_adv_instance(req, 0x00); - } else { - u8 enable = 0x00; - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); - } -} - -static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags) -{ - /* If privacy is not enabled don't use RPA */ - if (!hci_dev_test_flag(hdev, HCI_PRIVACY)) - return false; - - /* If basic privacy mode is enabled use RPA */ - if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY)) - return true; - - /* If limited privacy mode is enabled don't use RPA if we're - * both discoverable and bondable. - */ - if ((flags & MGMT_ADV_FLAG_DISCOV) && - hci_dev_test_flag(hdev, HCI_BONDABLE)) - return false; - - /* We're neither bondable nor discoverable in the limited - * privacy mode, therefore use RPA. - */ - return true; -} - -static bool is_advertising_allowed(struct hci_dev *hdev, bool connectable) -{ - /* If there is no connection we are OK to advertise. */ - if (hci_conn_num(hdev, LE_LINK) == 0) - return true; - - /* Check le_states if there is any connection in peripheral role. */ - if (hdev->conn_hash.le_num_peripheral > 0) { - /* Peripheral connection state and non connectable mode bit 20. - */ - if (!connectable && !(hdev->le_states[2] & 0x10)) - return false; - - /* Peripheral connection state and connectable mode bit 38 - * and scannable bit 21. - */ - if (connectable && (!(hdev->le_states[4] & 0x40) || - !(hdev->le_states[2] & 0x20))) - return false; - } - - /* Check le_states if there is any connection in central role. */ - if (hci_conn_num(hdev, LE_LINK) != hdev->conn_hash.le_num_peripheral) { - /* Central connection state and non connectable mode bit 18. */ - if (!connectable && !(hdev->le_states[2] & 0x02)) - return false; - - /* Central connection state and connectable mode bit 35 and - * scannable 19. - */ - if (connectable && (!(hdev->le_states[4] & 0x08) || - !(hdev->le_states[2] & 0x08))) - return false; - } - - return true; -} - -void __hci_req_enable_advertising(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct adv_info *adv; - struct hci_cp_le_set_adv_param cp; - u8 own_addr_type, enable = 0x01; - bool connectable; - u16 adv_min_interval, adv_max_interval; - u32 flags; - - flags = hci_adv_instance_flags(hdev, hdev->cur_adv_instance); - adv = hci_find_adv_instance(hdev, hdev->cur_adv_instance); - - /* If the "connectable" instance flag was not set, then choose between - * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. - */ - connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || - mgmt_get_connectable(hdev); - - if (!is_advertising_allowed(hdev, connectable)) - return; - - if (hci_dev_test_flag(hdev, HCI_LE_ADV)) - __hci_req_disable_advertising(req); - - /* Clear the HCI_LE_ADV bit temporarily so that the - * hci_update_random_address knows that it's safe to go ahead - * and write a new random address. The flag will be set back on - * as soon as the SET_ADV_ENABLE HCI command completes. - */ - hci_dev_clear_flag(hdev, HCI_LE_ADV); - - /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. - */ - if (hci_update_random_address(req, !connectable, - adv_use_rpa(hdev, flags), - &own_addr_type) < 0) - return; - - memset(&cp, 0, sizeof(cp)); - - if (adv) { - adv_min_interval = adv->min_interval; - adv_max_interval = adv->max_interval; - } else { - adv_min_interval = hdev->le_adv_min_interval; - adv_max_interval = hdev->le_adv_max_interval; - } - - if (connectable) { - cp.type = LE_ADV_IND; - } else { - if (adv_cur_instance_is_scannable(hdev)) - cp.type = LE_ADV_SCAN_IND; - else - cp.type = LE_ADV_NONCONN_IND; - - if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE) || - hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) { - adv_min_interval = DISCOV_LE_FAST_ADV_INT_MIN; - adv_max_interval = DISCOV_LE_FAST_ADV_INT_MAX; - } - } - - cp.min_interval = cpu_to_le16(adv_min_interval); - cp.max_interval = cpu_to_le16(adv_max_interval); - cp.own_address_type = own_addr_type; - cp.channel_map = hdev->le_adv_channel_map; - - hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp); - - hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable); -} - -void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - u8 len; - - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - return; - - if (ext_adv_capable(hdev)) { - struct { - struct hci_cp_le_set_ext_scan_rsp_data cp; - u8 data[HCI_MAX_EXT_AD_LENGTH]; - } pdu; - - memset(&pdu, 0, sizeof(pdu)); - - len = eir_create_scan_rsp(hdev, instance, pdu.data); - - if (hdev->scan_rsp_data_len == len && - !memcmp(pdu.data, hdev->scan_rsp_data, len)) - return; - - memcpy(hdev->scan_rsp_data, pdu.data, len); - hdev->scan_rsp_data_len = len; - - pdu.cp.handle = instance; - pdu.cp.length = len; - pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - - hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_RSP_DATA, - sizeof(pdu.cp) + len, &pdu.cp); - } else { - struct hci_cp_le_set_scan_rsp_data cp; - - memset(&cp, 0, sizeof(cp)); - - len = eir_create_scan_rsp(hdev, instance, cp.data); - - if (hdev->scan_rsp_data_len == len && - !memcmp(cp.data, hdev->scan_rsp_data, len)) - return; - - memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data)); - hdev->scan_rsp_data_len = len; - - cp.length = len; - - hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp); - } -} - -void __hci_req_update_adv_data(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - u8 len; - - if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) - return; - - if (ext_adv_capable(hdev)) { - struct { - struct hci_cp_le_set_ext_adv_data cp; - u8 data[HCI_MAX_EXT_AD_LENGTH]; - } pdu; - - memset(&pdu, 0, sizeof(pdu)); - - len = eir_create_adv_data(hdev, instance, pdu.data); - - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(pdu.data, hdev->adv_data, len) == 0) - return; - - memcpy(hdev->adv_data, pdu.data, len); - hdev->adv_data_len = len; - - pdu.cp.length = len; - pdu.cp.handle = instance; - pdu.cp.operation = LE_SET_ADV_DATA_OP_COMPLETE; - pdu.cp.frag_pref = LE_SET_ADV_DATA_NO_FRAG; - - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_DATA, - sizeof(pdu.cp) + len, &pdu.cp); - } else { - struct hci_cp_le_set_adv_data cp; - - memset(&cp, 0, sizeof(cp)); - - len = eir_create_adv_data(hdev, instance, cp.data); - - /* There's nothing to do if the data hasn't changed */ - if (hdev->adv_data_len == len && - memcmp(cp.data, hdev->adv_data, len) == 0) - return; - - memcpy(hdev->adv_data, cp.data, sizeof(cp.data)); - hdev->adv_data_len = len; - - cp.length = len; - - hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp); - } -} - -int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance) -{ - struct hci_request req; - - hci_req_init(&req, hdev); - __hci_req_update_adv_data(&req, instance); - - return hci_req_run(&req, NULL); -} - -static void enable_addr_resolution_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - BT_DBG("%s status %u", hdev->name, status); -} - -void hci_req_disable_address_resolution(struct hci_dev *hdev) -{ - struct hci_request req; - __u8 enable = 0x00; - - if (!hci_dev_test_flag(hdev, HCI_LL_RPA_RESOLUTION)) - return; - - hci_req_init(&req, hdev); - - hci_req_add(&req, HCI_OP_LE_SET_ADDR_RESOLV_ENABLE, 1, &enable); - - hci_req_run(&req, enable_addr_resolution_complete); -} - -static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - bt_dev_dbg(hdev, "status %u", status); -} - -void hci_req_reenable_advertising(struct hci_dev *hdev) -{ - struct hci_request req; - - if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && - list_empty(&hdev->adv_instances)) - return; - - hci_req_init(&req, hdev); - - if (hdev->cur_adv_instance) { - __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance, - true); - } else { - if (ext_adv_capable(hdev)) { - __hci_req_start_ext_adv(&req, 0x00); - } else { - __hci_req_update_adv_data(&req, 0x00); - __hci_req_update_scan_rsp_data(&req, 0x00); - __hci_req_enable_advertising(&req); - } - } - - hci_req_run(&req, adv_enable_complete); -} - -static void adv_timeout_expire(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - adv_instance_expire.work); - - struct hci_request req; - u8 instance; - - bt_dev_dbg(hdev, ""); - - hci_dev_lock(hdev); - - hdev->adv_instance_timeout = 0; - - instance = hdev->cur_adv_instance; - if (instance == 0x00) - goto unlock; - - hci_req_init(&req, hdev); - - hci_req_clear_adv_instance(hdev, NULL, &req, instance, false); - - if (list_empty(&hdev->adv_instances)) - __hci_req_disable_advertising(&req); - - hci_req_run(&req, NULL); - -unlock: - hci_dev_unlock(hdev); -} - static int hci_req_add_le_interleaved_scan(struct hci_request *req, unsigned long opt) { @@ -1226,84 +885,6 @@ static void interleave_scan_work(struct work_struct *work) &hdev->interleave_scan, timeout); } -int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, - bool use_rpa, struct adv_info *adv_instance, - u8 *own_addr_type, bdaddr_t *rand_addr) -{ - int err; - - bacpy(rand_addr, BDADDR_ANY); - - /* If privacy is enabled use a resolvable private address. If - * current RPA has expired then generate a new one. - */ - if (use_rpa) { - /* If Controller supports LL Privacy use own address type is - * 0x03 - */ - if (use_ll_privacy(hdev)) - *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; - else - *own_addr_type = ADDR_LE_DEV_RANDOM; - - if (adv_instance) { - if (adv_rpa_valid(adv_instance)) - return 0; - } else { - if (rpa_valid(hdev)) - return 0; - } - - err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); - if (err < 0) { - bt_dev_err(hdev, "failed to generate new RPA"); - return err; - } - - bacpy(rand_addr, &hdev->rpa); - - return 0; - } - - /* In case of required privacy without resolvable private address, - * use an non-resolvable private address. This is useful for - * non-connectable advertising. - */ - if (require_privacy) { - bdaddr_t nrpa; - - while (true) { - /* The non-resolvable private address is generated - * from random six bytes with the two most significant - * bits cleared. - */ - get_random_bytes(&nrpa, 6); - nrpa.b[5] &= 0x3f; - - /* The non-resolvable private address shall not be - * equal to the public address. - */ - if (bacmp(&hdev->bdaddr, &nrpa)) - break; - } - - *own_addr_type = ADDR_LE_DEV_RANDOM; - bacpy(rand_addr, &nrpa); - - return 0; - } - - /* No privacy so use a public address. */ - *own_addr_type = ADDR_LE_DEV_PUBLIC; - - return 0; -} - -void __hci_req_clear_ext_adv_sets(struct hci_request *req) -{ - hci_req_add(req, HCI_OP_LE_CLEAR_ADV_SETS, 0, NULL); -} - static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) { struct hci_dev *hdev = req->hdev; @@ -1328,933 +909,8 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa) hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, rpa); } -int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance) -{ - struct hci_cp_le_set_ext_adv_params cp; - struct hci_dev *hdev = req->hdev; - bool connectable; - u32 flags; - bdaddr_t random_addr; - u8 own_addr_type; - int err; - struct adv_info *adv; - bool secondary_adv, require_privacy; - - if (instance > 0) { - adv = hci_find_adv_instance(hdev, instance); - if (!adv) - return -EINVAL; - } else { - adv = NULL; - } - - flags = hci_adv_instance_flags(hdev, instance); - - /* If the "connectable" instance flag was not set, then choose between - * ADV_IND and ADV_NONCONN_IND based on the global connectable setting. - */ - connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) || - mgmt_get_connectable(hdev); - - if (!is_advertising_allowed(hdev, connectable)) - return -EPERM; - - /* Set require_privacy to true only when non-connectable - * advertising is used. In that case it is fine to use a - * non-resolvable private address. - */ - require_privacy = !connectable; - - /* Don't require privacy for periodic adv? */ - if (adv && adv->periodic) - require_privacy = false; - - err = hci_get_random_address(hdev, require_privacy, - adv_use_rpa(hdev, flags), adv, - &own_addr_type, &random_addr); - if (err < 0) - return err; - - memset(&cp, 0, sizeof(cp)); - - if (adv) { - hci_cpu_to_le24(adv->min_interval, cp.min_interval); - hci_cpu_to_le24(adv->max_interval, cp.max_interval); - cp.tx_power = adv->tx_power; - } else { - hci_cpu_to_le24(hdev->le_adv_min_interval, cp.min_interval); - hci_cpu_to_le24(hdev->le_adv_max_interval, cp.max_interval); - cp.tx_power = HCI_ADV_TX_POWER_NO_PREFERENCE; - } - - secondary_adv = (flags & MGMT_ADV_FLAG_SEC_MASK); - - if (connectable) { - if (secondary_adv) - cp.evt_properties = cpu_to_le16(LE_EXT_ADV_CONN_IND); - else - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_IND); - } else if (hci_adv_instance_is_scannable(hdev, instance) || - (flags & MGMT_ADV_PARAM_SCAN_RSP)) { - if (secondary_adv) - cp.evt_properties = cpu_to_le16(LE_EXT_ADV_SCAN_IND); - else - cp.evt_properties = cpu_to_le16(LE_LEGACY_ADV_SCAN_IND); - } else { - /* Secondary and periodic cannot use legacy PDUs */ - if (secondary_adv || (adv && adv->periodic)) - cp.evt_properties = cpu_to_le16(LE_EXT_ADV_NON_CONN_IND); - else - cp.evt_properties = cpu_to_le16(LE_LEGACY_NONCONN_IND); - } - - cp.own_addr_type = own_addr_type; - cp.channel_map = hdev->le_adv_channel_map; - cp.handle = instance; - - if (flags & MGMT_ADV_FLAG_SEC_2M) { - cp.primary_phy = HCI_ADV_PHY_1M; - cp.secondary_phy = HCI_ADV_PHY_2M; - } else if (flags & MGMT_ADV_FLAG_SEC_CODED) { - cp.primary_phy = HCI_ADV_PHY_CODED; - cp.secondary_phy = HCI_ADV_PHY_CODED; - } else { - /* In all other cases use 1M */ - cp.primary_phy = HCI_ADV_PHY_1M; - cp.secondary_phy = HCI_ADV_PHY_1M; - } - - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_PARAMS, sizeof(cp), &cp); - - if ((own_addr_type == ADDR_LE_DEV_RANDOM || - own_addr_type == ADDR_LE_DEV_RANDOM_RESOLVED) && - bacmp(&random_addr, BDADDR_ANY)) { - struct hci_cp_le_set_adv_set_rand_addr cp; - - /* Check if random address need to be updated */ - if (adv) { - if (!bacmp(&random_addr, &adv->random_addr)) - return 0; - } else { - if (!bacmp(&random_addr, &hdev->random_addr)) - return 0; - /* Instance 0x00 doesn't have an adv_info, instead it - * uses hdev->random_addr to track its address so - * whenever it needs to be updated this also set the - * random address since hdev->random_addr is shared with - * scan state machine. - */ - set_random_addr(req, &random_addr); - } - - memset(&cp, 0, sizeof(cp)); - - cp.handle = instance; - bacpy(&cp.bdaddr, &random_addr); - - hci_req_add(req, - HCI_OP_LE_SET_ADV_SET_RAND_ADDR, - sizeof(cp), &cp); - } - - return 0; -} - -int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_ext_adv_enable *cp; - struct hci_cp_ext_adv_set *adv_set; - u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; - struct adv_info *adv_instance; - - if (instance > 0) { - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return -EINVAL; - } else { - adv_instance = NULL; - } - - cp = (void *) data; - adv_set = (void *) cp->data; - - memset(cp, 0, sizeof(*cp)); - - cp->enable = 0x01; - cp->num_of_sets = 0x01; - - memset(adv_set, 0, sizeof(*adv_set)); - - adv_set->handle = instance; - - /* Set duration per instance since controller is responsible for - * scheduling it. - */ - if (adv_instance && adv_instance->duration) { - u16 duration = adv_instance->timeout * MSEC_PER_SEC; - - /* Time = N * 10 ms */ - adv_set->duration = cpu_to_le16(duration / 10); - } - - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, - sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets, - data); - - return 0; -} - -int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - struct hci_cp_le_set_ext_adv_enable *cp; - struct hci_cp_ext_adv_set *adv_set; - u8 data[sizeof(*cp) + sizeof(*adv_set) * 1]; - u8 req_size; - - /* If request specifies an instance that doesn't exist, fail */ - if (instance > 0 && !hci_find_adv_instance(hdev, instance)) - return -EINVAL; - - memset(data, 0, sizeof(data)); - - cp = (void *)data; - adv_set = (void *)cp->data; - - /* Instance 0x00 indicates all advertising instances will be disabled */ - cp->num_of_sets = !!instance; - cp->enable = 0x00; - - adv_set->handle = instance; - - req_size = sizeof(*cp) + sizeof(*adv_set) * cp->num_of_sets; - hci_req_add(req, HCI_OP_LE_SET_EXT_ADV_ENABLE, req_size, data); - - return 0; -} - -int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - - /* If request specifies an instance that doesn't exist, fail */ - if (instance > 0 && !hci_find_adv_instance(hdev, instance)) - return -EINVAL; - - hci_req_add(req, HCI_OP_LE_REMOVE_ADV_SET, sizeof(instance), &instance); - - return 0; -} - -int __hci_req_start_ext_adv(struct hci_request *req, u8 instance) -{ - struct hci_dev *hdev = req->hdev; - struct adv_info *adv_instance = hci_find_adv_instance(hdev, instance); - int err; - - /* If instance isn't pending, the chip knows about it, and it's safe to - * disable - */ - if (adv_instance && !adv_instance->pending) - __hci_req_disable_ext_adv_instance(req, instance); - - err = __hci_req_setup_ext_adv_instance(req, instance); - if (err < 0) - return err; - - __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_ext_advertising(req, instance); - - return 0; -} - -int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, - bool force) -{ - struct hci_dev *hdev = req->hdev; - struct adv_info *adv_instance = NULL; - u16 timeout; - - if (hci_dev_test_flag(hdev, HCI_ADVERTISING) || - list_empty(&hdev->adv_instances)) - return -EPERM; - - if (hdev->adv_instance_timeout) - return -EBUSY; - - adv_instance = hci_find_adv_instance(hdev, instance); - if (!adv_instance) - return -ENOENT; - - /* A zero timeout means unlimited advertising. As long as there is - * only one instance, duration should be ignored. We still set a timeout - * in case further instances are being added later on. - * - * If the remaining lifetime of the instance is more than the duration - * then the timeout corresponds to the duration, otherwise it will be - * reduced to the remaining instance lifetime. - */ - if (adv_instance->timeout == 0 || - adv_instance->duration <= adv_instance->remaining_time) - timeout = adv_instance->duration; - else - timeout = adv_instance->remaining_time; - - /* The remaining time is being reduced unless the instance is being - * advertised without time limit. - */ - if (adv_instance->timeout) - adv_instance->remaining_time = - adv_instance->remaining_time - timeout; - - /* Only use work for scheduling instances with legacy advertising */ - if (!ext_adv_capable(hdev)) { - hdev->adv_instance_timeout = timeout; - queue_delayed_work(hdev->req_workqueue, - &hdev->adv_instance_expire, - msecs_to_jiffies(timeout * 1000)); - } - - /* If we're just re-scheduling the same instance again then do not - * execute any HCI commands. This happens when a single instance is - * being advertised. - */ - if (!force && hdev->cur_adv_instance == instance && - hci_dev_test_flag(hdev, HCI_LE_ADV)) - return 0; - - hdev->cur_adv_instance = instance; - if (ext_adv_capable(hdev)) { - __hci_req_start_ext_adv(req, instance); - } else { - __hci_req_update_adv_data(req, instance); - __hci_req_update_scan_rsp_data(req, instance); - __hci_req_enable_advertising(req); - } - - return 0; -} - -/* For a single instance: - * - force == true: The instance will be removed even when its remaining - * lifetime is not zero. - * - force == false: the instance will be deactivated but kept stored unless - * the remaining lifetime is zero. - * - * For instance == 0x00: - * - force == true: All instances will be removed regardless of their timeout - * setting. - * - force == false: Only instances that have a timeout will be removed. - */ -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, - struct hci_request *req, u8 instance, - bool force) -{ - struct adv_info *adv_instance, *n, *next_instance = NULL; - int err; - u8 rem_inst; - - /* Cancel any timeout concerning the removed instance(s). */ - if (!instance || hdev->cur_adv_instance == instance) - cancel_adv_timeout(hdev); - - /* Get the next instance to advertise BEFORE we remove - * the current one. This can be the same instance again - * if there is only one instance. - */ - if (instance && hdev->cur_adv_instance == instance) - next_instance = hci_get_next_instance(hdev, instance); - - if (instance == 0x00) { - list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, - list) { - if (!(force || adv_instance->timeout)) - continue; - - rem_inst = adv_instance->instance; - err = hci_remove_adv_instance(hdev, rem_inst); - if (!err) - mgmt_advertising_removed(sk, hdev, rem_inst); - } - } else { - adv_instance = hci_find_adv_instance(hdev, instance); - - if (force || (adv_instance && adv_instance->timeout && - !adv_instance->remaining_time)) { - /* Don't advertise a removed instance. */ - if (next_instance && - next_instance->instance == instance) - next_instance = NULL; - - err = hci_remove_adv_instance(hdev, instance); - if (!err) - mgmt_advertising_removed(sk, hdev, instance); - } - } - - if (!req || !hdev_is_powered(hdev) || - hci_dev_test_flag(hdev, HCI_ADVERTISING)) - return; - - if (next_instance && !ext_adv_capable(hdev)) - __hci_req_schedule_adv_instance(req, next_instance->instance, - false); -} - -int hci_update_random_address(struct hci_request *req, bool require_privacy, - bool use_rpa, u8 *own_addr_type) -{ - struct hci_dev *hdev = req->hdev; - int err; - - /* If privacy is enabled use a resolvable private address. If - * current RPA has expired or there is something else than - * the current RPA in use, then generate a new one. - */ - if (use_rpa) { - /* If Controller supports LL Privacy use own address type is - * 0x03 - */ - if (use_ll_privacy(hdev)) - *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; - else - *own_addr_type = ADDR_LE_DEV_RANDOM; - - if (rpa_valid(hdev)) - return 0; - - err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); - if (err < 0) { - bt_dev_err(hdev, "failed to generate new RPA"); - return err; - } - - set_random_addr(req, &hdev->rpa); - - return 0; - } - - /* In case of required privacy without resolvable private address, - * use an non-resolvable private address. This is useful for active - * scanning and non-connectable advertising. - */ - if (require_privacy) { - bdaddr_t nrpa; - - while (true) { - /* The non-resolvable private address is generated - * from random six bytes with the two most significant - * bits cleared. - */ - get_random_bytes(&nrpa, 6); - nrpa.b[5] &= 0x3f; - - /* The non-resolvable private address shall not be - * equal to the public address. - */ - if (bacmp(&hdev->bdaddr, &nrpa)) - break; - } - - *own_addr_type = ADDR_LE_DEV_RANDOM; - set_random_addr(req, &nrpa); - return 0; - } - - /* If forcing static address is in use or there is no public - * address use the static address as random address (but skip - * the HCI command if the current random address is already the - * static one. - * - * In case BR/EDR has been disabled on a dual-mode controller - * and a static address has been configured, then use that - * address instead of the public BR/EDR address. - */ - if (hci_dev_test_flag(hdev, HCI_FORCE_STATIC_ADDR) || - !bacmp(&hdev->bdaddr, BDADDR_ANY) || - (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED) && - bacmp(&hdev->static_addr, BDADDR_ANY))) { - *own_addr_type = ADDR_LE_DEV_RANDOM; - if (bacmp(&hdev->static_addr, &hdev->random_addr)) - hci_req_add(req, HCI_OP_LE_SET_RANDOM_ADDR, 6, - &hdev->static_addr); - return 0; - } - - /* Neither privacy nor static address is being used so use a - * public address. - */ - *own_addr_type = ADDR_LE_DEV_PUBLIC; - - return 0; -} - -static bool disconnected_accept_list_entries(struct hci_dev *hdev) -{ - struct bdaddr_list *b; - - list_for_each_entry(b, &hdev->accept_list, list) { - struct hci_conn *conn; - - conn = hci_conn_hash_lookup_ba(hdev, ACL_LINK, &b->bdaddr); - if (!conn) - return true; - - if (conn->state != BT_CONNECTED && conn->state != BT_CONFIG) - return true; - } - - return false; -} - -void __hci_req_update_scan(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 scan; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - return; - - if (!hdev_is_powered(hdev)) - return; - - if (mgmt_powering_down(hdev)) - return; - - if (hdev->scanning_paused) - return; - - if (hci_dev_test_flag(hdev, HCI_CONNECTABLE) || - disconnected_accept_list_entries(hdev)) - scan = SCAN_PAGE; - else - scan = SCAN_DISABLED; - - if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE)) - scan |= SCAN_INQUIRY; - - if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) && - test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY)) - return; - - hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan); -} - -static u8 get_service_classes(struct hci_dev *hdev) -{ - struct bt_uuid *uuid; - u8 val = 0; - - list_for_each_entry(uuid, &hdev->uuids, list) - val |= uuid->svc_hint; - - return val; -} - -void __hci_req_update_class(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - u8 cod[3]; - - bt_dev_dbg(hdev, ""); - - if (!hdev_is_powered(hdev)) - return; - - if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) - return; - - if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE)) - return; - - cod[0] = hdev->minor_class; - cod[1] = hdev->major_class; - cod[2] = get_service_classes(hdev); - - if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) - cod[1] |= 0x20; - - if (memcmp(cod, hdev->dev_class, 3) == 0) - return; - - hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod); -} - -void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, - u8 reason) -{ - switch (conn->state) { - case BT_CONNECTED: - case BT_CONFIG: - if (conn->type == AMP_LINK) { - struct hci_cp_disconn_phy_link cp; - - cp.phy_handle = HCI_PHY_HANDLE(conn->handle); - cp.reason = reason; - hci_req_add(req, HCI_OP_DISCONN_PHY_LINK, sizeof(cp), - &cp); - } else { - struct hci_cp_disconnect dc; - - dc.handle = cpu_to_le16(conn->handle); - dc.reason = reason; - hci_req_add(req, HCI_OP_DISCONNECT, sizeof(dc), &dc); - } - - conn->state = BT_DISCONN; - - break; - case BT_CONNECT: - if (conn->type == LE_LINK) { - if (test_bit(HCI_CONN_SCANNING, &conn->flags)) - break; - hci_req_add(req, HCI_OP_LE_CREATE_CONN_CANCEL, - 0, NULL); - } else if (conn->type == ACL_LINK) { - if (req->hdev->hci_ver < BLUETOOTH_VER_1_2) - break; - hci_req_add(req, HCI_OP_CREATE_CONN_CANCEL, - 6, &conn->dst); - } - break; - case BT_CONNECT2: - if (conn->type == ACL_LINK) { - struct hci_cp_reject_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - rej.reason = reason; - - hci_req_add(req, HCI_OP_REJECT_CONN_REQ, - sizeof(rej), &rej); - } else if (conn->type == SCO_LINK || conn->type == ESCO_LINK) { - struct hci_cp_reject_sync_conn_req rej; - - bacpy(&rej.bdaddr, &conn->dst); - - /* SCO rejection has its own limited set of - * allowed error values (0x0D-0x0F) which isn't - * compatible with most values passed to this - * function. To be safe hard-code one of the - * values that's suitable for SCO. - */ - rej.reason = HCI_ERROR_REJ_LIMITED_RESOURCES; - - hci_req_add(req, HCI_OP_REJECT_SYNC_CONN_REQ, - sizeof(rej), &rej); - } - break; - default: - conn->state = BT_CLOSED; - break; - } -} - -static void abort_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode) -{ - if (status) - bt_dev_dbg(hdev, "Failed to abort connection: status 0x%2.2x", status); -} - -int hci_abort_conn(struct hci_conn *conn, u8 reason) -{ - struct hci_request req; - int err; - - hci_req_init(&req, conn->hdev); - - __hci_abort_conn(&req, conn, reason); - - err = hci_req_run(&req, abort_conn_complete); - if (err && err != -ENODATA) { - bt_dev_err(conn->hdev, "failed to run HCI request: err %d", err); - return err; - } - - return 0; -} - -static int le_scan_disable(struct hci_request *req, unsigned long opt) -{ - hci_req_add_le_scan_disable(req, false); - return 0; -} - -static int bredr_inquiry(struct hci_request *req, unsigned long opt) -{ - u8 length = opt; - const u8 giac[3] = { 0x33, 0x8b, 0x9e }; - const u8 liac[3] = { 0x00, 0x8b, 0x9e }; - struct hci_cp_inquiry cp; - - if (test_bit(HCI_INQUIRY, &req->hdev->flags)) - return 0; - - bt_dev_dbg(req->hdev, ""); - - hci_dev_lock(req->hdev); - hci_inquiry_cache_flush(req->hdev); - hci_dev_unlock(req->hdev); - - memset(&cp, 0, sizeof(cp)); - - if (req->hdev->discovery.limited) - memcpy(&cp.lap, liac, sizeof(cp.lap)); - else - memcpy(&cp.lap, giac, sizeof(cp.lap)); - - cp.length = length; - - hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp); - - return 0; -} - -static void le_scan_disable_work(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - le_scan_disable.work); - u8 status; - - bt_dev_dbg(hdev, ""); - - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return; - - cancel_delayed_work(&hdev->le_scan_restart); - - hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status); - if (status) { - bt_dev_err(hdev, "failed to disable LE scan: status 0x%02x", - status); - return; - } - - hdev->discovery.scan_start = 0; - - /* If we were running LE only scan, change discovery state. If - * we were running both LE and BR/EDR inquiry simultaneously, - * and BR/EDR inquiry is already finished, stop discovery, - * otherwise BR/EDR inquiry will stop discovery when finished. - * If we will resolve remote device name, do not change - * discovery state. - */ - - if (hdev->discovery.type == DISCOV_TYPE_LE) - goto discov_stopped; - - if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) - return; - - if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { - if (!test_bit(HCI_INQUIRY, &hdev->flags) && - hdev->discovery.state != DISCOVERY_RESOLVING) - goto discov_stopped; - - return; - } - - hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN, - HCI_CMD_TIMEOUT, &status); - if (status) { - bt_dev_err(hdev, "inquiry failed: status 0x%02x", status); - goto discov_stopped; - } - - return; - -discov_stopped: - hci_dev_lock(hdev); - hci_discovery_set_state(hdev, DISCOVERY_STOPPED); - hci_dev_unlock(hdev); -} - -static int le_scan_restart(struct hci_request *req, unsigned long opt) -{ - struct hci_dev *hdev = req->hdev; - - /* If controller is not scanning we are done. */ - if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) - return 0; - - if (hdev->scanning_paused) { - bt_dev_dbg(hdev, "Scanning is paused for suspend"); - return 0; - } - - hci_req_add_le_scan_disable(req, false); - - if (use_ext_scan(hdev)) { - struct hci_cp_le_set_ext_scan_enable ext_enable_cp; - - memset(&ext_enable_cp, 0, sizeof(ext_enable_cp)); - ext_enable_cp.enable = LE_SCAN_ENABLE; - ext_enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - - hci_req_add(req, HCI_OP_LE_SET_EXT_SCAN_ENABLE, - sizeof(ext_enable_cp), &ext_enable_cp); - } else { - struct hci_cp_le_set_scan_enable cp; - - memset(&cp, 0, sizeof(cp)); - cp.enable = LE_SCAN_ENABLE; - cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE; - hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp); - } - - return 0; -} - -static void le_scan_restart_work(struct work_struct *work) -{ - struct hci_dev *hdev = container_of(work, struct hci_dev, - le_scan_restart.work); - unsigned long timeout, duration, scan_start, now; - u8 status; - - bt_dev_dbg(hdev, ""); - - hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status); - if (status) { - bt_dev_err(hdev, "failed to restart LE scan: status %d", - status); - return; - } - - hci_dev_lock(hdev); - - if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || - !hdev->discovery.scan_start) - goto unlock; - - /* When the scan was started, hdev->le_scan_disable has been queued - * after duration from scan_start. During scan restart this job - * has been canceled, and we need to queue it again after proper - * timeout, to make sure that scan does not run indefinitely. - */ - duration = hdev->discovery.scan_duration; - scan_start = hdev->discovery.scan_start; - now = jiffies; - if (now - scan_start <= duration) { - int elapsed; - - if (now >= scan_start) - elapsed = now - scan_start; - else - elapsed = ULONG_MAX - scan_start + now; - - timeout = duration - elapsed; - } else { - timeout = 0; - } - - queue_delayed_work(hdev->req_workqueue, - &hdev->le_scan_disable, timeout); - -unlock: - hci_dev_unlock(hdev); -} - -bool hci_req_stop_discovery(struct hci_request *req) -{ - struct hci_dev *hdev = req->hdev; - struct discovery_state *d = &hdev->discovery; - struct hci_cp_remote_name_req_cancel cp; - struct inquiry_entry *e; - bool ret = false; - - bt_dev_dbg(hdev, "state %u", hdev->discovery.state); - - if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) { - if (test_bit(HCI_INQUIRY, &hdev->flags)) - hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL); - - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - cancel_delayed_work(&hdev->le_scan_disable); - cancel_delayed_work(&hdev->le_scan_restart); - hci_req_add_le_scan_disable(req, false); - } - - ret = true; - } else { - /* Passive scanning */ - if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) { - hci_req_add_le_scan_disable(req, false); - ret = true; - } - } - - /* No further actions needed for LE-only discovery */ - if (d->type == DISCOV_TYPE_LE) - return ret; - - if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) { - e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY, - NAME_PENDING); - if (!e) - return ret; - - bacpy(&cp.bdaddr, &e->data.bdaddr); - hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp), - &cp); - ret = true; - } - - return ret; -} - -static void config_data_path_complete(struct hci_dev *hdev, u8 status, - u16 opcode) -{ - bt_dev_dbg(hdev, "status %u", status); -} - -int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec) -{ - struct hci_request req; - int err; - __u8 vnd_len, *vnd_data = NULL; - struct hci_op_configure_data_path *cmd = NULL; - - hci_req_init(&req, hdev); - - err = hdev->get_codec_config_data(hdev, ESCO_LINK, codec, &vnd_len, - &vnd_data); - if (err < 0) - goto error; - - cmd = kzalloc(sizeof(*cmd) + vnd_len, GFP_KERNEL); - if (!cmd) { - err = -ENOMEM; - goto error; - } - - err = hdev->get_data_path_id(hdev, &cmd->data_path_id); - if (err < 0) - goto error; - - cmd->vnd_len = vnd_len; - memcpy(cmd->vnd_data, vnd_data, vnd_len); - - cmd->direction = 0x00; - hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); - - cmd->direction = 0x01; - hci_req_add(&req, HCI_CONFIGURE_DATA_PATH, sizeof(*cmd) + vnd_len, cmd); - - err = hci_req_run(&req, config_data_path_complete); -error: - - kfree(cmd); - kfree(vnd_data); - return err; -} - void hci_request_setup(struct hci_dev *hdev) { - INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work); - INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work); - INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); INIT_DELAYED_WORK(&hdev->interleave_scan, interleave_scan_work); } @@ -2262,13 +918,5 @@ void hci_request_cancel_all(struct hci_dev *hdev) { __hci_cmd_sync_cancel(hdev, ENODEV); - cancel_delayed_work_sync(&hdev->le_scan_disable); - cancel_delayed_work_sync(&hdev->le_scan_restart); - - if (hdev->adv_instance_timeout) { - cancel_delayed_work_sync(&hdev->adv_instance_expire); - hdev->adv_instance_timeout = 0; - } - cancel_interleave_scan(hdev); } diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h index 39d001fa3acf..b9c5a9823837 100644 --- a/net/bluetooth/hci_request.h +++ b/net/bluetooth/hci_request.h @@ -68,63 +68,10 @@ int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req, struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen, const void *param); -void __hci_req_write_fast_connectable(struct hci_request *req, bool enable); -void __hci_req_update_name(struct hci_request *req); -void __hci_req_update_eir(struct hci_request *req); - void hci_req_add_le_scan_disable(struct hci_request *req, bool rpa_le_conn); void hci_req_add_le_passive_scan(struct hci_request *req); void hci_req_prepare_suspend(struct hci_dev *hdev, enum suspended_state next); -void hci_req_disable_address_resolution(struct hci_dev *hdev); -void hci_req_reenable_advertising(struct hci_dev *hdev); -void __hci_req_enable_advertising(struct hci_request *req); -void __hci_req_disable_advertising(struct hci_request *req); -void __hci_req_update_adv_data(struct hci_request *req, u8 instance); -int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance); -int hci_req_start_per_adv(struct hci_dev *hdev, u8 instance, u32 flags, - u16 min_interval, u16 max_interval, - u16 sync_interval); -void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance); - -int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance, - bool force); -void hci_req_clear_adv_instance(struct hci_dev *hdev, struct sock *sk, - struct hci_request *req, u8 instance, - bool force); - -int __hci_req_setup_ext_adv_instance(struct hci_request *req, u8 instance); -int __hci_req_setup_per_adv_instance(struct hci_request *req, u8 instance, - u16 min_interval, u16 max_interval); -int __hci_req_start_ext_adv(struct hci_request *req, u8 instance); -int __hci_req_start_per_adv(struct hci_request *req, u8 instance, u32 flags, - u16 min_interval, u16 max_interval, - u16 sync_interval); -int __hci_req_enable_ext_advertising(struct hci_request *req, u8 instance); -int __hci_req_enable_per_advertising(struct hci_request *req, u8 instance); -int __hci_req_disable_ext_adv_instance(struct hci_request *req, u8 instance); -int __hci_req_remove_ext_adv_instance(struct hci_request *req, u8 instance); -void __hci_req_clear_ext_adv_sets(struct hci_request *req); -int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, - bool use_rpa, struct adv_info *adv_instance, - u8 *own_addr_type, bdaddr_t *rand_addr); - -void __hci_req_update_class(struct hci_request *req); - -/* Returns true if HCI commands were queued */ -bool hci_req_stop_discovery(struct hci_request *req); - -int hci_req_configure_datapath(struct hci_dev *hdev, struct bt_codec *codec); - -void __hci_req_update_scan(struct hci_request *req); - -int hci_update_random_address(struct hci_request *req, bool require_privacy, - bool use_rpa, u8 *own_addr_type); - -int hci_abort_conn(struct hci_conn *conn, u8 reason); -void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn, - u8 reason); - void hci_request_setup(struct hci_dev *hdev); void hci_request_cancel_all(struct hci_dev *hdev); diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c index 0d015d4a8e41..06581223238c 100644 --- a/net/bluetooth/hci_sock.c +++ b/net/bluetooth/hci_sock.c @@ -887,7 +887,6 @@ static int hci_sock_release(struct socket *sock) */ hci_dev_do_close(hdev); hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); - hci_register_suspend_notifier(hdev); mgmt_index_added(hdev); } @@ -1216,7 +1215,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, } mgmt_index_removed(hdev); - hci_unregister_suspend_notifier(hdev); err = hci_dev_open(hdev->id); if (err) { @@ -1231,7 +1229,6 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr, err = 0; } else { hci_dev_clear_flag(hdev, HCI_USER_CHANNEL); - hci_register_suspend_notifier(hdev); mgmt_index_added(hdev); hci_dev_put(hdev); goto done; @@ -2065,6 +2062,7 @@ static int hci_sock_getsockopt(struct socket *sock, int level, int optname, static void hci_sock_destruct(struct sock *sk) { + mgmt_cleanup(sk); skb_queue_purge(&sk->sk_receive_queue); skb_queue_purge(&sk->sk_write_queue); } diff --git a/net/bluetooth/hci_sync.c b/net/bluetooth/hci_sync.c index 187786454d98..76c3107c9f91 100644 --- a/net/bluetooth/hci_sync.c +++ b/net/bluetooth/hci_sync.c @@ -246,7 +246,7 @@ int __hci_cmd_sync_status_sk(struct hci_dev *hdev, u16 opcode, u32 plen, skb = __hci_cmd_sync_sk(hdev, opcode, plen, param, event, timeout, sk); if (IS_ERR(skb)) { bt_dev_err(hdev, "Opcode 0x%4x failed: %ld", opcode, - PTR_ERR(skb)); + PTR_ERR(skb)); return PTR_ERR(skb); } @@ -321,6 +321,307 @@ static void hci_cmd_sync_cancel_work(struct work_struct *work) wake_up_interruptible(&hdev->req_wait_q); } +static int hci_scan_disable_sync(struct hci_dev *hdev); +static int scan_disable_sync(struct hci_dev *hdev, void *data) +{ + return hci_scan_disable_sync(hdev); +} + +static int hci_inquiry_sync(struct hci_dev *hdev, u8 length); +static int interleaved_inquiry_sync(struct hci_dev *hdev, void *data) +{ + return hci_inquiry_sync(hdev, DISCOV_INTERLEAVED_INQUIRY_LEN); +} + +static void le_scan_disable(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + le_scan_disable.work); + int status; + + bt_dev_dbg(hdev, ""); + hci_dev_lock(hdev); + + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) + goto _return; + + cancel_delayed_work(&hdev->le_scan_restart); + + status = hci_cmd_sync_queue(hdev, scan_disable_sync, NULL, NULL); + if (status) { + bt_dev_err(hdev, "failed to disable LE scan: %d", status); + goto _return; + } + + hdev->discovery.scan_start = 0; + + /* If we were running LE only scan, change discovery state. If + * we were running both LE and BR/EDR inquiry simultaneously, + * and BR/EDR inquiry is already finished, stop discovery, + * otherwise BR/EDR inquiry will stop discovery when finished. + * If we will resolve remote device name, do not change + * discovery state. + */ + + if (hdev->discovery.type == DISCOV_TYPE_LE) + goto discov_stopped; + + if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED) + goto _return; + + if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) { + if (!test_bit(HCI_INQUIRY, &hdev->flags) && + hdev->discovery.state != DISCOVERY_RESOLVING) + goto discov_stopped; + + goto _return; + } + + status = hci_cmd_sync_queue(hdev, interleaved_inquiry_sync, NULL, NULL); + if (status) { + bt_dev_err(hdev, "inquiry failed: status %d", status); + goto discov_stopped; + } + + goto _return; + +discov_stopped: + hci_discovery_set_state(hdev, DISCOVERY_STOPPED); + +_return: + hci_dev_unlock(hdev); +} + +static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, + u8 filter_dup); +static int hci_le_scan_restart_sync(struct hci_dev *hdev) +{ + /* If controller is not scanning we are done. */ + if (!hci_dev_test_flag(hdev, HCI_LE_SCAN)) + return 0; + + if (hdev->scanning_paused) { + bt_dev_dbg(hdev, "Scanning is paused for suspend"); + return 0; + } + + hci_le_set_scan_enable_sync(hdev, LE_SCAN_DISABLE, 0x00); + return hci_le_set_scan_enable_sync(hdev, LE_SCAN_ENABLE, + LE_SCAN_FILTER_DUP_ENABLE); +} + +static int le_scan_restart_sync(struct hci_dev *hdev, void *data) +{ + return hci_le_scan_restart_sync(hdev); +} + +static void le_scan_restart(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + le_scan_restart.work); + unsigned long timeout, duration, scan_start, now; + int status; + + bt_dev_dbg(hdev, ""); + + hci_dev_lock(hdev); + + status = hci_cmd_sync_queue(hdev, le_scan_restart_sync, NULL, NULL); + if (status) { + bt_dev_err(hdev, "failed to restart LE scan: status %d", + status); + goto unlock; + } + + if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) || + !hdev->discovery.scan_start) + goto unlock; + + /* When the scan was started, hdev->le_scan_disable has been queued + * after duration from scan_start. During scan restart this job + * has been canceled, and we need to queue it again after proper + * timeout, to make sure that scan does not run indefinitely. + */ + duration = hdev->discovery.scan_duration; + scan_start = hdev->discovery.scan_start; + now = jiffies; + if (now - scan_start <= duration) { + int elapsed; + + if (now >= scan_start) + elapsed = now - scan_start; + else + elapsed = ULONG_MAX - scan_start + now; + + timeout = duration - elapsed; + } else { + timeout = 0; + } + + queue_delayed_work(hdev->req_workqueue, + &hdev->le_scan_disable, timeout); + +unlock: + hci_dev_unlock(hdev); +} + +static int reenable_adv_sync(struct hci_dev *hdev, void *data) +{ + bt_dev_dbg(hdev, ""); + + if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) && + list_empty(&hdev->adv_instances)) + return 0; + + if (hdev->cur_adv_instance) { + return hci_schedule_adv_instance_sync(hdev, + hdev->cur_adv_instance, + true); + } else { + if (ext_adv_capable(hdev)) { + hci_start_ext_adv_sync(hdev, 0x00); + } else { + hci_update_adv_data_sync(hdev, 0x00); + hci_update_scan_rsp_data_sync(hdev, 0x00); + hci_enable_advertising_sync(hdev); + } + } + + return 0; +} + +static void reenable_adv(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + reenable_adv_work); + int status; + + bt_dev_dbg(hdev, ""); + + hci_dev_lock(hdev); + + status = hci_cmd_sync_queue(hdev, reenable_adv_sync, NULL, NULL); + if (status) + bt_dev_err(hdev, "failed to reenable ADV: %d", status); + + hci_dev_unlock(hdev); +} + +static void cancel_adv_timeout(struct hci_dev *hdev) +{ + if (hdev->adv_instance_timeout) { + hdev->adv_instance_timeout = 0; + cancel_delayed_work(&hdev->adv_instance_expire); + } +} + +/* For a single instance: + * - force == true: The instance will be removed even when its remaining + * lifetime is not zero. + * - force == false: the instance will be deactivated but kept stored unless + * the remaining lifetime is zero. + * + * For instance == 0x00: + * - force == true: All instances will be removed regardless of their timeout + * setting. + * - force == false: Only instances that have a timeout will be removed. + */ +int hci_clear_adv_instance_sync(struct hci_dev *hdev, struct sock *sk, + u8 instance, bool force) +{ + struct adv_info *adv_instance, *n, *next_instance = NULL; + int err; + u8 rem_inst; + + /* Cancel any timeout concerning the removed instance(s). */ + if (!instance || hdev->cur_adv_instance == instance) + cancel_adv_timeout(hdev); + + /* Get the next instance to advertise BEFORE we remove + * the current one. This can be the same instance again + * if there is only one instance. + */ + if (instance && hdev->cur_adv_instance == instance) + next_instance = hci_get_next_instance(hdev, instance); + + if (instance == 0x00) { + list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, + list) { + if (!(force || adv_instance->timeout)) + continue; + + rem_inst = adv_instance->instance; + err = hci_remove_adv_instance(hdev, rem_inst); + if (!err) + mgmt_advertising_removed(sk, hdev, rem_inst); + } + } else { + adv_instance = hci_find_adv_instance(hdev, instance); + + if (force || (adv_instance && adv_instance->timeout && + !adv_instance->remaining_time)) { + /* Don't advertise a removed instance. */ + if (next_instance && + next_instance->instance == instance) + next_instance = NULL; + + err = hci_remove_adv_instance(hdev, instance); + if (!err) + mgmt_advertising_removed(sk, hdev, instance); + } + } + + if (!hdev_is_powered(hdev) || hci_dev_test_flag(hdev, HCI_ADVERTISING)) + return 0; + + if (next_instance && !ext_adv_capable(hdev)) + return hci_schedule_adv_instance_sync(hdev, + next_instance->instance, + false); + + return 0; +} + +static int adv_timeout_expire_sync(struct hci_dev *hdev, void *data) +{ + u8 instance = *(u8 *)data; + + kfree(data); + + hci_clear_adv_instance_sync(hdev, NULL, instance, false); + + if (list_empty(&hdev->adv_instances)) + return hci_disable_advertising_sync(hdev); + + return 0; +} + +static void adv_timeout_expire(struct work_struct *work) +{ + u8 *inst_ptr; + struct hci_dev *hdev = container_of(work, struct hci_dev, + adv_instance_expire.work); + + bt_dev_dbg(hdev, ""); + + hci_dev_lock(hdev); + + hdev->adv_instance_timeout = 0; + + if (hdev->cur_adv_instance == 0x00) + goto unlock; + + inst_ptr = kmalloc(1, GFP_KERNEL); + if (!inst_ptr) + goto unlock; + + *inst_ptr = hdev->cur_adv_instance; + hci_cmd_sync_queue(hdev, adv_timeout_expire_sync, inst_ptr, NULL); + +unlock: + hci_dev_unlock(hdev); +} + void hci_cmd_sync_init(struct hci_dev *hdev) { INIT_WORK(&hdev->cmd_sync_work, hci_cmd_sync_work); @@ -328,6 +629,10 @@ void hci_cmd_sync_init(struct hci_dev *hdev) mutex_init(&hdev->cmd_sync_work_lock); INIT_WORK(&hdev->cmd_sync_cancel_work, hci_cmd_sync_cancel_work); + INIT_WORK(&hdev->reenable_adv_work, reenable_adv); + INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable); + INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart); + INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire); } void hci_cmd_sync_clear(struct hci_dev *hdev) @@ -335,6 +640,7 @@ void hci_cmd_sync_clear(struct hci_dev *hdev) struct hci_cmd_sync_work_entry *entry, *tmp; cancel_work_sync(&hdev->cmd_sync_work); + cancel_work_sync(&hdev->reenable_adv_work); list_for_each_entry_safe(entry, tmp, &hdev->cmd_sync_work_list, list) { if (entry->destroy) @@ -1333,14 +1639,6 @@ int hci_le_terminate_big_sync(struct hci_dev *hdev, u8 handle, u8 reason) sizeof(cp), &cp, HCI_CMD_TIMEOUT); } -static void cancel_adv_timeout(struct hci_dev *hdev) -{ - if (hdev->adv_instance_timeout) { - hdev->adv_instance_timeout = 0; - cancel_delayed_work(&hdev->adv_instance_expire); - } -} - static int hci_set_ext_adv_data_sync(struct hci_dev *hdev, u8 instance) { struct { @@ -1492,10 +1790,13 @@ static int hci_clear_adv_sets_sync(struct hci_dev *hdev, struct sock *sk) static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) { struct adv_info *adv, *n; + int err = 0; if (ext_adv_capable(hdev)) /* Remove all existing sets */ - return hci_clear_adv_sets_sync(hdev, sk); + err = hci_clear_adv_sets_sync(hdev, sk); + if (ext_adv_capable(hdev)) + return err; /* This is safe as long as there is no command send while the lock is * held. @@ -1523,11 +1824,13 @@ static int hci_clear_adv_sync(struct hci_dev *hdev, struct sock *sk, bool force) static int hci_remove_adv_sync(struct hci_dev *hdev, u8 instance, struct sock *sk) { - int err; + int err = 0; /* If we use extended advertising, instance has to be removed first. */ if (ext_adv_capable(hdev)) - return hci_remove_ext_adv_instance_sync(hdev, instance, sk); + err = hci_remove_ext_adv_instance_sync(hdev, instance, sk); + if (ext_adv_capable(hdev)) + return err; /* This is safe as long as there is no command send while the lock is * held. @@ -1626,13 +1929,16 @@ int hci_read_tx_power_sync(struct hci_dev *hdev, __le16 handle, u8 type) int hci_disable_advertising_sync(struct hci_dev *hdev) { u8 enable = 0x00; + int err = 0; /* If controller is not advertising we are done. */ if (!hci_dev_test_flag(hdev, HCI_LE_ADV)) return 0; if (ext_adv_capable(hdev)) - return hci_disable_ext_adv_instance_sync(hdev, 0x00); + err = hci_disable_ext_adv_instance_sync(hdev, 0x00); + if (ext_adv_capable(hdev)) + return err; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable, HCI_CMD_TIMEOUT); @@ -1645,7 +1951,11 @@ static int hci_le_set_ext_scan_enable_sync(struct hci_dev *hdev, u8 val, memset(&cp, 0, sizeof(cp)); cp.enable = val; - cp.filter_dup = filter_dup; + + if (hci_dev_test_flag(hdev, HCI_MESH)) + cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; + else + cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_EXT_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); @@ -1661,7 +1971,11 @@ static int hci_le_set_scan_enable_sync(struct hci_dev *hdev, u8 val, memset(&cp, 0, sizeof(cp)); cp.enable = val; - cp.filter_dup = filter_dup; + + if (val && hci_dev_test_flag(hdev, HCI_MESH)) + cp.filter_dup = LE_SCAN_FILTER_DUP_DISABLE; + else + cp.filter_dup = filter_dup; return __hci_cmd_sync_status(hdev, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp, HCI_CMD_TIMEOUT); @@ -2300,6 +2614,7 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) u8 own_addr_type; u8 filter_policy; u16 window, interval; + u8 filter_dups = LE_SCAN_FILTER_DUP_ENABLE; int err; if (hdev->scanning_paused) { @@ -2362,11 +2677,16 @@ static int hci_passive_scan_sync(struct hci_dev *hdev) interval = hdev->le_scan_interval; } + /* Disable all filtering for Mesh */ + if (hci_dev_test_flag(hdev, HCI_MESH)) { + filter_policy = 0; + filter_dups = LE_SCAN_FILTER_DUP_DISABLE; + } + bt_dev_dbg(hdev, "LE passive scan with acceptlist = %d", filter_policy); return hci_start_scan_sync(hdev, LE_SCAN_PASSIVE, interval, window, - own_addr_type, filter_policy, - LE_SCAN_FILTER_DUP_ENABLE); + own_addr_type, filter_policy, filter_dups); } /* This function controls the passive scanning based on hdev->pend_le_conns @@ -2416,7 +2736,8 @@ int hci_update_passive_scan_sync(struct hci_dev *hdev) bt_dev_dbg(hdev, "ADV monitoring is %s", hci_is_adv_monitoring(hdev) ? "on" : "off"); - if (list_empty(&hdev->pend_le_conns) && + if (!hci_dev_test_flag(hdev, HCI_MESH) && + list_empty(&hdev->pend_le_conns) && list_empty(&hdev->pend_le_reports) && !hci_is_adv_monitoring(hdev) && !hci_dev_test_flag(hdev, HCI_PA_SYNC)) { @@ -3018,12 +3339,6 @@ static const struct hci_init_stage amp_init2[] = { /* Read Buffer Size (ACL mtu, max pkt, etc.) */ static int hci_read_buffer_size_sync(struct hci_dev *hdev) { - /* Use Read LE Buffer Size V2 if supported */ - if (hdev->commands[41] & 0x20) - return __hci_cmd_sync_status(hdev, - HCI_OP_LE_READ_BUFFER_SIZE_V2, - 0, NULL, HCI_CMD_TIMEOUT); - return __hci_cmd_sync_status(hdev, HCI_OP_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } @@ -3237,6 +3552,12 @@ static const struct hci_init_stage hci_init2[] = { /* Read LE Buffer Size */ static int hci_le_read_buffer_size_sync(struct hci_dev *hdev) { + /* Use Read LE Buffer Size V2 if supported */ + if (hdev->commands[41] & 0x20) + return __hci_cmd_sync_status(hdev, + HCI_OP_LE_READ_BUFFER_SIZE_V2, + 0, NULL, HCI_CMD_TIMEOUT); + return __hci_cmd_sync_status(hdev, HCI_OP_LE_READ_BUFFER_SIZE, 0, NULL, HCI_CMD_TIMEOUT); } @@ -4355,6 +4676,7 @@ int hci_dev_open_sync(struct hci_dev *hdev) hci_dev_test_flag(hdev, HCI_MGMT) && hdev->dev_type == HCI_PRIMARY) { ret = hci_powered_update_sync(hdev); + mgmt_power_on(hdev, ret); } } else { /* Init failed, cleanup */ @@ -4406,6 +4728,31 @@ static void hci_pend_le_actions_clear(struct hci_dev *hdev) BT_DBG("All LE pending actions cleared"); } +static int hci_dev_shutdown(struct hci_dev *hdev) +{ + int err = 0; + /* Similar to how we first do setup and then set the exclusive access + * bit for userspace, we must first unset userchannel and then clean up. + * Otherwise, the kernel can't properly use the hci channel to clean up + * the controller (some shutdown routines require sending additional + * commands to the controller for example). + */ + bool was_userchannel = + hci_dev_test_and_clear_flag(hdev, HCI_USER_CHANNEL); + + if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && + test_bit(HCI_UP, &hdev->flags)) { + /* Execute vendor specific shutdown routine */ + if (hdev->shutdown) + err = hdev->shutdown(hdev); + } + + if (was_userchannel) + hci_dev_set_flag(hdev, HCI_USER_CHANNEL); + + return err; +} + int hci_dev_close_sync(struct hci_dev *hdev) { bool auto_off; @@ -4415,17 +4762,18 @@ int hci_dev_close_sync(struct hci_dev *hdev) cancel_delayed_work(&hdev->power_off); cancel_delayed_work(&hdev->ncmd_timer); + cancel_delayed_work(&hdev->le_scan_disable); + cancel_delayed_work(&hdev->le_scan_restart); hci_request_cancel_all(hdev); - if (!hci_dev_test_flag(hdev, HCI_UNREGISTER) && - !hci_dev_test_flag(hdev, HCI_USER_CHANNEL) && - test_bit(HCI_UP, &hdev->flags)) { - /* Execute vendor specific shutdown routine */ - if (hdev->shutdown) - err = hdev->shutdown(hdev); + if (hdev->adv_instance_timeout) { + cancel_delayed_work_sync(&hdev->adv_instance_expire); + hdev->adv_instance_timeout = 0; } + err = hci_dev_shutdown(hdev); + if (!test_and_clear_bit(HCI_UP, &hdev->flags)) { cancel_delayed_work_sync(&hdev->cmd_timer); return err; @@ -5023,7 +5371,7 @@ static int hci_active_scan_sync(struct hci_dev *hdev, uint16_t interval) /* Pause advertising since active scanning disables address resolution * which advertising depend on in order to generate its RPAs. */ - if (use_ll_privacy(hdev)) { + if (use_ll_privacy(hdev) && hci_dev_test_flag(hdev, HCI_PRIVACY)) { err = hci_pause_advertising_sync(hdev); if (err) { bt_dev_err(hdev, "pause advertising failed: %d", err); @@ -5737,3 +6085,96 @@ int hci_le_pa_terminate_sync(struct hci_dev *hdev, u16 handle) return __hci_cmd_sync_status(hdev, HCI_OP_LE_PA_TERM_SYNC, sizeof(cp), &cp, HCI_CMD_TIMEOUT); } + +int hci_get_random_address(struct hci_dev *hdev, bool require_privacy, + bool use_rpa, struct adv_info *adv_instance, + u8 *own_addr_type, bdaddr_t *rand_addr) +{ + int err; + + bacpy(rand_addr, BDADDR_ANY); + + /* If privacy is enabled use a resolvable private address. If + * current RPA has expired then generate a new one. + */ + if (use_rpa) { + /* If Controller supports LL Privacy use own address type is + * 0x03 + */ + if (use_ll_privacy(hdev)) + *own_addr_type = ADDR_LE_DEV_RANDOM_RESOLVED; + else + *own_addr_type = ADDR_LE_DEV_RANDOM; + + if (adv_instance) { + if (adv_rpa_valid(adv_instance)) + return 0; + } else { + if (rpa_valid(hdev)) + return 0; + } + + err = smp_generate_rpa(hdev, hdev->irk, &hdev->rpa); + if (err < 0) { + bt_dev_err(hdev, "failed to generate new RPA"); + return err; + } + + bacpy(rand_addr, &hdev->rpa); + + return 0; + } + + /* In case of required privacy without resolvable private address, + * use an non-resolvable private address. This is useful for + * non-connectable advertising. + */ + if (require_privacy) { + bdaddr_t nrpa; + + while (true) { + /* The non-resolvable private address is generated + * from random six bytes with the two most significant + * bits cleared. + */ + get_random_bytes(&nrpa, 6); + nrpa.b[5] &= 0x3f; + + /* The non-resolvable private address shall not be + * equal to the public address. + */ + if (bacmp(&hdev->bdaddr, &nrpa)) + break; + } + + *own_addr_type = ADDR_LE_DEV_RANDOM; + bacpy(rand_addr, &nrpa); + + return 0; + } + + /* No privacy so use a public address. */ + *own_addr_type = ADDR_LE_DEV_PUBLIC; + + return 0; +} + +static int _update_adv_data_sync(struct hci_dev *hdev, void *data) +{ + u8 instance = *(u8 *)data; + + kfree(data); + + return hci_update_adv_data_sync(hdev, instance); +} + +int hci_update_adv_data(struct hci_dev *hdev, u8 instance) +{ + u8 *inst_ptr = kmalloc(1, GFP_KERNEL); + + if (!inst_ptr) + return -ENOMEM; + + *inst_ptr = instance; + return hci_cmd_sync_queue(hdev, _update_adv_data_sync, inst_ptr, NULL); +} diff --git a/net/bluetooth/hci_sysfs.c b/net/bluetooth/hci_sysfs.c index 4e3e0451b08c..08542dfc2dc5 100644 --- a/net/bluetooth/hci_sysfs.c +++ b/net/bluetooth/hci_sysfs.c @@ -48,6 +48,9 @@ void hci_conn_add_sysfs(struct hci_conn *conn) BT_DBG("conn %p", conn); + if (device_is_registered(&conn->dev)) + return; + dev_set_name(&conn->dev, "%s:%d", hdev->name, conn->handle); if (device_add(&conn->dev) < 0) { diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 2c9de67daadc..1f34b82ca0ec 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -61,6 +61,9 @@ static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, struct sk_buff_head *skbs, u8 event); +static void l2cap_retrans_timeout(struct work_struct *work); +static void l2cap_monitor_timeout(struct work_struct *work); +static void l2cap_ack_timeout(struct work_struct *work); static inline u8 bdaddr_type(u8 link_type, u8 bdaddr_type) { @@ -476,6 +479,9 @@ struct l2cap_chan *l2cap_chan_create(void) write_unlock(&chan_list_lock); INIT_DELAYED_WORK(&chan->chan_timer, l2cap_chan_timeout); + INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout); + INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout); + INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout); chan->state = BT_OPEN; @@ -3320,10 +3326,6 @@ int l2cap_ertm_init(struct l2cap_chan *chan) chan->rx_state = L2CAP_RX_STATE_RECV; chan->tx_state = L2CAP_TX_STATE_XMIT; - INIT_DELAYED_WORK(&chan->retrans_timer, l2cap_retrans_timeout); - INIT_DELAYED_WORK(&chan->monitor_timer, l2cap_monitor_timeout); - INIT_DELAYED_WORK(&chan->ack_timer, l2cap_ack_timeout); - skb_queue_head_init(&chan->srej_q); err = l2cap_seq_list_init(&chan->srej_list, chan->tx_win); @@ -4307,6 +4309,12 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } } + chan = l2cap_chan_hold_unless_zero(chan); + if (!chan) { + err = -EBADSLT; + goto unlock; + } + err = 0; l2cap_chan_lock(chan); @@ -4336,6 +4344,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, } l2cap_chan_unlock(chan); + l2cap_chan_put(chan); unlock: mutex_unlock(&conn->chan_lock); diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c index 72e6595a71cc..a92e7e485feb 100644 --- a/net/bluetooth/mgmt.c +++ b/net/bluetooth/mgmt.c @@ -129,6 +129,10 @@ static const u16 mgmt_commands[] = { MGMT_OP_ADD_EXT_ADV_PARAMS, MGMT_OP_ADD_EXT_ADV_DATA, MGMT_OP_ADD_ADV_PATTERNS_MONITOR_RSSI, + MGMT_OP_SET_MESH_RECEIVER, + MGMT_OP_MESH_READ_FEATURES, + MGMT_OP_MESH_SEND, + MGMT_OP_MESH_SEND_CANCEL, }; static const u16 mgmt_events[] = { @@ -1048,9 +1052,66 @@ static void discov_off(struct work_struct *work) hci_dev_unlock(hdev); } +static int send_settings_rsp(struct sock *sk, u16 opcode, struct hci_dev *hdev); + +static void mesh_send_complete(struct hci_dev *hdev, + struct mgmt_mesh_tx *mesh_tx, bool silent) +{ + u8 handle = mesh_tx->handle; + + if (!silent) + mgmt_event(MGMT_EV_MESH_PACKET_CMPLT, hdev, &handle, + sizeof(handle), NULL); + + mgmt_mesh_remove(mesh_tx); +} + +static int mesh_send_done_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_mesh_tx *mesh_tx; + + hci_dev_clear_flag(hdev, HCI_MESH_SENDING); + hci_disable_advertising_sync(hdev); + mesh_tx = mgmt_mesh_next(hdev, NULL); + + if (mesh_tx) + mesh_send_complete(hdev, mesh_tx, false); + + return 0; +} + +static int mesh_send_sync(struct hci_dev *hdev, void *data); +static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err); +static void mesh_next(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_mesh_tx *mesh_tx = mgmt_mesh_next(hdev, NULL); + + if (!mesh_tx) + return; + + err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx, + mesh_send_start_complete); + + if (err < 0) + mesh_send_complete(hdev, mesh_tx, false); + else + hci_dev_set_flag(hdev, HCI_MESH_SENDING); +} + +static void mesh_send_done(struct work_struct *work) +{ + struct hci_dev *hdev = container_of(work, struct hci_dev, + mesh_send_done.work); + + if (!hci_dev_test_flag(hdev, HCI_MESH_SENDING)) + return; + + hci_cmd_sync_queue(hdev, mesh_send_done_sync, NULL, mesh_next); +} + static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) { - if (hci_dev_test_and_set_flag(hdev, HCI_MGMT)) + if (hci_dev_test_flag(hdev, HCI_MGMT)) return; BT_INFO("MGMT ver %d.%d", MGMT_VERSION, MGMT_REVISION); @@ -1058,6 +1119,7 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) INIT_DELAYED_WORK(&hdev->discov_off, discov_off); INIT_DELAYED_WORK(&hdev->service_cache, service_cache_off); INIT_DELAYED_WORK(&hdev->rpa_expired, rpa_expired); + INIT_DELAYED_WORK(&hdev->mesh_send_done, mesh_send_done); /* Non-mgmt controlled devices get this bit set * implicitly so that pairing works for them, however @@ -1065,6 +1127,8 @@ static void mgmt_init_hdev(struct sock *sk, struct hci_dev *hdev) * it */ hci_dev_clear_flag(hdev, HCI_BONDABLE); + + hci_dev_set_flag(hdev, HCI_MGMT); } static int read_controller_info(struct sock *sk, struct hci_dev *hdev, @@ -2058,6 +2122,8 @@ static int set_le_sync(struct hci_dev *hdev, void *data) int err; if (!val) { + hci_clear_adv_instance_sync(hdev, NULL, 0x00, true); + if (hci_dev_test_flag(hdev, HCI_LE_ADV)) hci_disable_advertising_sync(hdev); @@ -2092,6 +2158,317 @@ static int set_le_sync(struct hci_dev *hdev, void *data) return err; } +static void set_mesh_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_pending_cmd *cmd = data; + u8 status = mgmt_status(err); + struct sock *sk = cmd->sk; + + if (status) { + mgmt_pending_foreach(MGMT_OP_SET_MESH_RECEIVER, hdev, + cmd_status_rsp, &status); + return; + } + + mgmt_pending_remove(cmd); + mgmt_cmd_complete(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, 0, NULL, 0); +} + +static int set_mesh_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_set_mesh *cp = cmd->param; + size_t len = cmd->param_len; + + memset(hdev->mesh_ad_types, 0, sizeof(hdev->mesh_ad_types)); + + if (cp->enable) + hci_dev_set_flag(hdev, HCI_MESH); + else + hci_dev_clear_flag(hdev, HCI_MESH); + + len -= sizeof(*cp); + + /* If filters don't fit, forward all adv pkts */ + if (len <= sizeof(hdev->mesh_ad_types)) + memcpy(hdev->mesh_ad_types, cp->ad_types, len); + + hci_update_passive_scan_sync(hdev); + return 0; +} + +static int set_mesh(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) +{ + struct mgmt_cp_set_mesh *cp = data; + struct mgmt_pending_cmd *cmd; + int err = 0; + + bt_dev_dbg(hdev, "sock %p", sk); + + if (!lmp_le_capable(hdev) || + !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_NOT_SUPPORTED); + + if (cp->enable != 0x00 && cp->enable != 0x01) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_INVALID_PARAMS); + + hci_dev_lock(hdev); + + cmd = mgmt_pending_add(sk, MGMT_OP_SET_MESH_RECEIVER, hdev, data, len); + if (!cmd) + err = -ENOMEM; + else + err = hci_cmd_sync_queue(hdev, set_mesh_sync, cmd, + set_mesh_complete); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_MESH_RECEIVER, + MGMT_STATUS_FAILED); + + if (cmd) + mgmt_pending_remove(cmd); + } + + hci_dev_unlock(hdev); + return err; +} + +static void mesh_send_start_complete(struct hci_dev *hdev, void *data, int err) +{ + struct mgmt_mesh_tx *mesh_tx = data; + struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param; + unsigned long mesh_send_interval; + u8 mgmt_err = mgmt_status(err); + + /* Report any errors here, but don't report completion */ + + if (mgmt_err) { + hci_dev_clear_flag(hdev, HCI_MESH_SENDING); + /* Send Complete Error Code for handle */ + mesh_send_complete(hdev, mesh_tx, false); + return; + } + + mesh_send_interval = msecs_to_jiffies((send->cnt) * 25); + queue_delayed_work(hdev->req_workqueue, &hdev->mesh_send_done, + mesh_send_interval); +} + +static int mesh_send_sync(struct hci_dev *hdev, void *data) +{ + struct mgmt_mesh_tx *mesh_tx = data; + struct mgmt_cp_mesh_send *send = (void *)mesh_tx->param; + struct adv_info *adv, *next_instance; + u8 instance = hdev->le_num_of_adv_sets + 1; + u16 timeout, duration; + int err = 0; + + if (hdev->le_num_of_adv_sets <= hdev->adv_instance_cnt) + return MGMT_STATUS_BUSY; + + timeout = 1000; + duration = send->cnt * INTERVAL_TO_MS(hdev->le_adv_max_interval); + adv = hci_add_adv_instance(hdev, instance, 0, + send->adv_data_len, send->adv_data, + 0, NULL, + timeout, duration, + HCI_ADV_TX_POWER_NO_PREFERENCE, + hdev->le_adv_min_interval, + hdev->le_adv_max_interval, + mesh_tx->handle); + + if (!IS_ERR(adv)) + mesh_tx->instance = instance; + else + err = PTR_ERR(adv); + + if (hdev->cur_adv_instance == instance) { + /* If the currently advertised instance is being changed then + * cancel the current advertising and schedule the next + * instance. If there is only one instance then the overridden + * advertising data will be visible right away. + */ + cancel_adv_timeout(hdev); + + next_instance = hci_get_next_instance(hdev, instance); + if (next_instance) + instance = next_instance->instance; + else + instance = 0; + } else if (hdev->adv_instance_timeout) { + /* Immediately advertise the new instance if no other, or + * let it go naturally from queue if ADV is already happening + */ + instance = 0; + } + + if (instance) + return hci_schedule_adv_instance_sync(hdev, instance, true); + + return err; +} + +static void send_count(struct mgmt_mesh_tx *mesh_tx, void *data) +{ + struct mgmt_rp_mesh_read_features *rp = data; + + if (rp->used_handles >= rp->max_handles) + return; + + rp->handles[rp->used_handles++] = mesh_tx->handle; +} + +static int mesh_features(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_rp_mesh_read_features rp; + + if (!lmp_le_capable(hdev) || + !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, + MGMT_STATUS_NOT_SUPPORTED); + + memset(&rp, 0, sizeof(rp)); + rp.index = cpu_to_le16(hdev->id); + if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + rp.max_handles = MESH_HANDLES_MAX; + + hci_dev_lock(hdev); + + if (rp.max_handles) + mgmt_mesh_foreach(hdev, send_count, &rp, sk); + + mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_READ_FEATURES, 0, &rp, + rp.used_handles + sizeof(rp) - MESH_HANDLES_MAX); + + hci_dev_unlock(hdev); + return 0; +} + +static int send_cancel(struct hci_dev *hdev, void *data) +{ + struct mgmt_pending_cmd *cmd = data; + struct mgmt_cp_mesh_send_cancel *cancel = (void *)cmd->param; + struct mgmt_mesh_tx *mesh_tx; + + if (!cancel->handle) { + do { + mesh_tx = mgmt_mesh_next(hdev, cmd->sk); + + if (mesh_tx) + mesh_send_complete(hdev, mesh_tx, false); + } while (mesh_tx); + } else { + mesh_tx = mgmt_mesh_find(hdev, cancel->handle); + + if (mesh_tx && mesh_tx->sk == cmd->sk) + mesh_send_complete(hdev, mesh_tx, false); + } + + mgmt_cmd_complete(cmd->sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, + 0, NULL, 0); + mgmt_pending_free(cmd); + + return 0; +} + +static int mesh_send_cancel(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_pending_cmd *cmd; + int err; + + if (!lmp_le_capable(hdev) || + !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, + MGMT_STATUS_NOT_SUPPORTED); + + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, + MGMT_STATUS_REJECTED); + + hci_dev_lock(hdev); + cmd = mgmt_pending_new(sk, MGMT_OP_MESH_SEND_CANCEL, hdev, data, len); + if (!cmd) + err = -ENOMEM; + else + err = hci_cmd_sync_queue(hdev, send_cancel, cmd, NULL); + + if (err < 0) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND_CANCEL, + MGMT_STATUS_FAILED); + + if (cmd) + mgmt_pending_free(cmd); + } + + hci_dev_unlock(hdev); + return err; +} + +static int mesh_send(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) +{ + struct mgmt_mesh_tx *mesh_tx; + struct mgmt_cp_mesh_send *send = data; + struct mgmt_rp_mesh_read_features rp; + bool sending; + int err = 0; + + if (!lmp_le_capable(hdev) || + !hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_NOT_SUPPORTED); + if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED) || + len <= MGMT_MESH_SEND_SIZE || + len > (MGMT_MESH_SEND_SIZE + 31)) + return mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_REJECTED); + + hci_dev_lock(hdev); + + memset(&rp, 0, sizeof(rp)); + rp.max_handles = MESH_HANDLES_MAX; + + mgmt_mesh_foreach(hdev, send_count, &rp, sk); + + if (rp.max_handles <= rp.used_handles) { + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_BUSY); + goto done; + } + + sending = hci_dev_test_flag(hdev, HCI_MESH_SENDING); + mesh_tx = mgmt_mesh_add(sk, hdev, send, len); + + if (!mesh_tx) + err = -ENOMEM; + else if (!sending) + err = hci_cmd_sync_queue(hdev, mesh_send_sync, mesh_tx, + mesh_send_start_complete); + + if (err < 0) { + bt_dev_err(hdev, "Send Mesh Failed %d", err); + err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_MESH_SEND, + MGMT_STATUS_FAILED); + + if (mesh_tx) { + if (sending) + mgmt_mesh_remove(mesh_tx); + } + } else { + hci_dev_set_flag(hdev, HCI_MESH_SENDING); + + mgmt_cmd_complete(sk, hdev->id, MGMT_OP_MESH_SEND, 0, + &mesh_tx->handle, 1); + } + +done: + hci_dev_unlock(hdev); + return err; +} + static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { struct mgmt_mode *cp = data; @@ -2131,9 +2508,6 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) val = !!cp->val; enabled = lmp_host_le_capable(hdev); - if (!val) - hci_req_clear_adv_instance(hdev, NULL, NULL, 0x00, true); - if (!hdev_is_powered(hdev) || val == enabled) { bool changed = false; @@ -3186,6 +3560,18 @@ unlock: return err; } +static int abort_conn_sync(struct hci_dev *hdev, void *data) +{ + struct hci_conn *conn; + u16 handle = PTR_ERR(data); + + conn = hci_conn_hash_lookup_handle(hdev, handle); + if (!conn) + return 0; + + return hci_abort_conn_sync(hdev, conn, HCI_ERROR_REMOTE_USER_TERM); +} + static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, u16 len) { @@ -3236,7 +3622,8 @@ static int cancel_pair_device(struct sock *sk, struct hci_dev *hdev, void *data, le_addr_type(addr->type)); if (conn->conn_reason == CONN_REASON_PAIR_DEVICE) - hci_abort_conn(conn, HCI_ERROR_REMOTE_USER_TERM); + hci_cmd_sync_queue(hdev, abort_conn_sync, ERR_PTR(conn->handle), + NULL); unlock: hci_dev_unlock(hdev); @@ -3991,17 +4378,28 @@ static const u8 iso_socket_uuid[16] = { 0x6a, 0x49, 0xe0, 0x05, 0x88, 0xf1, 0xba, 0x6f, }; +/* 2ce463d7-7a03-4d8d-bf05-5f24e8f36e76 */ +static const u8 mgmt_mesh_uuid[16] = { + 0x76, 0x6e, 0xf3, 0xe8, 0x24, 0x5f, 0x05, 0xbf, + 0x8d, 0x4d, 0x03, 0x7a, 0xd7, 0x63, 0xe4, 0x2c, +}; + static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, void *data, u16 data_len) { - char buf[122]; /* Enough space for 6 features: 2 + 20 * 6 */ - struct mgmt_rp_read_exp_features_info *rp = (void *)buf; + struct mgmt_rp_read_exp_features_info *rp; + size_t len; u16 idx = 0; u32 flags; + int status; bt_dev_dbg(hdev, "sock %p", sk); - memset(&buf, 0, sizeof(buf)); + /* Enough space for 7 features */ + len = sizeof(*rp) + (sizeof(rp->features[0]) * 7); + rp = kzalloc(len, GFP_KERNEL); + if (!rp) + return -ENOMEM; #ifdef CONFIG_BT_FEATURE_DEBUG if (!hdev) { @@ -4065,6 +4463,17 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, idx++; } + if (hdev && lmp_le_capable(hdev)) { + if (hci_dev_test_flag(hdev, HCI_MESH_EXPERIMENTAL)) + flags = BIT(0); + else + flags = 0; + + memcpy(rp->features[idx].uuid, mgmt_mesh_uuid, 16); + rp->features[idx].flags = cpu_to_le32(flags); + idx++; + } + rp->feature_count = cpu_to_le16(idx); /* After reading the experimental features information, enable @@ -4072,9 +4481,12 @@ static int read_exp_features_info(struct sock *sk, struct hci_dev *hdev, */ hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); - return mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, - MGMT_OP_READ_EXP_FEATURES_INFO, - 0, rp, sizeof(*rp) + (20 * idx)); + status = mgmt_cmd_complete(sk, hdev ? hdev->id : MGMT_INDEX_NONE, + MGMT_OP_READ_EXP_FEATURES_INFO, + 0, rp, sizeof(*rp) + (20 * idx)); + + kfree(rp); + return status; } static int exp_ll_privacy_feature_changed(bool enabled, struct hci_dev *hdev, @@ -4202,6 +4614,63 @@ static int set_debug_func(struct sock *sk, struct hci_dev *hdev, } #endif +static int set_mgmt_mesh_func(struct sock *sk, struct hci_dev *hdev, + struct mgmt_cp_set_exp_feature *cp, u16 data_len) +{ + struct mgmt_rp_set_exp_feature rp; + bool val, changed; + int err; + + /* Command requires to use the controller index */ + if (!hdev) + return mgmt_cmd_status(sk, MGMT_INDEX_NONE, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_INDEX); + + /* Changes can only be made when controller is powered down */ + if (hdev_is_powered(hdev)) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_REJECTED); + + /* Parameters are limited to a single octet */ + if (data_len != MGMT_SET_EXP_FEATURE_SIZE + 1) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + /* Only boolean on/off is supported */ + if (cp->param[0] != 0x00 && cp->param[0] != 0x01) + return mgmt_cmd_status(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, + MGMT_STATUS_INVALID_PARAMS); + + val = !!cp->param[0]; + + if (val) { + changed = !hci_dev_test_and_set_flag(hdev, + HCI_MESH_EXPERIMENTAL); + } else { + hci_dev_clear_flag(hdev, HCI_MESH); + changed = hci_dev_test_and_clear_flag(hdev, + HCI_MESH_EXPERIMENTAL); + } + + memcpy(rp.uuid, mgmt_mesh_uuid, 16); + rp.flags = cpu_to_le32(val ? BIT(0) : 0); + + hci_sock_set_flag(sk, HCI_MGMT_EXP_FEATURE_EVENTS); + + err = mgmt_cmd_complete(sk, hdev->id, + MGMT_OP_SET_EXP_FEATURE, 0, + &rp, sizeof(rp)); + + if (changed) + exp_feature_changed(hdev, mgmt_mesh_uuid, val, sk); + + return err; +} + static int set_rpa_resolution_func(struct sock *sk, struct hci_dev *hdev, struct mgmt_cp_set_exp_feature *cp, u16 data_len) @@ -4517,6 +4986,7 @@ static const struct mgmt_exp_feature { #ifdef CONFIG_BT_FEATURE_DEBUG EXP_FEAT(debug_uuid, set_debug_func), #endif + EXP_FEAT(mgmt_mesh_uuid, set_mgmt_mesh_func), EXP_FEAT(rpa_resolution_uuid, set_rpa_resolution_func), EXP_FEAT(quality_report_uuid, set_quality_report_func), EXP_FEAT(offload_codecs_uuid, set_offload_codec_func), @@ -5981,6 +6451,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data, if (!hdev_is_powered(hdev) || (val == hci_dev_test_flag(hdev, HCI_ADVERTISING) && (cp->val == 0x02) == hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE)) || + hci_dev_test_flag(hdev, HCI_MESH) || hci_conn_num(hdev, LE_LINK) > 0 || (hci_dev_test_flag(hdev, HCI_LE_SCAN) && hdev->le_scan_type == LE_SCAN_ACTIVE)) { @@ -7909,8 +8380,7 @@ static u32 get_supported_adv_flags(struct hci_dev *hdev) /* In extended adv TX_POWER returned from Set Adv Param * will be always valid. */ - if ((hdev->adv_tx_power != HCI_TX_POWER_INVALID) || - ext_adv_capable(hdev)) + if (hdev->adv_tx_power != HCI_TX_POWER_INVALID || ext_adv_capable(hdev)) flags |= MGMT_ADV_FLAG_TX_POWER; if (ext_adv_capable(hdev)) { @@ -7963,8 +8433,14 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev, instance = rp->instance; list_for_each_entry(adv_instance, &hdev->adv_instances, list) { - *instance = adv_instance->instance; - instance++; + /* Only instances 1-le_num_of_adv_sets are externally visible */ + if (adv_instance->instance <= hdev->adv_instance_cnt) { + *instance = adv_instance->instance; + instance++; + } else { + rp->num_instances--; + rp_len--; + } } hci_dev_unlock(hdev); @@ -8226,7 +8702,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev, timeout, duration, HCI_ADV_TX_POWER_NO_PREFERENCE, hdev->le_adv_min_interval, - hdev->le_adv_max_interval); + hdev->le_adv_max_interval, 0); if (IS_ERR(adv)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING, MGMT_STATUS_FAILED); @@ -8430,7 +8906,7 @@ static int add_ext_adv_params(struct sock *sk, struct hci_dev *hdev, /* Create advertising instance with no advertising or response data */ adv = hci_add_adv_instance(hdev, cp->instance, flags, 0, NULL, 0, NULL, timeout, duration, tx_power, min_interval, - max_interval); + max_interval, 0); if (IS_ERR(adv)) { err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_EXT_ADV_PARAMS, @@ -8876,8 +9352,13 @@ static const struct hci_mgmt_handler mgmt_handlers[] = { { add_ext_adv_data, MGMT_ADD_EXT_ADV_DATA_SIZE, HCI_MGMT_VAR_LEN }, { add_adv_patterns_monitor_rssi, - MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE, + MGMT_ADD_ADV_PATTERNS_MONITOR_RSSI_SIZE }, + { set_mesh, MGMT_SET_MESH_RECEIVER_SIZE, + HCI_MGMT_VAR_LEN }, + { mesh_features, MGMT_MESH_READ_FEATURES_SIZE }, + { mesh_send, MGMT_MESH_SEND_SIZE, HCI_MGMT_VAR_LEN }, + { mesh_send_cancel, MGMT_MESH_SEND_CANCEL_SIZE }, }; void mgmt_index_added(struct hci_dev *hdev) @@ -9817,14 +10298,86 @@ static void mgmt_adv_monitor_device_found(struct hci_dev *hdev, kfree_skb(skb); } +static void mesh_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, + u8 addr_type, s8 rssi, u32 flags, u8 *eir, + u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, + u64 instant) +{ + struct sk_buff *skb; + struct mgmt_ev_mesh_device_found *ev; + int i, j; + + if (!hdev->mesh_ad_types[0]) + goto accepted; + + /* Scan for requested AD types */ + if (eir_len > 0) { + for (i = 0; i + 1 < eir_len; i += eir[i] + 1) { + for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) { + if (!hdev->mesh_ad_types[j]) + break; + + if (hdev->mesh_ad_types[j] == eir[i + 1]) + goto accepted; + } + } + } + + if (scan_rsp_len > 0) { + for (i = 0; i + 1 < scan_rsp_len; i += scan_rsp[i] + 1) { + for (j = 0; j < sizeof(hdev->mesh_ad_types); j++) { + if (!hdev->mesh_ad_types[j]) + break; + + if (hdev->mesh_ad_types[j] == scan_rsp[i + 1]) + goto accepted; + } + } + } + + return; + +accepted: + skb = mgmt_alloc_skb(hdev, MGMT_EV_MESH_DEVICE_FOUND, + sizeof(*ev) + eir_len + scan_rsp_len); + if (!skb) + return; + + ev = skb_put(skb, sizeof(*ev)); + + bacpy(&ev->addr.bdaddr, bdaddr); + ev->addr.type = link_to_bdaddr(LE_LINK, addr_type); + ev->rssi = rssi; + ev->flags = cpu_to_le32(flags); + ev->instant = cpu_to_le64(instant); + + if (eir_len > 0) + /* Copy EIR or advertising data into event */ + skb_put_data(skb, eir, eir_len); + + if (scan_rsp_len > 0) + /* Append scan response data to event */ + skb_put_data(skb, scan_rsp, scan_rsp_len); + + ev->eir_len = cpu_to_le16(eir_len + scan_rsp_len); + + mgmt_event_skb(skb, NULL); +} + void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type, u8 addr_type, u8 *dev_class, s8 rssi, u32 flags, - u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len) + u8 *eir, u16 eir_len, u8 *scan_rsp, u8 scan_rsp_len, + u64 instant) { struct sk_buff *skb; struct mgmt_ev_device_found *ev; bool report_device = hci_discovery_active(hdev); + if (hci_dev_test_flag(hdev, HCI_MESH) && link_type == LE_LINK) + mesh_device_found(hdev, bdaddr, addr_type, rssi, flags, + eir, eir_len, scan_rsp, scan_rsp_len, + instant); + /* Don't send events for a non-kernel initiated discovery. With * LE one exception is if we have pend_le_reports > 0 in which * case we're doing passive scanning and want these events. @@ -9983,3 +10536,22 @@ void mgmt_exit(void) { hci_mgmt_chan_unregister(&chan); } + +void mgmt_cleanup(struct sock *sk) +{ + struct mgmt_mesh_tx *mesh_tx; + struct hci_dev *hdev; + + read_lock(&hci_dev_list_lock); + + list_for_each_entry(hdev, &hci_dev_list, list) { + do { + mesh_tx = mgmt_mesh_next(hdev, sk); + + if (mesh_tx) + mesh_send_complete(hdev, mesh_tx, true); + } while (mesh_tx); + } + + read_unlock(&hci_dev_list_lock); +} diff --git a/net/bluetooth/mgmt_util.c b/net/bluetooth/mgmt_util.c index b69cfed62088..0115f783bde8 100644 --- a/net/bluetooth/mgmt_util.c +++ b/net/bluetooth/mgmt_util.c @@ -314,3 +314,77 @@ void mgmt_pending_remove(struct mgmt_pending_cmd *cmd) list_del(&cmd->list); mgmt_pending_free(cmd); } + +void mgmt_mesh_foreach(struct hci_dev *hdev, + void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), + void *data, struct sock *sk) +{ + struct mgmt_mesh_tx *mesh_tx, *tmp; + + list_for_each_entry_safe(mesh_tx, tmp, &hdev->mgmt_pending, list) { + if (!sk || mesh_tx->sk == sk) + cb(mesh_tx, data); + } +} + +struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk) +{ + struct mgmt_mesh_tx *mesh_tx; + + if (list_empty(&hdev->mesh_pending)) + return NULL; + + list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { + if (!sk || mesh_tx->sk == sk) + return mesh_tx; + } + + return NULL; +} + +struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle) +{ + struct mgmt_mesh_tx *mesh_tx; + + if (list_empty(&hdev->mesh_pending)) + return NULL; + + list_for_each_entry(mesh_tx, &hdev->mesh_pending, list) { + if (mesh_tx->handle == handle) + return mesh_tx; + } + + return NULL; +} + +struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len) +{ + struct mgmt_mesh_tx *mesh_tx; + + mesh_tx = kzalloc(sizeof(*mesh_tx), GFP_KERNEL); + if (!mesh_tx) + return NULL; + + hdev->mesh_send_ref++; + if (!hdev->mesh_send_ref) + hdev->mesh_send_ref++; + + mesh_tx->handle = hdev->mesh_send_ref; + mesh_tx->index = hdev->id; + memcpy(mesh_tx->param, data, len); + mesh_tx->param_len = len; + mesh_tx->sk = sk; + sock_hold(sk); + + list_add_tail(&mesh_tx->list, &hdev->mesh_pending); + + return mesh_tx; +} + +void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx) +{ + list_del(&mesh_tx->list); + sock_put(mesh_tx->sk); + kfree(mesh_tx); +} diff --git a/net/bluetooth/mgmt_util.h b/net/bluetooth/mgmt_util.h index 98e40395a383..6a8b7e84293d 100644 --- a/net/bluetooth/mgmt_util.h +++ b/net/bluetooth/mgmt_util.h @@ -20,6 +20,16 @@ SOFTWARE IS DISCLAIMED. */ +struct mgmt_mesh_tx { + struct list_head list; + int index; + size_t param_len; + struct sock *sk; + u8 handle; + u8 instance; + u8 param[sizeof(struct mgmt_cp_mesh_send) + 29]; +}; + struct mgmt_pending_cmd { struct list_head list; u16 opcode; @@ -59,3 +69,11 @@ struct mgmt_pending_cmd *mgmt_pending_new(struct sock *sk, u16 opcode, void *data, u16 len); void mgmt_pending_free(struct mgmt_pending_cmd *cmd); void mgmt_pending_remove(struct mgmt_pending_cmd *cmd); +void mgmt_mesh_foreach(struct hci_dev *hdev, + void (*cb)(struct mgmt_mesh_tx *mesh_tx, void *data), + void *data, struct sock *sk); +struct mgmt_mesh_tx *mgmt_mesh_find(struct hci_dev *hdev, u8 handle); +struct mgmt_mesh_tx *mgmt_mesh_next(struct hci_dev *hdev, struct sock *sk); +struct mgmt_mesh_tx *mgmt_mesh_add(struct sock *sk, struct hci_dev *hdev, + void *data, u16 len); +void mgmt_mesh_remove(struct mgmt_mesh_tx *mesh_tx); diff --git a/net/bluetooth/rfcomm/sock.c b/net/bluetooth/rfcomm/sock.c index 4bf4ea6cbb5e..21e24da4847f 100644 --- a/net/bluetooth/rfcomm/sock.c +++ b/net/bluetooth/rfcomm/sock.c @@ -902,7 +902,10 @@ static int rfcomm_sock_shutdown(struct socket *sock, int how) lock_sock(sk); if (!sk->sk_shutdown) { sk->sk_shutdown = SHUTDOWN_MASK; + + release_sock(sk); __rfcomm_sock_close(sk); + lock_sock(sk); if (sock_flag(sk, SOCK_LINGER) && sk->sk_lingertime && !(current->flags & PF_EXITING)) diff --git a/net/bluetooth/rfcomm/tty.c b/net/bluetooth/rfcomm/tty.c index ebd78fdbd6e8..8009e0e93216 100644 --- a/net/bluetooth/rfcomm/tty.c +++ b/net/bluetooth/rfcomm/tty.c @@ -35,7 +35,6 @@ #include <net/bluetooth/hci_core.h> #include <net/bluetooth/rfcomm.h> -#define RFCOMM_TTY_MAGIC 0x6d02 /* magic number for rfcomm struct */ #define RFCOMM_TTY_PORTS RFCOMM_MAX_DEV /* whole lotta rfcomm devices */ #define RFCOMM_TTY_MAJOR 216 /* device node major id of the usb/bluetooth.c driver */ #define RFCOMM_TTY_MINOR 0 @@ -855,7 +854,8 @@ static int rfcomm_tty_ioctl(struct tty_struct *tty, unsigned int cmd, unsigned l return -ENOIOCTLCMD; } -static void rfcomm_tty_set_termios(struct tty_struct *tty, struct ktermios *old) +static void rfcomm_tty_set_termios(struct tty_struct *tty, + const struct ktermios *old) { struct ktermios *new = &tty->termios; int old_baud_rate = tty_termios_baud_rate(old); diff --git a/net/bpf/test_run.c b/net/bpf/test_run.c index 25d8ecf105aa..13d578ce2a09 100644 --- a/net/bpf/test_run.c +++ b/net/bpf/test_run.c @@ -606,6 +606,38 @@ noinline void bpf_kfunc_call_memb1_release(struct prog_test_member1 *p) WARN_ON_ONCE(1); } +static int *__bpf_kfunc_call_test_get_mem(struct prog_test_ref_kfunc *p, const int size) +{ + if (size > 2 * sizeof(int)) + return NULL; + + return (int *)p; +} + +noinline int *bpf_kfunc_call_test_get_rdwr_mem(struct prog_test_ref_kfunc *p, const int rdwr_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdwr_buf_size); +} + +noinline int *bpf_kfunc_call_test_get_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); +} + +/* the next 2 ones can't be really used for testing expect to ensure + * that the verifier rejects the call. + * Acquire functions must return struct pointers, so these ones are + * failing. + */ +noinline int *bpf_kfunc_call_test_acq_rdonly_mem(struct prog_test_ref_kfunc *p, const int rdonly_buf_size) +{ + return __bpf_kfunc_call_test_get_mem(p, rdonly_buf_size); +} + +noinline void bpf_kfunc_call_int_mem_release(int *p) +{ +} + noinline struct prog_test_ref_kfunc * bpf_kfunc_call_test_kptr_get(struct prog_test_ref_kfunc **pp, int a, int b) { @@ -712,6 +744,10 @@ BTF_ID_FLAGS(func, bpf_kfunc_call_memb_acquire, KF_ACQUIRE | KF_RET_NULL) BTF_ID_FLAGS(func, bpf_kfunc_call_test_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_memb1_release, KF_RELEASE) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdwr_mem, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_get_rdonly_mem, KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_test_acq_rdonly_mem, KF_ACQUIRE | KF_RET_NULL) +BTF_ID_FLAGS(func, bpf_kfunc_call_int_mem_release, KF_RELEASE) BTF_ID_FLAGS(func, bpf_kfunc_call_test_kptr_get, KF_ACQUIRE | KF_RET_NULL | KF_KPTR_GET) BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass_ctx) BTF_ID_FLAGS(func, bpf_kfunc_call_test_pass1) @@ -1634,6 +1670,7 @@ static int __init bpf_prog_test_run_init(void) ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &bpf_prog_test_kfunc_set); ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, &bpf_prog_test_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, &bpf_prog_test_kfunc_set); return ret ?: register_btf_id_dtor_kfuncs(bpf_prog_test_dtor_kfunc, ARRAY_SIZE(bpf_prog_test_dtor_kfunc), THIS_MODULE); diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c index efbd93e92ce2..228fd5b20f10 100644 --- a/net/bridge/br_if.c +++ b/net/bridge/br_if.c @@ -40,12 +40,21 @@ static int port_cost(struct net_device *dev) switch (ecmd.base.speed) { case SPEED_10000: return 2; - case SPEED_1000: + case SPEED_5000: + return 3; + case SPEED_2500: return 4; + case SPEED_1000: + return 5; case SPEED_100: return 19; case SPEED_10: return 100; + case SPEED_UNKNOWN: + return 100; + default: + if (ecmd.base.speed > SPEED_10000) + return 1; } } diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c index 589ff497d50c..321be94c445a 100644 --- a/net/bridge/br_mdb.c +++ b/net/bridge/br_mdb.c @@ -866,7 +866,6 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, unsigned long now = jiffies; unsigned char flags = 0; u8 filter_mode; - int err; __mdb_entry_to_br_ip(entry, &group, mdb_attrs); @@ -892,13 +891,9 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port, return -EINVAL; } - mp = br_mdb_ip_get(br, &group); - if (!mp) { - mp = br_multicast_new_group(br, &group); - err = PTR_ERR_OR_ZERO(mp); - if (err) - return err; - } + mp = br_multicast_new_group(br, &group); + if (IS_ERR(mp)) + return PTR_ERR(mp); /* host join */ if (!port) { diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c index db4f2641d1cd..5e988f0ed2c0 100644 --- a/net/bridge/br_multicast.c +++ b/net/bridge/br_multicast.c @@ -2669,7 +2669,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, if (!pmctx || igmpv2) continue; - spin_lock_bh(&brmctx->br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; @@ -2717,7 +2717,7 @@ static int br_ip4_multicast_igmp3_report(struct net_bridge_mcast *brmctx, if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&brmctx->br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } return err; @@ -2807,7 +2807,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, if (!pmctx || mldv1) continue; - spin_lock_bh(&brmctx->br->multicast_lock); + spin_lock(&brmctx->br->multicast_lock); if (!br_multicast_ctx_should_use(brmctx, pmctx)) goto unlock_continue; @@ -2859,7 +2859,7 @@ static int br_ip6_multicast_mld2_report(struct net_bridge_mcast *brmctx, if (changed) br_mdb_notify(brmctx->br->dev, mdst, pg, RTM_NEWMDB); unlock_continue: - spin_unlock_bh(&brmctx->br->multicast_lock); + spin_unlock(&brmctx->br->multicast_lock); } return err; @@ -4899,9 +4899,9 @@ void br_multicast_get_stats(const struct net_bridge *br, unsigned int start; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); memcpy(&temp, &cpu_stats->mstats, sizeof(temp)); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); mcast_stats_add_dir(tdst.igmp_v1queries, temp.igmp_v1queries); mcast_stats_add_dir(tdst.igmp_v2queries, temp.igmp_v2queries); diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index ff4779036649..f20f4373ff40 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -384,6 +384,7 @@ static int br_nf_pre_routing_finish(struct net *net, struct sock *sk, struct sk_ /* - Bridged-and-DNAT'ed traffic doesn't * require ip_forwarding. */ if (rt->dst.dev == dev) { + skb_dst_drop(skb); skb_dst_set(skb, &rt->dst); goto bridged_dnat; } @@ -413,6 +414,7 @@ bridged_dnat: kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index e4e0c836c3f5..6b07f30675bb 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -197,6 +197,7 @@ static int br_nf_pre_routing_finish_ipv6(struct net *net, struct sock *sk, struc kfree_skb(skb); return 0; } + skb_dst_drop(skb); skb_dst_set_noref(skb, &rt->dst); } diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c index 6e53dc991409..f2fc284abab3 100644 --- a/net/bridge/br_vlan.c +++ b/net/bridge/br_vlan.c @@ -1378,12 +1378,12 @@ void br_vlan_get_stats(const struct net_bridge_vlan *v, cpu_stats = per_cpu_ptr(v->stats, i); do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); rxpackets = u64_stats_read(&cpu_stats->rx_packets); rxbytes = u64_stats_read(&cpu_stats->rx_bytes); txbytes = u64_stats_read(&cpu_stats->tx_bytes); txpackets = u64_stats_read(&cpu_stats->tx_packets); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rxpackets); u64_stats_add(&stats->rx_bytes, rxbytes); diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c index 8f6639e095a0..ce5dfa3babd2 100644 --- a/net/bridge/netfilter/ebtables.c +++ b/net/bridge/netfilter/ebtables.c @@ -1040,8 +1040,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl, goto free_iterate; } - if (repl->valid_hooks != t->valid_hooks) + if (repl->valid_hooks != t->valid_hooks) { + ret = -EINVAL; goto free_unlock; + } if (repl->num_counters && repl->num_counters != t->private->nentries) { ret = -EINVAL; diff --git a/net/can/af_can.c b/net/can/af_can.c index 1fb49d51b25d..9503ab10f9b8 100644 --- a/net/can/af_can.c +++ b/net/can/af_can.c @@ -199,27 +199,26 @@ static int can_create(struct net *net, struct socket *sock, int protocol, int can_send(struct sk_buff *skb, int loop) { struct sk_buff *newskb = NULL; - struct canfd_frame *cfd = (struct canfd_frame *)skb->data; struct can_pkg_stats *pkg_stats = dev_net(skb->dev)->can.pkg_stats; int err = -EINVAL; - if (skb->len == CAN_MTU) { + if (can_is_canxl_skb(skb)) { + skb->protocol = htons(ETH_P_CANXL); + } else if (can_is_can_skb(skb)) { skb->protocol = htons(ETH_P_CAN); - if (unlikely(cfd->len > CAN_MAX_DLEN)) - goto inval_skb; - } else if (skb->len == CANFD_MTU) { + } else if (can_is_canfd_skb(skb)) { + struct canfd_frame *cfd = (struct canfd_frame *)skb->data; + skb->protocol = htons(ETH_P_CANFD); - if (unlikely(cfd->len > CANFD_MAX_DLEN)) - goto inval_skb; + + /* set CAN FD flag for CAN FD frames by default */ + cfd->flags |= CANFD_FDF; } else { goto inval_skb; } - /* Make sure the CAN frame can pass the selected CAN netdevice. - * As structs can_frame and canfd_frame are similar, we can provide - * CAN FD frames to legacy CAN drivers as long as the length is <= 8 - */ - if (unlikely(skb->len > skb->dev->mtu && cfd->len > CAN_MAX_DLEN)) { + /* Make sure the CAN frame can pass the selected CAN netdevice. */ + if (unlikely(skb->len > skb->dev->mtu)) { err = -EMSGSIZE; goto inval_skb; } @@ -678,53 +677,46 @@ static void can_receive(struct sk_buff *skb, struct net_device *dev) static int can_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CAN_MTU)) { + if (unlikely(dev->type != ARPHRD_CAN || (!can_is_can_skb(skb)))) { pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d\n", dev->type, skb->len); - goto free_skb; - } - /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ - if (unlikely(cfd->len > CAN_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN skbuff: dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len); - goto free_skb; + kfree_skb(skb); + return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; - -free_skb: - kfree_skb(skb); - return NET_RX_DROP; } static int canfd_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, struct net_device *orig_dev) { - struct canfd_frame *cfd = (struct canfd_frame *)skb->data; - - if (unlikely(dev->type != ARPHRD_CAN || skb->len != CANFD_MTU)) { + if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canfd_skb(skb)))) { pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d\n", dev->type, skb->len); - goto free_skb; - } - /* This check is made separately since cfd->len would be uninitialized if skb->len = 0. */ - if (unlikely(cfd->len > CANFD_MAX_DLEN)) { - pr_warn_once("PF_CAN: dropped non conform CAN FD skbuff: dev type %d, len %d, datalen %d\n", - dev->type, skb->len, cfd->len); - goto free_skb; + kfree_skb(skb); + return NET_RX_DROP; } can_receive(skb, dev); return NET_RX_SUCCESS; +} -free_skb: - kfree_skb(skb); - return NET_RX_DROP; +static int canxl_rcv(struct sk_buff *skb, struct net_device *dev, + struct packet_type *pt, struct net_device *orig_dev) +{ + if (unlikely(dev->type != ARPHRD_CAN || (!can_is_canxl_skb(skb)))) { + pr_warn_once("PF_CAN: dropped non conform CAN XL skbuff: dev type %d, len %d\n", + dev->type, skb->len); + + kfree_skb(skb); + return NET_RX_DROP; + } + + can_receive(skb, dev); + return NET_RX_SUCCESS; } /* af_can protocol functions */ @@ -851,6 +843,11 @@ static struct packet_type canfd_packet __read_mostly = { .func = canfd_rcv, }; +static struct packet_type canxl_packet __read_mostly = { + .type = cpu_to_be16(ETH_P_CANXL), + .func = canxl_rcv, +}; + static const struct net_proto_family can_family_ops = { .family = PF_CAN, .create = can_create, @@ -890,6 +887,7 @@ static __init int can_init(void) dev_add_pack(&can_packet); dev_add_pack(&canfd_packet); + dev_add_pack(&canxl_packet); return 0; diff --git a/net/can/bcm.c b/net/can/bcm.c index e60161bec850..27706f6ace34 100644 --- a/net/can/bcm.c +++ b/net/can/bcm.c @@ -274,6 +274,7 @@ static void bcm_can_tx(struct bcm_op *op) struct sk_buff *skb; struct net_device *dev; struct canfd_frame *cf = op->frames + op->cfsiz * op->currframe; + int err; /* no target device? => exit */ if (!op->ifindex) @@ -298,11 +299,11 @@ static void bcm_can_tx(struct bcm_op *op) /* send with loopback */ skb->dev = dev; can_skb_set_owner(skb, op->sk); - can_send(skb, 1); + err = can_send(skb, 1); + if (!err) + op->frames_abs++; - /* update statistics */ op->currframe++; - op->frames_abs++; /* reached last frame? */ if (op->currframe >= op->nframes) @@ -648,8 +649,13 @@ static void bcm_rx_handler(struct sk_buff *skb, void *data) return; /* make sure to handle the correct frame type (CAN / CAN FD) */ - if (skb->len != op->cfsiz) - return; + if (op->flags & CAN_FD_FRAME) { + if (!can_is_canfd_skb(skb)) + return; + } else { + if (!can_is_can_skb(skb)) + return; + } /* disable timeout */ hrtimer_cancel(&op->timer); @@ -1744,15 +1750,27 @@ static int __init bcm_module_init(void) pr_info("can: broadcast manager protocol\n"); + err = register_pernet_subsys(&canbcm_pernet_ops); + if (err) + return err; + + err = register_netdevice_notifier(&canbcm_notifier); + if (err) + goto register_notifier_failed; + err = can_proto_register(&bcm_can_proto); if (err < 0) { printk(KERN_ERR "can: registration of bcm protocol failed\n"); - return err; + goto register_proto_failed; } - register_pernet_subsys(&canbcm_pernet_ops); - register_netdevice_notifier(&canbcm_notifier); return 0; + +register_proto_failed: + unregister_netdevice_notifier(&canbcm_notifier); +register_notifier_failed: + unregister_pernet_subsys(&canbcm_pernet_ops); + return err; } static void __exit bcm_module_exit(void) diff --git a/net/can/gw.c b/net/can/gw.c index 1ea4cc527db3..23a3d89cad81 100644 --- a/net/can/gw.c +++ b/net/can/gw.c @@ -463,10 +463,10 @@ static void can_can_gw_rcv(struct sk_buff *skb, void *data) /* process strictly Classic CAN or CAN FD frames */ if (gwj->flags & CGW_FLAGS_CAN_FD) { - if (skb->len != CANFD_MTU) + if (!can_is_canfd_skb(skb)) return; } else { - if (skb->len != CAN_MTU) + if (!can_is_can_skb(skb)) return; } diff --git a/net/can/isotp.c b/net/can/isotp.c index 43a27d19cdac..a9d1357f8489 100644 --- a/net/can/isotp.c +++ b/net/can/isotp.c @@ -669,7 +669,7 @@ static void isotp_rcv(struct sk_buff *skb, void *data) if (cf->len <= CAN_MAX_DLEN) { isotp_rcv_sf(sk, cf, SF_PCI_SZ4 + ae, skb, sf_dl); } else { - if (skb->len == CANFD_MTU) { + if (can_is_canfd_skb(skb)) { /* We have a CAN FD frame and CAN_DL is greater than 8: * Only frames with the SF_DL == 0 ESC value are valid. * diff --git a/net/can/j1939/main.c b/net/can/j1939/main.c index 8452b0fbb78c..144c86b0e3ff 100644 --- a/net/can/j1939/main.c +++ b/net/can/j1939/main.c @@ -42,6 +42,10 @@ static void j1939_can_recv(struct sk_buff *iskb, void *data) struct j1939_sk_buff_cb *skcb, *iskcb; struct can_frame *cf; + /* make sure we only get Classical CAN frames */ + if (!can_is_can_skb(iskb)) + return; + /* create a copy of the skb * j1939 only delivers the real data bytes, * the header goes into sockaddr. diff --git a/net/can/j1939/transport.c b/net/can/j1939/transport.c index d7d86c944d76..f26f4cfa9e63 100644 --- a/net/can/j1939/transport.c +++ b/net/can/j1939/transport.c @@ -342,10 +342,12 @@ static void j1939_session_skb_drop_old(struct j1939_session *session) __skb_unlink(do_skb, &session->skb_queue); /* drop ref taken in j1939_session_skb_queue() */ skb_unref(do_skb); + spin_unlock_irqrestore(&session->skb_queue.lock, flags); kfree_skb(do_skb); + } else { + spin_unlock_irqrestore(&session->skb_queue.lock, flags); } - spin_unlock_irqrestore(&session->skb_queue.lock, flags); } void j1939_session_skb_queue(struct j1939_session *session, @@ -985,7 +987,7 @@ static int j1939_session_tx_eoma(struct j1939_session *session) /* wait for the EOMA packet to come in */ j1939_tp_set_rxtimeout(session, 1250); - netdev_dbg(session->priv->ndev, "%p: 0x%p\n", __func__, session); + netdev_dbg(session->priv->ndev, "%s: 0x%p\n", __func__, session); return 0; } diff --git a/net/can/raw.c b/net/can/raw.c index d1bd9cc51ebe..3eb7d3e2b541 100644 --- a/net/can/raw.c +++ b/net/can/raw.c @@ -50,6 +50,7 @@ #include <linux/skbuff.h> #include <linux/can.h> #include <linux/can/core.h> +#include <linux/can/dev.h> /* for can_is_canxl_dev_mtu() */ #include <linux/can/skb.h> #include <linux/can/raw.h> #include <net/sock.h> @@ -87,6 +88,7 @@ struct raw_sock { int loopback; int recv_own_msgs; int fd_frames; + int xl_frames; int join_filters; int count; /* number of active filters */ struct can_filter dfilter; /* default/single filter */ @@ -129,21 +131,21 @@ static void raw_rcv(struct sk_buff *oskb, void *data) if (!ro->recv_own_msgs && oskb->sk == sk) return; - /* do not pass non-CAN2.0 frames to a legacy socket */ - if (!ro->fd_frames && oskb->len != CAN_MTU) + /* make sure to not pass oversized frames to the socket */ + if ((can_is_canfd_skb(oskb) && !ro->fd_frames && !ro->xl_frames) || + (can_is_canxl_skb(oskb) && !ro->xl_frames)) return; /* eliminate multiple filter matches for the same skb */ if (this_cpu_ptr(ro->uniq)->skb == oskb && this_cpu_ptr(ro->uniq)->skbcnt == can_skb_prv(oskb)->skbcnt) { - if (ro->join_filters) { - this_cpu_inc(ro->uniq->join_rx_count); - /* drop frame until all enabled filters matched */ - if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) - return; - } else { + if (!ro->join_filters) + return; + + this_cpu_inc(ro->uniq->join_rx_count); + /* drop frame until all enabled filters matched */ + if (this_cpu_ptr(ro->uniq)->join_rx_count < ro->count) return; - } } else { this_cpu_ptr(ro->uniq)->skb = oskb; this_cpu_ptr(ro->uniq)->skbcnt = can_skb_prv(oskb)->skbcnt; @@ -346,6 +348,7 @@ static int raw_init(struct sock *sk) ro->loopback = 1; ro->recv_own_msgs = 0; ro->fd_frames = 0; + ro->xl_frames = 0; ro->join_filters = 0; /* alloc_percpu provides zero'ed memory */ @@ -669,6 +672,15 @@ static int raw_setsockopt(struct socket *sock, int level, int optname, break; + case CAN_RAW_XL_FRAMES: + if (optlen != sizeof(ro->xl_frames)) + return -EINVAL; + + if (copy_from_sockptr(&ro->xl_frames, optval, optlen)) + return -EFAULT; + + break; + case CAN_RAW_JOIN_FILTERS: if (optlen != sizeof(ro->join_filters)) return -EINVAL; @@ -751,6 +763,12 @@ static int raw_getsockopt(struct socket *sock, int level, int optname, val = &ro->fd_frames; break; + case CAN_RAW_XL_FRAMES: + if (len > sizeof(int)) + len = sizeof(int); + val = &ro->xl_frames; + break; + case CAN_RAW_JOIN_FILTERS: if (len > sizeof(int)) len = sizeof(int); @@ -776,7 +794,11 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) struct sk_buff *skb; struct net_device *dev; int ifindex; - int err; + int err = -EINVAL; + + /* check for valid CAN frame sizes */ + if (size < CANXL_HDR_SIZE + CANXL_MIN_DLEN || size > CANXL_MTU) + return -EINVAL; if (msg->msg_name) { DECLARE_SOCKADDR(struct sockaddr_can *, addr, msg->msg_name); @@ -796,15 +818,6 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) if (!dev) return -ENXIO; - err = -EINVAL; - if (ro->fd_frames && dev->mtu == CANFD_MTU) { - if (unlikely(size != CANFD_MTU && size != CAN_MTU)) - goto put_dev; - } else { - if (unlikely(size != CAN_MTU)) - goto put_dev; - } - skb = sock_alloc_send_skb(sk, size + sizeof(struct can_skb_priv), msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) @@ -814,10 +827,27 @@ static int raw_sendmsg(struct socket *sock, struct msghdr *msg, size_t size) can_skb_prv(skb)->ifindex = dev->ifindex; can_skb_prv(skb)->skbcnt = 0; + /* fill the skb before testing for valid CAN frames */ err = memcpy_from_msg(skb_put(skb, size), msg, size); if (err < 0) goto free_skb; + err = -EINVAL; + if (ro->xl_frames && can_is_canxl_dev_mtu(dev->mtu)) { + /* CAN XL, CAN FD and Classical CAN */ + if (!can_is_canxl_skb(skb) && !can_is_canfd_skb(skb) && + !can_is_can_skb(skb)) + goto free_skb; + } else if (ro->fd_frames && dev->mtu == CANFD_MTU) { + /* CAN FD and Classical CAN */ + if (!can_is_canfd_skb(skb) && !can_is_can_skb(skb)) + goto free_skb; + } else { + /* Classical CAN */ + if (!can_is_can_skb(skb)) + goto free_skb; + } + sockcm_init(&sockc, sk); if (msg->msg_controllen) { err = sock_cmsg_send(sk, msg, &sockc); @@ -942,12 +972,20 @@ static __init int raw_module_init(void) pr_info("can: raw protocol\n"); + err = register_netdevice_notifier(&canraw_notifier); + if (err) + return err; + err = can_proto_register(&raw_can_proto); - if (err < 0) + if (err < 0) { pr_err("can: registration of raw protocol failed\n"); - else - register_netdevice_notifier(&canraw_notifier); + goto register_proto_failed; + } + + return 0; +register_proto_failed: + unregister_netdevice_notifier(&canraw_notifier); return err; } diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index d3bb656308b4..dfa237fbd5a3 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -728,7 +728,6 @@ static void ceph_msg_data_bio_cursor_init(struct ceph_msg_data_cursor *cursor, it->iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); } static struct page *ceph_msg_data_bio_next(struct ceph_msg_data_cursor *cursor, @@ -754,10 +753,8 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, cursor->resid -= bytes; bio_advance_iter(it->bio, &it->iter, bytes); - if (!cursor->resid) { - BUG_ON(!cursor->last_piece); + if (!cursor->resid) return false; /* no more data */ - } if (!bytes || (it->iter.bi_size && it->iter.bi_bvec_done && page == bio_iter_page(it->bio, it->iter))) @@ -770,9 +767,7 @@ static bool ceph_msg_data_bio_advance(struct ceph_msg_data_cursor *cursor, it->iter.bi_size = cursor->resid; } - BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bio_iter_len(it->bio, it->iter)); - cursor->last_piece = cursor->resid == bio_iter_len(it->bio, it->iter); return true; } #endif /* CONFIG_BLOCK */ @@ -788,8 +783,6 @@ static void ceph_msg_data_bvecs_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->bvec_iter.bi_size = cursor->resid; BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); - cursor->last_piece = - cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); } static struct page *ceph_msg_data_bvecs_next(struct ceph_msg_data_cursor *cursor, @@ -815,19 +808,14 @@ static bool ceph_msg_data_bvecs_advance(struct ceph_msg_data_cursor *cursor, cursor->resid -= bytes; bvec_iter_advance(bvecs, &cursor->bvec_iter, bytes); - if (!cursor->resid) { - BUG_ON(!cursor->last_piece); + if (!cursor->resid) return false; /* no more data */ - } if (!bytes || (cursor->bvec_iter.bi_bvec_done && page == bvec_iter_page(bvecs, cursor->bvec_iter))) return false; /* more bytes to process in this segment */ - BUG_ON(cursor->last_piece); BUG_ON(cursor->resid < bvec_iter_len(bvecs, cursor->bvec_iter)); - cursor->last_piece = - cursor->resid == bvec_iter_len(bvecs, cursor->bvec_iter); return true; } @@ -853,7 +841,6 @@ static void ceph_msg_data_pages_cursor_init(struct ceph_msg_data_cursor *cursor, BUG_ON(page_count > (int)USHRT_MAX); cursor->page_count = (unsigned short)page_count; BUG_ON(length > SIZE_MAX - cursor->page_offset); - cursor->last_piece = cursor->page_offset + cursor->resid <= PAGE_SIZE; } static struct page * @@ -868,11 +855,7 @@ ceph_msg_data_pages_next(struct ceph_msg_data_cursor *cursor, BUG_ON(cursor->page_offset >= PAGE_SIZE); *page_offset = cursor->page_offset; - if (cursor->last_piece) - *length = cursor->resid; - else - *length = PAGE_SIZE - *page_offset; - + *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return data->pages[cursor->page_index]; } @@ -897,8 +880,6 @@ static bool ceph_msg_data_pages_advance(struct ceph_msg_data_cursor *cursor, BUG_ON(cursor->page_index >= cursor->page_count); cursor->page_index++; - cursor->last_piece = cursor->resid <= PAGE_SIZE; - return true; } @@ -928,7 +909,6 @@ ceph_msg_data_pagelist_cursor_init(struct ceph_msg_data_cursor *cursor, cursor->resid = min(length, pagelist->length); cursor->page = page; cursor->offset = 0; - cursor->last_piece = cursor->resid <= PAGE_SIZE; } static struct page * @@ -948,11 +928,7 @@ ceph_msg_data_pagelist_next(struct ceph_msg_data_cursor *cursor, /* offset of first page in pagelist is always 0 */ *page_offset = cursor->offset & ~PAGE_MASK; - if (cursor->last_piece) - *length = cursor->resid; - else - *length = PAGE_SIZE - *page_offset; - + *length = min_t(size_t, cursor->resid, PAGE_SIZE - *page_offset); return cursor->page; } @@ -985,8 +961,6 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); cursor->page = list_next_entry(cursor->page, lru); - cursor->last_piece = cursor->resid <= PAGE_SIZE; - return true; } @@ -1044,8 +1018,7 @@ void ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor, * Indicate whether this is the last piece in this data item. */ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, - size_t *page_offset, size_t *length, - bool *last_piece) + size_t *page_offset, size_t *length) { struct page *page; @@ -1074,8 +1047,6 @@ struct page *ceph_msg_data_next(struct ceph_msg_data_cursor *cursor, BUG_ON(*page_offset + *length > PAGE_SIZE); BUG_ON(!*length); BUG_ON(*length > cursor->resid); - if (last_piece) - *last_piece = cursor->last_piece; return page; } @@ -1112,7 +1083,6 @@ void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, size_t bytes) cursor->total_resid -= bytes; if (!cursor->resid && cursor->total_resid) { - WARN_ON(!cursor->last_piece); cursor->data++; __ceph_msg_data_cursor_init(cursor); new_piece = true; diff --git a/net/ceph/messenger_v1.c b/net/ceph/messenger_v1.c index 6b014eca3a13..3ddbde87e4d6 100644 --- a/net/ceph/messenger_v1.c +++ b/net/ceph/messenger_v1.c @@ -495,7 +495,7 @@ static int write_partial_message_data(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length); if (length == cursor->total_resid) more = MSG_MORE; ret = ceph_tcp_sendpage(con->sock, page, page_offset, length, @@ -1008,7 +1008,7 @@ static int read_partial_msg_data(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &page_offset, &length, NULL); + page = ceph_msg_data_next(cursor, &page_offset, &length); ret = ceph_tcp_recvpage(con->sock, page, page_offset, length); if (ret <= 0) { if (do_datacrc) @@ -1050,7 +1050,7 @@ static int read_partial_msg_data_bounce(struct ceph_connection *con) continue; } - page = ceph_msg_data_next(cursor, &off, &len, NULL); + page = ceph_msg_data_next(cursor, &off, &len); ret = ceph_tcp_recvpage(con->sock, con->bounce_page, 0, len); if (ret <= 0) { con->in_data_crc = crc; diff --git a/net/ceph/messenger_v2.c b/net/ceph/messenger_v2.c index c6e5bfc717d5..cc8ff81a50b7 100644 --- a/net/ceph/messenger_v2.c +++ b/net/ceph/messenger_v2.c @@ -862,7 +862,7 @@ static void get_bvec_at(struct ceph_msg_data_cursor *cursor, ceph_msg_data_advance(cursor, 0); /* get a piece of data, cursor isn't advanced */ - page = ceph_msg_data_next(cursor, &off, &len, NULL); + page = ceph_msg_data_next(cursor, &off, &len); bv->bv_page = page; bv->bv_offset = off; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index 6a6898ee4049..db60217f911b 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -222,7 +222,7 @@ static void pick_new_mon(struct ceph_mon_client *monc) max--; } - n = prandom_u32() % max; + n = prandom_u32_max(max); if (o >= 0 && n >= o) n++; diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c index 87b883c7bfd6..4e4f1e4bc265 100644 --- a/net/ceph/osd_client.c +++ b/net/ceph/osd_client.c @@ -1479,7 +1479,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, static int pick_random_replica(const struct ceph_osds *acting) { - int i = prandom_u32() % acting->size; + int i = prandom_u32_max(acting->size); dout("%s picked osd%d, primary osd%d\n", __func__, acting->osds[i], acting->primary); diff --git a/net/compat.c b/net/compat.c index fe9be3c56ef7..385f04a6be2f 100644 --- a/net/compat.c +++ b/net/compat.c @@ -52,6 +52,7 @@ int __get_compat_msghdr(struct msghdr *kmsg, kmsg->msg_namelen = sizeof(struct sockaddr_storage); kmsg->msg_control_is_user = true; + kmsg->msg_get_inq = 0; kmsg->msg_control_user = compat_ptr(msg->msg_control); kmsg->msg_controllen = msg->msg_controllen; diff --git a/net/core/.gitignore b/net/core/.gitignore deleted file mode 100644 index df1e74372cce..000000000000 --- a/net/core/.gitignore +++ /dev/null @@ -1 +0,0 @@ -dropreason_str.c diff --git a/net/core/Makefile b/net/core/Makefile index e8ce3bd283a6..5857cec87b83 100644 --- a/net/core/Makefile +++ b/net/core/Makefile @@ -5,7 +5,7 @@ obj-y := sock.o request_sock.o skbuff.o datagram.o stream.o scm.o \ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o \ - flow_dissector.o dropreason_str.o + flow_dissector.o obj-$(CONFIG_SYSCTL) += sysctl_net_core.o @@ -40,23 +40,3 @@ obj-$(CONFIG_NET_SOCK_MSG) += skmsg.o obj-$(CONFIG_BPF_SYSCALL) += sock_map.o obj-$(CONFIG_BPF_SYSCALL) += bpf_sk_storage.o obj-$(CONFIG_OF) += of_net.o - -clean-files := dropreason_str.c - -quiet_cmd_dropreason_str = GEN $@ -cmd_dropreason_str = awk -F ',' 'BEGIN{ print "\#include <net/dropreason.h>\n"; \ - print "const char * const drop_reasons[] = {" }\ - /^enum skb_drop/ { dr=1; }\ - /^\};/ { dr=0; }\ - /^\tSKB_DROP_REASON_/ {\ - if (dr) {\ - sub(/\tSKB_DROP_REASON_/, "", $$1);\ - printf "\t[SKB_DROP_REASON_%s] = \"%s\",\n", $$1, $$1;\ - }\ - }\ - END{ print "};" }' $< > $@ - -$(obj)/dropreason_str.c: $(srctree)/include/net/dropreason.h - $(call cmd,dropreason_str) - -$(obj)/dropreason_str.o: $(obj)/dropreason_str.c diff --git a/net/core/datagram.c b/net/core/datagram.c index 7255531f63ae..e4ff2db40c98 100644 --- a/net/core/datagram.c +++ b/net/core/datagram.c @@ -677,7 +677,7 @@ int __zerocopy_sg_from_iter(struct msghdr *msg, struct sock *sk, page_ref_sub(last_head, refs); refs = 0; } - skb_fill_page_desc(skb, frag++, head, start, size); + skb_fill_page_desc_noacc(skb, frag++, head, start, size); } if (refs) page_ref_sub(last_head, refs); diff --git a/net/core/dev.c b/net/core/dev.c index d66c73c1c734..cfb68db040a4 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -5136,11 +5136,13 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, case TC_ACT_SHOT: mini_qdisc_qstats_cpu_drop(miniq); kfree_skb_reason(skb, SKB_DROP_REASON_TC_INGRESS); + *ret = NET_RX_DROP; return NULL; case TC_ACT_STOLEN: case TC_ACT_QUEUED: case TC_ACT_TRAP: consume_skb(skb); + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_REDIRECT: /* skb_mac_header check was done by cls/act_bpf, so @@ -5153,8 +5155,10 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, *another = true; break; } + *ret = NET_RX_SUCCESS; return NULL; case TC_ACT_CONSUMED: + *ret = NET_RX_SUCCESS; return NULL; default: break; @@ -6358,23 +6362,6 @@ int dev_set_threaded(struct net_device *dev, bool threaded) } EXPORT_SYMBOL(dev_set_threaded); -/* Double check that napi_get_frags() allocates skbs with - * skb->head being backed by slab, not a page fragment. - * This is to make sure bug fixed in 3226b158e67c - * ("net: avoid 32 x truesize under-estimation for tiny skbs") - * does not accidentally come back. - */ -static void napi_get_frags_check(struct napi_struct *napi) -{ - struct sk_buff *skb; - - local_bh_disable(); - skb = napi_get_frags(napi); - WARN_ON_ONCE(skb && skb->head_frag); - napi_free_frags(napi); - local_bh_enable(); -} - void netif_napi_add_weight(struct net_device *dev, struct napi_struct *napi, int (*poll)(struct napi_struct *, int), int weight) { @@ -8835,7 +8822,7 @@ EXPORT_SYMBOL(dev_set_mac_address_user); int dev_get_mac_address(struct sockaddr *sa, struct net *net, char *dev_name) { - size_t size = sizeof(sa->sa_data); + size_t size = sizeof(sa->sa_data_min); struct net_device *dev; int ret = 0; @@ -10490,12 +10477,12 @@ void dev_fetch_sw_netstats(struct rtnl_link_stats64 *s, stats = per_cpu_ptr(netstats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); rx_packets = u64_stats_read(&stats->rx_packets); rx_bytes = u64_stats_read(&stats->rx_bytes); tx_packets = u64_stats_read(&stats->tx_packets); tx_bytes = u64_stats_read(&stats->tx_bytes); - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); s->rx_packets += rx_packets; s->rx_bytes += rx_bytes; diff --git a/net/core/dev_ioctl.c b/net/core/dev_ioctl.c index 7674bb9f3076..5cdbfbf9a7dc 100644 --- a/net/core/dev_ioctl.c +++ b/net/core/dev_ioctl.c @@ -342,7 +342,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, void __user *data, if (ifr->ifr_hwaddr.sa_family != dev->type) return -EINVAL; memcpy(dev->broadcast, ifr->ifr_hwaddr.sa_data, - min(sizeof(ifr->ifr_hwaddr.sa_data), + min(sizeof(ifr->ifr_hwaddr.sa_data_min), (size_t)dev->addr_len)); call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); return 0; diff --git a/net/core/devlink.c b/net/core/devlink.c index 7776dc82f88d..0a16ad45520e 100644 --- a/net/core/devlink.c +++ b/net/core/devlink.c @@ -371,6 +371,13 @@ static struct devlink *devlink_get_from_attrs(struct net *net, return ERR_PTR(-ENODEV); } +#define ASSERT_DEVLINK_PORT_REGISTERED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port) \ + WARN_ON_ONCE((devlink_port)->registered) +#define ASSERT_DEVLINK_PORT_INITIALIZED(devlink_port) \ + WARN_ON_ONCE(!(devlink_port)->initialized) + static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink, unsigned int port_index) { @@ -8297,10 +8304,10 @@ static void devlink_trap_stats_read(struct devlink_stats __percpu *trap_stats, cpu_stats = per_cpu_ptr(trap_stats, i); do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); rx_packets = u64_stats_read(&cpu_stats->rx_packets); rx_bytes = u64_stats_read(&cpu_stats->rx_bytes); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->rx_packets, rx_packets); u64_stats_add(&stats->rx_bytes, rx_bytes); @@ -9848,6 +9855,44 @@ static void devlink_port_type_warn_cancel(struct devlink_port *devlink_port) } /** + * devlink_port_init() - Init devlink port + * + * @devlink: devlink + * @devlink_port: devlink port + * + * Initialize essencial stuff that is needed for functions + * that may be called before devlink port registration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_init(struct devlink *devlink, + struct devlink_port *devlink_port) +{ + if (devlink_port->initialized) + return; + devlink_port->devlink = devlink; + INIT_LIST_HEAD(&devlink_port->region_list); + devlink_port->initialized = true; +} +EXPORT_SYMBOL_GPL(devlink_port_init); + +/** + * devlink_port_fini() - Deinitialize devlink port + * + * @devlink_port: devlink port + * + * Deinitialize essencial stuff that is in use for functions + * that may be called after devlink port unregistration. + * Call to this function is optional and not needed + * in case the driver does not use such functions. + */ +void devlink_port_fini(struct devlink_port *devlink_port) +{ + WARN_ON(!list_empty(&devlink_port->region_list)); +} +EXPORT_SYMBOL_GPL(devlink_port_fini); + +/** * devl_port_register() - Register devlink port * * @devlink: devlink @@ -9869,14 +9914,15 @@ int devl_port_register(struct devlink *devlink, if (devlink_port_index_exists(devlink, port_index)) return -EEXIST; - WARN_ON(devlink_port->devlink); - devlink_port->devlink = devlink; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + + devlink_port_init(devlink, devlink_port); + devlink_port->registered = true; devlink_port->index = port_index; spin_lock_init(&devlink_port->type_lock); INIT_LIST_HEAD(&devlink_port->reporter_list); mutex_init(&devlink_port->reporters_lock); list_add_tail(&devlink_port->list, &devlink->port_list); - INIT_LIST_HEAD(&devlink_port->region_list); INIT_DELAYED_WORK(&devlink_port->type_warn_dw, &devlink_port_type_warn); devlink_port_type_warn_schedule(devlink_port); @@ -9926,8 +9972,8 @@ void devl_port_unregister(struct devlink_port *devlink_port) devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL); list_del(&devlink_port->list); WARN_ON(!list_empty(&devlink_port->reporter_list)); - WARN_ON(!list_empty(&devlink_port->region_list)); mutex_destroy(&devlink_port->reporters_lock); + devlink_port->registered = false; } EXPORT_SYMBOL_GPL(devl_port_unregister); @@ -9952,8 +9998,8 @@ static void __devlink_port_type_set(struct devlink_port *devlink_port, enum devlink_port_type type, void *type_dev) { - if (WARN_ON(!devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_REGISTERED(devlink_port); + devlink_port_type_warn_cancel(devlink_port); spin_lock_bh(&devlink_port->type_lock); devlink_port->type = type; @@ -10072,8 +10118,8 @@ void devlink_port_attrs_set(struct devlink_port *devlink_port, { int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + devlink_port->attrs = *attrs; ret = __devlink_port_attrs_set(devlink_port, attrs->flavour); if (ret) @@ -10096,8 +10142,8 @@ void devlink_port_attrs_pci_pf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_PF); if (ret) @@ -10123,8 +10169,8 @@ void devlink_port_attrs_pci_vf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_VF); if (ret) @@ -10151,8 +10197,8 @@ void devlink_port_attrs_pci_sf_set(struct devlink_port *devlink_port, u32 contro struct devlink_port_attrs *attrs = &devlink_port->attrs; int ret; - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + ret = __devlink_port_attrs_set(devlink_port, DEVLINK_PORT_FLAVOUR_PCI_SF); if (ret) @@ -10267,8 +10313,8 @@ EXPORT_SYMBOL_GPL(devl_rate_nodes_destroy); void devlink_port_linecard_set(struct devlink_port *devlink_port, struct devlink_linecard *linecard) { - if (WARN_ON(devlink_port->devlink)) - return; + ASSERT_DEVLINK_PORT_NOT_REGISTERED(devlink_port); + devlink_port->linecard = linecard; } EXPORT_SYMBOL_GPL(devlink_port_linecard_set); @@ -11339,6 +11385,8 @@ devlink_port_region_create(struct devlink_port *port, struct devlink_region *region; int err = 0; + ASSERT_DEVLINK_PORT_INITIALIZED(port); + if (WARN_ON(!ops) || WARN_ON(!ops->destructor)) return ERR_PTR(-EINVAL); diff --git a/net/core/drop_monitor.c b/net/core/drop_monitor.c index f084a4a6b7ab..11aa6e8a3098 100644 --- a/net/core/drop_monitor.c +++ b/net/core/drop_monitor.c @@ -1432,9 +1432,9 @@ static void net_dm_stats_read(struct net_dm_stats *stats) u64 dropped; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } @@ -1476,9 +1476,9 @@ static void net_dm_hw_stats_read(struct net_dm_stats *stats) u64 dropped; do { - start = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + start = u64_stats_fetch_begin(&cpu_stats->syncp); dropped = u64_stats_read(&cpu_stats->dropped); - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, start)); u64_stats_add(&stats->dropped, dropped); } diff --git a/net/core/filter.c b/net/core/filter.c index c191db80ce93..bb0136e7a8e4 100644 --- a/net/core/filter.c +++ b/net/core/filter.c @@ -18,6 +18,7 @@ */ #include <linux/atomic.h> +#include <linux/bpf_verifier.h> #include <linux/module.h> #include <linux/types.h> #include <linux/mm.h> @@ -3010,7 +3011,7 @@ BPF_CALL_0(bpf_get_cgroup_classid_curr) return __task_get_classid(current); } -static const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { +const struct bpf_func_proto bpf_get_cgroup_classid_curr_proto = { .func = bpf_get_cgroup_classid_curr, .gpl_only = false, .ret_type = RET_INTEGER, @@ -4489,7 +4490,8 @@ BPF_CALL_4(bpf_skb_get_tunnel_key, struct sk_buff *, skb, struct bpf_tunnel_key void *to_orig = to; int err; - if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6)))) { + if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6 | + BPF_F_TUNINFO_FLAGS)))) { err = -EINVAL; goto err_clear; } @@ -4521,7 +4523,10 @@ set_compat: to->tunnel_id = be64_to_cpu(info->key.tun_id); to->tunnel_tos = info->key.tos; to->tunnel_ttl = info->key.ttl; - to->tunnel_ext = 0; + if (flags & BPF_F_TUNINFO_FLAGS) + to->tunnel_flags = info->key.tun_flags; + else + to->tunnel_ext = 0; if (flags & BPF_F_TUNINFO_IPV6) { memcpy(to->remote_ipv6, &info->key.u.ipv6.src, @@ -5014,359 +5019,303 @@ static const struct bpf_func_proto bpf_get_socket_uid_proto = { .arg1_type = ARG_PTR_TO_CTX, }; -static int __bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_socket_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + switch (optname) { + case SO_REUSEADDR: + case SO_SNDBUF: + case SO_RCVBUF: + case SO_KEEPALIVE: + case SO_PRIORITY: + case SO_REUSEPORT: + case SO_RCVLOWAT: + case SO_MARK: + case SO_MAX_PACING_RATE: + case SO_BINDTOIFINDEX: + case SO_TXREHASH: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case SO_BINDTODEVICE: + break; + default: + return -EINVAL; + } + + if (getopt) { + if (optname == SO_BINDTODEVICE) + return -EINVAL; + return sk_getsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } + + return sk_setsockopt(sk, SOL_SOCKET, optname, + KERNEL_SOCKPTR(optval), *optlen); +} + +static int bpf_sol_tcp_setsockopt(struct sock *sk, int optname, + char *optval, int optlen) { - char devname[IFNAMSIZ]; - int val, valbool; - struct net *net; - int ifindex; - int ret = 0; + struct tcp_sock *tp = tcp_sk(sk); + unsigned long timeout; + int val; - if (!sk_fullsock(sk)) + if (optlen != sizeof(int)) return -EINVAL; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int) && optname != SO_BINDTODEVICE) + val = *(int *)optval; + + /* Only some options are supported */ + switch (optname) { + case TCP_BPF_IW: + if (val <= 0 || tp->data_segs_out > tp->syn_data) return -EINVAL; - val = *((int *)optval); - valbool = val ? 1 : 0; - - /* Only some socketops are supported */ - switch (optname) { - case SO_RCVBUF: - val = min_t(u32, val, READ_ONCE(sysctl_rmem_max)); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_RCVBUF_LOCK; - WRITE_ONCE(sk->sk_rcvbuf, - max_t(int, val * 2, SOCK_MIN_RCVBUF)); - break; - case SO_SNDBUF: - val = min_t(u32, val, READ_ONCE(sysctl_wmem_max)); - val = min_t(int, val, INT_MAX / 2); - sk->sk_userlocks |= SOCK_SNDBUF_LOCK; - WRITE_ONCE(sk->sk_sndbuf, - max_t(int, val * 2, SOCK_MIN_SNDBUF)); - break; - case SO_MAX_PACING_RATE: /* 32bit version */ - if (val != ~0U) - cmpxchg(&sk->sk_pacing_status, - SK_PACING_NONE, - SK_PACING_NEEDED); - sk->sk_max_pacing_rate = (val == ~0U) ? - ~0UL : (unsigned int)val; - sk->sk_pacing_rate = min(sk->sk_pacing_rate, - sk->sk_max_pacing_rate); - break; - case SO_PRIORITY: - sk->sk_priority = val; - break; - case SO_RCVLOWAT: - if (val < 0) - val = INT_MAX; - if (sk->sk_socket && sk->sk_socket->ops->set_rcvlowat) - ret = sk->sk_socket->ops->set_rcvlowat(sk, val); - else - WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); - break; - case SO_MARK: - if (sk->sk_mark != val) { - sk->sk_mark = val; - sk_dst_reset(sk); - } - break; - case SO_BINDTODEVICE: - optlen = min_t(long, optlen, IFNAMSIZ - 1); - strncpy(devname, optval, optlen); - devname[optlen] = 0; + tcp_snd_cwnd_set(tp, val); + break; + case TCP_BPF_SNDCWND_CLAMP: + if (val <= 0) + return -EINVAL; + tp->snd_cwnd_clamp = val; + tp->snd_ssthresh = val; + break; + case TCP_BPF_DELACK_MAX: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_DELACK_MAX || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_delack_max = timeout; + break; + case TCP_BPF_RTO_MIN: + timeout = usecs_to_jiffies(val); + if (timeout > TCP_RTO_MIN || + timeout < TCP_TIMEOUT_MIN) + return -EINVAL; + inet_csk(sk)->icsk_rto_min = timeout; + break; + default: + return -EINVAL; + } - ifindex = 0; - if (devname[0] != '\0') { - struct net_device *dev; + return 0; +} - ret = -ENODEV; +static int sol_tcp_sockopt_congestion(struct sock *sk, char *optval, + int *optlen, bool getopt) +{ + struct tcp_sock *tp; + int ret; - net = sock_net(sk); - dev = dev_get_by_name(net, devname); - if (!dev) - break; - ifindex = dev->ifindex; - dev_put(dev); - } - fallthrough; - case SO_BINDTOIFINDEX: - if (optname == SO_BINDTOIFINDEX) - ifindex = val; - ret = sock_bindtoindex(sk, ifindex, false); - break; - case SO_KEEPALIVE: - if (sk->sk_prot->keepalive) - sk->sk_prot->keepalive(sk, valbool); - sock_valbool_flag(sk, SOCK_KEEPOPEN, valbool); - break; - case SO_REUSEPORT: - sk->sk_reuseport = valbool; - break; - case SO_TXREHASH: - if (val < -1 || val > 1) { - ret = -EINVAL; - break; - } - sk->sk_txrehash = (u8)val; - break; - default: - ret = -EINVAL; - } -#ifdef CONFIG_INET - } else if (level == SOL_IP) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET) + if (*optlen < 2) + return -EINVAL; + + if (getopt) { + if (!inet_csk(sk)->icsk_ca_ops) return -EINVAL; + /* BPF expects NULL-terminated tcp-cc string */ + optval[--(*optlen)] = '\0'; + return do_tcp_getsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + } - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct inet_sock *inet = inet_sk(sk); + /* "cdg" is the only cc that alloc a ptr + * in inet_csk_ca area. The bpf-tcp-cc may + * overwrite this ptr after switching to cdg. + */ + if (*optlen >= sizeof("cdg") - 1 && !strncmp("cdg", optval, *optlen)) + return -ENOTSUPP; - if (val == -1) - val = 0; - inet->tos = val; - } - break; - default: - ret = -EINVAL; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - return -EINVAL; + /* It stops this looping + * + * .init => bpf_setsockopt(tcp_cc) => .init => + * bpf_setsockopt(tcp_cc)" => .init => .... + * + * The second bpf_setsockopt(tcp_cc) is not allowed + * in order to break the loop when both .init + * are the same bpf prog. + * + * This applies even the second bpf_setsockopt(tcp_cc) + * does not cause a loop. This limits only the first + * '.init' can call bpf_setsockopt(TCP_CONGESTION) to + * pick a fallback cc (eg. peer does not support ECN) + * and the second '.init' cannot fallback to + * another. + */ + tp = tcp_sk(sk); + if (tp->bpf_chg_cc_inprogress) + return -EBUSY; + + tp->bpf_chg_cc_inprogress = 1; + ret = do_tcp_setsockopt(sk, SOL_TCP, TCP_CONGESTION, + KERNEL_SOCKPTR(optval), *optlen); + tp->bpf_chg_cc_inprogress = 0; + return ret; +} - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - if (val < -1 || val > 0xff) { - ret = -EINVAL; - } else { - struct ipv6_pinfo *np = inet6_sk(sk); +static int sol_tcp_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) +{ + if (sk->sk_prot->setsockopt != tcp_setsockopt) + return -EINVAL; - if (val == -1) - val = 0; - np->tclass = val; - } - break; - default: - ret = -EINVAL; - } -#endif - } else if (level == SOL_TCP && - sk->sk_prot->setsockopt == tcp_setsockopt) { - if (optname == TCP_CONGESTION) { - char name[TCP_CA_NAME_MAX]; - - strncpy(name, optval, min_t(long, optlen, - TCP_CA_NAME_MAX-1)); - name[TCP_CA_NAME_MAX-1] = 0; - ret = tcp_set_congestion_control(sk, name, false, true); - } else { - struct inet_connection_sock *icsk = inet_csk(sk); + switch (optname) { + case TCP_NODELAY: + case TCP_MAXSEG: + case TCP_KEEPIDLE: + case TCP_KEEPINTVL: + case TCP_KEEPCNT: + case TCP_SYNCNT: + case TCP_WINDOW_CLAMP: + case TCP_THIN_LINEAR_TIMEOUTS: + case TCP_USER_TIMEOUT: + case TCP_NOTSENT_LOWAT: + case TCP_SAVE_SYN: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + case TCP_CONGESTION: + return sol_tcp_sockopt_congestion(sk, optval, optlen, getopt); + case TCP_SAVED_SYN: + if (*optlen < 1) + return -EINVAL; + break; + default: + if (getopt) + return -EINVAL; + return bpf_sol_tcp_setsockopt(sk, optname, optval, *optlen); + } + + if (getopt) { + if (optname == TCP_SAVED_SYN) { struct tcp_sock *tp = tcp_sk(sk); - unsigned long timeout; - if (optlen != sizeof(int)) + if (!tp->saved_syn || + *optlen > tcp_saved_syn_len(tp->saved_syn)) return -EINVAL; - - val = *((int *)optval); - /* Only some options are supported */ - switch (optname) { - case TCP_BPF_IW: - if (val <= 0 || tp->data_segs_out > tp->syn_data) - ret = -EINVAL; - else - tcp_snd_cwnd_set(tp, val); - break; - case TCP_BPF_SNDCWND_CLAMP: - if (val <= 0) { - ret = -EINVAL; - } else { - tp->snd_cwnd_clamp = val; - tp->snd_ssthresh = val; - } - break; - case TCP_BPF_DELACK_MAX: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_DELACK_MAX || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_delack_max = timeout; - break; - case TCP_BPF_RTO_MIN: - timeout = usecs_to_jiffies(val); - if (timeout > TCP_RTO_MIN || - timeout < TCP_TIMEOUT_MIN) - return -EINVAL; - inet_csk(sk)->icsk_rto_min = timeout; - break; - case TCP_SAVE_SYN: - if (val < 0 || val > 1) - ret = -EINVAL; - else - tp->save_syn = val; - break; - case TCP_KEEPIDLE: - ret = tcp_sock_set_keepidle_locked(sk, val); - break; - case TCP_KEEPINTVL: - if (val < 1 || val > MAX_TCP_KEEPINTVL) - ret = -EINVAL; - else - tp->keepalive_intvl = val * HZ; - break; - case TCP_KEEPCNT: - if (val < 1 || val > MAX_TCP_KEEPCNT) - ret = -EINVAL; - else - tp->keepalive_probes = val; - break; - case TCP_SYNCNT: - if (val < 1 || val > MAX_TCP_SYNCNT) - ret = -EINVAL; - else - icsk->icsk_syn_retries = val; - break; - case TCP_USER_TIMEOUT: - if (val < 0) - ret = -EINVAL; - else - icsk->icsk_user_timeout = val; - break; - case TCP_NOTSENT_LOWAT: - tp->notsent_lowat = val; - sk->sk_write_space(sk); - break; - case TCP_WINDOW_CLAMP: - ret = tcp_set_window_clamp(sk, val); - break; - default: - ret = -EINVAL; - } + memcpy(optval, tp->saved_syn->data, *optlen); + /* It cannot free tp->saved_syn here because it + * does not know if the user space still needs it. + */ + return 0; } -#endif - } else { - ret = -EINVAL; + + return do_tcp_getsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); } - return ret; + + return do_tcp_setsockopt(sk, SOL_TCP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int _bpf_setsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ip_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (sk_fullsock(sk)) - sock_owned_by_me(sk); - return __bpf_setsockopt(sk, level, optname, optval, optlen); + if (sk->sk_family != AF_INET) + return -EINVAL; + + switch (optname) { + case IP_TOS: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } + + if (getopt) + return do_ip_getsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); + + return do_ip_setsockopt(sk, SOL_IP, optname, + KERNEL_SOCKPTR(optval), *optlen); } -static int __bpf_getsockopt(struct sock *sk, int level, int optname, - char *optval, int optlen) +static int sol_ipv6_sockopt(struct sock *sk, int optname, + char *optval, int *optlen, + bool getopt) { - if (!sk_fullsock(sk)) - goto err_clear; + if (sk->sk_family != AF_INET6) + return -EINVAL; - if (level == SOL_SOCKET) { - if (optlen != sizeof(int)) - goto err_clear; + switch (optname) { + case IPV6_TCLASS: + case IPV6_AUTOFLOWLABEL: + if (*optlen != sizeof(int)) + return -EINVAL; + break; + default: + return -EINVAL; + } - switch (optname) { - case SO_RCVBUF: - *((int *)optval) = sk->sk_rcvbuf; - break; - case SO_SNDBUF: - *((int *)optval) = sk->sk_sndbuf; - break; - case SO_MARK: - *((int *)optval) = sk->sk_mark; - break; - case SO_PRIORITY: - *((int *)optval) = sk->sk_priority; - break; - case SO_BINDTOIFINDEX: - *((int *)optval) = sk->sk_bound_dev_if; - break; - case SO_REUSEPORT: - *((int *)optval) = sk->sk_reuseport; - break; - case SO_TXREHASH: - *((int *)optval) = sk->sk_txrehash; - break; - default: - goto err_clear; - } -#ifdef CONFIG_INET - } else if (level == SOL_TCP && sk->sk_prot->getsockopt == tcp_getsockopt) { - struct inet_connection_sock *icsk; - struct tcp_sock *tp; + if (getopt) + return ipv6_bpf_stub->ipv6_getsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), + KERNEL_SOCKPTR(optlen)); - switch (optname) { - case TCP_CONGESTION: - icsk = inet_csk(sk); + return ipv6_bpf_stub->ipv6_setsockopt(sk, SOL_IPV6, optname, + KERNEL_SOCKPTR(optval), *optlen); +} - if (!icsk->icsk_ca_ops || optlen <= 1) - goto err_clear; - strncpy(optval, icsk->icsk_ca_ops->name, optlen); - optval[optlen - 1] = 0; - break; - case TCP_SAVED_SYN: - tp = tcp_sk(sk); +static int __bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (!sk_fullsock(sk)) + return -EINVAL; - if (optlen <= 0 || !tp->saved_syn || - optlen > tcp_saved_syn_len(tp->saved_syn)) - goto err_clear; - memcpy(optval, tp->saved_syn->data, optlen); - break; - default: - goto err_clear; - } - } else if (level == SOL_IP) { - struct inet_sock *inet = inet_sk(sk); + if (level == SOL_SOCKET) + return sol_socket_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + return sol_ip_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + return sol_ipv6_sockopt(sk, optname, optval, &optlen, false); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + return sol_tcp_sockopt(sk, optname, optval, &optlen, false); - if (optlen != sizeof(int) || sk->sk_family != AF_INET) - goto err_clear; + return -EINVAL; +} - /* Only some options are supported */ - switch (optname) { - case IP_TOS: - *((int *)optval) = (int)inet->tos; - break; - default: - goto err_clear; - } -#if IS_ENABLED(CONFIG_IPV6) - } else if (level == SOL_IPV6) { - struct ipv6_pinfo *np = inet6_sk(sk); +static int _bpf_setsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + if (sk_fullsock(sk)) + sock_owned_by_me(sk); + return __bpf_setsockopt(sk, level, optname, optval, optlen); +} - if (optlen != sizeof(int) || sk->sk_family != AF_INET6) - goto err_clear; +static int __bpf_getsockopt(struct sock *sk, int level, int optname, + char *optval, int optlen) +{ + int err, saved_optlen = optlen; - /* Only some options are supported */ - switch (optname) { - case IPV6_TCLASS: - *((int *)optval) = (int)np->tclass; - break; - default: - goto err_clear; - } -#endif -#endif - } else { - goto err_clear; + if (!sk_fullsock(sk)) { + err = -EINVAL; + goto done; } - return 0; -err_clear: - memset(optval, 0, optlen); - return -EINVAL; + + if (level == SOL_SOCKET) + err = sol_socket_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_TCP) + err = sol_tcp_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_INET) && level == SOL_IP) + err = sol_ip_sockopt(sk, optname, optval, &optlen, true); + else if (IS_ENABLED(CONFIG_IPV6) && level == SOL_IPV6) + err = sol_ipv6_sockopt(sk, optname, optval, &optlen, true); + else + err = -EINVAL; + +done: + if (err) + optlen = 0; + if (optlen < saved_optlen) + memset(optval + optlen, 0, saved_optlen - optlen); + return err; } static int _bpf_getsockopt(struct sock *sk, int level, int optname, @@ -5380,12 +5329,6 @@ static int _bpf_getsockopt(struct sock *sk, int level, int optname, BPF_CALL_5(bpf_sk_setsockopt, struct sock *, sk, int, level, int, optname, char *, optval, int, optlen) { - if (level == SOL_TCP && optname == TCP_CONGESTION) { - if (optlen >= sizeof("cdg") - 1 && - !strncmp("cdg", optval, optlen)) - return -ENOTSUPP; - } - return _bpf_setsockopt(sk, level, optname, optval, optlen); } @@ -6469,6 +6412,7 @@ static const struct bpf_func_proto bpf_lwt_seg6_adjust_srh_proto = { static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, int dif, int sdif, u8 family, u8 proto) { + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; bool refcounted = false; struct sock *sk = NULL; @@ -6477,7 +6421,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, __be32 dst4 = tuple->ipv4.daddr; if (proto == IPPROTO_TCP) - sk = __inet_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet_lookup(net, hinfo, NULL, 0, src4, tuple->ipv4.sport, dst4, tuple->ipv4.dport, dif, sdif, &refcounted); @@ -6491,7 +6435,7 @@ static struct sock *sk_lookup(struct net *net, struct bpf_sock_tuple *tuple, struct in6_addr *dst6 = (struct in6_addr *)&tuple->ipv6.daddr; if (proto == IPPROTO_TCP) - sk = __inet6_lookup(net, &tcp_hashinfo, NULL, 0, + sk = __inet6_lookup(net, hinfo, NULL, 0, src6, tuple->ipv6.sport, dst6, ntohs(tuple->ipv6.dport), dif, sdif, &refcounted); @@ -7667,34 +7611,23 @@ const struct bpf_func_proto bpf_sk_storage_get_cg_sock_proto __weak; static const struct bpf_func_proto * sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif case BPF_FUNC_sk_storage_get: return &bpf_sk_storage_get_cg_sock_proto; case BPF_FUNC_ktime_get_coarse_ns: @@ -7707,12 +7640,17 @@ sock_filter_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) static const struct bpf_func_proto * sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + + func_proto = cgroup_current_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - /* inet and inet6 sockets are created in a process - * context so there is always a valid uid/gid - */ - case BPF_FUNC_get_current_uid_gid: - return &bpf_get_current_uid_gid_proto; case BPF_FUNC_bind: switch (prog->expected_attach_type) { case BPF_CGROUP_INET4_CONNECT: @@ -7725,24 +7663,8 @@ sock_addr_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_get_socket_cookie_sock_addr_proto; case BPF_FUNC_get_netns_cookie: return &bpf_get_netns_cookie_sock_addr_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; - case BPF_FUNC_get_current_pid_tgid: - return &bpf_get_current_pid_tgid_proto; - case BPF_FUNC_get_current_comm: - return &bpf_get_current_comm_proto; -#ifdef CONFIG_CGROUPS - case BPF_FUNC_get_current_cgroup_id: - return &bpf_get_current_cgroup_id_proto; - case BPF_FUNC_get_current_ancestor_cgroup_id: - return &bpf_get_current_ancestor_cgroup_id_proto; -#endif -#ifdef CONFIG_CGROUP_NET_CLASSID - case BPF_FUNC_get_cgroup_classid: - return &bpf_get_cgroup_classid_curr_proto; -#endif #ifdef CONFIG_INET case BPF_FUNC_sk_lookup_tcp: return &bpf_sock_addr_sk_lookup_tcp_proto; @@ -7823,9 +7745,13 @@ const struct bpf_func_proto bpf_sk_storage_delete_proto __weak; static const struct bpf_func_proto * cg_skb_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_sk_fullsock: return &bpf_sk_fullsock_proto; case BPF_FUNC_sk_storage_get: @@ -8065,6 +7991,12 @@ const struct bpf_func_proto bpf_sock_hash_update_proto __weak; static const struct bpf_func_proto * sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) { + const struct bpf_func_proto *func_proto; + + func_proto = cgroup_common_func_proto(func_id, prog); + if (func_proto) + return func_proto; + switch (func_id) { case BPF_FUNC_setsockopt: return &bpf_sock_ops_setsockopt_proto; @@ -8078,8 +8010,6 @@ sock_ops_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) return &bpf_sock_hash_update_proto; case BPF_FUNC_get_socket_cookie: return &bpf_get_socket_cookie_sock_ops_proto; - case BPF_FUNC_get_local_storage: - return &bpf_get_local_storage_proto; case BPF_FUNC_perf_event_output: return &bpf_event_output_data_proto; case BPF_FUNC_sk_storage_get: @@ -8714,6 +8644,36 @@ static bool tc_cls_act_is_valid_access(int off, int size, return bpf_skb_is_valid_access(off, size, type, prog, info); } +DEFINE_MUTEX(nf_conn_btf_access_lock); +EXPORT_SYMBOL_GPL(nf_conn_btf_access_lock); + +int (*nfct_btf_struct_access)(struct bpf_verifier_log *log, const struct btf *btf, + const struct btf_type *t, int off, int size, + enum bpf_access_type atype, u32 *next_btf_id, + enum bpf_type_flag *flag); +EXPORT_SYMBOL_GPL(nfct_btf_struct_access); + +static int tc_cls_act_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool __is_valid_xdp_access(int off, int size) { if (off < 0 || off >= sizeof(struct xdp_md)) @@ -8773,6 +8733,27 @@ void bpf_warn_invalid_xdp_action(struct net_device *dev, struct bpf_prog *prog, } EXPORT_SYMBOL_GPL(bpf_warn_invalid_xdp_action); +static int xdp_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + int ret = -EACCES; + + if (atype == BPF_READ) + return btf_struct_access(log, btf, t, off, size, atype, next_btf_id, + flag); + + mutex_lock(&nf_conn_btf_access_lock); + if (nfct_btf_struct_access) + ret = nfct_btf_struct_access(log, btf, t, off, size, atype, next_btf_id, flag); + mutex_unlock(&nf_conn_btf_access_lock); + + return ret; +} + static bool sock_addr_is_valid_access(int off, int size, enum bpf_access_type type, const struct bpf_prog *prog, @@ -10667,6 +10648,7 @@ const struct bpf_verifier_ops tc_cls_act_verifier_ops = { .convert_ctx_access = tc_cls_act_convert_ctx_access, .gen_prologue = tc_cls_act_prologue, .gen_ld_abs = bpf_gen_ld_abs, + .btf_struct_access = tc_cls_act_btf_struct_access, }; const struct bpf_prog_ops tc_cls_act_prog_ops = { @@ -10678,6 +10660,7 @@ const struct bpf_verifier_ops xdp_verifier_ops = { .is_valid_access = xdp_is_valid_access, .convert_ctx_access = xdp_convert_ctx_access, .gen_prologue = bpf_noop_prologue, + .btf_struct_access = xdp_btf_struct_access, }; const struct bpf_prog_ops xdp_prog_ops = { @@ -10812,14 +10795,13 @@ int sk_detach_filter(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_detach_filter); -int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, - unsigned int len) +int sk_get_filter(struct sock *sk, sockptr_t optval, unsigned int len) { struct sock_fprog_kern *fprog; struct sk_filter *filter; int ret = 0; - lock_sock(sk); + sockopt_lock_sock(sk); filter = rcu_dereference_protected(sk->sk_filter, lockdep_sock_is_held(sk)); if (!filter) @@ -10844,7 +10826,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, goto out; ret = -EFAULT; - if (copy_to_user(ubuf, fprog->filter, bpf_classic_proglen(fprog))) + if (copy_to_sockptr(optval, fprog->filter, bpf_classic_proglen(fprog))) goto out; /* Instead of bytes, the API requests to return the number @@ -10852,7 +10834,7 @@ int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf, */ ret = fprog->len; out: - release_sock(sk); + sockopt_release_sock(sk); return ret; } diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c index 764c4cb3fe8f..25cd35f5922e 100644 --- a/net/core/flow_dissector.c +++ b/net/core/flow_dissector.c @@ -204,6 +204,30 @@ static void __skb_flow_dissect_icmp(const struct sk_buff *skb, skb_flow_get_icmp_tci(skb, key_icmp, data, thoff, hlen); } +static void __skb_flow_dissect_l2tpv3(const struct sk_buff *skb, + struct flow_dissector *flow_dissector, + void *target_container, const void *data, + int nhoff, int hlen) +{ + struct flow_dissector_key_l2tpv3 *key_l2tpv3; + struct { + __be32 session_id; + } *hdr, _hdr; + + if (!dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_L2TPV3)) + return; + + hdr = __skb_header_pointer(skb, nhoff, sizeof(_hdr), data, hlen, &_hdr); + if (!hdr) + return; + + key_l2tpv3 = skb_flow_dissector_target(flow_dissector, + FLOW_DISSECTOR_KEY_L2TPV3, + target_container); + + key_l2tpv3->session_id = hdr->session_id; +} + void skb_flow_dissect_meta(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container) @@ -866,8 +890,8 @@ static void __skb_flow_bpf_to_target(const struct bpf_flow_keys *flow_keys, } } -bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, - __be16 proto, int nhoff, int hlen, unsigned int flags) +u32 bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, + __be16 proto, int nhoff, int hlen, unsigned int flags) { struct bpf_flow_keys *flow_keys = ctx->flow_keys; u32 result; @@ -892,7 +916,7 @@ bool bpf_flow_dissect(struct bpf_prog *prog, struct bpf_flow_dissector *ctx, flow_keys->thoff = clamp_t(u16, flow_keys->thoff, flow_keys->nhoff, hlen); - return result == BPF_OK; + return result; } static bool is_pppoe_ses_hdr_valid(const struct pppoe_hdr *hdr) @@ -1008,6 +1032,7 @@ bool __skb_flow_dissect(const struct net *net, }; __be16 n_proto = proto; struct bpf_prog *prog; + u32 result; if (skb) { ctx.skb = skb; @@ -1019,13 +1044,16 @@ bool __skb_flow_dissect(const struct net *net, } prog = READ_ONCE(run_array->items[0].prog); - ret = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, - hlen, flags); + result = bpf_flow_dissect(prog, &ctx, n_proto, nhoff, + hlen, flags); + if (result == BPF_FLOW_DISSECTOR_CONTINUE) + goto dissect_continue; __skb_flow_bpf_to_target(&flow_keys, flow_dissector, target_container); rcu_read_unlock(); - return ret; + return result == BPF_OK; } +dissect_continue: rcu_read_unlock(); } @@ -1173,8 +1201,8 @@ proto_again: nhoff += sizeof(*vlan); } - if (dissector_uses_key(flow_dissector, - FLOW_DISSECTOR_KEY_NUM_OF_VLANS)) { + if (dissector_uses_key(flow_dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS) && + !(key_control->flags & FLOW_DIS_ENCAPSULATION)) { struct flow_dissector_key_num_of_vlans *key_nvs; key_nvs = skb_flow_dissector_target(flow_dissector, @@ -1497,6 +1525,10 @@ ip_proto_again: __skb_flow_dissect_icmp(skb, flow_dissector, target_container, data, nhoff, hlen); break; + case IPPROTO_L2TP: + __skb_flow_dissect_l2tpv3(skb, flow_dissector, target_container, + data, nhoff, hlen); + break; default: break; @@ -1611,9 +1643,8 @@ static inline void __flow_hash_consistentify(struct flow_keys *keys) switch (keys->control.addr_type) { case FLOW_DISSECTOR_KEY_IPV4_ADDRS: - addr_diff = (__force u32)keys->addrs.v4addrs.dst - - (__force u32)keys->addrs.v4addrs.src; - if (addr_diff < 0) + if ((__force u32)keys->addrs.v4addrs.dst < + (__force u32)keys->addrs.v4addrs.src) swap(keys->addrs.v4addrs.src, keys->addrs.v4addrs.dst); if ((__force u16)keys->ports.dst < diff --git a/net/core/flow_offload.c b/net/core/flow_offload.c index 8cfb63528d18..abe423fd5736 100644 --- a/net/core/flow_offload.c +++ b/net/core/flow_offload.c @@ -237,6 +237,13 @@ void flow_rule_match_pppoe(const struct flow_rule *rule, } EXPORT_SYMBOL(flow_rule_match_pppoe); +void flow_rule_match_l2tpv3(const struct flow_rule *rule, + struct flow_match_l2tpv3 *out) +{ + FLOW_DISSECTOR_MATCH(rule, FLOW_DISSECTOR_KEY_L2TPV3, out); +} +EXPORT_SYMBOL(flow_rule_match_l2tpv3); + struct flow_block_cb *flow_block_cb_alloc(flow_setup_cb_t *cb, void *cb_ident, void *cb_priv, void (*release)(void *cb_priv)) diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c index c8d137ef5980..b71ccaec0991 100644 --- a/net/core/gen_stats.c +++ b/net/core/gen_stats.c @@ -135,10 +135,10 @@ static void gnet_stats_add_basic_cpu(struct gnet_stats_basic_sync *bstats, u64 bytes, packets; do { - start = u64_stats_fetch_begin_irq(&bcpu->syncp); + start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); - } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; @@ -162,10 +162,10 @@ void gnet_stats_add_basic(struct gnet_stats_basic_sync *bstats, } do { if (running) - start = u64_stats_fetch_begin_irq(&b->syncp); + start = u64_stats_fetch_begin(&b->syncp); bytes = u64_stats_read(&b->bytes); packets = u64_stats_read(&b->packets); - } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); _bstats_update(bstats, bytes, packets); } @@ -187,10 +187,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, u64 bytes, packets; do { - start = u64_stats_fetch_begin_irq(&bcpu->syncp); + start = u64_stats_fetch_begin(&bcpu->syncp); bytes = u64_stats_read(&bcpu->bytes); packets = u64_stats_read(&bcpu->packets); - } while (u64_stats_fetch_retry_irq(&bcpu->syncp, start)); + } while (u64_stats_fetch_retry(&bcpu->syncp, start)); t_bytes += bytes; t_packets += packets; @@ -201,10 +201,10 @@ static void gnet_stats_read_basic(u64 *ret_bytes, u64 *ret_packets, } do { if (running) - start = u64_stats_fetch_begin_irq(&b->syncp); + start = u64_stats_fetch_begin(&b->syncp); *ret_bytes = u64_stats_read(&b->bytes); *ret_packets = u64_stats_read(&b->packets); - } while (running && u64_stats_fetch_retry_irq(&b->syncp, start)); + } while (running && u64_stats_fetch_retry(&b->syncp, start)); } static int diff --git a/net/core/gro.c b/net/core/gro.c index b4190eb08467..bc9451743307 100644 --- a/net/core/gro.c +++ b/net/core/gro.c @@ -160,6 +160,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) unsigned int gro_max_size; unsigned int new_truesize; struct sk_buff *lp; + int segs; /* pairs with WRITE_ONCE() in netif_set_gro_max_size() */ gro_max_size = READ_ONCE(p->dev->gro_max_size); @@ -175,6 +176,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff *skb) return -E2BIG; } + segs = NAPI_GRO_CB(skb)->count; lp = NAPI_GRO_CB(p)->last; pinfo = skb_shinfo(lp); @@ -265,7 +267,7 @@ merge: lp = p; done: - NAPI_GRO_CB(p)->count++; + NAPI_GRO_CB(p)->count += segs; p->data_len += len; p->truesize += delta_truesize; p->len += len; @@ -496,8 +498,15 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff BUILD_BUG_ON(!IS_ALIGNED(offsetof(struct napi_gro_cb, zeroed), sizeof(u32))); /* Avoid slow unaligned acc */ *(u32 *)&NAPI_GRO_CB(skb)->zeroed = 0; - NAPI_GRO_CB(skb)->flush = skb_is_gso(skb) || skb_has_frag_list(skb); + NAPI_GRO_CB(skb)->flush = skb_has_frag_list(skb); NAPI_GRO_CB(skb)->is_atomic = 1; + NAPI_GRO_CB(skb)->count = 1; + if (unlikely(skb_is_gso(skb))) { + NAPI_GRO_CB(skb)->count = skb_shinfo(skb)->gso_segs; + /* Only support TCP at the moment. */ + if (!skb_is_gso_tcp(skb)) + NAPI_GRO_CB(skb)->flush = 1; + } /* Setup for GRO checksum validation */ switch (skb->ip_summed) { @@ -545,10 +554,10 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff else gro_list->count++; - NAPI_GRO_CB(skb)->count = 1; NAPI_GRO_CB(skb)->age = jiffies; NAPI_GRO_CB(skb)->last = skb; - skb_shinfo(skb)->gso_size = skb_gro_len(skb); + if (!skb_is_gso(skb)) + skb_shinfo(skb)->gso_size = skb_gro_len(skb); list_add(&skb->list, &gro_list->list); ret = GRO_HELD; @@ -660,6 +669,7 @@ static void napi_reuse_skb(struct napi_struct *napi, struct sk_buff *skb) skb->encapsulation = 0; skb_shinfo(skb)->gso_type = 0; + skb_shinfo(skb)->gso_size = 0; if (unlikely(skb->slow_gro)) { skb_orphan(skb); skb_ext_reset(skb); diff --git a/net/core/gro_cells.c b/net/core/gro_cells.c index 21619c70a82b..ed5ec5de47f6 100644 --- a/net/core/gro_cells.c +++ b/net/core/gro_cells.c @@ -81,8 +81,7 @@ int gro_cells_init(struct gro_cells *gcells, struct net_device *dev) set_bit(NAPI_STATE_NO_BUSY_POLL, &cell->napi.state); - netif_napi_add(dev, &cell->napi, gro_cell_poll, - NAPI_POLL_WEIGHT); + netif_napi_add(dev, &cell->napi, gro_cell_poll); napi_enable(&cell->napi); } return 0; diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c index 9ccd64e8a666..6fac2f0ef074 100644 --- a/net/core/lwtunnel.c +++ b/net/core/lwtunnel.c @@ -50,6 +50,7 @@ static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type) return "IOAM6"; case LWTUNNEL_ENCAP_IP6: case LWTUNNEL_ENCAP_IP: + case LWTUNNEL_ENCAP_XFRM: case LWTUNNEL_ENCAP_NONE: case __LWTUNNEL_ENCAP_MAX: /* should not have got here */ diff --git a/net/core/neighbour.c b/net/core/neighbour.c index e93edb810103..3c4786b99907 100644 --- a/net/core/neighbour.c +++ b/net/core/neighbour.c @@ -111,7 +111,7 @@ static void neigh_cleanup_and_release(struct neighbour *neigh) unsigned long neigh_rand_reach_time(unsigned long base) { - return base ? (prandom_u32() % base) + (base >> 1) : 0; + return base ? prandom_u32_max(base) + (base >> 1) : 0; } EXPORT_SYMBOL(neigh_rand_reach_time); diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c index d61afd21aab5..8409d41405df 100644 --- a/net/core/net-sysfs.c +++ b/net/core/net-sysfs.c @@ -59,7 +59,7 @@ static ssize_t netdev_show(const struct device *dev, #define NETDEVICE_SHOW(field, format_string) \ static ssize_t format_##field(const struct net_device *dev, char *buf) \ { \ - return sprintf(buf, format_string, dev->field); \ + return sysfs_emit(buf, format_string, dev->field); \ } \ static ssize_t field##_show(struct device *dev, \ struct device_attribute *attr, char *buf) \ @@ -118,13 +118,13 @@ static ssize_t iflink_show(struct device *dev, struct device_attribute *attr, { struct net_device *ndev = to_net_dev(dev); - return sprintf(buf, fmt_dec, dev_get_iflink(ndev)); + return sysfs_emit(buf, fmt_dec, dev_get_iflink(ndev)); } static DEVICE_ATTR_RO(iflink); static ssize_t format_name_assign_type(const struct net_device *dev, char *buf) { - return sprintf(buf, fmt_dec, dev->name_assign_type); + return sysfs_emit(buf, fmt_dec, dev->name_assign_type); } static ssize_t name_assign_type_show(struct device *dev, @@ -194,7 +194,7 @@ static ssize_t carrier_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_carrier_ok(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_carrier_ok(netdev)); return -EINVAL; } @@ -219,7 +219,7 @@ static ssize_t speed_show(struct device *dev, struct ethtool_link_ksettings cmd; if (!__ethtool_get_link_ksettings(netdev, &cmd)) - ret = sprintf(buf, fmt_dec, cmd.base.speed); + ret = sysfs_emit(buf, fmt_dec, cmd.base.speed); } rtnl_unlock(); return ret; @@ -258,7 +258,7 @@ static ssize_t duplex_show(struct device *dev, duplex = "unknown"; break; } - ret = sprintf(buf, "%s\n", duplex); + ret = sysfs_emit(buf, "%s\n", duplex); } } rtnl_unlock(); @@ -272,7 +272,7 @@ static ssize_t testing_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_testing(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_testing(netdev)); return -EINVAL; } @@ -284,7 +284,7 @@ static ssize_t dormant_show(struct device *dev, struct net_device *netdev = to_net_dev(dev); if (netif_running(netdev)) - return sprintf(buf, fmt_dec, !!netif_dormant(netdev)); + return sysfs_emit(buf, fmt_dec, !!netif_dormant(netdev)); return -EINVAL; } @@ -315,7 +315,7 @@ static ssize_t operstate_show(struct device *dev, if (operstate >= ARRAY_SIZE(operstates)) return -EINVAL; /* should not happen */ - return sprintf(buf, "%s\n", operstates[operstate]); + return sysfs_emit(buf, "%s\n", operstates[operstate]); } static DEVICE_ATTR_RO(operstate); @@ -325,9 +325,9 @@ static ssize_t carrier_changes_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, - atomic_read(&netdev->carrier_up_count) + - atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, + atomic_read(&netdev->carrier_up_count) + + atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_changes); @@ -337,7 +337,7 @@ static ssize_t carrier_up_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_up_count)); } static DEVICE_ATTR_RO(carrier_up_count); @@ -347,7 +347,7 @@ static ssize_t carrier_down_count_show(struct device *dev, { struct net_device *netdev = to_net_dev(dev); - return sprintf(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); + return sysfs_emit(buf, fmt_dec, atomic_read(&netdev->carrier_down_count)); } static DEVICE_ATTR_RO(carrier_down_count); @@ -462,7 +462,7 @@ static ssize_t ifalias_show(struct device *dev, ret = dev_get_alias(netdev, tmp, sizeof(tmp)); if (ret > 0) - ret = sprintf(buf, "%s\n", tmp); + ret = sysfs_emit(buf, "%s\n", tmp); return ret; } static DEVICE_ATTR_RW(ifalias); @@ -514,7 +514,7 @@ static ssize_t phys_port_id_show(struct device *dev, ret = dev_get_phys_port_id(netdev, &ppid); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -543,7 +543,7 @@ static ssize_t phys_port_name_show(struct device *dev, ret = dev_get_phys_port_name(netdev, name, sizeof(name)); if (!ret) - ret = sprintf(buf, "%s\n", name); + ret = sysfs_emit(buf, "%s\n", name); } rtnl_unlock(); @@ -573,7 +573,7 @@ static ssize_t phys_switch_id_show(struct device *dev, ret = dev_get_port_parent_id(netdev, &ppid, false); if (!ret) - ret = sprintf(buf, "%*phN\n", ppid.id_len, ppid.id); + ret = sysfs_emit(buf, "%*phN\n", ppid.id_len, ppid.id); } rtnl_unlock(); @@ -591,7 +591,7 @@ static ssize_t threaded_show(struct device *dev, return restart_syscall(); if (dev_isalive(netdev)) - ret = sprintf(buf, fmt_dec, netdev->threaded); + ret = sysfs_emit(buf, fmt_dec, netdev->threaded); rtnl_unlock(); return ret; @@ -673,7 +673,7 @@ static ssize_t netstat_show(const struct device *d, struct rtnl_link_stats64 temp; const struct rtnl_link_stats64 *stats = dev_get_stats(dev, &temp); - ret = sprintf(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); + ret = sysfs_emit(buf, fmt_u64, *(u64 *)(((u8 *)stats) + offset)); } read_unlock(&dev_base_lock); return ret; @@ -824,7 +824,7 @@ static ssize_t show_rps_map(struct netdev_rx_queue *queue, char *buf) for (i = 0; i < map->len; i++) cpumask_set_cpu(map->cpus[i], mask); - len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask)); + len = sysfs_emit(buf, "%*pb\n", cpumask_pr_args(mask)); rcu_read_unlock(); free_cpumask_var(mask); @@ -910,7 +910,7 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue, val = (unsigned long)flow_table->mask + 1; rcu_read_unlock(); - return sprintf(buf, "%lu\n", val); + return sysfs_emit(buf, "%lu\n", val); } static void rps_dev_flow_table_release(struct rcu_head *rcu) @@ -1208,7 +1208,7 @@ static ssize_t tx_timeout_show(struct netdev_queue *queue, char *buf) { unsigned long trans_timeout = atomic_long_read(&queue->trans_timeout); - return sprintf(buf, fmt_ulong, trans_timeout); + return sysfs_emit(buf, fmt_ulong, trans_timeout); } static unsigned int get_netdev_queue_index(struct netdev_queue *queue) @@ -1255,15 +1255,15 @@ static ssize_t traffic_class_show(struct netdev_queue *queue, * belongs to the root device it will be reported with just the * traffic class, so just "0" for TC 0 for example. */ - return num_tc < 0 ? sprintf(buf, "%d%d\n", tc, num_tc) : - sprintf(buf, "%d\n", tc); + return num_tc < 0 ? sysfs_emit(buf, "%d%d\n", tc, num_tc) : + sysfs_emit(buf, "%d\n", tc); } #ifdef CONFIG_XPS static ssize_t tx_maxrate_show(struct netdev_queue *queue, char *buf) { - return sprintf(buf, "%lu\n", queue->tx_maxrate); + return sysfs_emit(buf, "%lu\n", queue->tx_maxrate); } static ssize_t tx_maxrate_store(struct netdev_queue *queue, @@ -1317,7 +1317,7 @@ static struct netdev_queue_attribute queue_traffic_class __ro_after_init */ static ssize_t bql_show(char *buf, unsigned int value) { - return sprintf(buf, "%u\n", value); + return sysfs_emit(buf, "%u\n", value); } static ssize_t bql_set(const char *buf, const size_t count, @@ -1346,7 +1346,7 @@ static ssize_t bql_show_hold_time(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); + return sysfs_emit(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time)); } static ssize_t bql_set_hold_time(struct netdev_queue *queue, @@ -1374,7 +1374,7 @@ static ssize_t bql_show_inflight(struct netdev_queue *queue, { struct dql *dql = &queue->dql; - return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed); + return sysfs_emit(buf, "%u\n", dql->num_queued - dql->num_completed); } static struct netdev_queue_attribute bql_inflight_attribute __ro_after_init = diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c index 6b9f19122ec1..5581d22cc191 100644 --- a/net/core/net_namespace.c +++ b/net/core/net_namespace.c @@ -18,7 +18,6 @@ #include <linux/user_namespace.h> #include <linux/net_namespace.h> #include <linux/sched/task.h> -#include <linux/sched/mm.h> #include <linux/uidgid.h> #include <linux/cookie.h> @@ -118,6 +117,7 @@ static int net_assign_generic(struct net *net, unsigned int id, void *data) static int ops_init(const struct pernet_operations *ops, struct net *net) { + struct net_generic *ng; int err = -ENOMEM; void *data = NULL; @@ -136,7 +136,13 @@ static int ops_init(const struct pernet_operations *ops, struct net *net) if (!err) return 0; + if (ops->id && ops->size) { cleanup: + ng = rcu_dereference_protected(net->gen, + lockdep_is_held(&pernet_ops_rwsem)); + ng->ptr[*ops->id] = NULL; + } + kfree(data); out: @@ -310,6 +316,7 @@ static __net_init int setup_net(struct net *net, struct user_namespace *user_ns) refcount_set(&net->ns.count, 1); ref_tracker_dir_init(&net->refcnt_tracker, 128); + ref_tracker_dir_init(&net->notrefcnt_tracker, 128); refcount_set(&net->passive, 1); get_random_bytes(&net->hash_mix, sizeof(u32)); @@ -430,6 +437,10 @@ static void net_free(struct net *net) { if (refcount_dec_and_test(&net->passive)) { kfree(rcu_access_pointer(net->gen)); + + /* There should not be any trackers left there. */ + ref_tracker_dir_exit(&net->notrefcnt_tracker); + kmem_cache_free(net_cachep, net); } } @@ -1144,13 +1155,7 @@ static int __register_pernet_operations(struct list_head *list, * setup_net() and cleanup_net() are not possible. */ for_each_net(net) { - struct mem_cgroup *old, *memcg; - - memcg = mem_cgroup_or_root(get_mem_cgroup_from_obj(net)); - old = set_active_memcg(memcg); error = ops_init(ops, net); - set_active_memcg(old); - mem_cgroup_put(memcg); if (error) goto out_undo; list_add_tail(&net->exit_list, &net_exit_list); diff --git a/net/core/pktgen.c b/net/core/pktgen.c index 88906ba6d9a7..c3763056c554 100644 --- a/net/core/pktgen.c +++ b/net/core/pktgen.c @@ -2324,7 +2324,7 @@ static inline int f_pick(struct pktgen_dev *pkt_dev) pkt_dev->curfl = 0; /*reset */ } } else { - flow = prandom_u32() % pkt_dev->cflows; + flow = prandom_u32_max(pkt_dev->cflows); pkt_dev->curfl = flow; if (pkt_dev->flows[flow].count > pkt_dev->lflow) { @@ -2380,10 +2380,9 @@ static void set_cur_queue_map(struct pktgen_dev *pkt_dev) else if (pkt_dev->queue_map_min <= pkt_dev->queue_map_max) { __u16 t; if (pkt_dev->flags & F_QUEUE_MAP_RND) { - t = prandom_u32() % - (pkt_dev->queue_map_max - - pkt_dev->queue_map_min + 1) - + pkt_dev->queue_map_min; + t = prandom_u32_max(pkt_dev->queue_map_max - + pkt_dev->queue_map_min + 1) + + pkt_dev->queue_map_min; } else { t = pkt_dev->cur_queue_map + 1; if (t > pkt_dev->queue_map_max) @@ -2412,7 +2411,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACSRC_RND) - mc = prandom_u32() % pkt_dev->src_mac_count; + mc = prandom_u32_max(pkt_dev->src_mac_count); else { mc = pkt_dev->cur_src_mac_offset++; if (pkt_dev->cur_src_mac_offset >= @@ -2438,7 +2437,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) __u32 tmp; if (pkt_dev->flags & F_MACDST_RND) - mc = prandom_u32() % pkt_dev->dst_mac_count; + mc = prandom_u32_max(pkt_dev->dst_mac_count); else { mc = pkt_dev->cur_dst_mac_offset++; @@ -2465,23 +2464,23 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < pkt_dev->nr_labels; i++) if (pkt_dev->labels[i] & MPLS_STACK_BOTTOM) pkt_dev->labels[i] = MPLS_STACK_BOTTOM | - ((__force __be32)prandom_u32() & + ((__force __be32)get_random_u32() & htonl(0x000fffff)); } if ((pkt_dev->flags & F_VID_RND) && (pkt_dev->vlan_id != 0xffff)) { - pkt_dev->vlan_id = prandom_u32() & (4096 - 1); + pkt_dev->vlan_id = prandom_u32_max(4096); } if ((pkt_dev->flags & F_SVID_RND) && (pkt_dev->svlan_id != 0xffff)) { - pkt_dev->svlan_id = prandom_u32() & (4096 - 1); + pkt_dev->svlan_id = prandom_u32_max(4096); } if (pkt_dev->udp_src_min < pkt_dev->udp_src_max) { if (pkt_dev->flags & F_UDPSRC_RND) - pkt_dev->cur_udp_src = prandom_u32() % - (pkt_dev->udp_src_max - pkt_dev->udp_src_min) - + pkt_dev->udp_src_min; + pkt_dev->cur_udp_src = prandom_u32_max( + pkt_dev->udp_src_max - pkt_dev->udp_src_min) + + pkt_dev->udp_src_min; else { pkt_dev->cur_udp_src++; @@ -2492,9 +2491,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->udp_dst_min < pkt_dev->udp_dst_max) { if (pkt_dev->flags & F_UDPDST_RND) { - pkt_dev->cur_udp_dst = prandom_u32() % - (pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) - + pkt_dev->udp_dst_min; + pkt_dev->cur_udp_dst = prandom_u32_max( + pkt_dev->udp_dst_max - pkt_dev->udp_dst_min) + + pkt_dev->udp_dst_min; } else { pkt_dev->cur_udp_dst++; if (pkt_dev->cur_udp_dst >= pkt_dev->udp_dst_max) @@ -2509,7 +2508,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (imn < imx) { __u32 t; if (pkt_dev->flags & F_IPSRC_RND) - t = prandom_u32() % (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + imn; else { t = ntohl(pkt_dev->cur_saddr); t++; @@ -2531,8 +2530,8 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->flags & F_IPDST_RND) { do { - t = prandom_u32() % - (imx - imn) + imn; + t = prandom_u32_max(imx - imn) + + imn; s = htonl(t); } while (ipv4_is_loopback(s) || ipv4_is_multicast(s) || @@ -2569,7 +2568,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) for (i = 0; i < 4; i++) { pkt_dev->cur_in6_daddr.s6_addr32[i] = - (((__force __be32)prandom_u32() | + (((__force __be32)get_random_u32() | pkt_dev->min_in6_daddr.s6_addr32[i]) & pkt_dev->max_in6_daddr.s6_addr32[i]); } @@ -2579,9 +2578,9 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) if (pkt_dev->min_pkt_size < pkt_dev->max_pkt_size) { __u32 t; if (pkt_dev->flags & F_TXSIZE_RND) { - t = prandom_u32() % - (pkt_dev->max_pkt_size - pkt_dev->min_pkt_size) - + pkt_dev->min_pkt_size; + t = prandom_u32_max(pkt_dev->max_pkt_size - + pkt_dev->min_pkt_size) + + pkt_dev->min_pkt_size; } else { t = pkt_dev->cur_pkt_size + 1; if (t > pkt_dev->max_pkt_size) @@ -2590,7 +2589,7 @@ static void mod_cur_headers(struct pktgen_dev *pkt_dev) pkt_dev->cur_pkt_size = t; } else if (pkt_dev->n_imix_entries > 0) { struct imix_pkt *entry; - __u32 t = prandom_u32() % IMIX_PRECISION; + __u32 t = prandom_u32_max(IMIX_PRECISION); __u8 entry_index = pkt_dev->imix_distribution[t]; entry = &pkt_dev->imix_entries[entry_index]; diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index f5e87fe57c83..74864dc46a7e 100644 --- a/net/core/rtnetlink.c +++ b/net/core/rtnetlink.c @@ -1057,6 +1057,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev, + nla_total_size(4) /* IFLA_MASTER */ + nla_total_size(1) /* IFLA_CARRIER */ + nla_total_size(4) /* IFLA_PROMISCUITY */ + + nla_total_size(4) /* IFLA_ALLMULTI */ + nla_total_size(4) /* IFLA_NUM_TX_QUEUES */ + nla_total_size(4) /* IFLA_NUM_RX_QUEUES */ + nla_total_size(4) /* IFLA_GSO_MAX_SEGS */ @@ -1765,6 +1766,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, nla_put_u32(skb, IFLA_MAX_MTU, dev->max_mtu) || nla_put_u32(skb, IFLA_GROUP, dev->group) || nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) || + nla_put_u32(skb, IFLA_ALLMULTI, dev->allmulti) || nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) || nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) || nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) || @@ -1926,6 +1928,7 @@ static const struct nla_policy ifla_policy[IFLA_MAX+1] = { [IFLA_GRO_MAX_SIZE] = { .type = NLA_U32 }, [IFLA_TSO_MAX_SIZE] = { .type = NLA_REJECT }, [IFLA_TSO_MAX_SEGS] = { .type = NLA_REJECT }, + [IFLA_ALLMULTI] = { .type = NLA_REJECT }, }; static const struct nla_policy ifla_info_policy[IFLA_INFO_MAX+1] = { @@ -2774,13 +2777,6 @@ static int do_setlink(const struct sk_buff *skb, call_netdevice_notifiers(NETDEV_CHANGEADDR, dev); } - if (ifm->ifi_flags || ifm->ifi_change) { - err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), - extack); - if (err < 0) - goto errout; - } - if (tb[IFLA_MASTER]) { err = do_set_master(dev, nla_get_u32(tb[IFLA_MASTER]), extack); if (err) @@ -2788,6 +2784,13 @@ static int do_setlink(const struct sk_buff *skb, status |= DO_SETLINK_MODIFIED; } + if (ifm->ifi_flags || ifm->ifi_change) { + err = dev_change_flags(dev, rtnl_dev_combine_flags(dev, ifm), + extack); + if (err < 0) + goto errout; + } + if (tb[IFLA_CARRIER]) { err = dev_change_carrier(dev, nla_get_u8(tb[IFLA_CARRIER])); if (err) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index 48ecfbf29174..1d84a17eada5 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -91,7 +91,11 @@ static struct kmem_cache *skbuff_ext_cache __ro_after_init; int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS; EXPORT_SYMBOL(sysctl_max_skb_frags); -/* The array 'drop_reasons' is auto-generated in dropreason_str.c */ +#undef FN +#define FN(reason) [SKB_DROP_REASON_##reason] = #reason, +const char * const drop_reasons[] = { + DEFINE_DROP_REASON(FN, FN) +}; EXPORT_SYMBOL(drop_reasons); /** @@ -130,8 +134,66 @@ static void skb_under_panic(struct sk_buff *skb, unsigned int sz, void *addr) #define NAPI_SKB_CACHE_BULK 16 #define NAPI_SKB_CACHE_HALF (NAPI_SKB_CACHE_SIZE / 2) +#if PAGE_SIZE == SZ_4K + +#define NAPI_HAS_SMALL_PAGE_FRAG 1 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) ((nc).pfmemalloc) + +/* specialized page frag allocator using a single order 0 page + * and slicing it into 1K sized fragment. Constrained to systems + * with a very limited amount of 1K fragments fitting a single + * page - to avoid excessive truesize underestimation + */ + +struct page_frag_1k { + void *va; + u16 offset; + bool pfmemalloc; +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp) +{ + struct page *page; + int offset; + + offset = nc->offset - SZ_1K; + if (likely(offset >= 0)) + goto use_frag; + + page = alloc_pages_node(NUMA_NO_NODE, gfp, 0); + if (!page) + return NULL; + + nc->va = page_address(page); + nc->pfmemalloc = page_is_pfmemalloc(page); + offset = PAGE_SIZE - SZ_1K; + page_ref_add(page, offset / SZ_1K); + +use_frag: + nc->offset = offset; + return nc->va + offset; +} +#else + +/* the small page is actually unused in this build; add dummy helpers + * to please the compiler and avoid later preprocessor's conditionals + */ +#define NAPI_HAS_SMALL_PAGE_FRAG 0 +#define NAPI_SMALL_PAGE_PFMEMALLOC(nc) false + +struct page_frag_1k { +}; + +static void *page_frag_alloc_1k(struct page_frag_1k *nc, gfp_t gfp_mask) +{ + return NULL; +} + +#endif + struct napi_alloc_cache { struct page_frag_cache page; + struct page_frag_1k page_small; unsigned int skb_count; void *skb_cache[NAPI_SKB_CACHE_SIZE]; }; @@ -139,6 +201,23 @@ struct napi_alloc_cache { static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache); static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache); +/* Double check that napi_get_frags() allocates skbs with + * skb->head being backed by slab, not a page fragment. + * This is to make sure bug fixed in 3226b158e67c + * ("net: avoid 32 x truesize under-estimation for tiny skbs") + * does not accidentally come back. + */ +void napi_get_frags_check(struct napi_struct *napi) +{ + struct sk_buff *skb; + + local_bh_disable(); + skb = napi_get_frags(napi); + WARN_ON_ONCE(!NAPI_HAS_SMALL_PAGE_FRAG && skb && skb->head_frag); + napi_free_frags(napi); + local_bh_enable(); +} + void *__napi_alloc_frag_align(unsigned int fragsz, unsigned int align_mask) { struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache); @@ -427,14 +506,14 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask, */ size = SKB_DATA_ALIGN(size); size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - data = kmalloc_reserve(size, gfp_mask, node, &pfmemalloc); + osize = kmalloc_size_roundup(size); + data = kmalloc_reserve(osize, gfp_mask, node, &pfmemalloc); if (unlikely(!data)) goto nodata; - /* kmalloc(size) might give us more room than requested. + /* kmalloc_size_roundup() might give us more room than requested. * Put skb_shared_info exactly at the end of allocated zone, * to allow max possible filling before reallocation. */ - osize = ksize(data); size = SKB_WITH_OVERHEAD(osize); prefetchw(data + size); @@ -557,6 +636,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, { struct napi_alloc_cache *nc; struct sk_buff *skb; + bool pfmemalloc; void *data; DEBUG_NET_WARN_ON_ONCE(!in_softirq()); @@ -564,8 +644,10 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, /* If requested length is either too small or too big, * we use kmalloc() for skb->head allocation. + * When the small frag allocator is available, prefer it over kmalloc + * for small fragments */ - if (len <= SKB_WITH_OVERHEAD(1024) || + if ((!NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) || len > SKB_WITH_OVERHEAD(PAGE_SIZE) || (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) { skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX | SKB_ALLOC_NAPI, @@ -576,13 +658,33 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, } nc = this_cpu_ptr(&napi_alloc_cache); - len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); - len = SKB_DATA_ALIGN(len); if (sk_memalloc_socks()) gfp_mask |= __GFP_MEMALLOC; - data = page_frag_alloc(&nc->page, len, gfp_mask); + if (NAPI_HAS_SMALL_PAGE_FRAG && len <= SKB_WITH_OVERHEAD(1024)) { + /* we are artificially inflating the allocation size, but + * that is not as bad as it may look like, as: + * - 'len' less than GRO_MAX_HEAD makes little sense + * - On most systems, larger 'len' values lead to fragment + * size above 512 bytes + * - kmalloc would use the kmalloc-1k slab for such values + * - Builds with smaller GRO_MAX_HEAD will very likely do + * little networking, as that implies no WiFi and no + * tunnels support, and 32 bits arches. + */ + len = SZ_1K; + + data = page_frag_alloc_1k(&nc->page_small, gfp_mask); + pfmemalloc = NAPI_SMALL_PAGE_PFMEMALLOC(nc->page_small); + } else { + len += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + len = SKB_DATA_ALIGN(len); + + data = page_frag_alloc(&nc->page, len, gfp_mask); + pfmemalloc = nc->page.pfmemalloc; + } + if (unlikely(!data)) return NULL; @@ -592,7 +694,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len, return NULL; } - if (nc->page.pfmemalloc) + if (pfmemalloc) skb->pfmemalloc = 1; skb->head_frag = 1; @@ -646,6 +748,13 @@ static void skb_clone_fraglist(struct sk_buff *skb) skb_get(list); } +static bool skb_pp_recycle(struct sk_buff *skb, void *data) +{ + if (!IS_ENABLED(CONFIG_PAGE_POOL) || !skb->pp_recycle) + return false; + return page_pool_return_skb_page(virt_to_page(data)); +} + static void skb_free_head(struct sk_buff *skb) { unsigned char *head = skb->head; @@ -1184,7 +1293,7 @@ EXPORT_SYMBOL_GPL(mm_unaccount_pinned_pages); static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) { - struct ubuf_info *uarg; + struct ubuf_info_msgzc *uarg; struct sk_buff *skb; WARN_ON_ONCE(!in_task()); @@ -1202,19 +1311,19 @@ static struct ubuf_info *msg_zerocopy_alloc(struct sock *sk, size_t size) return NULL; } - uarg->callback = msg_zerocopy_callback; + uarg->ubuf.callback = msg_zerocopy_callback; uarg->id = ((u32)atomic_inc_return(&sk->sk_zckey)) - 1; uarg->len = 1; uarg->bytelen = size; uarg->zerocopy = 1; - uarg->flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; - refcount_set(&uarg->refcnt, 1); + uarg->ubuf.flags = SKBFL_ZEROCOPY_FRAG | SKBFL_DONT_ORPHAN; + refcount_set(&uarg->ubuf.refcnt, 1); sock_hold(sk); - return uarg; + return &uarg->ubuf; } -static inline struct sk_buff *skb_from_uarg(struct ubuf_info *uarg) +static inline struct sk_buff *skb_from_uarg(struct ubuf_info_msgzc *uarg) { return container_of((void *)uarg, struct sk_buff, cb); } @@ -1223,6 +1332,7 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, struct ubuf_info *uarg) { if (uarg) { + struct ubuf_info_msgzc *uarg_zc; const u32 byte_limit = 1 << 19; /* limit to a few TSO */ u32 bytelen, next; @@ -1238,8 +1348,9 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, return NULL; } - bytelen = uarg->bytelen + size; - if (uarg->len == USHRT_MAX - 1 || bytelen > byte_limit) { + uarg_zc = uarg_to_msgzc(uarg); + bytelen = uarg_zc->bytelen + size; + if (uarg_zc->len == USHRT_MAX - 1 || bytelen > byte_limit) { /* TCP can create new skb to attach new uarg */ if (sk->sk_type == SOCK_STREAM) goto new_alloc; @@ -1247,11 +1358,11 @@ struct ubuf_info *msg_zerocopy_realloc(struct sock *sk, size_t size, } next = (u32)atomic_read(&sk->sk_zckey); - if ((u32)(uarg->id + uarg->len) == next) { - if (mm_account_pinned_pages(&uarg->mmp, size)) + if ((u32)(uarg_zc->id + uarg_zc->len) == next) { + if (mm_account_pinned_pages(&uarg_zc->mmp, size)) return NULL; - uarg->len++; - uarg->bytelen = bytelen; + uarg_zc->len++; + uarg_zc->bytelen = bytelen; atomic_set(&sk->sk_zckey, ++next); /* no extra ref when appending to datagram (MSG_MORE) */ @@ -1287,7 +1398,7 @@ static bool skb_zerocopy_notify_extend(struct sk_buff *skb, u32 lo, u16 len) return true; } -static void __msg_zerocopy_callback(struct ubuf_info *uarg) +static void __msg_zerocopy_callback(struct ubuf_info_msgzc *uarg) { struct sk_buff *tail, *skb = skb_from_uarg(uarg); struct sock_exterr_skb *serr; @@ -1340,19 +1451,21 @@ release: void msg_zerocopy_callback(struct sk_buff *skb, struct ubuf_info *uarg, bool success) { - uarg->zerocopy = uarg->zerocopy & success; + struct ubuf_info_msgzc *uarg_zc = uarg_to_msgzc(uarg); + + uarg_zc->zerocopy = uarg_zc->zerocopy & success; if (refcount_dec_and_test(&uarg->refcnt)) - __msg_zerocopy_callback(uarg); + __msg_zerocopy_callback(uarg_zc); } EXPORT_SYMBOL_GPL(msg_zerocopy_callback); void msg_zerocopy_put_abort(struct ubuf_info *uarg, bool have_uref) { - struct sock *sk = skb_from_uarg(uarg)->sk; + struct sock *sk = skb_from_uarg(uarg_to_msgzc(uarg))->sk; atomic_dec(&sk->sk_zckey); - uarg->len--; + uarg_to_msgzc(uarg)->len--; if (have_uref) msg_zerocopy_callback(NULL, uarg, true); @@ -1708,10 +1821,11 @@ EXPORT_SYMBOL(__pskb_copy_fclone); int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, gfp_t gfp_mask) { - int i, osize = skb_end_offset(skb); - int size = osize + nhead + ntail; + unsigned int osize = skb_end_offset(skb); + unsigned int size = osize + nhead + ntail; long off; u8 *data; + int i; BUG_ON(nhead < 0); @@ -1719,15 +1833,16 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail, skb_zcopy_downgrade_managed(skb); - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) goto nodata; - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); /* Copy only real data... and, alas, header. This should be * optimized for the cases when header is void. @@ -3865,7 +3980,7 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page, } else if (i < MAX_SKB_FRAGS) { skb_zcopy_downgrade_managed(skb); get_page(page); - skb_fill_page_desc(skb, i, page, offset, size); + skb_fill_page_desc_noacc(skb, i, page, offset, size); } else { return -EMSGSIZE; } @@ -6061,21 +6176,20 @@ static int pskb_carve_inside_header(struct sk_buff *skb, const u32 off, const int headlen, gfp_t gfp_mask) { int i; - int size = skb_end_offset(skb); + unsigned int size = skb_end_offset(skb); int new_hlen = headlen - off; u8 *data; - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; - - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); /* Copy real data, and all frags */ skb_copy_from_linear_data_offset(skb, off, data, new_hlen); @@ -6180,22 +6294,21 @@ static int pskb_carve_inside_nonlinear(struct sk_buff *skb, const u32 off, int pos, gfp_t gfp_mask) { int i, k = 0; - int size = skb_end_offset(skb); + unsigned int size = skb_end_offset(skb); u8 *data; const int nfrags = skb_shinfo(skb)->nr_frags; struct skb_shared_info *shinfo; - size = SKB_DATA_ALIGN(size); - if (skb_pfmemalloc(skb)) gfp_mask |= __GFP_MEMALLOC; - data = kmalloc_reserve(size + - SKB_DATA_ALIGN(sizeof(struct skb_shared_info)), - gfp_mask, NUMA_NO_NODE, NULL); + + size = SKB_DATA_ALIGN(size); + size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info)); + size = kmalloc_size_roundup(size); + data = kmalloc_reserve(size, gfp_mask, NUMA_NO_NODE, NULL); if (!data) return -ENOMEM; - - size = SKB_WITH_OVERHEAD(ksize(data)); + size = SKB_WITH_OVERHEAD(size); memcpy((struct skb_shared_info *)(data + size), skb_shinfo(skb), offsetof(struct skb_shared_info, frags[0])); diff --git a/net/core/skmsg.c b/net/core/skmsg.c index 188f8558d27d..1efdc47a999b 100644 --- a/net/core/skmsg.c +++ b/net/core/skmsg.c @@ -434,8 +434,10 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, if (copied + copy > len) copy = len - copied; copy = copy_page_to_iter(page, sge->offset, copy, iter); - if (!copy) - return copied ? copied : -EFAULT; + if (!copy) { + copied = copied ? copied : -EFAULT; + goto out; + } copied += copy; if (likely(!peek)) { @@ -455,7 +457,7 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, * didn't copy the entire length lets just break. */ if (copy != sge->length) - return copied; + goto out; sk_msg_iter_var_next(i); } @@ -477,7 +479,9 @@ int sk_msg_recvmsg(struct sock *sk, struct sk_psock *psock, struct msghdr *msg, } msg_rx = sk_psock_peek_msg(psock); } - +out: + if (psock->work_state.skb && copied > 0) + schedule_work(&psock->work); return copied; } EXPORT_SYMBOL_GPL(sk_msg_recvmsg); @@ -496,11 +500,11 @@ bool sk_msg_is_readable(struct sock *sk) } EXPORT_SYMBOL_GPL(sk_msg_is_readable); -static struct sk_msg *alloc_sk_msg(void) +static struct sk_msg *alloc_sk_msg(gfp_t gfp) { struct sk_msg *msg; - msg = kzalloc(sizeof(*msg), __GFP_NOWARN | GFP_KERNEL); + msg = kzalloc(sizeof(*msg), gfp | __GFP_NOWARN); if (unlikely(!msg)) return NULL; sg_init_marker(msg->sg.data, NR_MSG_FRAG_IDS); @@ -516,7 +520,7 @@ static struct sk_msg *sk_psock_create_ingress_msg(struct sock *sk, if (!sk_rmem_schedule(sk, skb, skb->truesize)) return NULL; - return alloc_sk_msg(); + return alloc_sk_msg(GFP_KERNEL); } static int sk_psock_skb_ingress_enqueue(struct sk_buff *skb, @@ -593,7 +597,7 @@ static int sk_psock_skb_ingress(struct sk_psock *psock, struct sk_buff *skb, static int sk_psock_skb_ingress_self(struct sk_psock *psock, struct sk_buff *skb, u32 off, u32 len) { - struct sk_msg *msg = alloc_sk_msg(); + struct sk_msg *msg = alloc_sk_msg(GFP_ATOMIC); struct sock *sk = psock->sk; int err; diff --git a/net/core/sock.c b/net/core/sock.c index 788c1372663c..4571914a4aa8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -703,15 +703,17 @@ static int sock_setbindtodevice(struct sock *sk, sockptr_t optval, int optlen) goto out; } - return sock_bindtoindex(sk, index, true); + sockopt_lock_sock(sk); + ret = sock_bindtoindex_locked(sk, index); + sockopt_release_sock(sk); out: #endif return ret; } -static int sock_getbindtodevice(struct sock *sk, char __user *optval, - int __user *optlen, int len) +static int sock_getbindtodevice(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { int ret = -ENOPROTOOPT; #ifdef CONFIG_NETDEVICES @@ -735,12 +737,12 @@ static int sock_getbindtodevice(struct sock *sk, char __user *optval, len = strlen(devname) + 1; ret = -EFAULT; - if (copy_to_user(optval, devname, len)) + if (copy_to_sockptr(optval, devname, len)) goto out; zero: ret = -EFAULT; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) goto out; ret = 0; @@ -1036,17 +1038,51 @@ static int sock_reserve_memory(struct sock *sk, int bytes) return 0; } +void sockopt_lock_sock(struct sock *sk) +{ + /* When current->bpf_ctx is set, the setsockopt is called from + * a bpf prog. bpf has ensured the sk lock has been + * acquired before calling setsockopt(). + */ + if (has_current_bpf_ctx()) + return; + + lock_sock(sk); +} +EXPORT_SYMBOL(sockopt_lock_sock); + +void sockopt_release_sock(struct sock *sk) +{ + if (has_current_bpf_ctx()) + return; + + release_sock(sk); +} +EXPORT_SYMBOL(sockopt_release_sock); + +bool sockopt_ns_capable(struct user_namespace *ns, int cap) +{ + return has_current_bpf_ctx() || ns_capable(ns, cap); +} +EXPORT_SYMBOL(sockopt_ns_capable); + +bool sockopt_capable(int cap) +{ + return has_current_bpf_ctx() || capable(cap); +} +EXPORT_SYMBOL(sockopt_capable); + /* * This is meant for all protocols to use and covers goings on * at the socket level. Everything here is generic. */ -int sock_setsockopt(struct socket *sock, int level, int optname, - sockptr_t optval, unsigned int optlen) +int sk_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct so_timestamping timestamping; + struct socket *sock = sk->sk_socket; struct sock_txtime sk_txtime; - struct sock *sk = sock->sk; int val; int valbool; struct linger ling; @@ -1067,11 +1103,11 @@ int sock_setsockopt(struct socket *sock, int level, int optname, valbool = val ? 1 : 0; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case SO_DEBUG: - if (val && !capable(CAP_NET_ADMIN)) + if (val && !sockopt_capable(CAP_NET_ADMIN)) ret = -EACCES; else sock_valbool_flag(sk, SOCK_DBG, valbool); @@ -1115,7 +1151,7 @@ set_sndbuf: break; case SO_SNDBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1137,7 +1173,7 @@ set_sndbuf: break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1164,8 +1200,8 @@ set_sndbuf: case SO_PRIORITY: if ((val >= 0 && val <= 6) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || - ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) || + sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) sk->sk_priority = val; else ret = -EPERM; @@ -1228,7 +1264,7 @@ set_sndbuf: case SO_RCVLOWAT: if (val < 0) val = INT_MAX; - if (sock->ops->set_rcvlowat) + if (sock && sock->ops->set_rcvlowat) ret = sock->ops->set_rcvlowat(sk, val); else WRITE_ONCE(sk->sk_rcvlowat, val ? : 1); @@ -1310,8 +1346,8 @@ set_sndbuf: clear_bit(SOCK_PASSSEC, &sock->flags); break; case SO_MARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1319,8 +1355,8 @@ set_sndbuf: __sock_set_mark(sk, val); break; case SO_RCVMARK: - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1354,7 +1390,7 @@ set_sndbuf: #ifdef CONFIG_NET_RX_BUSY_POLL case SO_BUSY_POLL: /* allow unprivileged users to decrease the value */ - if ((val > sk->sk_ll_usec) && !capable(CAP_NET_ADMIN)) + if ((val > sk->sk_ll_usec) && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else { if (val < 0) @@ -1364,13 +1400,13 @@ set_sndbuf: } break; case SO_PREFER_BUSY_POLL: - if (valbool && !capable(CAP_NET_ADMIN)) + if (valbool && !sockopt_capable(CAP_NET_ADMIN)) ret = -EPERM; else WRITE_ONCE(sk->sk_prefer_busy_poll, valbool); break; case SO_BUSY_POLL_BUDGET: - if (val > READ_ONCE(sk->sk_busy_poll_budget) && !capable(CAP_NET_ADMIN)) { + if (val > READ_ONCE(sk->sk_busy_poll_budget) && !sockopt_capable(CAP_NET_ADMIN)) { ret = -EPERM; } else { if (val < 0 || val > U16_MAX) @@ -1400,7 +1436,7 @@ set_sndbuf: break; } case SO_INCOMING_CPU: - WRITE_ONCE(sk->sk_incoming_cpu, val); + reuseport_update_incoming_cpu(sk, val); break; case SO_CNX_ADVICE: @@ -1441,7 +1477,7 @@ set_sndbuf: * scheduler has enough safe guards. */ if (sk_txtime.clockid != CLOCK_MONOTONIC && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { ret = -EPERM; break; } @@ -1496,9 +1532,16 @@ set_sndbuf: ret = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); return ret; } + +int sock_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + return sk_setsockopt(sock->sk, level, optname, + optval, optlen); +} EXPORT_SYMBOL(sock_setsockopt); static const struct cred *sk_get_peer_cred(struct sock *sk) @@ -1525,22 +1568,25 @@ static void cred_to_ucred(struct pid *pid, const struct cred *cred, } } -static int groups_to_user(gid_t __user *dst, const struct group_info *src) +static int groups_to_user(sockptr_t dst, const struct group_info *src) { struct user_namespace *user_ns = current_user_ns(); int i; - for (i = 0; i < src->ngroups; i++) - if (put_user(from_kgid_munged(user_ns, src->gid[i]), dst + i)) + for (i = 0; i < src->ngroups; i++) { + gid_t gid = from_kgid_munged(user_ns, src->gid[i]); + + if (copy_to_sockptr_offset(dst, i * sizeof(gid), &gid, sizeof(gid))) return -EFAULT; + } return 0; } -int sock_getsockopt(struct socket *sock, int level, int optname, - char __user *optval, int __user *optlen) +int sk_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { - struct sock *sk = sock->sk; + struct socket *sock = sk->sk_socket; union { int val; @@ -1557,7 +1603,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, int lv = sizeof(int); int len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; @@ -1692,7 +1738,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, cred_to_ucred(sk->sk_peer_pid, sk->sk_peer_cred, &peercred); spin_unlock(&sk->sk_peer_lock); - if (copy_to_user(optval, &peercred, len)) + if (copy_to_sockptr(optval, &peercred, len)) return -EFAULT; goto lenout; } @@ -1710,11 +1756,11 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len < n * sizeof(gid_t)) { len = n * sizeof(gid_t); put_cred(cred); - return put_user(len, optlen) ? -EFAULT : -ERANGE; + return copy_to_sockptr(optlen, &len, sizeof(int)) ? -EFAULT : -ERANGE; } len = n * sizeof(gid_t); - ret = groups_to_user((gid_t __user *)optval, cred->group_info); + ret = groups_to_user(optval, cred->group_info); put_cred(cred); if (ret) return ret; @@ -1730,7 +1776,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return -ENOTCONN; if (lv < len) return -EINVAL; - if (copy_to_user(optval, address, len)) + if (copy_to_sockptr(optval, address, len)) return -EFAULT; goto lenout; } @@ -1747,7 +1793,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, break; case SO_PEERSEC: - return security_socket_getpeersec_stream(sock, optval, optlen, len); + return security_socket_getpeersec_stream(sock, optval.user, optlen.user, len); case SO_MARK: v.val = sk->sk_mark; @@ -1779,7 +1825,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, return sock_getbindtodevice(sk, optval, optlen, len); case SO_GET_FILTER: - len = sk_get_filter(sk, (struct sock_filter __user *)optval, len); + len = sk_get_filter(sk, optval, len); if (len < 0) return len; @@ -1827,7 +1873,7 @@ int sock_getsockopt(struct socket *sock, int level, int optname, sk_get_meminfo(sk, meminfo); len = min_t(unsigned int, len, sizeof(meminfo)); - if (copy_to_user(optval, &meminfo, len)) + if (copy_to_sockptr(optval, &meminfo, len)) return -EFAULT; goto lenout; @@ -1896,14 +1942,22 @@ int sock_getsockopt(struct socket *sock, int level, int optname, if (len > lv) len = lv; - if (copy_to_user(optval, &v, len)) + if (copy_to_sockptr(optval, &v, len)) return -EFAULT; lenout: - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } +int sock_getsockopt(struct socket *sock, int level, int optname, + char __user *optval, int __user *optlen) +{ + return sk_getsockopt(sock->sk, level, optname, + USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); +} + /* * Initialize an sk_lock. * @@ -2040,6 +2094,9 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority, if (likely(sk->sk_net_refcnt)) { get_net_track(net, &sk->ns_tracker, priority); sock_inuse_add(net, 1); + } else { + __netns_tracker_alloc(net, &sk->ns_tracker, + false, priority); } sock_net_set(sk, net); @@ -2095,6 +2152,9 @@ static void __sk_destruct(struct rcu_head *head) if (likely(sk->sk_net_refcnt)) put_net_track(sock_net(sk), &sk->ns_tracker); + else + __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); + sk_prot_free(sk->sk_prot_creator, sk); } @@ -2183,6 +2243,14 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) if (likely(newsk->sk_net_refcnt)) { get_net_track(sock_net(newsk), &newsk->ns_tracker, priority); sock_inuse_add(sock_net(newsk), 1); + } else { + /* Kernel sockets are not elevating the struct net refcount. + * Instead, use a tracker to more easily detect if a layer + * is not properly dismantling its kernel sockets at netns + * destroy time. + */ + __netns_tracker_alloc(sock_net(newsk), &newsk->ns_tracker, + false, priority); } sk_node_init(&newsk->sk_node); sock_lock_init(newsk); @@ -2676,7 +2744,7 @@ failure: } EXPORT_SYMBOL(sock_alloc_send_pskb); -int __sock_cmsg_send(struct sock *sk, struct msghdr *msg, struct cmsghdr *cmsg, +int __sock_cmsg_send(struct sock *sk, struct cmsghdr *cmsg, struct sockcm_cookie *sockc) { u32 tsflags; @@ -2730,7 +2798,7 @@ int sock_cmsg_send(struct sock *sk, struct msghdr *msg, return -EINVAL; if (cmsg->cmsg_level != SOL_SOCKET) continue; - ret = __sock_cmsg_send(sk, msg, cmsg, sockc); + ret = __sock_cmsg_send(sk, cmsg, sockc); if (ret) return ret; } @@ -3556,7 +3624,8 @@ int sock_common_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->getsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->getsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_getsockopt); @@ -3582,7 +3651,8 @@ int sock_common_setsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; - return sk->sk_prot->setsockopt(sk, level, optname, optval, optlen); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + return READ_ONCE(sk->sk_prot)->setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(sock_common_setsockopt); diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c index 5daa1fa54249..5a165286e4d8 100644 --- a/net/core/sock_reuseport.c +++ b/net/core/sock_reuseport.c @@ -21,6 +21,86 @@ static DEFINE_IDA(reuseport_ida); static int reuseport_resurrect(struct sock *sk, struct sock_reuseport *old_reuse, struct sock_reuseport *reuse, bool bind_inany); +void reuseport_has_conns_set(struct sock *sk) +{ + struct sock_reuseport *reuse; + + if (!rcu_access_pointer(sk->sk_reuseport_cb)) + return; + + spin_lock_bh(&reuseport_lock); + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + if (likely(reuse)) + reuse->has_conns = 1; + spin_unlock_bh(&reuseport_lock); +} +EXPORT_SYMBOL(reuseport_has_conns_set); + +static void __reuseport_get_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu + 1); +} + +static void __reuseport_put_incoming_cpu(struct sock_reuseport *reuse) +{ + /* Paired with READ_ONCE() in reuseport_select_sock_by_hash(). */ + WRITE_ONCE(reuse->incoming_cpu, reuse->incoming_cpu - 1); +} + +static void reuseport_get_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_get_incoming_cpu(reuse); +} + +static void reuseport_put_incoming_cpu(struct sock *sk, struct sock_reuseport *reuse) +{ + if (sk->sk_incoming_cpu >= 0) + __reuseport_put_incoming_cpu(reuse); +} + +void reuseport_update_incoming_cpu(struct sock *sk, int val) +{ + struct sock_reuseport *reuse; + int old_sk_incoming_cpu; + + if (unlikely(!rcu_access_pointer(sk->sk_reuseport_cb))) { + /* Paired with REAE_ONCE() in sk_incoming_cpu_update() + * and compute_score(). + */ + WRITE_ONCE(sk->sk_incoming_cpu, val); + return; + } + + spin_lock_bh(&reuseport_lock); + + /* This must be done under reuseport_lock to avoid a race with + * reuseport_grow(), which accesses sk->sk_incoming_cpu without + * lock_sock() when detaching a shutdown()ed sk. + * + * Paired with READ_ONCE() in reuseport_select_sock_by_hash(). + */ + old_sk_incoming_cpu = sk->sk_incoming_cpu; + WRITE_ONCE(sk->sk_incoming_cpu, val); + + reuse = rcu_dereference_protected(sk->sk_reuseport_cb, + lockdep_is_held(&reuseport_lock)); + + /* reuseport_grow() has detached a closed sk. */ + if (!reuse) + goto out; + + if (old_sk_incoming_cpu < 0 && val >= 0) + __reuseport_get_incoming_cpu(reuse); + else if (old_sk_incoming_cpu >= 0 && val < 0) + __reuseport_put_incoming_cpu(reuse); + +out: + spin_unlock_bh(&reuseport_lock); +} + static int reuseport_sock_index(struct sock *sk, const struct sock_reuseport *reuse, bool closed) @@ -48,6 +128,7 @@ static void __reuseport_add_sock(struct sock *sk, /* paired with smp_rmb() in reuseport_(select|migrate)_sock() */ smp_wmb(); reuse->num_socks++; + reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_sock(struct sock *sk, @@ -60,6 +141,7 @@ static bool __reuseport_detach_sock(struct sock *sk, reuse->socks[i] = reuse->socks[reuse->num_socks - 1]; reuse->num_socks--; + reuseport_put_incoming_cpu(sk, reuse); return true; } @@ -70,6 +152,7 @@ static void __reuseport_add_closed_sock(struct sock *sk, reuse->socks[reuse->max_socks - reuse->num_closed_socks - 1] = sk; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks + 1); + reuseport_get_incoming_cpu(sk, reuse); } static bool __reuseport_detach_closed_sock(struct sock *sk, @@ -83,6 +166,7 @@ static bool __reuseport_detach_closed_sock(struct sock *sk, reuse->socks[i] = reuse->socks[reuse->max_socks - reuse->num_closed_socks]; /* paired with READ_ONCE() in inet_csk_bind_conflict() */ WRITE_ONCE(reuse->num_closed_socks, reuse->num_closed_socks - 1); + reuseport_put_incoming_cpu(sk, reuse); return true; } @@ -150,6 +234,7 @@ int reuseport_alloc(struct sock *sk, bool bind_inany) reuse->bind_inany = bind_inany; reuse->socks[0] = sk; reuse->num_socks = 1; + reuseport_get_incoming_cpu(sk, reuse); rcu_assign_pointer(sk->sk_reuseport_cb, reuse); out: @@ -193,6 +278,7 @@ static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse) more_reuse->reuseport_id = reuse->reuseport_id; more_reuse->bind_inany = reuse->bind_inany; more_reuse->has_conns = reuse->has_conns; + more_reuse->incoming_cpu = reuse->incoming_cpu; memcpy(more_reuse->socks, reuse->socks, reuse->num_socks * sizeof(struct sock *)); @@ -442,18 +528,32 @@ static struct sock *run_bpf_filter(struct sock_reuseport *reuse, u16 socks, static struct sock *reuseport_select_sock_by_hash(struct sock_reuseport *reuse, u32 hash, u16 num_socks) { + struct sock *first_valid_sk = NULL; int i, j; i = j = reciprocal_scale(hash, num_socks); - while (reuse->socks[i]->sk_state == TCP_ESTABLISHED) { + do { + struct sock *sk = reuse->socks[i]; + + if (sk->sk_state != TCP_ESTABLISHED) { + /* Paired with WRITE_ONCE() in __reuseport_(get|put)_incoming_cpu(). */ + if (!READ_ONCE(reuse->incoming_cpu)) + return sk; + + /* Paired with WRITE_ONCE() in reuseport_update_incoming_cpu(). */ + if (READ_ONCE(sk->sk_incoming_cpu) == raw_smp_processor_id()) + return sk; + + if (!first_valid_sk) + first_valid_sk = sk; + } + i++; if (i >= num_socks) i = 0; - if (i == j) - return NULL; - } + } while (i != j); - return reuse->socks[i]; + return first_valid_sk; } /** diff --git a/net/core/stream.c b/net/core/stream.c index ccc083cdef23..75fded8495f5 100644 --- a/net/core/stream.c +++ b/net/core/stream.c @@ -123,7 +123,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) DEFINE_WAIT_FUNC(wait, woken_wake_function); if (sk_stream_memory_free(sk)) - current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2; + current_timeo = vm_wait = prandom_u32_max(HZ / 5) + 2; add_wait_queue(sk_sleep(sk), &wait); @@ -159,7 +159,8 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p) *timeo_p = current_timeo; } out: - remove_wait_queue(sk_sleep(sk), &wait); + if (!sock_flag(sk, SOCK_DEAD)) + remove_wait_queue(sk_sleep(sk), &wait); return err; do_error: diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c index 725891527814..5b1ce656baa1 100644 --- a/net/core/sysctl_net_core.c +++ b/net/core/sysctl_net_core.c @@ -29,7 +29,6 @@ static int int_3600 = 3600; static int min_sndbuf = SOCK_MIN_SNDBUF; static int min_rcvbuf = SOCK_MIN_RCVBUF; static int max_skb_frags = MAX_SKB_FRAGS; -static long long_max __maybe_unused = LONG_MAX; static int net_msg_warn; /* Unused, but still a sysctl */ diff --git a/net/core/xdp.c b/net/core/xdp.c index 24420209bf0e..844c9d99dc0e 100644 --- a/net/core/xdp.c +++ b/net/core/xdp.c @@ -375,19 +375,17 @@ EXPORT_SYMBOL_GPL(xdp_rxq_info_reg_mem_model); void __xdp_return(void *data, struct xdp_mem_info *mem, bool napi_direct, struct xdp_buff *xdp) { - struct xdp_mem_allocator *xa; struct page *page; switch (mem->type) { case MEM_TYPE_PAGE_POOL: - rcu_read_lock(); - /* mem->id is valid, checked in xdp_rxq_info_reg_mem_model() */ - xa = rhashtable_lookup(mem_id_ht, &mem->id, mem_id_rht_params); page = virt_to_head_page(data); if (napi_direct && xdp_return_frame_no_direct()) napi_direct = false; - page_pool_put_full_page(xa->page_pool, page, napi_direct); - rcu_read_unlock(); + /* No need to check ((page->pp_magic & ~0x3UL) == PP_SIGNATURE) + * as mem->type knows this a page_pool page + */ + page_pool_put_full_page(page->pp, page, napi_direct); break; case MEM_TYPE_PAGE_SHARED: page_frag_free(data); diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h index 7dfc00c9fb32..9ddc3a9e89e4 100644 --- a/net/dccp/dccp.h +++ b/net/dccp/dccp.h @@ -278,6 +278,7 @@ int dccp_rcv_state_process(struct sock *sk, struct sk_buff *skb, int dccp_rcv_established(struct sock *sk, struct sk_buff *skb, const struct dccp_hdr *dh, const unsigned int len); +void dccp_destruct_common(struct sock *sk); int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized); void dccp_destroy_sock(struct sock *sk); diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c index 6a6e121dc00c..713b7b8dad7e 100644 --- a/net/dccp/ipv4.c +++ b/net/dccp/ipv4.c @@ -144,7 +144,7 @@ int dccp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, inet->inet_dport); - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); err = dccp_connect(sk); rt = NULL; @@ -443,7 +443,7 @@ struct sock *dccp_v4_request_recv_sock(const struct sock *sk, RCU_INIT_POINTER(newinet->inet_opt, rcu_dereference(ireq->ireq_opt)); newinet->mc_index = inet_iif(skb); newinet->mc_ttl = ip_hdr(skb)->ttl; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); if (dst == NULL && (dst = inet_csk_route_child_sock(sk, newsk, req)) == NULL) goto put_and_exit; diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c index e57b43006074..ae62b1591dea 100644 --- a/net/dccp/ipv6.c +++ b/net/dccp/ipv6.c @@ -1021,6 +1021,12 @@ static const struct inet_connection_sock_af_ops dccp_ipv6_mapped = { .sockaddr_len = sizeof(struct sockaddr_in6), }; +static void dccp_v6_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); + inet6_sock_destruct(sk); +} + /* NOTE: A lot of things set to zero explicitly by call to * sk_alloc() so need not be done here. */ @@ -1033,17 +1039,12 @@ static int dccp_v6_init_sock(struct sock *sk) if (unlikely(!dccp_v6_ctl_sock_initialized)) dccp_v6_ctl_sock_initialized = 1; inet_csk(sk)->icsk_af_ops = &dccp_ipv6_af_ops; + sk->sk_destruct = dccp_v6_sk_destruct; } return err; } -static void dccp_v6_destroy_sock(struct sock *sk) -{ - dccp_destroy_sock(sk); - inet6_destroy_sock(sk); -} - static struct timewait_sock_ops dccp6_timewait_sock_ops = { .twsk_obj_size = sizeof(struct dccp6_timewait_sock), }; @@ -1066,7 +1067,7 @@ static struct proto dccp_v6_prot = { .accept = inet_csk_accept, .get_port = inet_csk_get_port, .shutdown = dccp_shutdown, - .destroy = dccp_v6_destroy_sock, + .destroy = dccp_destroy_sock, .orphan_count = &dccp_orphan_count, .max_header = MAX_DCCP_HEADER, .obj_size = sizeof(struct dccp6_sock), diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 7cd4a6cc99fc..9494b0d224f9 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -171,12 +171,18 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); -static void dccp_sk_destruct(struct sock *sk) +void dccp_destruct_common(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); dp->dccps_hc_tx_ccid = NULL; +} +EXPORT_SYMBOL_GPL(dccp_destruct_common); + +static void dccp_sk_destruct(struct sock *sk) +{ + dccp_destruct_common(sk); inet_sock_destruct(sk); } @@ -1197,6 +1203,8 @@ static int __init dccp_init(void) INIT_HLIST_HEAD(&dccp_hashinfo.bhash2[i].chain); } + dccp_hashinfo.pernet = false; + rc = dccp_mib_init(); if (rc) goto out_free_dccp_bhash2; diff --git a/net/dsa/Makefile b/net/dsa/Makefile index af28c24ead18..bf57ef3bce2a 100644 --- a/net/dsa/Makefile +++ b/net/dsa/Makefile @@ -1,7 +1,15 @@ # SPDX-License-Identifier: GPL-2.0 # the core obj-$(CONFIG_NET_DSA) += dsa_core.o -dsa_core-y += dsa.o dsa2.o master.o port.o slave.o switch.o tag_8021q.o +dsa_core-y += \ + dsa.o \ + dsa2.o \ + master.o \ + netlink.o \ + port.o \ + slave.o \ + switch.o \ + tag_8021q.o # tagging formats obj-$(CONFIG_NET_DSA_TAG_AR9331) += tag_ar9331.o diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c index be7b320cda76..64b14f655b23 100644 --- a/net/dsa/dsa.c +++ b/net/dsa/dsa.c @@ -536,8 +536,16 @@ static int __init dsa_init_module(void) dsa_tag_driver_register(&DSA_TAG_DRIVER_NAME(none_ops), THIS_MODULE); + rc = rtnl_link_register(&dsa_link_ops); + if (rc) + goto netlink_register_fail; + return 0; +netlink_register_fail: + dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops)); + dsa_slave_unregister_notifier(); + dev_remove_pack(&dsa_pack_type); register_notifier_fail: destroy_workqueue(dsa_owq); @@ -547,6 +555,7 @@ module_init(dsa_init_module); static void __exit dsa_cleanup_module(void) { + rtnl_link_unregister(&dsa_link_ops); dsa_tag_driver_unregister(&DSA_TAG_DRIVER_NAME(none_ops)); dsa_slave_unregister_notifier(); diff --git a/net/dsa/dsa2.c b/net/dsa/dsa2.c index ed56c7a554b8..af0e2c0394ac 100644 --- a/net/dsa/dsa2.c +++ b/net/dsa/dsa2.c @@ -387,6 +387,20 @@ static struct dsa_port *dsa_tree_find_first_cpu(struct dsa_switch_tree *dst) return NULL; } +struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst) +{ + struct device_node *ethernet; + struct net_device *master; + struct dsa_port *cpu_dp; + + cpu_dp = dsa_tree_find_first_cpu(dst); + ethernet = of_parse_phandle(cpu_dp->dn, "ethernet", 0); + master = of_find_net_device_by_node(ethernet); + of_node_put(ethernet); + + return master; +} + /* Assign the default CPU port (the first one in the tree) to all ports of the * fabric which don't already have one as part of their own switch. */ @@ -447,6 +461,72 @@ static void dsa_tree_teardown_cpu_ports(struct dsa_switch_tree *dst) dp->cpu_dp = NULL; } +static int dsa_port_devlink_setup(struct dsa_port *dp) +{ + struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch_tree *dst = dp->ds->dst; + struct devlink_port_attrs attrs = {}; + struct devlink *dl = dp->ds->devlink; + struct dsa_switch *ds = dp->ds; + const unsigned char *id; + unsigned char len; + int err; + + memset(dlp, 0, sizeof(*dlp)); + devlink_port_init(dl, dlp); + + if (ds->ops->port_setup) { + err = ds->ops->port_setup(ds, dp->index); + if (err) + return err; + } + + id = (const unsigned char *)&dst->index; + len = sizeof(dst->index); + + attrs.phys.port_number = dp->index; + memcpy(attrs.switch_id.id, id, len); + attrs.switch_id.id_len = len; + + switch (dp->type) { + case DSA_PORT_TYPE_UNUSED: + attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED; + break; + case DSA_PORT_TYPE_CPU: + attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; + break; + case DSA_PORT_TYPE_DSA: + attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; + break; + case DSA_PORT_TYPE_USER: + attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; + break; + } + + devlink_port_attrs_set(dlp, &attrs); + err = devlink_port_register(dl, dlp, dp->index); + if (err) { + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + return err; + } + + return 0; +} + +static void dsa_port_devlink_teardown(struct dsa_port *dp) +{ + struct devlink_port *dlp = &dp->devlink_port; + struct dsa_switch *ds = dp->ds; + + devlink_port_unregister(dlp); + + if (ds->ops->port_teardown) + ds->ops->port_teardown(ds, dp->index); + + devlink_port_fini(dlp); +} + static int dsa_port_setup(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; @@ -458,11 +538,9 @@ static int dsa_port_setup(struct dsa_port *dp) if (dp->setup) return 0; - if (ds->ops->port_setup) { - err = ds->ops->port_setup(ds, dp->index); - if (err) - return err; - } + err = dsa_port_devlink_setup(dp); + if (err) + return err; switch (dp->type) { case DSA_PORT_TYPE_UNUSED: @@ -519,8 +597,7 @@ static int dsa_port_setup(struct dsa_port *dp) if (err && dsa_port_link_registered) dsa_shared_port_link_unregister_of(dp); if (err) { - if (ds->ops->port_teardown) - ds->ops->port_teardown(ds, dp->index); + dsa_port_devlink_teardown(dp); return err; } @@ -529,59 +606,13 @@ static int dsa_port_setup(struct dsa_port *dp) return 0; } -static int dsa_port_devlink_setup(struct dsa_port *dp) -{ - struct devlink_port *dlp = &dp->devlink_port; - struct dsa_switch_tree *dst = dp->ds->dst; - struct devlink_port_attrs attrs = {}; - struct devlink *dl = dp->ds->devlink; - const unsigned char *id; - unsigned char len; - int err; - - id = (const unsigned char *)&dst->index; - len = sizeof(dst->index); - - attrs.phys.port_number = dp->index; - memcpy(attrs.switch_id.id, id, len); - attrs.switch_id.id_len = len; - memset(dlp, 0, sizeof(*dlp)); - - switch (dp->type) { - case DSA_PORT_TYPE_UNUSED: - attrs.flavour = DEVLINK_PORT_FLAVOUR_UNUSED; - break; - case DSA_PORT_TYPE_CPU: - attrs.flavour = DEVLINK_PORT_FLAVOUR_CPU; - break; - case DSA_PORT_TYPE_DSA: - attrs.flavour = DEVLINK_PORT_FLAVOUR_DSA; - break; - case DSA_PORT_TYPE_USER: - attrs.flavour = DEVLINK_PORT_FLAVOUR_PHYSICAL; - break; - } - - devlink_port_attrs_set(dlp, &attrs); - err = devlink_port_register(dl, dlp, dp->index); - - if (!err) - dp->devlink_port_setup = true; - - return err; -} - static void dsa_port_teardown(struct dsa_port *dp) { struct devlink_port *dlp = &dp->devlink_port; - struct dsa_switch *ds = dp->ds; if (!dp->setup) return; - if (ds->ops->port_teardown) - ds->ops->port_teardown(ds, dp->index); - devlink_port_type_clear(dlp); switch (dp->type) { @@ -605,46 +636,15 @@ static void dsa_port_teardown(struct dsa_port *dp) break; } - dp->setup = false; -} - -static void dsa_port_devlink_teardown(struct dsa_port *dp) -{ - struct devlink_port *dlp = &dp->devlink_port; + dsa_port_devlink_teardown(dp); - if (dp->devlink_port_setup) - devlink_port_unregister(dlp); - dp->devlink_port_setup = false; + dp->setup = false; } -/* Destroy the current devlink port, and create a new one which has the UNUSED - * flavour. At this point, any call to ds->ops->port_setup has been already - * balanced out by a call to ds->ops->port_teardown, so we know that any - * devlink port regions the driver had are now unregistered. We then call its - * ds->ops->port_setup again, in order for the driver to re-create them on the - * new devlink port. - */ -static int dsa_port_reinit_as_unused(struct dsa_port *dp) +static int dsa_port_setup_as_unused(struct dsa_port *dp) { - struct dsa_switch *ds = dp->ds; - int err; - - dsa_port_devlink_teardown(dp); dp->type = DSA_PORT_TYPE_UNUSED; - err = dsa_port_devlink_setup(dp); - if (err) - return err; - - if (ds->ops->port_setup) { - /* On error, leave the devlink port registered, - * dsa_switch_teardown will clean it up later. - */ - err = ds->ops->port_setup(ds, dp->index); - if (err) - return err; - } - - return 0; + return dsa_port_setup(dp); } static int dsa_devlink_info_get(struct devlink *dl, @@ -868,7 +868,6 @@ static int dsa_switch_setup(struct dsa_switch *ds) { struct dsa_devlink_priv *dl_priv; struct device_node *dn; - struct dsa_port *dp; int err; if (ds->setup) @@ -891,18 +890,9 @@ static int dsa_switch_setup(struct dsa_switch *ds) dl_priv = devlink_priv(ds->devlink); dl_priv->ds = ds; - /* Setup devlink port instances now, so that the switch - * setup() can register regions etc, against the ports - */ - dsa_switch_for_each_port(dp, ds) { - err = dsa_port_devlink_setup(dp); - if (err) - goto unregister_devlink_ports; - } - err = dsa_switch_register_notifier(ds); if (err) - goto unregister_devlink_ports; + goto devlink_free; ds->configure_vlan_while_not_filtering = true; @@ -943,9 +933,7 @@ teardown: ds->ops->teardown(ds); unregister_notifier: dsa_switch_unregister_notifier(ds); -unregister_devlink_ports: - dsa_switch_for_each_port(dp, ds) - dsa_port_devlink_teardown(dp); +devlink_free: devlink_free(ds->devlink); ds->devlink = NULL; return err; @@ -953,8 +941,6 @@ unregister_devlink_ports: static void dsa_switch_teardown(struct dsa_switch *ds) { - struct dsa_port *dp; - if (!ds->setup) return; @@ -973,8 +959,6 @@ static void dsa_switch_teardown(struct dsa_switch *ds) dsa_switch_unregister_notifier(ds); if (ds->devlink) { - dsa_switch_for_each_port(dp, ds) - dsa_port_devlink_teardown(dp); devlink_free(ds->devlink); ds->devlink = NULL; } @@ -1027,7 +1011,7 @@ static int dsa_tree_setup_ports(struct dsa_switch_tree *dst) if (dsa_port_is_user(dp) || dsa_port_is_unused(dp)) { err = dsa_port_setup(dp); if (err) { - err = dsa_port_reinit_as_unused(dp); + err = dsa_port_setup_as_unused(dp); if (err) goto teardown; } @@ -1263,11 +1247,11 @@ int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, * attempts to change the tagging protocol. If we ever lift the IFF_UP * restriction, there needs to be another mutex which serializes this. */ - list_for_each_entry(dp, &dst->ports, list) { - if (dsa_port_is_cpu(dp) && (dp->master->flags & IFF_UP)) + dsa_tree_for_each_user_port(dp, dst) { + if (dsa_port_to_master(dp)->flags & IFF_UP) goto out_unlock; - if (dsa_port_is_user(dp) && (dp->slave->flags & IFF_UP)) + if (dp->slave->flags & IFF_UP) goto out_unlock; } @@ -1312,6 +1296,12 @@ void dsa_tree_master_admin_state_change(struct dsa_switch_tree *dst, struct dsa_port *cpu_dp = master->dsa_ptr; bool notify = false; + /* Don't keep track of admin state on LAG DSA masters, + * but rather just of physical DSA masters + */ + if (netif_is_lag_master(master)) + return; + if ((dsa_port_master_is_operational(cpu_dp)) != (up && cpu_dp->master_oper_up)) notify = true; @@ -1329,6 +1319,12 @@ void dsa_tree_master_oper_state_change(struct dsa_switch_tree *dst, struct dsa_port *cpu_dp = master->dsa_ptr; bool notify = false; + /* Don't keep track of oper state on LAG DSA masters, + * but rather just of physical DSA masters + */ + if (netif_is_lag_master(master)) + return; + if ((dsa_port_master_is_operational(cpu_dp)) != (cpu_dp->master_admin_up && up)) notify = true; @@ -1797,7 +1793,7 @@ void dsa_switch_shutdown(struct dsa_switch *ds) rtnl_lock(); dsa_switch_for_each_user_port(dp, ds) { - master = dp->cpu_dp->master; + master = dsa_port_to_master(dp); slave_dev = dp->slave; netdev_upper_dev_unlink(master, slave_dev); diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h index 614fbba8fe39..6e65c7ffd6f3 100644 --- a/net/dsa/dsa_priv.h +++ b/net/dsa/dsa_priv.h @@ -88,6 +88,7 @@ struct dsa_notifier_lag_info { const struct dsa_port *dp; struct dsa_lag lag; struct netdev_lag_upper_info *info; + struct netlink_ext_ack *extack; }; /* DSA_NOTIFIER_VLAN_* */ @@ -184,6 +185,11 @@ static inline int dsa_tag_protocol_overhead(const struct dsa_device_ops *ops) /* master.c */ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp); void dsa_master_teardown(struct net_device *dev); +int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack); +void dsa_master_lag_teardown(struct net_device *lag_dev, + struct dsa_port *cpu_dp); static inline struct net_device *dsa_master_find_slave(struct net_device *dev, int device, int port) @@ -200,6 +206,9 @@ static inline struct net_device *dsa_master_find_slave(struct net_device *dev, return NULL; } +/* netlink.c */ +extern struct rtnl_link_ops dsa_link_ops __read_mostly; + /* port.c */ void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops); @@ -285,6 +294,7 @@ int dsa_port_mrp_add_ring_role(const struct dsa_port *dp, int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, const struct switchdev_obj_ring_role_mrp *mrp); int dsa_port_phylink_create(struct dsa_port *dp); +void dsa_port_phylink_destroy(struct dsa_port *dp); int dsa_shared_port_link_register_of(struct dsa_port *dp); void dsa_shared_port_link_unregister_of(struct dsa_port *dp); int dsa_port_hsr_join(struct dsa_port *dp, struct net_device *hsr); @@ -292,6 +302,8 @@ void dsa_port_hsr_leave(struct dsa_port *dp, struct net_device *hsr); int dsa_port_tag_8021q_vlan_add(struct dsa_port *dp, u16 vid, bool broadcast); void dsa_port_tag_8021q_vlan_del(struct dsa_port *dp, u16 vid, bool broadcast); void dsa_port_set_host_flood(struct dsa_port *dp, bool uc, bool mc); +int dsa_port_change_master(struct dsa_port *dp, struct net_device *master, + struct netlink_ext_ack *extack); /* slave.c */ extern const struct dsa_device_ops notag_netdev_ops; @@ -305,8 +317,12 @@ int dsa_slave_suspend(struct net_device *slave_dev); int dsa_slave_resume(struct net_device *slave_dev); int dsa_slave_register_notifier(void); void dsa_slave_unregister_notifier(void); +void dsa_slave_sync_ha(struct net_device *dev); +void dsa_slave_unsync_ha(struct net_device *dev); void dsa_slave_setup_tagger(struct net_device *slave); int dsa_slave_change_mtu(struct net_device *dev, int new_mtu); +int dsa_slave_change_master(struct net_device *dev, struct net_device *master, + struct netlink_ext_ack *extack); int dsa_slave_manage_vlan_filtering(struct net_device *dev, bool vlan_filtering); @@ -322,7 +338,7 @@ dsa_slave_to_master(const struct net_device *dev) { struct dsa_port *dp = dsa_slave_to_port(dev); - return dp->cpu_dp->master; + return dsa_port_to_master(dp); } /* If under a bridge with vlan_filtering=0, make sure to send pvid-tagged @@ -542,6 +558,7 @@ void dsa_lag_map(struct dsa_switch_tree *dst, struct dsa_lag *lag); void dsa_lag_unmap(struct dsa_switch_tree *dst, struct dsa_lag *lag); struct dsa_lag *dsa_tree_lag_find(struct dsa_switch_tree *dst, const struct net_device *lag_dev); +struct net_device *dsa_tree_find_first_master(struct dsa_switch_tree *dst); int dsa_tree_notify(struct dsa_switch_tree *dst, unsigned long e, void *v); int dsa_broadcast(unsigned long e, void *v); int dsa_tree_change_tag_proto(struct dsa_switch_tree *dst, diff --git a/net/dsa/master.c b/net/dsa/master.c index fb810edc8281..40367ab41cf8 100644 --- a/net/dsa/master.c +++ b/net/dsa/master.c @@ -226,6 +226,9 @@ static int dsa_master_ethtool_setup(struct net_device *dev) struct dsa_switch *ds = cpu_dp->ds; struct ethtool_ops *ops; + if (netif_is_lag_master(dev)) + return 0; + ops = devm_kzalloc(ds->dev, sizeof(*ops), GFP_KERNEL); if (!ops) return -ENOMEM; @@ -250,6 +253,9 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) { struct dsa_port *cpu_dp = dev->dsa_ptr; + if (netif_is_lag_master(dev)) + return; + dev->ethtool_ops = cpu_dp->orig_ethtool_ops; cpu_dp->orig_ethtool_ops = NULL; } @@ -257,6 +263,9 @@ static void dsa_master_ethtool_teardown(struct net_device *dev) static void dsa_netdev_ops_set(struct net_device *dev, const struct dsa_netdevice_ops *ops) { + if (netif_is_lag_master(dev)) + return; + dev->dsa_ptr->netdev_ops = ops; } @@ -355,12 +364,14 @@ int dsa_master_setup(struct net_device *dev, struct dsa_port *cpu_dp) mtu = ETH_DATA_LEN + dsa_tag_protocol_overhead(tag_ops); /* The DSA master must use SET_NETDEV_DEV for this to work. */ - consumer_link = device_link_add(ds->dev, dev->dev.parent, - DL_FLAG_AUTOREMOVE_CONSUMER); - if (!consumer_link) - netdev_err(dev, - "Failed to create a device link to DSA switch %s\n", - dev_name(ds->dev)); + if (!netif_is_lag_master(dev)) { + consumer_link = device_link_add(ds->dev, dev->dev.parent, + DL_FLAG_AUTOREMOVE_CONSUMER); + if (!consumer_link) + netdev_err(dev, + "Failed to create a device link to DSA switch %s\n", + dev_name(ds->dev)); + } /* The switch driver may not implement ->port_change_mtu(), case in * which dsa_slave_change_mtu() will not update the master MTU either, @@ -417,3 +428,52 @@ void dsa_master_teardown(struct net_device *dev) */ wmb(); } + +int dsa_master_lag_setup(struct net_device *lag_dev, struct dsa_port *cpu_dp, + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack) +{ + bool master_setup = false; + int err; + + if (!netdev_uses_dsa(lag_dev)) { + err = dsa_master_setup(lag_dev, cpu_dp); + if (err) + return err; + + master_setup = true; + } + + err = dsa_port_lag_join(cpu_dp, lag_dev, uinfo, extack); + if (err) { + if (extack && !extack->_msg) + NL_SET_ERR_MSG_MOD(extack, + "CPU port failed to join LAG"); + goto out_master_teardown; + } + + return 0; + +out_master_teardown: + if (master_setup) + dsa_master_teardown(lag_dev); + return err; +} + +/* Tear down a master if there isn't any other user port on it, + * optionally also destroying LAG information. + */ +void dsa_master_lag_teardown(struct net_device *lag_dev, + struct dsa_port *cpu_dp) +{ + struct net_device *upper; + struct list_head *iter; + + dsa_port_lag_leave(cpu_dp, lag_dev); + + netdev_for_each_upper_dev_rcu(lag_dev, upper, iter) + if (dsa_slave_dev_check(upper)) + return; + + dsa_master_teardown(lag_dev); +} diff --git a/net/dsa/netlink.c b/net/dsa/netlink.c new file mode 100644 index 000000000000..ecf9ed1de185 --- /dev/null +++ b/net/dsa/netlink.c @@ -0,0 +1,63 @@ +// SPDX-License-Identifier: GPL-2.0 +/* Copyright 2022 NXP + */ +#include <linux/netdevice.h> +#include <net/rtnetlink.h> + +#include "dsa_priv.h" + +static const struct nla_policy dsa_policy[IFLA_DSA_MAX + 1] = { + [IFLA_DSA_MASTER] = { .type = NLA_U32 }, +}; + +static int dsa_changelink(struct net_device *dev, struct nlattr *tb[], + struct nlattr *data[], + struct netlink_ext_ack *extack) +{ + int err; + + if (!data) + return 0; + + if (data[IFLA_DSA_MASTER]) { + u32 ifindex = nla_get_u32(data[IFLA_DSA_MASTER]); + struct net_device *master; + + master = __dev_get_by_index(dev_net(dev), ifindex); + if (!master) + return -EINVAL; + + err = dsa_slave_change_master(dev, master, extack); + if (err) + return err; + } + + return 0; +} + +static size_t dsa_get_size(const struct net_device *dev) +{ + return nla_total_size(sizeof(u32)) + /* IFLA_DSA_MASTER */ + 0; +} + +static int dsa_fill_info(struct sk_buff *skb, const struct net_device *dev) +{ + struct net_device *master = dsa_slave_to_master(dev); + + if (nla_put_u32(skb, IFLA_DSA_MASTER, master->ifindex)) + return -EMSGSIZE; + + return 0; +} + +struct rtnl_link_ops dsa_link_ops __read_mostly = { + .kind = "dsa", + .priv_size = sizeof(struct dsa_port), + .maxtype = IFLA_DSA_MAX, + .policy = dsa_policy, + .changelink = dsa_changelink, + .get_size = dsa_get_size, + .fill_info = dsa_fill_info, + .netns_refund = true, +}; diff --git a/net/dsa/port.c b/net/dsa/port.c index 7afc35db0c29..208168276995 100644 --- a/net/dsa/port.c +++ b/net/dsa/port.c @@ -7,6 +7,7 @@ */ #include <linux/if_bridge.h> +#include <linux/netdevice.h> #include <linux/notifier.h> #include <linux/of_mdio.h> #include <linux/of_net.h> @@ -634,6 +635,7 @@ int dsa_port_lag_join(struct dsa_port *dp, struct net_device *lag_dev, struct dsa_notifier_lag_info info = { .dp = dp, .info = uinfo, + .extack = extack, }; struct net_device *bridge_dev; int err; @@ -1026,7 +1028,7 @@ int dsa_port_standalone_host_fdb_add(struct dsa_port *dp, int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, const unsigned char *addr, u16 vid) { - struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = dsa_port_to_master(dp); struct dsa_db db = { .type = DSA_DB_BRIDGE, .bridge = *dp->bridge, @@ -1037,8 +1039,8 @@ int dsa_port_bridge_host_fdb_add(struct dsa_port *dp, * requires rtnl_lock(), since we can't guarantee that is held here, * and we can't take it either. */ - if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { - err = dev_uc_add(cpu_dp->master, addr); + if (master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_add(master, addr); if (err) return err; } @@ -1077,15 +1079,15 @@ int dsa_port_standalone_host_fdb_del(struct dsa_port *dp, int dsa_port_bridge_host_fdb_del(struct dsa_port *dp, const unsigned char *addr, u16 vid) { - struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = dsa_port_to_master(dp); struct dsa_db db = { .type = DSA_DB_BRIDGE, .bridge = *dp->bridge, }; int err; - if (cpu_dp->master->priv_flags & IFF_UNICAST_FLT) { - err = dev_uc_del(cpu_dp->master, addr); + if (master->priv_flags & IFF_UNICAST_FLT) { + err = dev_uc_del(master, addr); if (err) return err; } @@ -1208,14 +1210,14 @@ int dsa_port_standalone_host_mdb_add(const struct dsa_port *dp, int dsa_port_bridge_host_mdb_add(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = dsa_port_to_master(dp); struct dsa_db db = { .type = DSA_DB_BRIDGE, .bridge = *dp->bridge, }; int err; - err = dev_mc_add(cpu_dp->master, mdb->addr); + err = dev_mc_add(master, mdb->addr); if (err) return err; @@ -1252,14 +1254,14 @@ int dsa_port_standalone_host_mdb_del(const struct dsa_port *dp, int dsa_port_bridge_host_mdb_del(const struct dsa_port *dp, const struct switchdev_obj_port_mdb *mdb) { - struct dsa_port *cpu_dp = dp->cpu_dp; + struct net_device *master = dsa_port_to_master(dp); struct dsa_db db = { .type = DSA_DB_BRIDGE, .bridge = *dp->bridge, }; int err; - err = dev_mc_del(cpu_dp->master, mdb->addr); + err = dev_mc_del(master, mdb->addr); if (err) return err; @@ -1294,19 +1296,19 @@ int dsa_port_host_vlan_add(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan, struct netlink_ext_ack *extack) { + struct net_device *master = dsa_port_to_master(dp); struct dsa_notifier_vlan_info info = { .dp = dp, .vlan = vlan, .extack = extack, }; - struct dsa_port *cpu_dp = dp->cpu_dp; int err; err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_ADD, &info); if (err && err != -EOPNOTSUPP) return err; - vlan_vid_add(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid); + vlan_vid_add(master, htons(ETH_P_8021Q), vlan->vid); return err; } @@ -1314,18 +1316,18 @@ int dsa_port_host_vlan_add(struct dsa_port *dp, int dsa_port_host_vlan_del(struct dsa_port *dp, const struct switchdev_obj_port_vlan *vlan) { + struct net_device *master = dsa_port_to_master(dp); struct dsa_notifier_vlan_info info = { .dp = dp, .vlan = vlan, }; - struct dsa_port *cpu_dp = dp->cpu_dp; int err; err = dsa_port_notify(dp, DSA_NOTIFIER_HOST_VLAN_DEL, &info); if (err && err != -EOPNOTSUPP) return err; - vlan_vid_del(cpu_dp->master, htons(ETH_P_8021Q), vlan->vid); + vlan_vid_del(master, htons(ETH_P_8021Q), vlan->vid); return err; } @@ -1374,6 +1376,136 @@ int dsa_port_mrp_del_ring_role(const struct dsa_port *dp, return ds->ops->port_mrp_del_ring_role(ds, dp->index, mrp); } +static int dsa_port_assign_master(struct dsa_port *dp, + struct net_device *master, + struct netlink_ext_ack *extack, + bool fail_on_err) +{ + struct dsa_switch *ds = dp->ds; + int port = dp->index, err; + + err = ds->ops->port_change_master(ds, port, master, extack); + if (err && !fail_on_err) + dev_err(ds->dev, "port %d failed to assign master %s: %pe\n", + port, master->name, ERR_PTR(err)); + + if (err && fail_on_err) + return err; + + dp->cpu_dp = master->dsa_ptr; + dp->cpu_port_in_lag = netif_is_lag_master(master); + + return 0; +} + +/* Change the dp->cpu_dp affinity for a user port. Note that both cross-chip + * notifiers and drivers have implicit assumptions about user-to-CPU-port + * mappings, so we unfortunately cannot delay the deletion of the objects + * (switchdev, standalone addresses, standalone VLANs) on the old CPU port + * until the new CPU port has been set up. So we need to completely tear down + * the old CPU port before changing it, and restore it on errors during the + * bringup of the new one. + */ +int dsa_port_change_master(struct dsa_port *dp, struct net_device *master, + struct netlink_ext_ack *extack) +{ + struct net_device *bridge_dev = dsa_port_bridge_dev_get(dp); + struct net_device *old_master = dsa_port_to_master(dp); + struct net_device *dev = dp->slave; + struct dsa_switch *ds = dp->ds; + bool vlan_filtering; + int err, tmp; + + /* Bridges may hold host FDB, MDB and VLAN objects. These need to be + * migrated, so dynamically unoffload and later reoffload the bridge + * port. + */ + if (bridge_dev) { + dsa_port_pre_bridge_leave(dp, bridge_dev); + dsa_port_bridge_leave(dp, bridge_dev); + } + + /* The port might still be VLAN filtering even if it's no longer + * under a bridge, either due to ds->vlan_filtering_is_global or + * ds->needs_standalone_vlan_filtering. In turn this means VLANs + * on the CPU port. + */ + vlan_filtering = dsa_port_is_vlan_filtering(dp); + if (vlan_filtering) { + err = dsa_slave_manage_vlan_filtering(dev, false); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to remove standalone VLANs"); + goto rewind_old_bridge; + } + } + + /* Standalone addresses, and addresses of upper interfaces like + * VLAN, LAG, HSR need to be migrated. + */ + dsa_slave_unsync_ha(dev); + + err = dsa_port_assign_master(dp, master, extack, true); + if (err) + goto rewind_old_addrs; + + dsa_slave_sync_ha(dev); + + if (vlan_filtering) { + err = dsa_slave_manage_vlan_filtering(dev, true); + if (err) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to restore standalone VLANs"); + goto rewind_new_addrs; + } + } + + if (bridge_dev) { + err = dsa_port_bridge_join(dp, bridge_dev, extack); + if (err && err == -EOPNOTSUPP) { + NL_SET_ERR_MSG_MOD(extack, + "Failed to reoffload bridge"); + goto rewind_new_vlan; + } + } + + return 0; + +rewind_new_vlan: + if (vlan_filtering) + dsa_slave_manage_vlan_filtering(dev, false); + +rewind_new_addrs: + dsa_slave_unsync_ha(dev); + + dsa_port_assign_master(dp, old_master, NULL, false); + +/* Restore the objects on the old CPU port */ +rewind_old_addrs: + dsa_slave_sync_ha(dev); + + if (vlan_filtering) { + tmp = dsa_slave_manage_vlan_filtering(dev, true); + if (tmp) { + dev_err(ds->dev, + "port %d failed to restore standalone VLANs: %pe\n", + dp->index, ERR_PTR(tmp)); + } + } + +rewind_old_bridge: + if (bridge_dev) { + tmp = dsa_port_bridge_join(dp, bridge_dev, extack); + if (tmp) { + dev_err(ds->dev, + "port %d failed to rejoin bridge %s: %pe\n", + dp->index, bridge_dev->name, ERR_PTR(tmp)); + } + } + + return err; +} + void dsa_port_set_tag_protocol(struct dsa_port *cpu_dp, const struct dsa_device_ops *tag_ops) { @@ -1529,6 +1661,7 @@ int dsa_port_phylink_create(struct dsa_port *dp) { struct dsa_switch *ds = dp->ds; phy_interface_t mode; + struct phylink *pl; int err; err = of_get_phy_mode(dp->dn, &mode); @@ -1545,16 +1678,24 @@ int dsa_port_phylink_create(struct dsa_port *dp) if (ds->ops->phylink_get_caps) ds->ops->phylink_get_caps(ds, dp->index, &dp->pl_config); - dp->pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), - mode, &dsa_port_phylink_mac_ops); - if (IS_ERR(dp->pl)) { - pr_err("error creating PHYLINK: %ld\n", PTR_ERR(dp->pl)); - return PTR_ERR(dp->pl); + pl = phylink_create(&dp->pl_config, of_fwnode_handle(dp->dn), + mode, &dsa_port_phylink_mac_ops); + if (IS_ERR(pl)) { + pr_err("error creating PHYLINK: %ld\n", PTR_ERR(pl)); + return PTR_ERR(pl); } + dp->pl = pl; + return 0; } +void dsa_port_phylink_destroy(struct dsa_port *dp) +{ + phylink_destroy(dp->pl); + dp->pl = NULL; +} + static int dsa_shared_port_setup_phy_of(struct dsa_port *dp, bool enable) { struct dsa_switch *ds = dp->ds; @@ -1649,7 +1790,7 @@ static int dsa_shared_port_phylink_register(struct dsa_port *dp) return 0; err_phy_connect: - phylink_destroy(dp->pl); + dsa_port_phylink_destroy(dp); return err; } @@ -1851,8 +1992,7 @@ void dsa_shared_port_link_unregister_of(struct dsa_port *dp) rtnl_lock(); phylink_disconnect_phy(dp->pl); rtnl_unlock(); - phylink_destroy(dp->pl); - dp->pl = NULL; + dsa_port_phylink_destroy(dp); return; } diff --git a/net/dsa/slave.c b/net/dsa/slave.c index 345106b1ed78..83e419afa89e 100644 --- a/net/dsa/slave.c +++ b/net/dsa/slave.c @@ -164,6 +164,48 @@ static int dsa_slave_unsync_mc(struct net_device *dev, return dsa_slave_schedule_standalone_work(dev, DSA_MC_DEL, addr, 0); } +void dsa_slave_sync_ha(struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + struct netdev_hw_addr *ha; + + netif_addr_lock_bh(dev); + + netdev_for_each_synced_mc_addr(ha, dev) + dsa_slave_sync_mc(dev, ha->addr); + + netdev_for_each_synced_uc_addr(ha, dev) + dsa_slave_sync_uc(dev, ha->addr); + + netif_addr_unlock_bh(dev); + + if (dsa_switch_supports_uc_filtering(ds) || + dsa_switch_supports_mc_filtering(ds)) + dsa_flush_workqueue(); +} + +void dsa_slave_unsync_ha(struct net_device *dev) +{ + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + struct netdev_hw_addr *ha; + + netif_addr_lock_bh(dev); + + netdev_for_each_synced_uc_addr(ha, dev) + dsa_slave_unsync_uc(dev, ha->addr); + + netdev_for_each_synced_mc_addr(ha, dev) + dsa_slave_unsync_mc(dev, ha->addr); + + netif_addr_unlock_bh(dev); + + if (dsa_switch_supports_uc_filtering(ds) || + dsa_switch_supports_mc_filtering(ds)) + dsa_flush_workqueue(); +} + /* slave mii_bus handling ***************************************************/ static int dsa_slave_phy_read(struct mii_bus *bus, int addr, int reg) { @@ -934,12 +976,12 @@ static void dsa_slave_get_ethtool_stats(struct net_device *dev, s = per_cpu_ptr(dev->tstats, i); do { - start = u64_stats_fetch_begin_irq(&s->syncp); + start = u64_stats_fetch_begin(&s->syncp); tx_packets = u64_stats_read(&s->tx_packets); tx_bytes = u64_stats_read(&s->tx_bytes); rx_packets = u64_stats_read(&s->rx_packets); rx_bytes = u64_stats_read(&s->rx_bytes); - } while (u64_stats_fetch_retry_irq(&s->syncp, start)); + } while (u64_stats_fetch_retry(&s->syncp, start)); data[0] += tx_packets; data[1] += tx_bytes; data[2] += rx_packets; @@ -1503,8 +1545,7 @@ static int dsa_slave_setup_tc_block(struct net_device *dev, static int dsa_slave_setup_ft_block(struct dsa_switch *ds, int port, void *type_data) { - struct dsa_port *cpu_dp = dsa_to_port(ds, port)->cpu_dp; - struct net_device *master = cpu_dp->master; + struct net_device *master = dsa_port_to_master(dsa_to_port(ds, port)); if (!master->netdev_ops->ndo_setup_tc) return -EOPNOTSUPP; @@ -2147,13 +2188,14 @@ static int dsa_slave_fill_forward_path(struct net_device_path_ctx *ctx, struct net_device_path *path) { struct dsa_port *dp = dsa_slave_to_port(ctx->dev); + struct net_device *master = dsa_port_to_master(dp); struct dsa_port *cpu_dp = dp->cpu_dp; path->dev = ctx->dev; path->type = DEV_PATH_DSA; path->dsa.proto = cpu_dp->tag_ops->proto; path->dsa.port = dp->index; - ctx->dev = cpu_dp->master; + ctx->dev = master; return 0; } @@ -2262,7 +2304,7 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) if (ret) { netdev_err(slave_dev, "failed to connect to PHY: %pe\n", ERR_PTR(ret)); - phylink_destroy(dp->pl); + dsa_port_phylink_destroy(dp); } return ret; @@ -2271,9 +2313,9 @@ static int dsa_slave_phy_setup(struct net_device *slave_dev) void dsa_slave_setup_tagger(struct net_device *slave) { struct dsa_port *dp = dsa_slave_to_port(slave); + struct net_device *master = dsa_port_to_master(dp); struct dsa_slave_priv *p = netdev_priv(slave); const struct dsa_port *cpu_dp = dp->cpu_dp; - struct net_device *master = cpu_dp->master; const struct dsa_switch *ds = dp->ds; slave->needed_headroom = cpu_dp->tag_ops->needed_headroom; @@ -2330,8 +2372,7 @@ int dsa_slave_resume(struct net_device *slave_dev) int dsa_slave_create(struct dsa_port *port) { - const struct dsa_port *cpu_dp = port->cpu_dp; - struct net_device *master = cpu_dp->master; + struct net_device *master = dsa_port_to_master(port); struct dsa_switch *ds = port->ds; const char *name = port->name; struct net_device *slave_dev; @@ -2347,6 +2388,7 @@ int dsa_slave_create(struct dsa_port *port) if (slave_dev == NULL) return -ENOMEM; + slave_dev->rtnl_link_ops = &dsa_link_ops; slave_dev->ethtool_ops = &dsa_slave_ethtool_ops; #if IS_ENABLED(CONFIG_DCB) slave_dev->dcbnl_ops = &dsa_slave_dcbnl_ops; @@ -2434,7 +2476,7 @@ out_phy: rtnl_lock(); phylink_disconnect_phy(p->dp->pl); rtnl_unlock(); - phylink_destroy(p->dp->pl); + dsa_port_phylink_destroy(p->dp); out_gcells: gro_cells_destroy(&p->gcells); out_free: @@ -2457,12 +2499,89 @@ void dsa_slave_destroy(struct net_device *slave_dev) phylink_disconnect_phy(dp->pl); rtnl_unlock(); - phylink_destroy(dp->pl); + dsa_port_phylink_destroy(dp); gro_cells_destroy(&p->gcells); free_percpu(slave_dev->tstats); free_netdev(slave_dev); } +int dsa_slave_change_master(struct net_device *dev, struct net_device *master, + struct netlink_ext_ack *extack) +{ + struct net_device *old_master = dsa_slave_to_master(dev); + struct dsa_port *dp = dsa_slave_to_port(dev); + struct dsa_switch *ds = dp->ds; + struct net_device *upper; + struct list_head *iter; + int err; + + if (master == old_master) + return 0; + + if (!ds->ops->port_change_master) { + NL_SET_ERR_MSG_MOD(extack, + "Driver does not support changing DSA master"); + return -EOPNOTSUPP; + } + + if (!netdev_uses_dsa(master)) { + NL_SET_ERR_MSG_MOD(extack, + "Interface not eligible as DSA master"); + return -EOPNOTSUPP; + } + + netdev_for_each_upper_dev_rcu(master, upper, iter) { + if (dsa_slave_dev_check(upper)) + continue; + if (netif_is_bridge_master(upper)) + continue; + NL_SET_ERR_MSG_MOD(extack, "Cannot join master with unknown uppers"); + return -EOPNOTSUPP; + } + + /* Since we allow live-changing the DSA master, plus we auto-open the + * DSA master when the user port opens => we need to ensure that the + * new DSA master is open too. + */ + if (dev->flags & IFF_UP) { + err = dev_open(master, extack); + if (err) + return err; + } + + netdev_upper_dev_unlink(old_master, dev); + + err = netdev_upper_dev_link(master, dev, extack); + if (err) + goto out_revert_old_master_unlink; + + err = dsa_port_change_master(dp, master, extack); + if (err) + goto out_revert_master_link; + + /* Update the MTU of the new CPU port through cross-chip notifiers */ + err = dsa_slave_change_mtu(dev, dev->mtu); + if (err && err != -EOPNOTSUPP) { + netdev_warn(dev, + "nonfatal error updating MTU with new master: %pe\n", + ERR_PTR(err)); + } + + /* If the port doesn't have its own MAC address and relies on the DSA + * master's one, inherit it again from the new DSA master. + */ + if (is_zero_ether_addr(dp->mac)) + eth_hw_addr_inherit(dev, master); + + return 0; + +out_revert_master_link: + netdev_upper_dev_unlink(master, dev); +out_revert_old_master_unlink: + netdev_upper_dev_link(old_master, dev, NULL); + return err; +} + bool dsa_slave_dev_check(const struct net_device *dev) { return dev->netdev_ops == &dsa_slave_netdev_ops; @@ -2699,11 +2818,45 @@ dsa_slave_prechangeupper_sanity_check(struct net_device *dev, return NOTIFY_DONE; } +/* To be eligible as a DSA master, a LAG must have all lower interfaces be + * eligible DSA masters. Additionally, all LAG slaves must be DSA masters of + * switches in the same switch tree. + */ +static int dsa_lag_master_validate(struct net_device *lag_dev, + struct netlink_ext_ack *extack) +{ + struct net_device *lower1, *lower2; + struct list_head *iter1, *iter2; + + netdev_for_each_lower_dev(lag_dev, lower1, iter1) { + netdev_for_each_lower_dev(lag_dev, lower2, iter2) { + if (!netdev_uses_dsa(lower1) || + !netdev_uses_dsa(lower2)) { + NL_SET_ERR_MSG_MOD(extack, + "All LAG ports must be eligible as DSA masters"); + return notifier_from_errno(-EINVAL); + } + + if (lower1 == lower2) + continue; + + if (!dsa_port_tree_same(lower1->dsa_ptr, + lower2->dsa_ptr)) { + NL_SET_ERR_MSG_MOD(extack, + "LAG contains DSA masters of disjoint switch trees"); + return notifier_from_errno(-EINVAL); + } + } + } + + return NOTIFY_DONE; +} + static int dsa_master_prechangeupper_sanity_check(struct net_device *master, struct netdev_notifier_changeupper_info *info) { - struct netlink_ext_ack *extack; + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); if (!netdev_uses_dsa(master)) return NOTIFY_DONE; @@ -2721,13 +2874,51 @@ dsa_master_prechangeupper_sanity_check(struct net_device *master, if (netif_is_bridge_master(info->upper_dev)) return NOTIFY_DONE; - extack = netdev_notifier_info_to_extack(&info->info); + /* Allow LAG uppers, subject to further restrictions in + * dsa_lag_master_prechangelower_sanity_check() + */ + if (netif_is_lag_master(info->upper_dev)) + return dsa_lag_master_validate(info->upper_dev, extack); NL_SET_ERR_MSG_MOD(extack, "DSA master cannot join unknown upper interfaces"); return notifier_from_errno(-EBUSY); } +static int +dsa_lag_master_prechangelower_sanity_check(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct netlink_ext_ack *extack = netdev_notifier_info_to_extack(&info->info); + struct net_device *lag_dev = info->upper_dev; + struct net_device *lower; + struct list_head *iter; + + if (!netdev_uses_dsa(lag_dev) || !netif_is_lag_master(lag_dev)) + return NOTIFY_DONE; + + if (!info->linking) + return NOTIFY_DONE; + + if (!netdev_uses_dsa(dev)) { + NL_SET_ERR_MSG(extack, + "Only DSA masters can join a LAG DSA master"); + return notifier_from_errno(-EINVAL); + } + + netdev_for_each_lower_dev(lag_dev, lower, iter) { + if (!dsa_port_tree_same(dev->dsa_ptr, lower->dsa_ptr)) { + NL_SET_ERR_MSG(extack, + "Interface is DSA master for a different switch tree than this LAG"); + return notifier_from_errno(-EINVAL); + } + + break; + } + + return NOTIFY_DONE; +} + /* Don't allow bridging of DSA masters, since the bridge layer rx_handler * prevents the DSA fake ethertype handler to be invoked, so we don't get the * chance to strip off and parse the DSA switch tag protocol header (the bridge @@ -2768,6 +2959,136 @@ dsa_bridge_prechangelower_sanity_check(struct net_device *new_lower, return NOTIFY_DONE; } +static void dsa_tree_migrate_ports_from_lag_master(struct dsa_switch_tree *dst, + struct net_device *lag_dev) +{ + struct net_device *new_master = dsa_tree_find_first_master(dst); + struct dsa_port *dp; + int err; + + dsa_tree_for_each_user_port(dp, dst) { + if (dsa_port_to_master(dp) != lag_dev) + continue; + + err = dsa_slave_change_master(dp->slave, new_master, NULL); + if (err) { + netdev_err(dp->slave, + "failed to restore master to %s: %pe\n", + new_master->name, ERR_PTR(err)); + } + } +} + +static int dsa_master_lag_join(struct net_device *master, + struct net_device *lag_dev, + struct netdev_lag_upper_info *uinfo, + struct netlink_ext_ack *extack) +{ + struct dsa_port *cpu_dp = master->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_port *dp; + int err; + + err = dsa_master_lag_setup(lag_dev, cpu_dp, uinfo, extack); + if (err) + return err; + + dsa_tree_for_each_user_port(dp, dst) { + if (dsa_port_to_master(dp) != master) + continue; + + err = dsa_slave_change_master(dp->slave, lag_dev, extack); + if (err) + goto restore; + } + + return 0; + +restore: + dsa_tree_for_each_user_port_continue_reverse(dp, dst) { + if (dsa_port_to_master(dp) != lag_dev) + continue; + + err = dsa_slave_change_master(dp->slave, master, NULL); + if (err) { + netdev_err(dp->slave, + "failed to restore master to %s: %pe\n", + master->name, ERR_PTR(err)); + } + } + + dsa_master_lag_teardown(lag_dev, master->dsa_ptr); + + return err; +} + +static void dsa_master_lag_leave(struct net_device *master, + struct net_device *lag_dev) +{ + struct dsa_port *dp, *cpu_dp = lag_dev->dsa_ptr; + struct dsa_switch_tree *dst = cpu_dp->dst; + struct dsa_port *new_cpu_dp = NULL; + struct net_device *lower; + struct list_head *iter; + + netdev_for_each_lower_dev(lag_dev, lower, iter) { + if (netdev_uses_dsa(lower)) { + new_cpu_dp = lower->dsa_ptr; + break; + } + } + + if (new_cpu_dp) { + /* Update the CPU port of the user ports still under the LAG + * so that dsa_port_to_master() continues to work properly + */ + dsa_tree_for_each_user_port(dp, dst) + if (dsa_port_to_master(dp) == lag_dev) + dp->cpu_dp = new_cpu_dp; + + /* Update the index of the virtual CPU port to match the lowest + * physical CPU port + */ + lag_dev->dsa_ptr = new_cpu_dp; + wmb(); + } else { + /* If the LAG DSA master has no ports left, migrate back all + * user ports to the first physical CPU port + */ + dsa_tree_migrate_ports_from_lag_master(dst, lag_dev); + } + + /* This DSA master has left its LAG in any case, so let + * the CPU port leave the hardware LAG as well + */ + dsa_master_lag_teardown(lag_dev, master->dsa_ptr); +} + +static int dsa_master_changeupper(struct net_device *dev, + struct netdev_notifier_changeupper_info *info) +{ + struct netlink_ext_ack *extack; + int err = NOTIFY_DONE; + + if (!netdev_uses_dsa(dev)) + return err; + + extack = netdev_notifier_info_to_extack(&info->info); + + if (netif_is_lag_master(info->upper_dev)) { + if (info->linking) { + err = dsa_master_lag_join(dev, info->upper_dev, + info->upper_info, extack); + err = notifier_from_errno(err); + } else { + dsa_master_lag_leave(dev, info->upper_dev); + err = NOTIFY_OK; + } + } + + return err; +} + static int dsa_slave_netdevice_event(struct notifier_block *nb, unsigned long event, void *ptr) { @@ -2786,6 +3107,10 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, if (notifier_to_errno(err)) return err; + err = dsa_lag_master_prechangelower_sanity_check(dev, info); + if (notifier_to_errno(err)) + return err; + err = dsa_bridge_prechangelower_sanity_check(dev, info); if (notifier_to_errno(err)) return err; @@ -2811,19 +3136,32 @@ static int dsa_slave_netdevice_event(struct notifier_block *nb, if (notifier_to_errno(err)) return err; + err = dsa_master_changeupper(dev, ptr); + if (notifier_to_errno(err)) + return err; + break; } case NETDEV_CHANGELOWERSTATE: { struct netdev_notifier_changelowerstate_info *info = ptr; struct dsa_port *dp; - int err; + int err = 0; - if (!dsa_slave_dev_check(dev)) - break; + if (dsa_slave_dev_check(dev)) { + dp = dsa_slave_to_port(dev); - dp = dsa_slave_to_port(dev); + err = dsa_port_lag_change(dp, info->lower_state_info); + } + + /* Mirror LAG port events on DSA masters that are in + * a LAG towards their respective switch CPU ports + */ + if (netdev_uses_dsa(dev)) { + dp = dev->dsa_ptr; + + err = dsa_port_lag_change(dp, info->lower_state_info); + } - err = dsa_port_lag_change(dp, info->lower_state_info); return notifier_from_errno(err); } case NETDEV_CHANGE: diff --git a/net/dsa/switch.c b/net/dsa/switch.c index 4dfd68cf61c5..ce56acdba203 100644 --- a/net/dsa/switch.c +++ b/net/dsa/switch.c @@ -398,8 +398,15 @@ static int dsa_switch_host_fdb_add(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->dp)) { - err = dsa_port_do_fdb_add(dp, info->addr, info->vid, - info->db); + if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) { + err = dsa_switch_do_lag_fdb_add(ds, dp->lag, + info->addr, + info->vid, + info->db); + } else { + err = dsa_port_do_fdb_add(dp, info->addr, + info->vid, info->db); + } if (err) break; } @@ -419,8 +426,15 @@ static int dsa_switch_host_fdb_del(struct dsa_switch *ds, dsa_switch_for_each_port(dp, ds) { if (dsa_port_host_address_match(dp, info->dp)) { - err = dsa_port_do_fdb_del(dp, info->addr, info->vid, - info->db); + if (dsa_port_is_cpu(dp) && info->dp->cpu_port_in_lag) { + err = dsa_switch_do_lag_fdb_del(ds, dp->lag, + info->addr, + info->vid, + info->db); + } else { + err = dsa_port_do_fdb_del(dp, info->addr, + info->vid, info->db); + } if (err) break; } @@ -507,12 +521,12 @@ static int dsa_switch_lag_join(struct dsa_switch *ds, { if (info->dp->ds == ds && ds->ops->port_lag_join) return ds->ops->port_lag_join(ds, info->dp->index, info->lag, - info->info); + info->info, info->extack); if (info->dp->ds != ds && ds->ops->crosschip_lag_join) return ds->ops->crosschip_lag_join(ds, info->dp->ds->index, info->dp->index, info->lag, - info->info); + info->info, info->extack); return -EOPNOTSUPP; } diff --git a/net/dsa/tag_8021q.c b/net/dsa/tag_8021q.c index b5f80bc45ceb..34e5ec5d3e23 100644 --- a/net/dsa/tag_8021q.c +++ b/net/dsa/tag_8021q.c @@ -330,7 +330,7 @@ static int dsa_tag_8021q_port_setup(struct dsa_switch *ds, int port) if (!dsa_port_is_user(dp)) return 0; - master = dp->cpu_dp->master; + master = dsa_port_to_master(dp); err = dsa_port_tag_8021q_vlan_add(dp, vid, false); if (err) { @@ -359,7 +359,7 @@ static void dsa_tag_8021q_port_teardown(struct dsa_switch *ds, int port) if (!dsa_port_is_user(dp)) return; - master = dp->cpu_dp->master; + master = dsa_port_to_master(dp); dsa_port_tag_8021q_vlan_del(dp, vid, false); diff --git a/net/ethtool/Makefile b/net/ethtool/Makefile index b76432e70e6b..72ab0944262a 100644 --- a/net/ethtool/Makefile +++ b/net/ethtool/Makefile @@ -7,4 +7,5 @@ obj-$(CONFIG_ETHTOOL_NETLINK) += ethtool_nl.o ethtool_nl-y := netlink.o bitset.o strset.o linkinfo.o linkmodes.o \ linkstate.o debug.o wol.o features.o privflags.o rings.o \ channels.o coalesce.o pause.o eee.o tsinfo.o cabletest.o \ - tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o + tunnels.o fec.o eeprom.o stats.o phc_vclocks.o module.o \ + pse-pd.o diff --git a/net/ethtool/common.c b/net/ethtool/common.c index 566adf85e658..ee3e02da0013 100644 --- a/net/ethtool/common.c +++ b/net/ethtool/common.c @@ -202,6 +202,12 @@ const char link_mode_names[][ETH_GSTRING_LEN] = { __DEFINE_LINK_MODE_NAME(100, FX, Half), __DEFINE_LINK_MODE_NAME(100, FX, Full), __DEFINE_LINK_MODE_NAME(10, T1L, Full), + __DEFINE_LINK_MODE_NAME(800000, CR8, Full), + __DEFINE_LINK_MODE_NAME(800000, KR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8, Full), + __DEFINE_LINK_MODE_NAME(800000, DR8_2, Full), + __DEFINE_LINK_MODE_NAME(800000, SR8, Full), + __DEFINE_LINK_MODE_NAME(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); @@ -238,6 +244,8 @@ static_assert(ARRAY_SIZE(link_mode_names) == __ETHTOOL_LINK_MODE_MASK_NBITS); #define __LINK_MODE_LANES_X 1 #define __LINK_MODE_LANES_FX 1 #define __LINK_MODE_LANES_T1L 1 +#define __LINK_MODE_LANES_VR8 8 +#define __LINK_MODE_LANES_DR8_2 8 #define __DEFINE_LINK_MODE_PARAMS(_speed, _type, _duplex) \ [ETHTOOL_LINK_MODE(_speed, _type, _duplex)] = { \ @@ -352,6 +360,12 @@ const struct link_mode_info link_mode_params[] = { __DEFINE_LINK_MODE_PARAMS(100, FX, Half), __DEFINE_LINK_MODE_PARAMS(100, FX, Full), __DEFINE_LINK_MODE_PARAMS(10, T1L, Full), + __DEFINE_LINK_MODE_PARAMS(800000, CR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, KR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, DR8_2, Full), + __DEFINE_LINK_MODE_PARAMS(800000, SR8, Full), + __DEFINE_LINK_MODE_PARAMS(800000, VR8, Full), }; static_assert(ARRAY_SIZE(link_mode_params) == __ETHTOOL_LINK_MODE_MASK_NBITS); diff --git a/net/ethtool/common.h b/net/ethtool/common.h index 2dc2b80aea5f..c1779657e074 100644 --- a/net/ethtool/common.h +++ b/net/ethtool/common.h @@ -46,6 +46,7 @@ int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max); int __ethtool_get_ts_info(struct net_device *dev, struct ethtool_ts_info *info); extern const struct ethtool_phy_ops *ethtool_phy_ops; +extern const struct ethtool_pse_ops *ethtool_pse_ops; int ethtool_get_module_info_call(struct net_device *dev, struct ethtool_modinfo *modinfo); diff --git a/net/ethtool/eeprom.c b/net/ethtool/eeprom.c index 1c94bb8ea03f..49c0a2a77f02 100644 --- a/net/ethtool/eeprom.c +++ b/net/ethtool/eeprom.c @@ -124,7 +124,7 @@ static int eeprom_prepare_data(const struct ethnl_req_info *req_base, if (ret) goto err_free; - ret = get_module_eeprom_by_page(dev, &page_data, info->extack); + ret = get_module_eeprom_by_page(dev, &page_data, info ? info->extack : NULL); if (ret < 0) goto err_ops; diff --git a/net/ethtool/ioctl.c b/net/ethtool/ioctl.c index 9298eb3251cb..57e7238a4136 100644 --- a/net/ethtool/ioctl.c +++ b/net/ethtool/ioctl.c @@ -571,6 +571,7 @@ static int ethtool_get_link_ksettings(struct net_device *dev, = __ETHTOOL_LINK_MODE_MASK_NU32; link_ksettings.base.master_slave_cfg = MASTER_SLAVE_CFG_UNSUPPORTED; link_ksettings.base.master_slave_state = MASTER_SLAVE_STATE_UNSUPPORTED; + link_ksettings.base.rate_matching = RATE_MATCH_NONE; return store_link_ksettings_for_user(useraddr, &link_ksettings); } diff --git a/net/ethtool/linkmodes.c b/net/ethtool/linkmodes.c index 99b29b4fe947..126e06c713a3 100644 --- a/net/ethtool/linkmodes.c +++ b/net/ethtool/linkmodes.c @@ -70,6 +70,7 @@ static int linkmodes_reply_size(const struct ethnl_req_info *req_base, + nla_total_size(sizeof(u32)) /* LINKMODES_SPEED */ + nla_total_size(sizeof(u32)) /* LINKMODES_LANES */ + nla_total_size(sizeof(u8)) /* LINKMODES_DUPLEX */ + + nla_total_size(sizeof(u8)) /* LINKMODES_RATE_MATCHING */ + 0; ret = ethnl_bitset_size(ksettings->link_modes.advertising, ksettings->link_modes.supported, @@ -143,6 +144,10 @@ static int linkmodes_fill_reply(struct sk_buff *skb, lsettings->master_slave_state)) return -EMSGSIZE; + if (nla_put_u8(skb, ETHTOOL_A_LINKMODES_RATE_MATCHING, + lsettings->rate_matching)) + return -EMSGSIZE; + return 0; } diff --git a/net/ethtool/netlink.c b/net/ethtool/netlink.c index f4e41a6e0163..1a4c11356c96 100644 --- a/net/ethtool/netlink.c +++ b/net/ethtool/netlink.c @@ -286,6 +286,7 @@ ethnl_default_requests[__ETHTOOL_MSG_USER_CNT] = { [ETHTOOL_MSG_STATS_GET] = ðnl_stats_request_ops, [ETHTOOL_MSG_PHC_VCLOCKS_GET] = ðnl_phc_vclocks_request_ops, [ETHTOOL_MSG_MODULE_GET] = ðnl_module_request_ops, + [ETHTOOL_MSG_PSE_GET] = ðnl_pse_request_ops, }; static struct ethnl_dump_ctx *ethnl_dump_context(struct netlink_callback *cb) @@ -1023,6 +1024,22 @@ static const struct genl_ops ethtool_genl_ops[] = { .policy = ethnl_module_set_policy, .maxattr = ARRAY_SIZE(ethnl_module_set_policy) - 1, }, + { + .cmd = ETHTOOL_MSG_PSE_GET, + .doit = ethnl_default_doit, + .start = ethnl_default_start, + .dumpit = ethnl_default_dumpit, + .done = ethnl_default_done, + .policy = ethnl_pse_get_policy, + .maxattr = ARRAY_SIZE(ethnl_pse_get_policy) - 1, + }, + { + .cmd = ETHTOOL_MSG_PSE_SET, + .flags = GENL_UNS_ADMIN_PERM, + .doit = ethnl_set_pse, + .policy = ethnl_pse_set_policy, + .maxattr = ARRAY_SIZE(ethnl_pse_set_policy) - 1, + }, }; static const struct genl_multicast_group ethtool_nl_mcgrps[] = { diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index c0d587611854..1bfd374f9718 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -345,6 +345,7 @@ extern const struct ethnl_request_ops ethnl_module_eeprom_request_ops; extern const struct ethnl_request_ops ethnl_stats_request_ops; extern const struct ethnl_request_ops ethnl_phc_vclocks_request_ops; extern const struct ethnl_request_ops ethnl_module_request_ops; +extern const struct ethnl_request_ops ethnl_pse_request_ops; extern const struct nla_policy ethnl_header_policy[ETHTOOL_A_HEADER_FLAGS + 1]; extern const struct nla_policy ethnl_header_policy_stats[ETHTOOL_A_HEADER_FLAGS + 1]; @@ -383,6 +384,8 @@ extern const struct nla_policy ethnl_stats_get_policy[ETHTOOL_A_STATS_GROUPS + 1 extern const struct nla_policy ethnl_phc_vclocks_get_policy[ETHTOOL_A_PHC_VCLOCKS_HEADER + 1]; extern const struct nla_policy ethnl_module_get_policy[ETHTOOL_A_MODULE_HEADER + 1]; extern const struct nla_policy ethnl_module_set_policy[ETHTOOL_A_MODULE_POWER_MODE_POLICY + 1]; +extern const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1]; +extern const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1]; int ethnl_set_linkinfo(struct sk_buff *skb, struct genl_info *info); int ethnl_set_linkmodes(struct sk_buff *skb, struct genl_info *info); @@ -402,6 +405,7 @@ int ethnl_tunnel_info_start(struct netlink_callback *cb); int ethnl_tunnel_info_dumpit(struct sk_buff *skb, struct netlink_callback *cb); int ethnl_set_fec(struct sk_buff *skb, struct genl_info *info); int ethnl_set_module(struct sk_buff *skb, struct genl_info *info); +int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info); extern const char stats_std_names[__ETHTOOL_STATS_CNT][ETH_GSTRING_LEN]; extern const char stats_eth_phy_names[__ETHTOOL_A_STATS_ETH_PHY_CNT][ETH_GSTRING_LEN]; diff --git a/net/ethtool/pse-pd.c b/net/ethtool/pse-pd.c new file mode 100644 index 000000000000..e8683e485dc9 --- /dev/null +++ b/net/ethtool/pse-pd.c @@ -0,0 +1,185 @@ +// SPDX-License-Identifier: GPL-2.0-only +// +// ethtool interface for for Ethernet PSE (Power Sourcing Equipment) +// and PD (Powered Device) +// +// Copyright (c) 2022 Pengutronix, Oleksij Rempel <kernel@pengutronix.de> +// + +#include "common.h" +#include "linux/pse-pd/pse.h" +#include "netlink.h" +#include <linux/ethtool_netlink.h> +#include <linux/ethtool.h> +#include <linux/phy.h> + +struct pse_req_info { + struct ethnl_req_info base; +}; + +struct pse_reply_data { + struct ethnl_reply_data base; + struct pse_control_status status; +}; + +#define PSE_REPDATA(__reply_base) \ + container_of(__reply_base, struct pse_reply_data, base) + +/* PSE_GET */ + +const struct nla_policy ethnl_pse_get_policy[ETHTOOL_A_PSE_HEADER + 1] = { + [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), +}; + +static int pse_get_pse_attributes(struct net_device *dev, + struct netlink_ext_ack *extack, + struct pse_reply_data *data) +{ + struct phy_device *phydev = dev->phydev; + + if (!phydev) { + NL_SET_ERR_MSG(extack, "No PHY is attached"); + return -EOPNOTSUPP; + } + + if (!phydev->psec) { + NL_SET_ERR_MSG(extack, "No PSE is attached"); + return -EOPNOTSUPP; + } + + memset(&data->status, 0, sizeof(data->status)); + + return pse_ethtool_get_status(phydev->psec, extack, &data->status); +} + +static int pse_prepare_data(const struct ethnl_req_info *req_base, + struct ethnl_reply_data *reply_base, + struct genl_info *info) +{ + struct pse_reply_data *data = PSE_REPDATA(reply_base); + struct net_device *dev = reply_base->dev; + int ret; + + ret = ethnl_ops_begin(dev); + if (ret < 0) + return ret; + + ret = pse_get_pse_attributes(dev, info ? info->extack : NULL, data); + + ethnl_ops_complete(dev); + + return ret; +} + +static int pse_reply_size(const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct pse_reply_data *data = PSE_REPDATA(reply_base); + const struct pse_control_status *st = &data->status; + int len = 0; + + if (st->podl_admin_state > 0) + len += nla_total_size(sizeof(u32)); /* _PODL_PSE_ADMIN_STATE */ + if (st->podl_pw_status > 0) + len += nla_total_size(sizeof(u32)); /* _PODL_PSE_PW_D_STATUS */ + + return len; +} + +static int pse_fill_reply(struct sk_buff *skb, + const struct ethnl_req_info *req_base, + const struct ethnl_reply_data *reply_base) +{ + const struct pse_reply_data *data = PSE_REPDATA(reply_base); + const struct pse_control_status *st = &data->status; + + if (st->podl_admin_state > 0 && + nla_put_u32(skb, ETHTOOL_A_PODL_PSE_ADMIN_STATE, + st->podl_admin_state)) + return -EMSGSIZE; + + if (st->podl_pw_status > 0 && + nla_put_u32(skb, ETHTOOL_A_PODL_PSE_PW_D_STATUS, + st->podl_pw_status)) + return -EMSGSIZE; + + return 0; +} + +const struct ethnl_request_ops ethnl_pse_request_ops = { + .request_cmd = ETHTOOL_MSG_PSE_GET, + .reply_cmd = ETHTOOL_MSG_PSE_GET_REPLY, + .hdr_attr = ETHTOOL_A_PSE_HEADER, + .req_info_size = sizeof(struct pse_req_info), + .reply_data_size = sizeof(struct pse_reply_data), + + .prepare_data = pse_prepare_data, + .reply_size = pse_reply_size, + .fill_reply = pse_fill_reply, +}; + +/* PSE_SET */ + +const struct nla_policy ethnl_pse_set_policy[ETHTOOL_A_PSE_MAX + 1] = { + [ETHTOOL_A_PSE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), + [ETHTOOL_A_PODL_PSE_ADMIN_CONTROL] = + NLA_POLICY_RANGE(NLA_U32, ETHTOOL_PODL_PSE_ADMIN_STATE_DISABLED, + ETHTOOL_PODL_PSE_ADMIN_STATE_ENABLED), +}; + +static int pse_set_pse_config(struct net_device *dev, + struct netlink_ext_ack *extack, + struct nlattr **tb) +{ + struct phy_device *phydev = dev->phydev; + struct pse_control_config config = {}; + + /* Optional attribute. Do not return error if not set. */ + if (!tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]) + return 0; + + /* this values are already validated by the ethnl_pse_set_policy */ + config.admin_cotrol = nla_get_u32(tb[ETHTOOL_A_PODL_PSE_ADMIN_CONTROL]); + + if (!phydev) { + NL_SET_ERR_MSG(extack, "No PHY is attached"); + return -EOPNOTSUPP; + } + + if (!phydev->psec) { + NL_SET_ERR_MSG(extack, "No PSE is attached"); + return -EOPNOTSUPP; + } + + return pse_ethtool_set_config(phydev->psec, extack, &config); +} + +int ethnl_set_pse(struct sk_buff *skb, struct genl_info *info) +{ + struct ethnl_req_info req_info = {}; + struct nlattr **tb = info->attrs; + struct net_device *dev; + int ret; + + ret = ethnl_parse_header_dev_get(&req_info, tb[ETHTOOL_A_PSE_HEADER], + genl_info_net(info), info->extack, + true); + if (ret < 0) + return ret; + + dev = req_info.dev; + + rtnl_lock(); + ret = ethnl_ops_begin(dev); + if (ret < 0) + goto out_rtnl; + + ret = pse_set_pse_config(dev, info->extack, tb); + ethnl_ops_complete(dev); +out_rtnl: + rtnl_unlock(); + + ethnl_parse_header_dev_put(&req_info); + + return ret; +} diff --git a/net/ethtool/tunnels.c b/net/ethtool/tunnels.c index efde33536687..67fb414ca859 100644 --- a/net/ethtool/tunnels.c +++ b/net/ethtool/tunnels.c @@ -136,6 +136,8 @@ ethnl_tunnel_info_fill_reply(const struct ethnl_req_info *req_base, goto err_cancel_table; entry = nla_nest_start(skb, ETHTOOL_A_TUNNEL_UDP_TABLE_ENTRY); + if (!entry) + goto err_cancel_entry; if (nla_put_be16(skb, ETHTOOL_A_TUNNEL_UDP_ENTRY_PORT, htons(IANA_VXLAN_UDP_PORT)) || diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c index 5bf357734b11..a50429a62f74 100644 --- a/net/hsr/hsr_forward.c +++ b/net/hsr/hsr_forward.c @@ -150,15 +150,15 @@ struct sk_buff *hsr_get_untagged_frame(struct hsr_frame_info *frame, struct hsr_port *port) { if (!frame->skb_std) { - if (frame->skb_hsr) { + if (frame->skb_hsr) frame->skb_std = create_stripped_skb_hsr(frame->skb_hsr, frame); - } else { - /* Unexpected */ - WARN_ONCE(1, "%s:%d: Unexpected frame received (port_src %s)\n", - __FILE__, __LINE__, port->dev->name); + else + netdev_warn_once(port->dev, + "Unexpected frame received in hsr_get_untagged_frame()\n"); + + if (!frame->skb_std) return NULL; - } } return skb_clone(frame->skb_std, GFP_ATOMIC); diff --git a/net/ieee802154/core.c b/net/ieee802154/core.c index de259b5170ab..57546e07e06a 100644 --- a/net/ieee802154/core.c +++ b/net/ieee802154/core.c @@ -129,6 +129,9 @@ wpan_phy_new(const struct cfg802154_ops *ops, size_t priv_size) wpan_phy_net_set(&rdev->wpan_phy, &init_net); init_waitqueue_head(&rdev->dev_wait); + init_waitqueue_head(&rdev->wpan_phy.sync_txq); + + spin_lock_init(&rdev->wpan_phy.queue_lock); return &rdev->wpan_phy; } diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c index 718fb77bb372..1fa2fe041ec0 100644 --- a/net/ieee802154/socket.c +++ b/net/ieee802154/socket.c @@ -200,8 +200,9 @@ static int raw_bind(struct sock *sk, struct sockaddr *_uaddr, int len) int err = 0; struct net_device *dev = NULL; - if (len < sizeof(*uaddr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(uaddr, len); + if (err < 0) + return err; uaddr = (struct sockaddr_ieee802154 *)_uaddr; if (uaddr->family != AF_IEEE802154) @@ -271,6 +272,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) err = -EMSGSIZE; goto out_dev; } + if (!size) { + err = 0; + goto out_dev; + } hlen = LL_RESERVED_SPACE(dev); tlen = dev->needed_tailroom; @@ -493,11 +498,14 @@ static int dgram_bind(struct sock *sk, struct sockaddr *uaddr, int len) ro->bound = 0; - if (len < sizeof(*addr)) + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) goto out; - if (addr->family != AF_IEEE802154) + if (addr->family != AF_IEEE802154) { + err = -EINVAL; goto out; + } ieee802154_addr_from_sa(&haddr, &addr->addr); dev = ieee802154_get_dev(sock_net(sk), &haddr); @@ -564,8 +572,9 @@ static int dgram_connect(struct sock *sk, struct sockaddr *uaddr, struct dgram_sock *ro = dgram_sk(sk); int err = 0; - if (len < sizeof(*addr)) - return -EINVAL; + err = ieee802154_sockaddr_check_size(addr, len); + if (err < 0) + return err; if (addr->family != AF_IEEE802154) return -EINVAL; @@ -604,6 +613,7 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) struct ieee802154_mac_cb *cb; struct dgram_sock *ro = dgram_sk(sk); struct ieee802154_addr dst_addr; + DECLARE_SOCKADDR(struct sockaddr_ieee802154*, daddr, msg->msg_name); int hlen, tlen; int err; @@ -612,10 +622,20 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) return -EOPNOTSUPP; } - if (!ro->connected && !msg->msg_name) - return -EDESTADDRREQ; - else if (ro->connected && msg->msg_name) - return -EISCONN; + if (msg->msg_name) { + if (ro->connected) + return -EISCONN; + if (msg->msg_namelen < IEEE802154_MIN_NAMELEN) + return -EINVAL; + err = ieee802154_sockaddr_check_size(daddr, msg->msg_namelen); + if (err < 0) + return err; + ieee802154_addr_from_sa(&dst_addr, &daddr->addr); + } else { + if (!ro->connected) + return -EDESTADDRREQ; + dst_addr = ro->dst_addr; + } if (!ro->bound) dev = dev_getfirstbyhwtype(sock_net(sk), ARPHRD_IEEE802154); @@ -651,16 +671,6 @@ static int dgram_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) cb = mac_cb_init(skb); cb->type = IEEE802154_FC_TYPE_DATA; cb->ackreq = ro->want_ack; - - if (msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_ieee802154*, - daddr, msg->msg_name); - - ieee802154_addr_from_sa(&dst_addr, &daddr->addr); - } else { - dst_addr = ro->dst_addr; - } - cb->secen = ro->secen; cb->secen_override = ro->secen_override; cb->seclevel = ro->seclevel; diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile index bbdd9c44f14e..af7d2cf490fb 100644 --- a/net/ipv4/Makefile +++ b/net/ipv4/Makefile @@ -10,7 +10,7 @@ obj-y := route.o inetpeer.o protocol.o \ tcp.o tcp_input.o tcp_output.o tcp_timer.o tcp_ipv4.o \ tcp_minisocks.o tcp_cong.o tcp_metrics.o tcp_fastopen.o \ tcp_rate.o tcp_recovery.o tcp_ulp.o \ - tcp_offload.o datagram.o raw.o udp.o udplite.o \ + tcp_offload.o tcp_plb.o datagram.o raw.o udp.o udplite.o \ udp_offload.o arp.o icmp.o devinet.o af_inet.o igmp.o \ fib_frontend.o fib_semantics.o fib_trie.o fib_notifier.o \ inet_fragment.o ping.o ip_tunnel_core.o gre_offload.o \ diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index d3ab1ae32ef5..585f13b6fef6 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -558,22 +558,27 @@ int inet_dgram_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; int err; if (addr_len < sizeof(uaddr->sa_family)) return -EINVAL; + + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (uaddr->sa_family == AF_UNSPEC) - return sk->sk_prot->disconnect(sk, flags); + return prot->disconnect(sk, flags); if (BPF_CGROUP_PRE_CONNECT_ENABLED(sk)) { - err = sk->sk_prot->pre_connect(sk, uaddr, addr_len); + err = prot->pre_connect(sk, uaddr, addr_len); if (err) return err; } if (data_race(!inet_sk(sk)->inet_num) && inet_autobind(sk)) return -EAGAIN; - return sk->sk_prot->connect(sk, uaddr, addr_len); + return prot->connect(sk, uaddr, addr_len); } EXPORT_SYMBOL(inet_dgram_connect); @@ -734,10 +739,11 @@ EXPORT_SYMBOL(inet_stream_connect); int inet_accept(struct socket *sock, struct socket *newsock, int flags, bool kern) { - struct sock *sk1 = sock->sk; + struct sock *sk1 = sock->sk, *sk2; int err = -EINVAL; - struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err, kern); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + sk2 = READ_ONCE(sk1->sk_prot)->accept(sk1, flags, &err, kern); if (!sk2) goto do_err; @@ -825,12 +831,15 @@ ssize_t inet_sendpage(struct socket *sock, struct page *page, int offset, size_t size, int flags) { struct sock *sk = sock->sk; + const struct proto *prot; if (unlikely(inet_send_prepare(sk))) return -EAGAIN; - if (sk->sk_prot->sendpage) - return sk->sk_prot->sendpage(sk, page, offset, size, flags); + /* IPV6_ADDRFORM can change sk->sk_prot under us. */ + prot = READ_ONCE(sk->sk_prot); + if (prot->sendpage) + return prot->sendpage(sk, page, offset, size, flags); return sock_no_sendpage(sock, page, offset, size, flags); } EXPORT_SYMBOL(inet_sendpage); @@ -1250,7 +1259,7 @@ static int inet_sk_reselect_saddr(struct sock *sk) } prev_addr_hashbucket = - inet_bhashfn_portaddr(sk->sk_prot->h.hashinfo, sk, + inet_bhashfn_portaddr(tcp_or_dccp_get_hashinfo(sk), sk, sock_net(sk), inet->inet_num); inet->inet_saddr = inet->inet_rcv_saddr = new_saddr; @@ -1697,9 +1706,9 @@ u64 snmp_get_cpu_field64(void __percpu *mib, int cpu, int offt, bhptr = per_cpu_ptr(mib, cpu); syncp = (struct u64_stats_sync *)(bhptr + syncp_offset); do { - start = u64_stats_fetch_begin_irq(syncp); + start = u64_stats_fetch_begin(syncp); v = *(((u64 *)bhptr) + offt); - } while (u64_stats_fetch_retry_irq(syncp, start)); + } while (u64_stats_fetch_retry(syncp, start)); return v; } diff --git a/net/ipv4/ah4.c b/net/ipv4/ah4.c index f8ad04470d3a..ee4e578c7f20 100644 --- a/net/ipv4/ah4.c +++ b/net/ipv4/ah4.c @@ -471,30 +471,38 @@ static int ah4_err(struct sk_buff *skb, u32 info) return 0; } -static int ah_init_state(struct xfrm_state *x) +static int ah_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; - if (!x->aalg) + if (!x->aalg) { + NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; + } - if (x->encap) + if (x->encap) { + NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; + } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); - if (IS_ERR(ahash)) + if (IS_ERR(ahash)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, - (x->aalg->alg_key_len + 7) / 8)) + (x->aalg->alg_key_len + 7) / 8)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } /* * Lookup the algorithm description maintained by xfrm_algo, @@ -507,10 +515,7 @@ static int ah_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - pr_info("%s: %s digestsize %u != %u\n", - __func__, x->aalg->alg_name, - crypto_ahash_digestsize(ahash), - aalg_desc->uinfo.auth.icv_fullbits / 8); + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c index 85a9e500c42d..6da16ae6a962 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -124,7 +124,7 @@ static int bpf_tcp_ca_btf_struct_access(struct bpf_verifier_log *log, return -EACCES; } - return NOT_INIT; + return 0; } BPF_CALL_2(bpf_tcp_send_ack, struct tcp_sock *, tp, u32, rcv_nxt) diff --git a/net/ipv4/datagram.c b/net/ipv4/datagram.c index 405a8c2aea64..4d1af0cd7d99 100644 --- a/net/ipv4/datagram.c +++ b/net/ipv4/datagram.c @@ -70,10 +70,10 @@ int __ip4_datagram_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len } inet->inet_daddr = fl4->daddr; inet->inet_dport = usin->sin_port; - reuseport_has_conns(sk, true); + reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); sk_dst_set(sk, &rt->dst); err = 0; diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c index 5c03eba787e5..52c8047efedb 100644 --- a/net/ipv4/esp4.c +++ b/net/ipv4/esp4.c @@ -134,6 +134,7 @@ static void esp_free_tcp_sk(struct rcu_head *head) static struct sock *esp_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; + struct net *net = xs_net(x); struct esp_tcp_sk *esk; __be16 sport, dport; struct sock *nsk; @@ -160,7 +161,7 @@ static struct sock *esp_find_tcp_sk(struct xfrm_state *x) } spin_unlock_bh(&x->lock); - sk = inet_lookup_established(xs_net(x), &tcp_hashinfo, x->id.daddr.a4, + sk = inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, x->id.daddr.a4, dport, x->props.saddr.a4, sport, 0); if (!sk) return ERR_PTR(-ENOENT); @@ -1007,16 +1008,17 @@ static void esp_destroy(struct xfrm_state *x) crypto_free_aead(aead); } -static int esp_init_aead(struct xfrm_state *x) +static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack) { char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", - x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); + return -ENAMETOOLONG; + } aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); @@ -1034,11 +1036,15 @@ static int esp_init_aead(struct xfrm_state *x) if (err) goto error; + return 0; + error: + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); return err; } -static int esp_init_authenc(struct xfrm_state *x) +static int esp_init_authenc(struct xfrm_state *x, + struct netlink_ext_ack *extack) { struct crypto_aead *aead; struct crypto_authenc_key_param *param; @@ -1049,10 +1055,6 @@ static int esp_init_authenc(struct xfrm_state *x) unsigned int keylen; int err; - err = -EINVAL; - if (!x->ealg) - goto error; - err = -ENAMETOOLONG; if ((x->props.flags & XFRM_STATE_ESN)) { @@ -1061,22 +1063,28 @@ static int esp_init_authenc(struct xfrm_state *x) x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", x->ealg->alg_name, - x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); goto error; + } } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "%s%sauthenc(%s,%s)%s", x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", x->ealg->alg_name, - x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); goto error; + } } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); - if (IS_ERR(aead)) + if (IS_ERR(aead)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } x->data = aead; @@ -1106,17 +1114,16 @@ static int esp_init_authenc(struct xfrm_state *x) err = -EINVAL; if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - pr_info("ESP: %s digestsize %u != %u\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits / 8); + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto free_key; } err = crypto_aead_setauthsize( aead, x->aalg->alg_trunc_len / 8); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto free_key; + } } param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8); @@ -1131,7 +1138,7 @@ error: return err; } -static int esp_init_state(struct xfrm_state *x) +static int esp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct crypto_aead *aead; u32 align; @@ -1139,10 +1146,14 @@ static int esp_init_state(struct xfrm_state *x) x->data = NULL; - if (x->aead) - err = esp_init_aead(x); - else - err = esp_init_authenc(x); + if (x->aead) { + err = esp_init_aead(x, extack); + } else if (x->ealg) { + err = esp_init_authenc(x, extack); + } else { + NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided"); + err = -EINVAL; + } if (err) goto error; @@ -1160,6 +1171,7 @@ static int esp_init_state(struct xfrm_state *x) switch (encap->encap_type) { default: + NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP"); err = -EINVAL; goto error; case UDP_ENCAP_ESPINUDP: diff --git a/net/ipv4/esp4_offload.c b/net/ipv4/esp4_offload.c index 935026f4c807..170152772d33 100644 --- a/net/ipv4/esp4_offload.c +++ b/net/ipv4/esp4_offload.c @@ -110,7 +110,10 @@ static struct sk_buff *xfrm4_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - return skb_eth_gso_segment(skb, features, htons(ETH_P_IP)); + __be16 type = x->inner_mode.family == AF_INET6 ? htons(ETH_P_IPV6) + : htons(ETH_P_IP); + + return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm4_transport_gso_segment(struct xfrm_state *x, diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 943edf4ad4db..f361d3d56be2 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -389,7 +389,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, dev_match = dev_match || (res.type == RTN_LOCAL && dev == net->loopback_dev); if (dev_match) { - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; return ret; } if (no_addr) @@ -401,7 +401,7 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, ret = 0; if (fib_lookup(net, &fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE) == 0) { if (res.type == RTN_UNICAST) - ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_LINK; + ret = FIB_RES_NHC(res)->nhc_scope >= RT_SCOPE_HOST; } return ret; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 2dc97583d279..f721c308248b 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -888,13 +888,13 @@ int fib_nh_match(struct net *net, struct fib_config *cfg, struct fib_info *fi, return 1; } + /* cannot match on nexthop object attributes */ + if (fi->nh) + return 1; + if (cfg->fc_oif || cfg->fc_gw_family) { struct fib_nh *nh; - /* cannot match on nexthop object attributes */ - if (fi->nh) - return 1; - nh = fib_info_nh(fi, 0); if (cfg->fc_encap) { if (fib_encap_match(net, cfg->fc_encap_type, @@ -1231,7 +1231,7 @@ static int fib_check_nh_nongw(struct net *net, struct fib_nh *nh, nh->fib_nh_dev = in_dev->dev; netdev_hold(nh->fib_nh_dev, &nh->fib_nh_dev_tracker, GFP_ATOMIC); - nh->fib_nh_scope = RT_SCOPE_LINK; + nh->fib_nh_scope = RT_SCOPE_HOST; if (!netif_carrier_ok(nh->fib_nh_dev)) nh->fib_nh_flags |= RTNH_F_LINKDOWN; err = 0; diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c index e3ab0cb61624..81be3e0f0e70 100644 --- a/net/ipv4/igmp.c +++ b/net/ipv4/igmp.c @@ -213,7 +213,7 @@ static void igmp_stop_timer(struct ip_mc_list *im) /* It must be called with locked im->lock */ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) { - int tv = prandom_u32() % max_delay; + int tv = prandom_u32_max(max_delay); im->tm_running = 1; if (!mod_timer(&im->timer, jiffies+tv+2)) @@ -222,7 +222,7 @@ static void igmp_start_timer(struct ip_mc_list *im, int max_delay) static void igmp_gq_start_timer(struct in_device *in_dev) { - int tv = prandom_u32() % in_dev->mr_maxdelay; + int tv = prandom_u32_max(in_dev->mr_maxdelay); unsigned long exp = jiffies + tv + 2; if (in_dev->mr_gq_running && @@ -236,7 +236,7 @@ static void igmp_gq_start_timer(struct in_device *in_dev) static void igmp_ifc_start_timer(struct in_device *in_dev, int delay) { - int tv = prandom_u32() % delay; + int tv = prandom_u32_max(delay); if (!mod_timer(&in_dev->mr_ifc_timer, jiffies+tv+2)) in_dev_hold(in_dev); @@ -2529,11 +2529,10 @@ done: err = ip_mc_leave_group(sk, &imr); return err; } - int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, - struct ip_msfilter __user *optval, int __user *optlen) + sockptr_t optval, sockptr_t optlen) { - int err, len, count, copycount; + int err, len, count, copycount, msf_size; struct ip_mreqn imr; __be32 addr = msf->imsf_multiaddr; struct ip_mc_socklist *pmc; @@ -2575,12 +2574,15 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf, copycount = count < msf->imsf_numsrc ? count : msf->imsf_numsrc; len = flex_array_size(psl, sl_addr, copycount); msf->imsf_numsrc = count; - if (put_user(IP_MSFILTER_SIZE(copycount), optlen) || - copy_to_user(optval, msf, IP_MSFILTER_SIZE(0))) { + msf_size = IP_MSFILTER_SIZE(copycount); + if (copy_to_sockptr(optlen, &msf_size, sizeof(int)) || + copy_to_sockptr(optval, msf, IP_MSFILTER_SIZE(0))) { return -EFAULT; } if (len && - copy_to_user(&optval->imsf_slist_flex[0], psl->sl_addr, len)) + copy_to_sockptr_offset(optval, + offsetof(struct ip_msfilter, imsf_slist_flex), + psl->sl_addr, len)) return -EFAULT; return 0; done: @@ -2588,7 +2590,7 @@ done: } int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p) + sockptr_t optval, size_t ss_offset) { int i, count, copycount; struct sockaddr_in *psin; @@ -2618,15 +2620,17 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf, count = psl ? psl->sl_count : 0; copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - for (i = 0; i < copycount; i++, p++) { + for (i = 0; i < copycount; i++) { struct sockaddr_storage ss; psin = (struct sockaddr_in *)&ss; memset(&ss, 0, sizeof(ss)); psin->sin_family = AF_INET; psin->sin_addr.s_addr = psl->sl_addr[i]; - if (copy_to_user(p, &ss, sizeof(ss))) + if (copy_to_sockptr_offset(optval, ss_offset, + &ss, sizeof(ss))) return -EFAULT; + ss_offset += sizeof(ss); } return 0; } diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c index f0038043b661..4e84ed21d16f 100644 --- a/net/ipv4/inet_connection_sock.c +++ b/net/ipv4/inet_connection_sock.c @@ -285,16 +285,14 @@ inet_csk_find_open_port(const struct sock *sk, struct inet_bind_bucket **tb_ret, struct inet_bind2_bucket **tb2_ret, struct inet_bind_hashbucket **head2_ret, int *port_ret) { - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int port = 0; + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); + int i, low, high, attempt_half, port, l3mdev; struct inet_bind_hashbucket *head, *head2; struct net *net = sock_net(sk); - bool relax = false; - int i, low, high, attempt_half; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; u32 remaining, offset; - int l3mdev; + bool relax = false; l3mdev = inet_sk_bound_l3mdev(sk); ports_exhausted: @@ -316,7 +314,7 @@ other_half_scan: if (likely(remaining > 1)) remaining &= ~1U; - offset = prandom_u32() % remaining; + offset = prandom_u32_max(remaining); /* __inet_hash_connect() favors ports having @low parity * We do the opposite to not pollute connect() users. */ @@ -469,17 +467,16 @@ void inet_csk_update_fastreuse(struct inet_bind_bucket *tb, */ int inet_csk_get_port(struct sock *sk, unsigned short snum) { + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN; - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; - int ret = 1, port = snum; - struct net *net = sock_net(sk); bool found_port = false, check_bind_conflict = true; bool bhash_created = false, bhash2_created = false; struct inet_bind_hashbucket *head, *head2; struct inet_bind2_bucket *tb2 = NULL; struct inet_bind_bucket *tb = NULL; bool head2_lock_acquired = false; - int l3mdev; + int ret = 1, port = snum, l3mdev; + struct net *net = sock_net(sk); l3mdev = inet_sk_bound_l3mdev(sk); @@ -909,14 +906,15 @@ static void reqsk_migrate_reset(struct request_sock *req) /* return true if req was found in the ehash table */ static bool reqsk_queue_unlink(struct request_sock *req) { - struct inet_hashinfo *hashinfo = req_to_sk(req)->sk_prot->h.hashinfo; + struct sock *sk = req_to_sk(req); bool found = false; - if (sk_hashed(req_to_sk(req))) { + if (sk_hashed(sk)) { + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); spinlock_t *lock = inet_ehash_lockp(hashinfo, req->rsk_hash); spin_lock(lock); - found = __sk_nulls_del_node_init_rcu(req_to_sk(req)); + found = __sk_nulls_del_node_init_rcu(sk); spin_unlock(lock); } if (timer_pending(&req->rsk_timer) && del_timer_sync(&req->rsk_timer)) diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c index 60d77e234a68..d3dc28156622 100644 --- a/net/ipv4/inet_hashtables.c +++ b/net/ipv4/inet_hashtables.c @@ -109,6 +109,7 @@ static void inet_bind2_bucket_init(struct inet_bind2_bucket *tb, tb->l3mdev = l3mdev; tb->port = port; #if IS_ENABLED(CONFIG_IPV6) + tb->family = sk->sk_family; if (sk->sk_family == AF_INET6) tb->v6_rcv_saddr = sk->sk_v6_rcv_saddr; else @@ -146,6 +147,9 @@ static bool inet_bind2_bucket_addr_match(const struct inet_bind2_bucket *tb2, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family != tb2->family) + return false; + if (sk->sk_family == AF_INET6) return ipv6_addr_equal(&tb2->v6_rcv_saddr, &sk->sk_v6_rcv_saddr); @@ -168,14 +172,15 @@ void inet_bind_hash(struct sock *sk, struct inet_bind_bucket *tb, */ static void __inet_put_port(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - const int bhash = inet_bhashfn(sock_net(sk), inet_sk(sk)->inet_num, - hashinfo->bhash_size); - struct inet_bind_hashbucket *head = &hashinfo->bhash[bhash]; - struct inet_bind_hashbucket *head2 = - inet_bhashfn_portaddr(hashinfo, sk, sock_net(sk), - inet_sk(sk)->inet_num); + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); + struct inet_bind_hashbucket *head, *head2; + struct net *net = sock_net(sk); struct inet_bind_bucket *tb; + int bhash; + + bhash = inet_bhashfn(net, inet_sk(sk)->inet_num, hashinfo->bhash_size); + head = &hashinfo->bhash[bhash]; + head2 = inet_bhashfn_portaddr(hashinfo, sk, net, inet_sk(sk)->inet_num); spin_lock(&head->lock); tb = inet_csk(sk)->icsk_bind_hash; @@ -207,19 +212,19 @@ EXPORT_SYMBOL(inet_put_port); int __inet_inherit_port(const struct sock *sk, struct sock *child) { - struct inet_hashinfo *table = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *table = tcp_or_dccp_get_hashinfo(sk); unsigned short port = inet_sk(child)->inet_num; - const int bhash = inet_bhashfn(sock_net(sk), port, - table->bhash_size); - struct inet_bind_hashbucket *head = &table->bhash[bhash]; - struct inet_bind_hashbucket *head2 = - inet_bhashfn_portaddr(table, child, sock_net(sk), port); + struct inet_bind_hashbucket *head, *head2; bool created_inet_bind_bucket = false; - bool update_fastreuse = false; struct net *net = sock_net(sk); + bool update_fastreuse = false; struct inet_bind2_bucket *tb2; struct inet_bind_bucket *tb; - int l3mdev; + int bhash, l3mdev; + + bhash = inet_bhashfn(net, port, table->bhash_size); + head = &table->bhash[bhash]; + head2 = inet_bhashfn_portaddr(table, child, net, port); spin_lock(&head->lock); spin_lock(&head2->lock); @@ -385,7 +390,7 @@ static inline struct sock *inet_lookup_run_bpf(struct net *net, struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != &tcp_hashinfo) + if (hashinfo != net->ipv4.tcp_death_row.hashinfo) return NULL; /* only TCP is supported */ no_reuseport = bpf_sk_lookup_run_v4(net, IPPROTO_TCP, saddr, sport, @@ -628,9 +633,9 @@ static bool inet_ehash_lookup_by_sk(struct sock *sk, */ bool inet_ehash_insert(struct sock *sk, struct sock *osk, bool *found_dup_sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; - struct hlist_nulls_head *list; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_ehash_bucket *head; + struct hlist_nulls_head *list; spinlock_t *lock; bool ret = true; @@ -700,7 +705,7 @@ static int inet_reuseport_add_sock(struct sock *sk, int __inet_hash(struct sock *sk, struct sock *osk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_listen_hashbucket *ilb2; int err = 0; @@ -746,7 +751,7 @@ EXPORT_SYMBOL_GPL(inet_hash); void inet_unhash(struct sock *sk) { - struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *hashinfo = tcp_or_dccp_get_hashinfo(sk); if (sk_unhashed(sk)) return; @@ -790,6 +795,9 @@ static bool inet_bind2_bucket_match(const struct inet_bind2_bucket *tb, int l3mdev, const struct sock *sk) { #if IS_ENABLED(CONFIG_IPV6) + if (sk->sk_family != tb->family) + return false; + if (sk->sk_family == AF_INET6) return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev && @@ -806,6 +814,9 @@ bool inet_bind2_bucket_match_addr_any(const struct inet_bind2_bucket *tb, const #if IS_ENABLED(CONFIG_IPV6) struct in6_addr addr_any = {}; + if (sk->sk_family != tb->family) + return false; + if (sk->sk_family == AF_INET6) return net_eq(ib2_net(tb), net) && tb->port == port && tb->l3mdev == l3mdev && @@ -833,7 +844,7 @@ inet_bind2_bucket_find(const struct inet_bind_hashbucket *head, const struct net struct inet_bind_hashbucket * inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, int port) { - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); u32 hash; #if IS_ENABLED(CONFIG_IPV6) struct in6_addr addr_any = {}; @@ -849,7 +860,7 @@ inet_bhash2_addr_any_hashbucket(const struct sock *sk, const struct net *net, in int inet_bhash2_update_saddr(struct inet_bind_hashbucket *prev_saddr, struct sock *sk) { - struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo; + struct inet_hashinfo *hinfo = tcp_or_dccp_get_hashinfo(sk); struct inet_bind2_bucket *tb2, *new_tb2; int l3mdev = inet_sk_bound_l3mdev(sk); struct inet_bind_hashbucket *head2; @@ -947,8 +958,8 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row, if (likely(remaining > 1)) remaining &= ~1U; - net_get_random_once(table_perturb, - INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); + get_random_sleepable_once(table_perturb, + INET_TABLE_PERTURB_SIZE * sizeof(*table_perturb)); index = port_offset & (INET_TABLE_PERTURB_SIZE - 1); offset = READ_ONCE(table_perturb[index]) + (port_offset >> 32); @@ -1026,7 +1037,7 @@ ok: * on low contention the randomness is maximal and on high contention * it may be inexistent. */ - i = max_t(int, i, (prandom_u32() & 7) * 2); + i = max_t(int, i, prandom_u32_max(8) * 2); WRITE_ONCE(table_perturb[index], READ_ONCE(table_perturb[index]) + i + 2); /* Head lock still held and bh's disabled */ @@ -1144,3 +1155,50 @@ int inet_ehash_locks_alloc(struct inet_hashinfo *hashinfo) return 0; } EXPORT_SYMBOL_GPL(inet_ehash_locks_alloc); + +struct inet_hashinfo *inet_pernet_hashinfo_alloc(struct inet_hashinfo *hashinfo, + unsigned int ehash_entries) +{ + struct inet_hashinfo *new_hashinfo; + int i; + + new_hashinfo = kmemdup(hashinfo, sizeof(*hashinfo), GFP_KERNEL); + if (!new_hashinfo) + goto err; + + new_hashinfo->ehash = vmalloc_huge(ehash_entries * sizeof(struct inet_ehash_bucket), + GFP_KERNEL_ACCOUNT); + if (!new_hashinfo->ehash) + goto free_hashinfo; + + new_hashinfo->ehash_mask = ehash_entries - 1; + + if (inet_ehash_locks_alloc(new_hashinfo)) + goto free_ehash; + + for (i = 0; i < ehash_entries; i++) + INIT_HLIST_NULLS_HEAD(&new_hashinfo->ehash[i].chain, i); + + new_hashinfo->pernet = true; + + return new_hashinfo; + +free_ehash: + vfree(new_hashinfo->ehash); +free_hashinfo: + kfree(new_hashinfo); +err: + return NULL; +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_alloc); + +void inet_pernet_hashinfo_free(struct inet_hashinfo *hashinfo) +{ + if (!hashinfo->pernet) + return; + + inet_ehash_locks_free(hashinfo); + vfree(hashinfo->ehash); + kfree(hashinfo); +} +EXPORT_SYMBOL_GPL(inet_pernet_hashinfo_free); diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c index 47ccc343c9fb..66fc940f9521 100644 --- a/net/ipv4/inet_timewait_sock.c +++ b/net/ipv4/inet_timewait_sock.c @@ -59,9 +59,7 @@ static void inet_twsk_kill(struct inet_timewait_sock *tw) inet_twsk_bind_unhash(tw, hashinfo); spin_unlock(&bhead->lock); - if (refcount_dec_and_test(&tw->tw_dr->tw_refcount)) - kfree(tw->tw_dr); - + refcount_dec(&tw->tw_dr->tw_refcount); inet_twsk_put(tw); } @@ -270,8 +268,21 @@ restart_rcu: rcu_read_lock(); restart: sk_nulls_for_each_rcu(sk, node, &head->chain) { - if (sk->sk_state != TCP_TIME_WAIT) + if (sk->sk_state != TCP_TIME_WAIT) { + /* A kernel listener socket might not hold refcnt for net, + * so reqsk_timer_handler() could be fired after net is + * freed. Userspace listener and reqsk never exist here. + */ + if (unlikely(sk->sk_state == TCP_NEW_SYN_RECV && + hashinfo->pernet)) { + struct request_sock *req = inet_reqsk(sk); + + inet_csk_reqsk_queue_drop_and_put(req->rsk_listener, req); + } + continue; + } + tw = inet_twsk(sk); if ((tw->tw_family != family) || refcount_read(&twsk_net(tw)->ns.count)) diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c index 8201cd423ff9..922c87ef1ab5 100644 --- a/net/ipv4/ip_output.c +++ b/net/ipv4/ip_output.c @@ -172,7 +172,7 @@ int ip_build_and_send_pkt(struct sk_buff *skb, const struct sock *sk, * Avoid using the hashed IP ident generator. */ if (sk->sk_protocol == IPPROTO_TCP) - iph->id = (__force __be16)prandom_u32(); + iph->id = (__force __be16)get_random_u16(); else __ip_select_ident(net, iph, 1); } @@ -1043,7 +1043,7 @@ static int __ip_append_data(struct sock *sk, paged = true; zc = true; } else { - uarg->zerocopy = 0; + uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c index e49a61a053a6..5f16807d3235 100644 --- a/net/ipv4/ip_sockglue.c +++ b/net/ipv4/ip_sockglue.c @@ -267,7 +267,7 @@ int ip_cmsg_send(struct sock *sk, struct msghdr *msg, struct ipcm_cookie *ipc, } #endif if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, &ipc->sockc); + err = __sock_cmsg_send(sk, cmsg, &ipc->sockc); if (err) return err; continue; @@ -888,8 +888,8 @@ static int compat_ip_mcast_join_leave(struct sock *sk, int optname, DEFINE_STATIC_KEY_FALSE(ip4_min_ttl); -static int do_ip_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_ip_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct inet_sock *inet = inet_sk(sk); struct net *net = sock_net(sk); @@ -944,7 +944,7 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = 0; if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: @@ -1333,14 +1333,14 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, case IP_IPSEC_POLICY: case IP_XFRM_POLICY: err = -EPERM; - if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) + if (!sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) break; err = xfrm_user_policy(sk, optname, optval, optlen); break; case IP_TRANSPARENT: - if (!!val && !ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && - !ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { + if (!!val && !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN)) { err = -EPERM; break; } @@ -1368,13 +1368,13 @@ static int do_ip_setsockopt(struct sock *sk, int level, int optname, err = -ENOPROTOOPT; break; } - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; e_inval: - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return -EINVAL; @@ -1462,37 +1462,37 @@ static bool getsockopt_needs_rtnl(int optname) return false; } -static int ip_get_mcast_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); - struct group_filter __user *p = optval; struct group_filter gsf; - int num; + int num, gsf_size; int err; if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, p, size0)) + if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; num = gsf.gf_numsrc; - err = ip_mc_gsfget(sk, &gsf, p->gf_slist_flex); + err = ip_mc_gsfget(sk, &gsf, optval, + offsetof(struct group_filter, gf_slist_flex)); if (err) return err; if (gsf.gf_numsrc < num) num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) + gsf_size = GROUP_FILTER_SIZE(num); + if (copy_to_sockptr(optlen, &gsf_size, sizeof(int)) || + copy_to_sockptr(optval, &gsf, size0)) return -EFAULT; return 0; } -static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int compat_ip_get_mcast_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); - struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; int num; @@ -1500,7 +1500,7 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, if (len < size0) return -EINVAL; - if (copy_from_user(&gf32, p, size0)) + if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; @@ -1508,21 +1508,24 @@ static int compat_ip_get_mcast_msfilter(struct sock *sk, void __user *optval, num = gf.gf_numsrc = gf32.gf_numsrc; gf.gf_group = gf32.gf_group; - err = ip_mc_gsfget(sk, &gf, p->gf_slist_flex); + err = ip_mc_gsfget(sk, &gf, optval, + offsetof(struct compat_group_filter, gf_slist_flex)); if (err) return err; if (gf.gf_numsrc < num) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf) - sizeof(gf32)); - if (put_user(len, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), + &gf.gf_fmode, sizeof(gf.gf_fmode)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), + &gf.gf_numsrc, sizeof(gf.gf_numsrc))) return -EFAULT; return 0; } -static int do_ip_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen) +int do_ip_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { struct inet_sock *inet = inet_sk(sk); bool needs_rtnl = getsockopt_needs_rtnl(optname); @@ -1535,14 +1538,14 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, if (ip_mroute_opt(optname)) return ip_mroute_getsockopt(sk, optname, optval, optlen); - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0) return -EINVAL; if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case IP_OPTIONS: @@ -1558,17 +1561,19 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, memcpy(optbuf, &inet_opt->opt, sizeof(struct ip_options) + inet_opt->opt.optlen); - release_sock(sk); + sockopt_release_sock(sk); - if (opt->optlen == 0) - return put_user(0, optlen); + if (opt->optlen == 0) { + len = 0; + return copy_to_sockptr(optlen, &len, sizeof(int)); + } ip_options_undo(opt); len = min_t(unsigned int, len, opt->optlen); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, opt->__data, len)) + if (copy_to_sockptr(optval, opt->__data, len)) return -EFAULT; return 0; } @@ -1632,7 +1637,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, dst_release(dst); } if (!val) { - release_sock(sk); + sockopt_release_sock(sk); return -ENOTCONN; } break; @@ -1657,11 +1662,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, struct in_addr addr; len = min_t(unsigned int, len, sizeof(struct in_addr)); addr.s_addr = inet->mc_addr; - release_sock(sk); + sockopt_release_sock(sk); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &addr, len)) + if (copy_to_sockptr(optval, &addr, len)) return -EFAULT; return 0; } @@ -1673,12 +1678,11 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, err = -EINVAL; goto out; } - if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) { + if (copy_from_sockptr(&msf, optval, IP_MSFILTER_SIZE(0))) { err = -EFAULT; goto out; } - err = ip_mc_msfget(sk, &msf, - (struct ip_msfilter __user *)optval, optlen); + err = ip_mc_msfget(sk, &msf, optval, optlen); goto out; } case MCAST_MSFILTER: @@ -1695,13 +1699,18 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, { struct msghdr msg; - release_sock(sk); + sockopt_release_sock(sk); if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control_is_user = true; - msg.msg_control_user = optval; + if (optval.is_kernel) { + msg.msg_control_is_user = false; + msg.msg_control = optval.kernel; + } else { + msg.msg_control_is_user = true; + msg.msg_control_user = optval.user; + } msg.msg_controllen = len; msg.msg_flags = in_compat_syscall() ? MSG_CMSG_COMPAT : 0; @@ -1722,7 +1731,7 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, put_cmsg(&msg, SOL_IP, IP_TOS, sizeof(tos), &tos); } len -= msg.msg_controllen; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IP_FREEBIND: val = inet->freebind; @@ -1734,29 +1743,29 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname, val = inet->min_ttl; break; default: - release_sock(sk); + sockopt_release_sock(sk); return -ENOPROTOOPT; } - release_sock(sk); + sockopt_release_sock(sk); if (len < sizeof(int) && len > 0 && val >= 0 && val <= 255) { unsigned char ucval = (unsigned char)val; len = 1; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &ucval, 1)) + if (copy_to_sockptr(optval, &ucval, 1)) return -EFAULT; } else { len = min_t(unsigned int, sizeof(int), len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; } return 0; out: - release_sock(sk); + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return err; @@ -1767,7 +1776,8 @@ int ip_getsockopt(struct sock *sk, int level, { int err; - err = do_ip_getsockopt(sk, level, optname, optval, optlen); + err = do_ip_getsockopt(sk, level, optname, + USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #if IS_ENABLED(CONFIG_BPFILTER_UMH) if (optname >= BPFILTER_IPT_SO_GET_INFO && diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c index cc1caab4a654..92c02c886fe7 100644 --- a/net/ipv4/ip_tunnel_core.c +++ b/net/ipv4/ip_tunnel_core.c @@ -1079,3 +1079,70 @@ EXPORT_SYMBOL(ip_tunnel_parse_protocol); const struct header_ops ip_tunnel_header_ops = { .parse_protocol = ip_tunnel_parse_protocol }; EXPORT_SYMBOL(ip_tunnel_header_ops); + +/* This function returns true when ENCAP attributes are present in the nl msg */ +bool ip_tunnel_netlink_encap_parms(struct nlattr *data[], + struct ip_tunnel_encap *encap) +{ + bool ret = false; + + memset(encap, 0, sizeof(*encap)); + + if (!data) + return ret; + + if (data[IFLA_IPTUN_ENCAP_TYPE]) { + ret = true; + encap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); + } + + if (data[IFLA_IPTUN_ENCAP_FLAGS]) { + ret = true; + encap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); + } + + if (data[IFLA_IPTUN_ENCAP_SPORT]) { + ret = true; + encap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); + } + + if (data[IFLA_IPTUN_ENCAP_DPORT]) { + ret = true; + encap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); + } + + return ret; +} +EXPORT_SYMBOL_GPL(ip_tunnel_netlink_encap_parms); + +void ip_tunnel_netlink_parms(struct nlattr *data[], + struct ip_tunnel_parm *parms) +{ + if (data[IFLA_IPTUN_LINK]) + parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); + + if (data[IFLA_IPTUN_LOCAL]) + parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); + + if (data[IFLA_IPTUN_REMOTE]) + parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); + + if (data[IFLA_IPTUN_TTL]) { + parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); + if (parms->iph.ttl) + parms->iph.frag_off = htons(IP_DF); + } + + if (data[IFLA_IPTUN_TOS]) + parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); + + if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) + parms->iph.frag_off = htons(IP_DF); + + if (data[IFLA_IPTUN_FLAGS]) + parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); + + if (data[IFLA_IPTUN_PROTO]) + parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); +} +EXPORT_SYMBOL_GPL(ip_tunnel_netlink_parms); diff --git a/net/ipv4/ipcomp.c b/net/ipv4/ipcomp.c index 366094c1ce6c..5a4fb2539b08 100644 --- a/net/ipv4/ipcomp.c +++ b/net/ipv4/ipcomp.c @@ -117,7 +117,8 @@ out: return err; } -static int ipcomp4_init_state(struct xfrm_state *x) +static int ipcomp4_init_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) { int err = -EINVAL; @@ -129,17 +130,20 @@ static int ipcomp4_init_state(struct xfrm_state *x) x->props.header_len += sizeof(struct iphdr); break; default: + NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp"); goto out; } - err = ipcomp_init_state(x); + err = ipcomp_init_state(x, extack); if (err) goto out; if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp_tunnel_attach(x); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state"); goto out; + } } err = 0; diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c index 123ea63a04cb..180f9daf5bec 100644 --- a/net/ipv4/ipip.c +++ b/net/ipv4/ipip.c @@ -417,29 +417,7 @@ static void ipip_netlink_parms(struct nlattr *data[], if (!data) return; - if (data[IFLA_IPTUN_LINK]) - parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); - - if (data[IFLA_IPTUN_LOCAL]) - parms->iph.saddr = nla_get_in_addr(data[IFLA_IPTUN_LOCAL]); - - if (data[IFLA_IPTUN_REMOTE]) - parms->iph.daddr = nla_get_in_addr(data[IFLA_IPTUN_REMOTE]); - - if (data[IFLA_IPTUN_TTL]) { - parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); - if (parms->iph.ttl) - parms->iph.frag_off = htons(IP_DF); - } - - if (data[IFLA_IPTUN_TOS]) - parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); - - if (data[IFLA_IPTUN_PROTO]) - parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); - - if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) - parms->iph.frag_off = htons(IP_DF); + ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_COLLECT_METADATA]) *collect_md = true; @@ -448,40 +426,6 @@ static void ipip_netlink_parms(struct nlattr *data[], *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -/* This function returns true when ENCAP attributes are present in the nl msg */ -static bool ipip_netlink_encap_parms(struct nlattr *data[], - struct ip_tunnel_encap *ipencap) -{ - bool ret = false; - - memset(ipencap, 0, sizeof(*ipencap)); - - if (!data) - return ret; - - if (data[IFLA_IPTUN_ENCAP_TYPE]) { - ret = true; - ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); - } - - if (data[IFLA_IPTUN_ENCAP_FLAGS]) { - ret = true; - ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); - } - - if (data[IFLA_IPTUN_ENCAP_SPORT]) { - ret = true; - ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); - } - - if (data[IFLA_IPTUN_ENCAP_DPORT]) { - ret = true; - ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); - } - - return ret; -} - static int ipip_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -491,7 +435,7 @@ static int ipip_newlink(struct net *src_net, struct net_device *dev, struct ip_tunnel_encap ipencap; __u32 fwmark = 0; - if (ipip_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) @@ -512,7 +456,7 @@ static int ipip_changelink(struct net_device *dev, struct nlattr *tb[], bool collect_md; __u32 fwmark = t->fwmark; - if (ipip_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c index 73651d17e51f..e04544ac4b45 100644 --- a/net/ipv4/ipmr.c +++ b/net/ipv4/ipmr.c @@ -1004,7 +1004,9 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt, rtnl_unicast(skb, net, NETLINK_CB(skb).portid); } else { + rcu_read_lock(); ip_mr_forward(net, mrt, skb->dev, skb, c, 0); + rcu_read_unlock(); } } } @@ -1546,7 +1548,8 @@ out: } /* Getsock opt support for the multicast routing system. */ -int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen) +int ip_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, + sockptr_t optlen) { int olr; int val; @@ -1577,14 +1580,14 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int return -ENOPROTOOPT; } - if (get_user(olr, optlen)) + if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(unsigned int, olr, sizeof(int)); if (olr < 0) return -EINVAL; - if (put_user(olr, optlen)) + if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, olr)) + if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } diff --git a/net/ipv4/netfilter/ipt_rpfilter.c b/net/ipv4/netfilter/ipt_rpfilter.c index 8cd3224d913e..ded5bef02f77 100644 --- a/net/ipv4/netfilter/ipt_rpfilter.c +++ b/net/ipv4/netfilter/ipt_rpfilter.c @@ -33,7 +33,6 @@ static bool rpfilter_lookup_reverse(struct net *net, struct flowi4 *fl4, const struct net_device *dev, u8 flags) { struct fib_result res; - int ret __maybe_unused; if (fib_lookup(net, fl4, &res, FIB_LOOKUP_IGNORE_LINKSTATE)) return false; @@ -78,7 +77,8 @@ static bool rpfilter_mt(const struct sk_buff *skb, struct xt_action_param *par) flow.flowi4_mark = info->flags & XT_RPFILTER_VALID_MARK ? skb->mark : 0; flow.flowi4_tos = iph->tos & IPTOS_RT_MASK; flow.flowi4_scope = RT_SCOPE_UNIVERSE; - flow.flowi4_oif = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_l3mdev = l3mdev_master_ifindex_rcu(xt_in(par)); + flow.flowi4_uid = sock_net_uid(xt_net(par), NULL); return rpfilter_lookup_reverse(xt_net(par), &flow, xt_in(par), info->flags) ^ invert; } diff --git a/net/ipv4/netfilter/nf_nat_h323.c b/net/ipv4/netfilter/nf_nat_h323.c index a334f0dcc2d0..faee20af4856 100644 --- a/net/ipv4/netfilter/nf_nat_h323.c +++ b/net/ipv4/netfilter/nf_nat_h323.c @@ -291,20 +291,7 @@ static int nat_t120(struct sk_buff *skb, struct nf_conn *ct, exp->expectfn = nf_nat_follow_master; exp->dir = !dir; - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - nated_port = 0; - break; - } - } - + nated_port = nf_nat_exp_find_port(exp, nated_port); if (nated_port == 0) { /* No port available */ net_notice_ratelimited("nf_nat_h323: out of TCP ports\n"); return 0; @@ -347,20 +334,7 @@ static int nat_h245(struct sk_buff *skb, struct nf_conn *ct, if (info->sig_port[dir] == port) nated_port = ntohs(info->sig_port[!dir]); - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - nated_port = 0; - break; - } - } - + nated_port = nf_nat_exp_find_port(exp, nated_port); if (nated_port == 0) { /* No port available */ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); return 0; @@ -439,20 +413,7 @@ static int nat_q931(struct sk_buff *skb, struct nf_conn *ct, if (info->sig_port[dir] == port) nated_port = ntohs(info->sig_port[!dir]); - /* Try to get same port: if not, try to change it. */ - for (; nated_port != 0; nated_port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - nated_port = 0; - break; - } - } - + nated_port = nf_nat_exp_find_port(exp, nated_port); if (nated_port == 0) { /* No port available */ net_notice_ratelimited("nf_nat_ras: out of TCP ports\n"); return 0; @@ -532,20 +493,7 @@ static int nat_callforwarding(struct sk_buff *skb, struct nf_conn *ct, exp->expectfn = ip_nat_callforwarding_expect; exp->dir = !dir; - /* Try to get same port: if not, try to change it. */ - for (nated_port = ntohs(port); nated_port != 0; nated_port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(nated_port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - nated_port = 0; - break; - } - } - + nated_port = nf_nat_exp_find_port(exp, ntohs(port)); if (nated_port == 0) { /* No port available */ net_notice_ratelimited("nf_nat_q931: out of TCP ports\n"); return 0; diff --git a/net/ipv4/netfilter/nf_socket_ipv4.c b/net/ipv4/netfilter/nf_socket_ipv4.c index 2d42e4c35a20..a1350fc25838 100644 --- a/net/ipv4/netfilter/nf_socket_ipv4.c +++ b/net/ipv4/netfilter/nf_socket_ipv4.c @@ -71,8 +71,8 @@ nf_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff, { switch (protocol) { case IPPROTO_TCP: - return inet_lookup(net, &tcp_hashinfo, skb, doff, - saddr, sport, daddr, dport, + return inet_lookup(net, net->ipv4.tcp_death_row.hashinfo, + skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp4_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv4/netfilter/nf_tproxy_ipv4.c b/net/ipv4/netfilter/nf_tproxy_ipv4.c index b2bae0b0e42a..b22b2c745c76 100644 --- a/net/ipv4/netfilter/nf_tproxy_ipv4.c +++ b/net/ipv4/netfilter/nf_tproxy_ipv4.c @@ -79,6 +79,7 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -92,12 +93,10 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet_lookup_listener(net, &tcp_hashinfo, skb, - ip_hdrlen(skb) + - __tcp_hdrlen(hp), - saddr, sport, - daddr, dport, - in->ifindex, 0); + sk = inet_lookup_listener(net, hinfo, skb, + ip_hdrlen(skb) + __tcp_hdrlen(hp), + saddr, sport, daddr, dport, + in->ifindex, 0); if (sk && !refcount_inc_not_zero(&sk->sk_refcnt)) sk = NULL; @@ -108,9 +107,8 @@ nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = inet_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, dport, - in->ifindex); + sk = inet_lookup_established(net, hinfo, saddr, sport, + daddr, dport, in->ifindex); break; default: BUG(); diff --git a/net/ipv4/netfilter/nft_fib_ipv4.c b/net/ipv4/netfilter/nft_fib_ipv4.c index b75cac69bd7e..fc65d69f23e1 100644 --- a/net/ipv4/netfilter/nft_fib_ipv4.c +++ b/net/ipv4/netfilter/nft_fib_ipv4.c @@ -65,6 +65,7 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi4 fl4 = { .flowi4_scope = RT_SCOPE_UNIVERSE, .flowi4_iif = LOOPBACK_IFINDEX, + .flowi4_uid = sock_net_uid(nft_net(pkt), NULL), }; const struct net_device *oif; const struct net_device *found; @@ -83,6 +84,9 @@ void nft_fib4_eval(const struct nft_expr *expr, struct nft_regs *regs, else oif = NULL; + if (priv->flags & NFTA_FIB_F_IIF) + fl4.flowi4_l3mdev = l3mdev_master_ifindex_rcu(oif); + if (nft_hook(pkt) == NF_INET_PRE_ROUTING && nft_fib_is_loopback(pkt->skb, nft_in(pkt))) { nft_fib_store_result(dest, priv, nft_in(pkt)); diff --git a/net/ipv4/nexthop.c b/net/ipv4/nexthop.c index 853a75a8fbaf..d8ef05347fd9 100644 --- a/net/ipv4/nexthop.c +++ b/net/ipv4/nexthop.c @@ -2534,7 +2534,7 @@ static int nh_create_ipv4(struct net *net, struct nexthop *nh, if (!err) { nh->nh_flags = fib_nh->fib_nh_flags; fib_info_update_nhc_saddr(net, &fib_nh->nh_common, - fib_nh->fib_nh_scope); + !fib_nh->fib_nh_scope ? 0 : fib_nh->fib_nh_scope - 1); } else { fib_nh_release(net, fib_nh); } diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c index b83c2bd9d722..bde333b24837 100644 --- a/net/ipv4/ping.c +++ b/net/ipv4/ping.c @@ -33,6 +33,7 @@ #include <linux/skbuff.h> #include <linux/proc_fs.h> #include <linux/export.h> +#include <linux/bpf-cgroup.h> #include <net/sock.h> #include <net/ping.h> #include <net/udp.h> @@ -295,6 +296,19 @@ void ping_close(struct sock *sk, long timeout) } EXPORT_SYMBOL_GPL(ping_close); +static int ping_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from __ip4_datagram_connect() and + * intended to prevent BPF program called below from accessing bytes + * that are out of the bound specified by user in addr_len. + */ + if (addr_len < sizeof(struct sockaddr_in)) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET4_CONNECT_LOCK(sk, uaddr); +} + /* Checks the bind address and possibly modifies sk->sk_bound_dev_if. */ static int ping_check_bind_addr(struct sock *sk, struct inet_sock *isk, struct sockaddr *uaddr, int addr_len) @@ -603,21 +617,9 @@ int ping_getfrag(void *from, char *to, { struct pingfakehdr *pfh = from; - if (offset == 0) { - fraglen -= sizeof(struct icmphdr); - if (fraglen < 0) - BUG(); - if (!csum_and_copy_from_iter_full(to + sizeof(struct icmphdr), - fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } else if (offset < sizeof(struct icmphdr)) { - BUG(); - } else { - if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, - &pfh->msg->msg_iter)) - return -EFAULT; - } + if (!csum_and_copy_from_iter_full(to, fraglen, &pfh->wcheck, + &pfh->msg->msg_iter)) + return -EFAULT; #if IS_ENABLED(CONFIG_IPV6) /* For IPv6, checksum each skb as we go along, as expected by @@ -625,7 +627,7 @@ int ping_getfrag(void *from, char *to, * wcheck, it will be finalized in ping_v4_push_pending_frames. */ if (pfh->family == AF_INET6) { - skb->csum = pfh->wcheck; + skb->csum = csum_block_add(skb->csum, pfh->wcheck, odd); skb->ip_summed = CHECKSUM_NONE; pfh->wcheck = 0; } @@ -828,7 +830,8 @@ back_from_confirm: pfh.family = AF_INET; err = ip_append_data(sk, &fl4, ping_getfrag, &pfh, len, - 0, &ipc, &rt, msg->msg_flags); + sizeof(struct icmphdr), &ipc, &rt, + msg->msg_flags); if (err) ip_flush_pending_frames(sk); else @@ -1009,6 +1012,7 @@ struct proto ping_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, + .pre_connect = ping_pre_connect, .connect = ip4_datagram_connect, .disconnect = __udp_disconnect, .setsockopt = ip_setsockopt, diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c index 0088a4c64d77..f88daace9de3 100644 --- a/net/ipv4/proc.c +++ b/net/ipv4/proc.c @@ -59,7 +59,7 @@ static int sockstat_seq_show(struct seq_file *seq, void *v) socket_seq_show(seq); seq_printf(seq, "TCP: inuse %d orphan %d tw %d alloc %d mem %ld\n", sock_prot_inuse_get(net, &tcp_prot), orphans, - refcount_read(&net->ipv4.tcp_death_row->tw_refcount) - 1, + refcount_read(&net->ipv4.tcp_death_row.tw_refcount) - 1, sockets, proto_memory_allocated(&tcp_prot)); seq_printf(seq, "UDP: inuse %d mem %ld\n", sock_prot_inuse_get(net, &udp_prot), @@ -297,6 +297,7 @@ static const struct snmp_mib snmp4_net_list[] = { SNMP_MIB_ITEM("TCPDSACKIgnoredDubious", LINUX_MIB_TCPDSACKIGNOREDDUBIOUS), SNMP_MIB_ITEM("TCPMigrateReqSuccess", LINUX_MIB_TCPMIGRATEREQSUCCESS), SNMP_MIB_ITEM("TCPMigrateReqFailure", LINUX_MIB_TCPMIGRATEREQFAILURE), + SNMP_MIB_ITEM("TCPPLBRehash", LINUX_MIB_TCPPLBREHASH), SNMP_MIB_SENTINEL }; diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 795cbe1de912..cd1fa9f70f1a 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -3664,7 +3664,7 @@ static __net_init int rt_genid_init(struct net *net) { atomic_set(&net->ipv4.rt_genid, 0); atomic_set(&net->fnhe_genid, 0); - atomic_set(&net->ipv4.dev_addr_genid, get_random_int()); + atomic_set(&net->ipv4.dev_addr_genid, get_random_u32()); return 0; } @@ -3719,7 +3719,7 @@ int __init ip_rt_init(void) ip_idents = idents_hash; - prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); + get_random_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents)); ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents); diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c index 5490c285668b..0af28cedd071 100644 --- a/net/ipv4/sysctl_net_ipv4.c +++ b/net/ipv4/sysctl_net_ipv4.c @@ -39,6 +39,9 @@ static u32 u32_max_div_HZ = UINT_MAX / HZ; static int one_day_secs = 24 * 3600; static u32 fib_multipath_hash_fields_all_mask __maybe_unused = FIB_MULTIPATH_HASH_FIELD_ALL_MASK; +static unsigned int tcp_child_ehash_entries_max = 16 * 1024 * 1024; +static int tcp_plb_max_rounds = 31; +static int tcp_plb_max_cong_thresh = 256; /* obsolete */ static int sysctl_tcp_low_latency __read_mostly; @@ -382,6 +385,29 @@ static int proc_tcp_available_ulp(struct ctl_table *ctl, return ret; } +static int proc_tcp_ehash_entries(struct ctl_table *table, int write, + void *buffer, size_t *lenp, loff_t *ppos) +{ + struct net *net = container_of(table->data, struct net, + ipv4.sysctl_tcp_child_ehash_entries); + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; + int tcp_ehash_entries; + struct ctl_table tbl; + + tcp_ehash_entries = hinfo->ehash_mask + 1; + + /* A negative number indicates that the child netns + * shares the global ehash. + */ + if (!net_eq(net, &init_net) && !hinfo->pernet) + tcp_ehash_entries *= -1; + + tbl.data = &tcp_ehash_entries; + tbl.maxlen = sizeof(int); + + return proc_dointvec(&tbl, write, buffer, lenp, ppos); +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static int proc_fib_multipath_hash_policy(struct ctl_table *table, int write, void *buffer, size_t *lenp, @@ -530,10 +556,9 @@ static struct ctl_table ipv4_table[] = { }; static struct ctl_table ipv4_net_table[] = { - /* tcp_max_tw_buckets must be first in this table. */ { .procname = "tcp_max_tw_buckets", -/* .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, */ + .data = &init_net.ipv4.tcp_death_row.sysctl_max_tw_buckets, .maxlen = sizeof(int), .mode = 0644, .proc_handler = proc_dointvec @@ -1322,6 +1347,21 @@ static struct ctl_table ipv4_net_table[] = { .extra2 = SYSCTL_ONE, }, { + .procname = "tcp_ehash_entries", + .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, + .mode = 0444, + .proc_handler = proc_tcp_ehash_entries, + }, + { + .procname = "tcp_child_ehash_entries", + .data = &init_net.ipv4.sysctl_tcp_child_ehash_entries, + .maxlen = sizeof(unsigned int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_child_ehash_entries_max, + }, + { .procname = "udp_rmem_min", .data = &init_net.ipv4.sysctl_udp_rmem_min, .maxlen = sizeof(init_net.ipv4.sysctl_udp_rmem_min), @@ -1346,6 +1386,47 @@ static struct ctl_table ipv4_net_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "tcp_plb_enabled", + .data = &init_net.ipv4.sysctl_tcp_plb_enabled, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "tcp_plb_idle_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_idle_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_rehash_rounds", + .data = &init_net.ipv4.sysctl_tcp_plb_rehash_rounds, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + .extra2 = &tcp_plb_max_rounds, + }, + { + .procname = "tcp_plb_suspend_rto_sec", + .data = &init_net.ipv4.sysctl_tcp_plb_suspend_rto_sec, + .maxlen = sizeof(u8), + .mode = 0644, + .proc_handler = proc_dou8vec_minmax, + }, + { + .procname = "tcp_plb_cong_thresh", + .data = &init_net.ipv4.sysctl_tcp_plb_cong_thresh, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &tcp_plb_max_cong_thresh, + }, { } }; @@ -1361,8 +1442,7 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) if (!table) goto err_alloc; - /* skip first entry (sysctl_max_tw_buckets) */ - for (i = 1; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { + for (i = 0; i < ARRAY_SIZE(ipv4_net_table) - 1; i++) { if (table[i].data) { /* Update the variables to point into * the current struct net @@ -1377,8 +1457,6 @@ static __net_init int ipv4_sysctl_init_net(struct net *net) } } - table[0].data = &net->ipv4.tcp_death_row->sysctl_max_tw_buckets; - net->ipv4.ipv4_hdr = register_net_sysctl(net, "net/ipv4", table); if (!net->ipv4.ipv4_hdr) goto err_reg; diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 306b94dedc8d..de8f0cd7cb32 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -457,6 +457,7 @@ void tcp_init_sock(struct sock *sk) WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_wmem[1])); WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_rmem[1])); + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); sk_sockets_allocated_inc(sk); } EXPORT_SYMBOL(tcp_init_sock); @@ -1015,7 +1016,7 @@ new_segment: skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy); } else { get_page(page); - skb_fill_page_desc(skb, i, page, offset, copy); + skb_fill_page_desc_noacc(skb, i, page, offset, copy); } if (!(flags & MSG_NO_SHARED_FRAGS)) @@ -1162,9 +1163,8 @@ void tcp_free_fastopen_req(struct tcp_sock *tp) } } -static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, - int *copied, size_t size, - struct ubuf_info *uarg) +int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, int *copied, + size_t size, struct ubuf_info *uarg) { struct tcp_sock *tp = tcp_sk(sk); struct inet_sock *inet = inet_sk(sk); @@ -1239,7 +1239,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size) } zc = sk->sk_route_caps & NETIF_F_SG; if (!zc) - uarg->zerocopy = 0; + uarg_to_msgzc(uarg)->zerocopy = 0; } } @@ -1761,19 +1761,28 @@ int tcp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (sk->sk_state == TCP_LISTEN) return -ENOTCONN; - skb = tcp_recv_skb(sk, seq, &offset); - if (!skb) - return 0; + while ((skb = tcp_recv_skb(sk, seq, &offset)) != NULL) { + u8 tcp_flags; + int used; - __skb_unlink(skb, &sk->sk_receive_queue); - WARN_ON(!skb_set_owner_sk_safe(skb, sk)); - copied = recv_actor(sk, skb); - if (copied >= 0) { - seq += copied; - if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN) + __skb_unlink(skb, &sk->sk_receive_queue); + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + tcp_flags = TCP_SKB_CB(skb)->tcp_flags; + used = recv_actor(sk, skb); + consume_skb(skb); + if (used < 0) { + if (!copied) + copied = used; + break; + } + seq += used; + copied += used; + + if (tcp_flags & TCPHDR_FIN) { ++seq; + break; + } } - consume_skb(skb); WRITE_ONCE(tp->copied_seq, seq); tcp_rcv_space_adjust(sk); @@ -3128,6 +3137,8 @@ int tcp_disconnect(struct sock *sk, int flags) tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; tcp_snd_cwnd_set(tp, TCP_INIT_CWND); tp->snd_cwnd_cnt = 0; + tp->is_cwnd_limited = 0; + tp->max_packets_out = 0; tp->window_clamp = 0; tp->delivered = 0; tp->delivered_ce = 0; @@ -3165,6 +3176,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->sacked_out = 0; tp->tlp_high_seq = 0; tp->last_oow_ack_time = 0; + tp->plb_rehash = 0; /* There's a bubble in the pipe until at least the first ACK. */ tp->app_limited = ~0U; tp->rack.mstamp = 0; @@ -3199,7 +3211,7 @@ EXPORT_SYMBOL(tcp_disconnect); static inline bool tcp_can_repair_sock(const struct sock *sk) { - return ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && + return sockopt_ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN) && (sk->sk_state != TCP_LISTEN); } @@ -3476,8 +3488,8 @@ int tcp_set_window_clamp(struct sock *sk, int val) /* * Socket option code for TCP. */ -static int do_tcp_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_tcp_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct tcp_sock *tp = tcp_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); @@ -3499,11 +3511,11 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); - err = tcp_set_congestion_control(sk, name, true, - ns_capable(sock_net(sk)->user_ns, - CAP_NET_ADMIN)); - release_sock(sk); + sockopt_lock_sock(sk); + err = tcp_set_congestion_control(sk, name, !has_current_bpf_ctx(), + sockopt_ns_capable(sock_net(sk)->user_ns, + CAP_NET_ADMIN)); + sockopt_release_sock(sk); return err; } case TCP_ULP: { @@ -3519,9 +3531,9 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, return -EFAULT; name[val] = 0; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_set_ulp(sk, name); - release_sock(sk); + sockopt_release_sock(sk); return err; } case TCP_FASTOPEN_KEY: { @@ -3554,7 +3566,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, if (copy_from_sockptr(&val, optval, sizeof(val))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); switch (optname) { case TCP_MAXSEG: @@ -3776,7 +3788,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, int optname, break; } - release_sock(sk); + sockopt_release_sock(sk); return err; } @@ -3786,8 +3798,9 @@ int tcp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, const struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->setsockopt(sk, level, optname, - optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->setsockopt(sk, level, optname, + optval, optlen); return do_tcp_setsockopt(sk, level, optname, optval, optlen); } EXPORT_SYMBOL(tcp_setsockopt); @@ -3927,6 +3940,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_reord_seen = tp->reord_seen; info->tcpi_rcv_ooopack = tp->rcv_ooopack; info->tcpi_snd_wnd = tp->snd_wnd; + info->tcpi_rcv_wnd = tp->rcv_wnd; + info->tcpi_rehash = tp->plb_rehash + tp->timeout_rehash; info->tcpi_fastopen_client_fail = tp->fastopen_client_fail; unlock_sock_fast(sk, slow); } @@ -3961,6 +3976,7 @@ static size_t tcp_opt_stats_get_size(void) nla_total_size(sizeof(u32)) + /* TCP_NLA_BYTES_NOTSENT */ nla_total_size_64bit(sizeof(u64)) + /* TCP_NLA_EDT */ nla_total_size(sizeof(u8)) + /* TCP_NLA_TTL */ + nla_total_size(sizeof(u32)) + /* TCP_NLA_REHASH */ 0; } @@ -4037,18 +4053,19 @@ struct sk_buff *tcp_get_timestamping_opt_stats(const struct sock *sk, nla_put_u8(stats, TCP_NLA_TTL, tcp_skb_ttl_or_hop_limit(ack_skb)); + nla_put_u32(stats, TCP_NLA_REHASH, tp->plb_rehash + tp->timeout_rehash); return stats; } -static int do_tcp_getsockopt(struct sock *sk, int level, - int optname, char __user *optval, int __user *optlen) +int do_tcp_getsockopt(struct sock *sk, int level, + int optname, sockptr_t optval, sockptr_t optlen) { struct inet_connection_sock *icsk = inet_csk(sk); struct tcp_sock *tp = tcp_sk(sk); struct net *net = sock_net(sk); int val, len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, sizeof(int)); @@ -4098,15 +4115,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_INFO: { struct tcp_info info; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; tcp_get_info(sk, &info); len = min_t(unsigned int, len, sizeof(info)); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4116,7 +4133,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, size_t sz = 0; int attr; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; ca_ops = icsk->icsk_ca_ops; @@ -4124,9 +4141,9 @@ static int do_tcp_getsockopt(struct sock *sk, int level, sz = ca_ops->get_info(sk, ~0U, &attr, &info); len = min_t(unsigned int, len, sz); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &info, len)) + if (copy_to_sockptr(optval, &info, len)) return -EFAULT; return 0; } @@ -4135,27 +4152,28 @@ static int do_tcp_getsockopt(struct sock *sk, int level, break; case TCP_CONGESTION: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_CA_NAME_MAX); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ca_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ca_ops->name, len)) return -EFAULT; return 0; case TCP_ULP: - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; len = min_t(unsigned int, len, TCP_ULP_NAME_MAX); if (!icsk->icsk_ulp_ops) { - if (put_user(0, optlen)) + len = 0; + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; return 0; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, icsk->icsk_ulp_ops->name, len)) + if (copy_to_sockptr(optval, icsk->icsk_ulp_ops->name, len)) return -EFAULT; return 0; @@ -4163,15 +4181,15 @@ static int do_tcp_getsockopt(struct sock *sk, int level, u64 key[TCP_FASTOPEN_KEY_BUF_LENGTH / sizeof(u64)]; unsigned int key_len; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; key_len = tcp_fastopen_get_cipher(net, icsk, key) * TCP_FASTOPEN_KEY_LENGTH; len = min_t(unsigned int, len, key_len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, key, len)) + if (copy_to_sockptr(optval, key, len)) return -EFAULT; return 0; } @@ -4197,7 +4215,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, case TCP_REPAIR_WINDOW: { struct tcp_repair_window opt; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len != sizeof(opt)) @@ -4212,7 +4230,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level, opt.rcv_wnd = tp->rcv_wnd; opt.rcv_wup = tp->rcv_wup; - if (copy_to_user(optval, &opt, len)) + if (copy_to_sockptr(optval, &opt, len)) return -EFAULT; return 0; } @@ -4258,35 +4276,35 @@ static int do_tcp_getsockopt(struct sock *sk, int level, val = tp->save_syn; break; case TCP_SAVED_SYN: { - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; - lock_sock(sk); + sockopt_lock_sock(sk); if (tp->saved_syn) { if (len < tcp_saved_syn_len(tp->saved_syn)) { - if (put_user(tcp_saved_syn_len(tp->saved_syn), - optlen)) { - release_sock(sk); + len = tcp_saved_syn_len(tp->saved_syn); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - release_sock(sk); + sockopt_release_sock(sk); return -EINVAL; } len = tcp_saved_syn_len(tp->saved_syn); - if (put_user(len, optlen)) { - release_sock(sk); + if (copy_to_sockptr(optlen, &len, sizeof(int))) { + sockopt_release_sock(sk); return -EFAULT; } - if (copy_to_user(optval, tp->saved_syn->data, len)) { - release_sock(sk); + if (copy_to_sockptr(optval, tp->saved_syn->data, len)) { + sockopt_release_sock(sk); return -EFAULT; } tcp_saved_syn_free(tp); - release_sock(sk); + sockopt_release_sock(sk); } else { - release_sock(sk); + sockopt_release_sock(sk); len = 0; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } return 0; @@ -4297,31 +4315,31 @@ static int do_tcp_getsockopt(struct sock *sk, int level, struct tcp_zerocopy_receive zc = {}; int err; - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; if (len < 0 || len < offsetofend(struct tcp_zerocopy_receive, length)) return -EINVAL; if (unlikely(len > sizeof(zc))) { - err = check_zeroed_user(optval + sizeof(zc), - len - sizeof(zc)); + err = check_zeroed_sockptr(optval, sizeof(zc), + len - sizeof(zc)); if (err < 1) return err == 0 ? -EINVAL : err; len = sizeof(zc); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; } - if (copy_from_user(&zc, optval, len)) + if (copy_from_sockptr(&zc, optval, len)) return -EFAULT; if (zc.reserved) return -EINVAL; if (zc.msg_flags & ~(TCP_VALID_ZC_MSG_FLAGS)) return -EINVAL; - lock_sock(sk); + sockopt_lock_sock(sk); err = tcp_zerocopy_receive(sk, &zc, &tss); err = BPF_CGROUP_RUN_PROG_GETSOCKOPT_KERN(sk, level, optname, &zc, &len, err); - release_sock(sk); + sockopt_release_sock(sk); if (len >= offsetofend(struct tcp_zerocopy_receive, msg_flags)) goto zerocopy_rcv_cmsg; switch (len) { @@ -4351,7 +4369,7 @@ zerocopy_rcv_sk_err: zerocopy_rcv_inq: zc.inq = tcp_inq_hint(sk); zerocopy_rcv_out: - if (!err && copy_to_user(optval, &zc, len)) + if (!err && copy_to_sockptr(optval, &zc, len)) err = -EFAULT; return err; } @@ -4360,9 +4378,9 @@ zerocopy_rcv_out: return -ENOPROTOOPT; } - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } @@ -4385,9 +4403,11 @@ int tcp_getsockopt(struct sock *sk, int level, int optname, char __user *optval, struct inet_connection_sock *icsk = inet_csk(sk); if (level != SOL_TCP) - return icsk->icsk_af_ops->getsockopt(sk, level, optname, - optval, optlen); - return do_tcp_getsockopt(sk, level, optname, optval, optlen); + /* Paired with WRITE_ONCE() in do_ipv6_setsockopt() and tcp_v6_connect() */ + return READ_ONCE(icsk->icsk_af_ops)->getsockopt(sk, level, optname, + optval, optlen); + return do_tcp_getsockopt(sk, level, optname, USER_SOCKPTR(optval), + USER_SOCKPTR(optlen)); } EXPORT_SYMBOL(tcp_getsockopt); @@ -4788,6 +4808,7 @@ void __init tcp_init(void) INIT_HLIST_HEAD(&tcp_hashinfo.bhash2[i].chain); } + tcp_hashinfo.pernet = false; cnt = tcp_hashinfo.ehash_mask + 1; sysctl_tcp_max_orphans = cnt / 2; diff --git a/net/ipv4/tcp_cdg.c b/net/ipv4/tcp_cdg.c index ddc7ba0554bd..ba4d98e510e0 100644 --- a/net/ipv4/tcp_cdg.c +++ b/net/ipv4/tcp_cdg.c @@ -243,7 +243,7 @@ static bool tcp_cdg_backoff(struct sock *sk, u32 grad) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); - if (prandom_u32() <= nexp_u32(grad * backoff_factor)) + if (get_random_u32() <= nexp_u32(grad * backoff_factor)) return false; if (use_ineff) { @@ -375,6 +375,7 @@ static void tcp_cdg_init(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); struct tcp_sock *tp = tcp_sk(sk); + ca->gradients = NULL; /* We silently fall back to window = 1 if allocation fails. */ if (window > 1) ca->gradients = kcalloc(window, sizeof(ca->gradients[0]), @@ -388,6 +389,7 @@ static void tcp_cdg_release(struct sock *sk) struct cdg *ca = inet_csk_ca(sk); kfree(ca->gradients); + ca->gradients = NULL; } static struct tcp_congestion_ops tcp_cdg __read_mostly = { diff --git a/net/ipv4/tcp_dctcp.c b/net/ipv4/tcp_dctcp.c index 2a6c0dd665a4..e0a2ca7456ff 100644 --- a/net/ipv4/tcp_dctcp.c +++ b/net/ipv4/tcp_dctcp.c @@ -54,6 +54,7 @@ struct dctcp { u32 next_seq; u32 ce_state; u32 loss_cwnd; + struct tcp_plb_state plb; }; static unsigned int dctcp_shift_g __read_mostly = 4; /* g = 1/2^4 */ @@ -91,6 +92,8 @@ static void dctcp_init(struct sock *sk) ca->ce_state = 0; dctcp_reset(tp, ca); + tcp_plb_init(sk, &ca->plb); + return; } @@ -117,14 +120,28 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) /* Expired RTT */ if (!before(tp->snd_una, ca->next_seq)) { + u32 delivered = tp->delivered - ca->old_delivered; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce; u32 alpha = ca->dctcp_alpha; + u32 ce_ratio = 0; + + if (delivered > 0) { + /* dctcp_alpha keeps EWMA of fraction of ECN marked + * packets. Because of EWMA smoothing, PLB reaction can + * be slow so we use ce_ratio which is an instantaneous + * measure of congestion. ce_ratio is the fraction of + * ECN marked packets in the previous RTT. + */ + if (delivered_ce > 0) + ce_ratio = (delivered_ce << TCP_PLB_SCALE) / delivered; + tcp_plb_update_state(sk, &ca->plb, (int)ce_ratio); + tcp_plb_check_rehash(sk, &ca->plb); + } /* alpha = (1 - g) * alpha + g * F */ alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); if (delivered_ce) { - u32 delivered = tp->delivered - ca->old_delivered; /* If dctcp_shift_g == 1, a 32bit value would overflow * after 8 M packets. @@ -172,8 +189,12 @@ static void dctcp_cwnd_event(struct sock *sk, enum tcp_ca_event ev) dctcp_ece_ack_update(sk, ev, &ca->prior_rcv_nxt, &ca->ce_state); break; case CA_EVENT_LOSS: + tcp_plb_update_state_upon_rto(sk, &ca->plb); dctcp_react_to_loss(sk); break; + case CA_EVENT_TX_START: + tcp_plb_check_rehash(sk, &ca->plb); /* Maybe rehash when inflight is 0 */ + break; default: /* Don't care for the rest. */ break; diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c index 75a1c985f49a..01b50fa79189 100644 --- a/net/ipv4/tcp_diag.c +++ b/net/ipv4/tcp_diag.c @@ -181,13 +181,21 @@ static size_t tcp_diag_get_aux_size(struct sock *sk, bool net_admin) static void tcp_diag_dump(struct sk_buff *skb, struct netlink_callback *cb, const struct inet_diag_req_v2 *r) { - inet_diag_dump_icsk(&tcp_hashinfo, skb, cb, r); + struct inet_hashinfo *hinfo; + + hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + + inet_diag_dump_icsk(hinfo, skb, cb, r); } static int tcp_diag_dump_one(struct netlink_callback *cb, const struct inet_diag_req_v2 *req) { - return inet_diag_dump_one_icsk(&tcp_hashinfo, cb, req); + struct inet_hashinfo *hinfo; + + hinfo = sock_net(cb->skb->sk)->ipv4.tcp_death_row.hashinfo; + + return inet_diag_dump_one_icsk(hinfo, cb, req); } #ifdef CONFIG_INET_DIAG_DESTROY @@ -195,9 +203,13 @@ static int tcp_diag_destroy(struct sk_buff *in_skb, const struct inet_diag_req_v2 *req) { struct net *net = sock_net(in_skb->sk); - struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req); + struct inet_hashinfo *hinfo; + struct sock *sk; int err; + hinfo = net->ipv4.tcp_death_row.hashinfo; + sk = inet_diag_find_one_icsk(net, hinfo, req); + if (IS_ERR(sk)) return PTR_ERR(sk); diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index b85a9f755da4..0640453fce54 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -2192,7 +2192,8 @@ void tcp_enter_loss(struct sock *sk) */ static bool tcp_check_sack_reneging(struct sock *sk, int flag) { - if (flag & FLAG_SACK_RENEGING) { + if (flag & FLAG_SACK_RENEGING && + flag & FLAG_SND_UNA_ADVANCED) { struct tcp_sock *tp = tcp_sk(sk); unsigned long delay = max(usecs_to_jiffies(tp->srtt_us >> 4), msecs_to_jiffies(10)); @@ -2513,6 +2514,21 @@ static inline bool tcp_may_undo(const struct tcp_sock *tp) return tp->undo_marker && (!tp->undo_retrans || tcp_packet_delayed(tp)); } +static bool tcp_is_non_sack_preventing_reopen(struct sock *sk) +{ + struct tcp_sock *tp = tcp_sk(sk); + + if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { + /* Hold old state until something *above* high_seq + * is ACKed. For Reno it is MUST to prevent false + * fast retransmits (RFC2582). SACK TCP is safe. */ + if (!tcp_any_retrans_done(sk)) + tp->retrans_stamp = 0; + return true; + } + return false; +} + /* People celebrate: "We love our President!" */ static bool tcp_try_undo_recovery(struct sock *sk) { @@ -2535,14 +2551,8 @@ static bool tcp_try_undo_recovery(struct sock *sk) } else if (tp->rack.reo_wnd_persist) { tp->rack.reo_wnd_persist--; } - if (tp->snd_una == tp->high_seq && tcp_is_reno(tp)) { - /* Hold old state until something *above* high_seq - * is ACKed. For Reno it is MUST to prevent false - * fast retransmits (RFC2582). SACK TCP is safe. */ - if (!tcp_any_retrans_done(sk)) - tp->retrans_stamp = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) return true; - } tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; return false; @@ -2578,6 +2588,8 @@ static bool tcp_try_undo_loss(struct sock *sk, bool frto_undo) NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPSPURIOUSRTOS); inet_csk(sk)->icsk_retransmits = 0; + if (tcp_is_non_sack_preventing_reopen(sk)) + return true; if (frto_undo || tcp_is_sack(tp)) { tcp_set_ca_state(sk, TCP_CA_Open); tp->is_sack_reneg = 0; diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 01b31f5c7aba..ebab9e8b184c 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -201,15 +201,16 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct sockaddr_in *usin = (struct sockaddr_in *)uaddr; + struct inet_timewait_death_row *tcp_death_row; __be32 daddr, nexthop, prev_sk_rcv_saddr; struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); + struct ip_options_rcu *inet_opt; + struct net *net = sock_net(sk); __be16 orig_sport, orig_dport; struct flowi4 *fl4; struct rtable *rt; int err; - struct ip_options_rcu *inet_opt; - struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -235,7 +236,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (IS_ERR(rt)) { err = PTR_ERR(rt); if (err == -ENETUNREACH) - IP_INC_STATS(sock_net(sk), IPSTATS_MIB_OUTNOROUTES); + IP_INC_STATS(net, IPSTATS_MIB_OUTNOROUTES); return err; } @@ -247,11 +248,12 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) if (!inet_opt || !inet_opt->opt.srr) daddr = fl4->daddr; + tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + if (!inet->inet_saddr) { if (inet_csk(sk)->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo, - sk, sock_net(sk), - inet->inet_num); + prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, + sk, net, inet->inet_num); prev_sk_rcv_saddr = sk->sk_rcv_saddr; } inet->inet_saddr = fl4->saddr; @@ -317,12 +319,11 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) inet->inet_daddr, inet->inet_sport, usin->sin_port)); - tp->tsoffset = secure_tcp_ts_off(sock_net(sk), - inet->inet_saddr, + tp->tsoffset = secure_tcp_ts_off(net, inet->inet_saddr, inet->inet_daddr); } - inet->inet_id = prandom_u32(); + inet->inet_id = get_random_u16(); if (tcp_fastopen_defer_connect(sk, &err)) return err; @@ -494,9 +495,9 @@ int tcp_v4_err(struct sk_buff *skb, u32 info) int err; struct net *net = dev_net(skb->dev); - sk = __inet_lookup_established(net, &tcp_hashinfo, iph->daddr, - th->dest, iph->saddr, ntohs(th->source), - inet_iif(skb), 0); + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, + iph->daddr, th->dest, iph->saddr, + ntohs(th->source), inet_iif(skb), 0); if (!sk) { __ICMP_INC_STATS(net, ICMP_MIB_INERRORS); return -ENOENT; @@ -759,8 +760,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0, - ip_hdr(skb)->saddr, + sk1 = __inet_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, + NULL, 0, ip_hdr(skb)->saddr, th->source, ip_hdr(skb)->daddr, ntohs(th->source), dif, sdif); /* don't send rst if it can't find key */ @@ -1542,7 +1543,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb, inet_csk(newsk)->icsk_ext_hdr_len = 0; if (inet_opt) inet_csk(newsk)->icsk_ext_hdr_len = inet_opt->opt.optlen; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); /* Set ToS of the new socket based upon the value of incoming SYN. * ECT bits are set later in tcp_init_transfer(). @@ -1728,6 +1729,7 @@ EXPORT_SYMBOL(tcp_v4_do_rcv); int tcp_v4_early_demux(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct iphdr *iph; const struct tcphdr *th; struct sock *sk; @@ -1744,7 +1746,7 @@ int tcp_v4_early_demux(struct sk_buff *skb) if (th->doff < sizeof(struct tcphdr) / 4) return 0; - sk = __inet_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + sk = __inet_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, iph->saddr, th->source, iph->daddr, ntohs(th->dest), skb->skb_iif, inet_sdif(skb)); @@ -1872,11 +1874,13 @@ bool tcp_add_backlog(struct sock *sk, struct sk_buff *skb, __skb_push(skb, hdrlen); no_coalesce: + limit = (u32)READ_ONCE(sk->sk_rcvbuf) + (u32)(READ_ONCE(sk->sk_sndbuf) >> 1); + /* Only socket owner can try to collapse/prune rx queues * to reduce memory overhead, so add a little headroom here. * Few sockets backlog are possibly concurrently non empty. */ - limit = READ_ONCE(sk->sk_rcvbuf) + READ_ONCE(sk->sk_sndbuf) + 64*1024; + limit += 64 * 1024; if (unlikely(sk_add_backlog(sk, skb, limit))) { bh_unlock_sock(sk); @@ -1970,7 +1974,8 @@ int tcp_v4_rcv(struct sk_buff *skb) th = (const struct tcphdr *)skb->data; iph = ip_hdr(skb); lookup: - sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source, + sk = __inet_lookup_skb(net->ipv4.tcp_death_row.hashinfo, + skb, __tcp_hdrlen(th), th->source, th->dest, sdif, &refcounted); if (!sk) goto no_tcp_socket; @@ -2152,9 +2157,9 @@ do_time_wait: } switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) { case TCP_TW_SYN: { - struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev), - &tcp_hashinfo, skb, - __tcp_hdrlen(th), + struct sock *sk2 = inet_lookup_listener(net, + net->ipv4.tcp_death_row.hashinfo, + skb, __tcp_hdrlen(th), iph->saddr, th->source, iph->daddr, th->dest, inet_iif(skb), @@ -2304,15 +2309,16 @@ static bool seq_sk_match(struct seq_file *seq, const struct sock *sk) */ static void *listening_get_first(struct seq_file *seq) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; st->offset = 0; - for (; st->bucket <= tcp_hashinfo.lhash2_mask; st->bucket++) { + for (; st->bucket <= hinfo->lhash2_mask; st->bucket++) { struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; struct sock *sk; - ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + ilb2 = &hinfo->lhash2[st->bucket]; if (hlist_nulls_empty(&ilb2->nulls_head)) continue; @@ -2337,6 +2343,7 @@ static void *listening_get_next(struct seq_file *seq, void *cur) struct tcp_iter_state *st = seq->private; struct inet_listen_hashbucket *ilb2; struct hlist_nulls_node *node; + struct inet_hashinfo *hinfo; struct sock *sk = cur; ++st->num; @@ -2348,7 +2355,8 @@ static void *listening_get_next(struct seq_file *seq, void *cur) return sk; } - ilb2 = &tcp_hashinfo.lhash2[st->bucket]; + hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; + ilb2 = &hinfo->lhash2[st->bucket]; spin_unlock(&ilb2->lock); ++st->bucket; return listening_get_first(seq); @@ -2370,9 +2378,10 @@ static void *listening_get_idx(struct seq_file *seq, loff_t *pos) return rc; } -static inline bool empty_bucket(const struct tcp_iter_state *st) +static inline bool empty_bucket(struct inet_hashinfo *hinfo, + const struct tcp_iter_state *st) { - return hlist_nulls_empty(&tcp_hashinfo.ehash[st->bucket].chain); + return hlist_nulls_empty(&hinfo->ehash[st->bucket].chain); } /* @@ -2381,20 +2390,21 @@ static inline bool empty_bucket(const struct tcp_iter_state *st) */ static void *established_get_first(struct seq_file *seq) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; st->offset = 0; - for (; st->bucket <= tcp_hashinfo.ehash_mask; ++st->bucket) { + for (; st->bucket <= hinfo->ehash_mask; ++st->bucket) { struct sock *sk; struct hlist_nulls_node *node; - spinlock_t *lock = inet_ehash_lockp(&tcp_hashinfo, st->bucket); + spinlock_t *lock = inet_ehash_lockp(hinfo, st->bucket); /* Lockless fast path for the common case of empty buckets */ - if (empty_bucket(st)) + if (empty_bucket(hinfo, st)) continue; spin_lock_bh(lock); - sk_nulls_for_each(sk, node, &tcp_hashinfo.ehash[st->bucket].chain) { + sk_nulls_for_each(sk, node, &hinfo->ehash[st->bucket].chain) { if (seq_sk_match(seq, sk)) return sk; } @@ -2406,9 +2416,10 @@ static void *established_get_first(struct seq_file *seq) static void *established_get_next(struct seq_file *seq, void *cur) { - struct sock *sk = cur; - struct hlist_nulls_node *node; + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; + struct hlist_nulls_node *node; + struct sock *sk = cur; ++st->num; ++st->offset; @@ -2420,7 +2431,7 @@ static void *established_get_next(struct seq_file *seq, void *cur) return sk; } - spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); ++st->bucket; return established_get_first(seq); } @@ -2458,6 +2469,7 @@ static void *tcp_get_idx(struct seq_file *seq, loff_t pos) static void *tcp_seek_last_pos(struct seq_file *seq) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; int bucket = st->bucket; int offset = st->offset; @@ -2466,7 +2478,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) switch (st->state) { case TCP_SEQ_STATE_LISTENING: - if (st->bucket > tcp_hashinfo.lhash2_mask) + if (st->bucket > hinfo->lhash2_mask) break; st->state = TCP_SEQ_STATE_LISTENING; rc = listening_get_first(seq); @@ -2478,7 +2490,7 @@ static void *tcp_seek_last_pos(struct seq_file *seq) st->state = TCP_SEQ_STATE_ESTABLISHED; fallthrough; case TCP_SEQ_STATE_ESTABLISHED: - if (st->bucket > tcp_hashinfo.ehash_mask) + if (st->bucket > hinfo->ehash_mask) break; rc = established_get_first(seq); while (offset-- && rc && bucket == st->bucket) @@ -2546,16 +2558,17 @@ EXPORT_SYMBOL(tcp_seq_next); void tcp_seq_stop(struct seq_file *seq, void *v) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct tcp_iter_state *st = seq->private; switch (st->state) { case TCP_SEQ_STATE_LISTENING: if (v != SEQ_START_TOKEN) - spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + spin_unlock(&hinfo->lhash2[st->bucket].lock); break; case TCP_SEQ_STATE_ESTABLISHED: if (v) - spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); break; } } @@ -2750,6 +2763,7 @@ static int bpf_iter_tcp_realloc_batch(struct bpf_tcp_iter_state *iter, static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, struct sock *start_sk) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; struct hlist_nulls_node *node; @@ -2769,7 +2783,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, expected++; } } - spin_unlock(&tcp_hashinfo.lhash2[st->bucket].lock); + spin_unlock(&hinfo->lhash2[st->bucket].lock); return expected; } @@ -2777,6 +2791,7 @@ static unsigned int bpf_iter_tcp_listening_batch(struct seq_file *seq, static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, struct sock *start_sk) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; struct hlist_nulls_node *node; @@ -2796,13 +2811,14 @@ static unsigned int bpf_iter_tcp_established_batch(struct seq_file *seq, expected++; } } - spin_unlock_bh(inet_ehash_lockp(&tcp_hashinfo, st->bucket)); + spin_unlock_bh(inet_ehash_lockp(hinfo, st->bucket)); return expected; } static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) { + struct inet_hashinfo *hinfo = seq_file_net(seq)->ipv4.tcp_death_row.hashinfo; struct bpf_tcp_iter_state *iter = seq->private; struct tcp_iter_state *st = &iter->state; unsigned int expected; @@ -2818,7 +2834,7 @@ static struct sock *bpf_iter_tcp_batch(struct seq_file *seq) st->offset = 0; st->bucket++; if (st->state == TCP_SEQ_STATE_LISTENING && - st->bucket > tcp_hashinfo.lhash2_mask) { + st->bucket > hinfo->lhash2_mask) { st->state = TCP_SEQ_STATE_ESTABLISHED; st->bucket = 0; } @@ -3083,7 +3099,7 @@ struct proto tcp_prot = { .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp_timewait_sock_ops, .rsk_prot = &tcp_request_sock_ops, - .h.hashinfo = &tcp_hashinfo, + .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; @@ -3091,19 +3107,43 @@ EXPORT_SYMBOL(tcp_prot); static void __net_exit tcp_sk_exit(struct net *net) { - struct inet_timewait_death_row *tcp_death_row = net->ipv4.tcp_death_row; - if (net->ipv4.tcp_congestion_control) bpf_module_put(net->ipv4.tcp_congestion_control, net->ipv4.tcp_congestion_control->owner); - if (refcount_dec_and_test(&tcp_death_row->tw_refcount)) - kfree(tcp_death_row); } -static int __net_init tcp_sk_init(struct net *net) +static void __net_init tcp_set_hashinfo(struct net *net) { - int cnt; + struct inet_hashinfo *hinfo; + unsigned int ehash_entries; + struct net *old_net; + if (net_eq(net, &init_net)) + goto fallback; + + old_net = current->nsproxy->net_ns; + ehash_entries = READ_ONCE(old_net->ipv4.sysctl_tcp_child_ehash_entries); + if (!ehash_entries) + goto fallback; + + ehash_entries = roundup_pow_of_two(ehash_entries); + hinfo = inet_pernet_hashinfo_alloc(&tcp_hashinfo, ehash_entries); + if (!hinfo) { + pr_warn("Failed to allocate TCP ehash (entries: %u) " + "for a netns, fallback to the global one\n", + ehash_entries); +fallback: + hinfo = &tcp_hashinfo; + ehash_entries = tcp_hashinfo.ehash_mask + 1; + } + + net->ipv4.tcp_death_row.hashinfo = hinfo; + net->ipv4.tcp_death_row.sysctl_max_tw_buckets = ehash_entries / 2; + net->ipv4.sysctl_max_syn_backlog = max(128U, ehash_entries / 128); +} + +static int __net_init tcp_sk_init(struct net *net) +{ net->ipv4.sysctl_tcp_ecn = 2; net->ipv4.sysctl_tcp_ecn_fallback = 1; @@ -3129,15 +3169,9 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_tw_reuse = 2; net->ipv4.sysctl_tcp_no_ssthresh_metrics_save = 1; - net->ipv4.tcp_death_row = kzalloc(sizeof(struct inet_timewait_death_row), GFP_KERNEL); - if (!net->ipv4.tcp_death_row) - return -ENOMEM; - refcount_set(&net->ipv4.tcp_death_row->tw_refcount, 1); - cnt = tcp_hashinfo.ehash_mask + 1; - net->ipv4.tcp_death_row->sysctl_max_tw_buckets = cnt / 2; - net->ipv4.tcp_death_row->hashinfo = &tcp_hashinfo; + refcount_set(&net->ipv4.tcp_death_row.tw_refcount, 1); + tcp_set_hashinfo(net); - net->ipv4.sysctl_max_syn_backlog = max(128, cnt / 128); net->ipv4.sysctl_tcp_sack = 1; net->ipv4.sysctl_tcp_window_scaling = 1; net->ipv4.sysctl_tcp_timestamps = 1; @@ -3184,6 +3218,14 @@ static int __net_init tcp_sk_init(struct net *net) net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 0; atomic_set(&net->ipv4.tfo_active_disable_times, 0); + /* Set default values for PLB */ + net->ipv4.sysctl_tcp_plb_enabled = 0; /* Disabled by default */ + net->ipv4.sysctl_tcp_plb_idle_rehash_rounds = 3; + net->ipv4.sysctl_tcp_plb_rehash_rounds = 12; + net->ipv4.sysctl_tcp_plb_suspend_rto_sec = 60; + /* Default congestion threshold for PLB to mark a round is 50% */ + net->ipv4.sysctl_tcp_plb_cong_thresh = (1 << TCP_PLB_SCALE) / 2; + /* Reno is always built in */ if (!net_eq(net, &init_net) && bpf_try_module_get(init_net.ipv4.tcp_congestion_control, @@ -3199,10 +3241,13 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list) { struct net *net; - inet_twsk_purge(&tcp_hashinfo, AF_INET); + tcp_twsk_purge(net_exit_list, AF_INET); - list_for_each_entry(net, net_exit_list, exit_list) + list_for_each_entry(net, net_exit_list, exit_list) { + inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo); + WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount)); tcp_fastopen_ctx_destroy(net); + } } static struct pernet_operations __net_initdata tcp_sk_ops = { diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c index cb95d88497ae..c375f603a16c 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c @@ -247,10 +247,10 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) { const struct inet_connection_sock *icsk = inet_csk(sk); const struct tcp_sock *tp = tcp_sk(sk); + struct net *net = sock_net(sk); struct inet_timewait_sock *tw; - struct inet_timewait_death_row *tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; - tw = inet_twsk_alloc(sk, tcp_death_row, state); + tw = inet_twsk_alloc(sk, &net->ipv4.tcp_death_row, state); if (tw) { struct tcp_timewait_sock *tcptw = tcp_twsk((struct sock *)tw); @@ -319,14 +319,14 @@ void tcp_time_wait(struct sock *sk, int state, int timeo) /* Linkage updates. * Note that access to tw after this point is illegal. */ - inet_twsk_hashdance(tw, sk, &tcp_hashinfo); + inet_twsk_hashdance(tw, sk, net->ipv4.tcp_death_row.hashinfo); local_bh_enable(); } else { /* Sorry, if we're out of memory, just CLOSE this * socket up. We've got bigger problems than * non-graceful socket closings. */ - NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW); + NET_INC_STATS(net, LINUX_MIB_TCPTIMEWAITOVERFLOW); } tcp_update_metrics(sk); @@ -347,6 +347,27 @@ void tcp_twsk_destructor(struct sock *sk) } EXPORT_SYMBOL_GPL(tcp_twsk_destructor); +void tcp_twsk_purge(struct list_head *net_exit_list, int family) +{ + bool purged_once = false; + struct net *net; + + list_for_each_entry(net, net_exit_list, exit_list) { + if (net->ipv4.tcp_death_row.hashinfo->pernet) { + /* Even if tw_refcount == 1, we must clean up kernel reqsk */ + inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family); + } else if (!purged_once) { + /* The last refcount is decremented in tcp_sk_exit_batch() */ + if (refcount_read(&net->ipv4.tcp_death_row.tw_refcount) == 1) + continue; + + inet_twsk_purge(&tcp_hashinfo, family); + purged_once = true; + } + } +} +EXPORT_SYMBOL_GPL(tcp_twsk_purge); + /* Warning : This function is called without sk_listener being locked. * Be sure to read socket fields once, as their value could change under us. */ @@ -541,6 +562,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk, newtp->fastopen_req = NULL; RCU_INIT_POINTER(newtp->fastopen_rsk, NULL); + newtp->bpf_chg_cc_inprogress = 0; tcp_bpf_clone(sk, newsk); __TCP_INC_STATS(sock_net(sk), TCP_MIB_PASSIVEOPENS); diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c index a844a0d38482..45dda7889387 100644 --- a/net/ipv4/tcp_offload.c +++ b/net/ipv4/tcp_offload.c @@ -255,7 +255,15 @@ found: mss = skb_shinfo(p)->gso_size; - flush |= (len - 1) >= mss; + /* If skb is a GRO packet, make sure its gso_size matches prior packet mss. + * If it is a single frame, do not aggregate it if its length + * is bigger than our mss. + */ + if (unlikely(skb_is_gso(skb))) + flush |= (mss != skb_shinfo(skb)->gso_size); + else + flush |= (len - 1) >= mss; + flush |= (ntohl(th2->seq) + skb_gro_len(p)) ^ ntohl(th->seq); #ifdef CONFIG_TLS_DEVICE flush |= p->decrypted ^ skb->decrypted; @@ -269,7 +277,12 @@ found: tcp_flag_word(th2) |= flags & (TCP_FLAG_FIN | TCP_FLAG_PSH); out_check_final: - flush = len < mss; + /* Force a flush if last segment is smaller than mss. */ + if (unlikely(skb_is_gso(skb))) + flush = len != NAPI_GRO_CB(skb)->count * skb_shinfo(skb)->gso_size; + else + flush = len < mss; + flush |= (__force int)(flags & (TCP_FLAG_URG | TCP_FLAG_PSH | TCP_FLAG_RST | TCP_FLAG_SYN | TCP_FLAG_FIN)); diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c index 290019de766d..c69f4d966024 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -1875,15 +1875,20 @@ static void tcp_cwnd_validate(struct sock *sk, bool is_cwnd_limited) const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; struct tcp_sock *tp = tcp_sk(sk); - /* Track the maximum number of outstanding packets in each - * window, and remember whether we were cwnd-limited then. + /* Track the strongest available signal of the degree to which the cwnd + * is fully utilized. If cwnd-limited then remember that fact for the + * current window. If not cwnd-limited then track the maximum number of + * outstanding packets in the current window. (If cwnd-limited then we + * chose to not update tp->max_packets_out to avoid an extra else + * clause with no functional impact.) */ - if (!before(tp->snd_una, tp->max_packets_seq) || - tp->packets_out > tp->max_packets_out || - is_cwnd_limited) { - tp->max_packets_out = tp->packets_out; - tp->max_packets_seq = tp->snd_nxt; + if (!before(tp->snd_una, tp->cwnd_usage_seq) || + is_cwnd_limited || + (!tp->is_cwnd_limited && + tp->packets_out > tp->max_packets_out)) { tp->is_cwnd_limited = is_cwnd_limited; + tp->max_packets_out = tp->packets_out; + tp->cwnd_usage_seq = tp->snd_nxt; } if (tcp_is_cwnd_limited(sk)) { diff --git a/net/ipv4/tcp_plb.c b/net/ipv4/tcp_plb.c new file mode 100644 index 000000000000..bb1a08fda113 --- /dev/null +++ b/net/ipv4/tcp_plb.c @@ -0,0 +1,109 @@ +/* Protective Load Balancing (PLB) + * + * PLB was designed to reduce link load imbalance across datacenter + * switches. PLB is a host-based optimization; it leverages congestion + * signals from the transport layer to randomly change the path of the + * connection experiencing sustained congestion. PLB prefers to repath + * after idle periods to minimize packet reordering. It repaths by + * changing the IPv6 Flow Label on the packets of a connection, which + * datacenter switches include as part of ECMP/WCMP hashing. + * + * PLB is described in detail in: + * + * Mubashir Adnan Qureshi, Yuchung Cheng, Qianwen Yin, Qiaobin Fu, + * Gautam Kumar, Masoud Moshref, Junhua Yan, Van Jacobson, + * David Wetherall,Abdul Kabbani: + * "PLB: Congestion Signals are Simple and Effective for + * Network Load Balancing" + * In ACM SIGCOMM 2022, Amsterdam Netherlands. + * + */ + +#include <net/tcp.h> + +/* Called once per round-trip to update PLB state for a connection. */ +void tcp_plb_update_state(const struct sock *sk, struct tcp_plb_state *plb, + const int cong_ratio) +{ + struct net *net = sock_net(sk); + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + if (cong_ratio >= 0) { + if (cong_ratio < READ_ONCE(net->ipv4.sysctl_tcp_plb_cong_thresh)) + plb->consec_cong_rounds = 0; + else if (plb->consec_cong_rounds < + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds)) + plb->consec_cong_rounds++; + } +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state); + +/* Check whether recent congestion has been persistent enough to warrant + * a load balancing decision that switches the connection to another path. + */ +void tcp_plb_check_rehash(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 max_suspend; + bool forced_rehash = false, idle_rehash = false; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + forced_rehash = plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_rehash_rounds); + /* If sender goes idle then we check whether to rehash. */ + idle_rehash = READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds) && + !tcp_sk(sk)->packets_out && + plb->consec_cong_rounds >= + READ_ONCE(net->ipv4.sysctl_tcp_plb_idle_rehash_rounds); + + if (!forced_rehash && !idle_rehash) + return; + + /* Note that tcp_jiffies32 can wrap; we detect wraps by checking for + * cases where the max suspension end is before the actual suspension + * end. We clear pause_until to 0 to indicate there is no recent + * RTO event that constrains PLB rehashing. + */ + max_suspend = 2 * READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + if (plb->pause_until && + (!before(tcp_jiffies32, plb->pause_until) || + before(tcp_jiffies32 + max_suspend, plb->pause_until))) + plb->pause_until = 0; + + if (plb->pause_until) + return; + + sk_rethink_txhash(sk); + plb->consec_cong_rounds = 0; + tcp_sk(sk)->plb_rehash++; + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPLBREHASH); +} +EXPORT_SYMBOL_GPL(tcp_plb_check_rehash); + +/* Upon RTO, disallow load balancing for a while, to avoid having load + * balancing decisions switch traffic to a black-holed path that was + * previously avoided with a sk_rethink_txhash() call at RTO time. + */ +void tcp_plb_update_state_upon_rto(struct sock *sk, struct tcp_plb_state *plb) +{ + struct net *net = sock_net(sk); + u32 pause; + + if (!READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled)) + return; + + pause = READ_ONCE(net->ipv4.sysctl_tcp_plb_suspend_rto_sec) * HZ; + pause += prandom_u32_max(pause); + plb->pause_until = tcp_jiffies32 + pause; + + /* Reset PLB state upon RTO, since an RTO causes a sk_rethink_txhash() call + * that may switch this connection to a path with completely different + * congestion characteristics. + */ + plb->consec_cong_rounds = 0; +} +EXPORT_SYMBOL_GPL(tcp_plb_update_state_upon_rto); diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c index 34eda973bbf1..89accc3c8bb3 100644 --- a/net/ipv4/udp.c +++ b/net/ipv4/udp.c @@ -246,7 +246,7 @@ int udp_lib_get_port(struct sock *sk, unsigned short snum, inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rand = prandom_u32(); + rand = get_random_u32(); first = reciprocal_scale(rand, remaining) + low; /* * force rand to be an odd multiple of UDP_HTABLE_SIZE @@ -448,7 +448,7 @@ static struct sock *udp4_lib_lookup2(struct net *net, result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk, false)) + if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; @@ -783,6 +783,8 @@ int __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable) */ if (tunnel) { /* ...not for tunnels though: we don't have a sending socket */ + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, iph->ihl << 2); goto out; } if (!inet->recverr) { @@ -1446,7 +1448,7 @@ static void udp_rmem_release(struct sock *sk, int size, int partial, if (likely(partial)) { up->forward_deficit += size; size = up->forward_deficit; - if (size < (sk->sk_rcvbuf >> 2) && + if (size < READ_ONCE(up->forward_threshold) && !skb_queue_empty(&up->reader_queue)) return; } else { @@ -1596,7 +1598,7 @@ drop: } EXPORT_SYMBOL_GPL(__udp_enqueue_schedule_skb); -void udp_destruct_sock(struct sock *sk) +void udp_destruct_common(struct sock *sk) { /* reclaim completely the forward allocated memory */ struct udp_sock *up = udp_sk(sk); @@ -1609,18 +1611,22 @@ void udp_destruct_sock(struct sock *sk) kfree_skb(skb); } udp_rmem_release(sk, total, 0, true); +} +EXPORT_SYMBOL_GPL(udp_destruct_common); +static void udp_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); inet_sock_destruct(sk); } -EXPORT_SYMBOL_GPL(udp_destruct_sock); int udp_init_sock(struct sock *sk) { - skb_queue_head_init(&udp_sk(sk)->reader_queue); + udp_lib_init_sock(sk); sk->sk_destruct = udp_destruct_sock; + set_bit(SOCK_SUPPORT_ZC, &sk->sk_socket->flags); return 0; } -EXPORT_SYMBOL_GPL(udp_init_sock); void skb_consume_udp(struct sock *sk, struct sk_buff *skb, int len) { @@ -1799,41 +1805,29 @@ EXPORT_SYMBOL(__skb_recv_udp); int udp_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - int copied = 0; - - while (1) { - struct sk_buff *skb; - int err, used; - - skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); - if (!skb) - return err; + struct sk_buff *skb; + int err, copied; - if (udp_lib_checksum_complete(skb)) { - __UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, - IS_UDPLITE(sk)); - __UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, - IS_UDPLITE(sk)); - atomic_inc(&sk->sk_drops); - kfree_skb(skb); - continue; - } +try_again: + skb = skb_recv_udp(sk, MSG_DONTWAIT, &err); + if (!skb) + return err; - WARN_ON(!skb_set_owner_sk_safe(skb, sk)); - used = recv_actor(sk, skb); - if (used <= 0) { - if (!copied) - copied = used; - kfree_skb(skb); - break; - } else if (used <= skb->len) { - copied += used; - } + if (udp_lib_checksum_complete(skb)) { + int is_udplite = IS_UDPLITE(sk); + struct net *net = sock_net(sk); + __UDP_INC_STATS(net, UDP_MIB_CSUMERRORS, is_udplite); + __UDP_INC_STATS(net, UDP_MIB_INERRORS, is_udplite); + atomic_inc(&sk->sk_drops); kfree_skb(skb); - break; + goto try_again; } + WARN_ON_ONCE(!skb_set_owner_sk_safe(skb, sk)); + copied = recv_actor(sk, skb); + kfree_skb(skb); + return copied; } EXPORT_SYMBOL(udp_read_skb); @@ -2678,6 +2672,18 @@ int udp_lib_setsockopt(struct sock *sk, int level, int optname, int err = 0; int is_udplite = IS_UDPLITE(sk); + if (level == SOL_SOCKET) { + err = sk_setsockopt(sk, level, optname, optval, optlen); + + if (optname == SO_RCVBUF || optname == SO_RCVBUFFORCE) { + sockopt_lock_sock(sk); + /* paired with READ_ONCE in udp_rmem_release() */ + WRITE_ONCE(up->forward_threshold, sk->sk_rcvbuf >> 2); + sockopt_release_sock(sk); + } + return err; + } + if (optlen < sizeof(int)) return -EINVAL; @@ -2791,7 +2797,7 @@ EXPORT_SYMBOL(udp_lib_setsockopt); int udp_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_push_pending_frames); diff --git a/net/ipv4/udp_tunnel_core.c b/net/ipv4/udp_tunnel_core.c index 8efaf8c3fe2a..8242c8947340 100644 --- a/net/ipv4/udp_tunnel_core.c +++ b/net/ipv4/udp_tunnel_core.c @@ -72,6 +72,7 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock, udp_sk(sk)->encap_type = cfg->encap_type; udp_sk(sk)->encap_rcv = cfg->encap_rcv; + udp_sk(sk)->encap_err_rcv = cfg->encap_err_rcv; udp_sk(sk)->encap_err_lookup = cfg->encap_err_lookup; udp_sk(sk)->encap_destroy = cfg->encap_destroy; udp_sk(sk)->gro_receive = cfg->gro_receive; diff --git a/net/ipv4/udplite.c b/net/ipv4/udplite.c index 6e08a76ae1e7..e0c9cc39b81e 100644 --- a/net/ipv4/udplite.c +++ b/net/ipv4/udplite.c @@ -17,6 +17,14 @@ struct udp_table udplite_table __read_mostly; EXPORT_SYMBOL(udplite_table); +/* Designate sk as UDP-Lite socket */ +static int udplite_sk_init(struct sock *sk) +{ + udp_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplite_rcv(struct sk_buff *skb) { return __udp4_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); diff --git a/net/ipv4/xfrm4_tunnel.c b/net/ipv4/xfrm4_tunnel.c index 9d4f418f1bf8..8489fa106583 100644 --- a/net/ipv4/xfrm4_tunnel.c +++ b/net/ipv4/xfrm4_tunnel.c @@ -22,13 +22,17 @@ static int ipip_xfrm_rcv(struct xfrm_state *x, struct sk_buff *skb) return ip_hdr(skb)->protocol; } -static int ipip_init_state(struct xfrm_state *x) +static int ipip_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { - if (x->props.mode != XFRM_MODE_TUNNEL) + if (x->props.mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG(extack, "IPv4 tunnel can only be used with tunnel mode"); return -EINVAL; + } - if (x->encap) + if (x->encap) { + NL_SET_ERR_MSG(extack, "IPv4 tunnel is not compatible with encapsulation"); return -EINVAL; + } x->props.header_len = sizeof(struct iphdr); diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index e15f64f22fa8..9c3f5202a97b 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -104,7 +104,7 @@ static inline u32 cstamp_delta(unsigned long cstamp) static inline s32 rfc3315_s14_backoff_init(s32 irt) { /* multiply 'initial retransmission time' by 0.9 .. 1.1 */ - u64 tmp = (900000 + prandom_u32() % 200001) * (u64)irt; + u64 tmp = (900000 + prandom_u32_max(200001)) * (u64)irt; do_div(tmp, 1000000); return (s32)tmp; } @@ -112,11 +112,11 @@ static inline s32 rfc3315_s14_backoff_init(s32 irt) static inline s32 rfc3315_s14_backoff_update(s32 rt, s32 mrt) { /* multiply 'retransmission timeout' by 1.9 .. 2.1 */ - u64 tmp = (1900000 + prandom_u32() % 200001) * (u64)rt; + u64 tmp = (1900000 + prandom_u32_max(200001)) * (u64)rt; do_div(tmp, 1000000); if ((s32)tmp > mrt) { /* multiply 'maximum retransmission time' by 0.9 .. 1.1 */ - tmp = (900000 + prandom_u32() % 200001) * (u64)mrt; + tmp = (900000 + prandom_u32_max(200001)) * (u64)mrt; do_div(tmp, 1000000); } return (s32)tmp; @@ -3557,11 +3557,15 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event, fallthrough; case NETDEV_UP: case NETDEV_CHANGE: - if (dev->flags & IFF_SLAVE) + if (idev && idev->cnf.disable_ipv6) break; - if (idev && idev->cnf.disable_ipv6) + if (dev->flags & IFF_SLAVE) { + if (event == NETDEV_UP && !IS_ERR_OR_NULL(idev) && + dev->flags & IFF_UP && dev->flags & IFF_MULTICAST) + ipv6_mc_up(idev); break; + } if (event == NETDEV_UP) { /* restore routes for permanent addresses */ @@ -3963,7 +3967,7 @@ static void addrconf_dad_kick(struct inet6_ifaddr *ifp) if (ifp->flags & IFA_F_OPTIMISTIC) rand_num = 0; else - rand_num = prandom_u32() % (idev->cnf.rtr_solicit_delay ? : 1); + rand_num = prandom_u32_max(idev->cnf.rtr_solicit_delay ?: 1); nonce = 0; if (idev->cnf.enhanced_dad || @@ -7210,9 +7214,11 @@ err_reg_dflt: __addrconf_sysctl_unregister(net, all, NETCONFA_IFINDEX_ALL); err_reg_all: kfree(dflt); + net->ipv6.devconf_dflt = NULL; #endif err_alloc_dflt: kfree(all); + net->ipv6.devconf_all = NULL; err_alloc_all: kfree(net->ipv6.inet6_addr_lst); err_alloc_addr: diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c index 2ce0c44d0081..68075295d587 100644 --- a/net/ipv6/af_inet6.c +++ b/net/ipv6/af_inet6.c @@ -109,6 +109,13 @@ static __inline__ struct ipv6_pinfo *inet6_sk_generic(struct sock *sk) return (struct ipv6_pinfo *)(((u8 *)sk) + offset); } +void inet6_sock_destruct(struct sock *sk) +{ + inet6_cleanup_sock(sk); + inet_sock_destruct(sk); +} +EXPORT_SYMBOL_GPL(inet6_sock_destruct); + static int inet6_create(struct net *net, struct socket *sock, int protocol, int kern) { @@ -201,7 +208,7 @@ lookup_protocol: inet->hdrincl = 1; } - sk->sk_destruct = inet_sock_destruct; + sk->sk_destruct = inet6_sock_destruct; sk->sk_family = PF_INET6; sk->sk_protocol = protocol; @@ -483,7 +490,7 @@ int inet6_release(struct socket *sock) } EXPORT_SYMBOL(inet6_release); -void inet6_destroy_sock(struct sock *sk) +void inet6_cleanup_sock(struct sock *sk) { struct ipv6_pinfo *np = inet6_sk(sk); struct sk_buff *skb; @@ -508,7 +515,7 @@ void inet6_destroy_sock(struct sock *sk) txopt_put(opt); } } -EXPORT_SYMBOL_GPL(inet6_destroy_sock); +EXPORT_SYMBOL_GPL(inet6_cleanup_sock); /* * This does both peername and sockname. @@ -1057,6 +1064,8 @@ static const struct ipv6_stub ipv6_stub_impl = { static const struct ipv6_bpf_stub ipv6_bpf_stub_impl = { .inet6_bind = __inet6_bind, .udp6_lib_lookup = __udp6_lib_lookup, + .ipv6_setsockopt = do_ipv6_setsockopt, + .ipv6_getsockopt = do_ipv6_getsockopt, }; static int __init inet6_init(void) @@ -1070,13 +1079,13 @@ static int __init inet6_init(void) for (r = &inetsw6[0]; r < &inetsw6[SOCK_MAX]; ++r) INIT_LIST_HEAD(r); + raw_hashinfo_init(&raw_v6_hashinfo); + if (disable_ipv6_mod) { pr_info("Loaded, but administratively disabled, reboot required to enable\n"); goto out; } - raw_hashinfo_init(&raw_v6_hashinfo); - err = proto_register(&tcpv6_prot, 1); if (err) goto out; diff --git a/net/ipv6/ah6.c b/net/ipv6/ah6.c index b5995c1f4d7a..5228d2716289 100644 --- a/net/ipv6/ah6.c +++ b/net/ipv6/ah6.c @@ -666,30 +666,38 @@ static int ah6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } -static int ah6_init_state(struct xfrm_state *x) +static int ah6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct ah_data *ahp = NULL; struct xfrm_algo_desc *aalg_desc; struct crypto_ahash *ahash; - if (!x->aalg) + if (!x->aalg) { + NL_SET_ERR_MSG(extack, "AH requires a state with an AUTH algorithm"); goto error; + } - if (x->encap) + if (x->encap) { + NL_SET_ERR_MSG(extack, "AH is not compatible with encapsulation"); goto error; + } ahp = kzalloc(sizeof(*ahp), GFP_KERNEL); if (!ahp) return -ENOMEM; ahash = crypto_alloc_ahash(x->aalg->alg_name, 0, 0); - if (IS_ERR(ahash)) + if (IS_ERR(ahash)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } ahp->ahash = ahash; if (crypto_ahash_setkey(ahash, x->aalg->alg_key, - (x->aalg->alg_key_len + 7) / 8)) + (x->aalg->alg_key_len + 7) / 8)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } /* * Lookup the algorithm description maintained by xfrm_algo, @@ -702,9 +710,7 @@ static int ah6_init_state(struct xfrm_state *x) if (aalg_desc->uinfo.auth.icv_fullbits/8 != crypto_ahash_digestsize(ahash)) { - pr_info("AH: %s digestsize %u != %u\n", - x->aalg->alg_name, crypto_ahash_digestsize(ahash), - aalg_desc->uinfo.auth.icv_fullbits/8); + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; } @@ -721,6 +727,7 @@ static int ah6_init_state(struct xfrm_state *x) x->props.header_len += sizeof(struct ipv6hdr); break; default: + NL_SET_ERR_MSG(extack, "Invalid mode requested for AH, must be one of TRANSPORT, TUNNEL, BEET"); goto error; } x->data = ahp; diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c index df665d4e8f0f..df7e032ce87d 100644 --- a/net/ipv6/datagram.c +++ b/net/ipv6/datagram.c @@ -256,7 +256,7 @@ ipv4_connected: goto out; } - reuseport_has_conns(sk, true); + reuseport_has_conns_set(sk); sk->sk_state = TCP_ESTABLISHED; sk_set_txhash(sk); out: @@ -771,7 +771,7 @@ int ip6_datagram_send_ctl(struct net *net, struct sock *sk, } if (cmsg->cmsg_level == SOL_SOCKET) { - err = __sock_cmsg_send(sk, msg, cmsg, &ipc6->sockc); + err = __sock_cmsg_send(sk, cmsg, &ipc6->sockc); if (err) return err; continue; diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c index 8220923a12f7..14ed868680c6 100644 --- a/net/ipv6/esp6.c +++ b/net/ipv6/esp6.c @@ -151,6 +151,7 @@ static void esp_free_tcp_sk(struct rcu_head *head) static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) { struct xfrm_encap_tmpl *encap = x->encap; + struct net *net = xs_net(x); struct esp_tcp_sk *esk; __be16 sport, dport; struct sock *nsk; @@ -177,7 +178,7 @@ static struct sock *esp6_find_tcp_sk(struct xfrm_state *x) } spin_unlock_bh(&x->lock); - sk = __inet6_lookup_established(xs_net(x), &tcp_hashinfo, &x->id.daddr.in6, + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &x->id.daddr.in6, dport, &x->props.saddr.in6, ntohs(sport), 0, 0); if (!sk) return ERR_PTR(-ENOENT); @@ -1050,16 +1051,17 @@ static void esp6_destroy(struct xfrm_state *x) crypto_free_aead(aead); } -static int esp_init_aead(struct xfrm_state *x) +static int esp_init_aead(struct xfrm_state *x, struct netlink_ext_ack *extack) { char aead_name[CRYPTO_MAX_ALG_NAME]; struct crypto_aead *aead; int err; - err = -ENAMETOOLONG; if (snprintf(aead_name, CRYPTO_MAX_ALG_NAME, "%s(%s)", - x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) - goto error; + x->geniv, x->aead->alg_name) >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); + return -ENAMETOOLONG; + } aead = crypto_alloc_aead(aead_name, 0, 0); err = PTR_ERR(aead); @@ -1077,11 +1079,15 @@ static int esp_init_aead(struct xfrm_state *x) if (err) goto error; + return 0; + error: + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); return err; } -static int esp_init_authenc(struct xfrm_state *x) +static int esp_init_authenc(struct xfrm_state *x, + struct netlink_ext_ack *extack) { struct crypto_aead *aead; struct crypto_authenc_key_param *param; @@ -1092,10 +1098,6 @@ static int esp_init_authenc(struct xfrm_state *x) unsigned int keylen; int err; - err = -EINVAL; - if (!x->ealg) - goto error; - err = -ENAMETOOLONG; if ((x->props.flags & XFRM_STATE_ESN)) { @@ -1104,22 +1106,28 @@ static int esp_init_authenc(struct xfrm_state *x) x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", x->ealg->alg_name, - x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); goto error; + } } else { if (snprintf(authenc_name, CRYPTO_MAX_ALG_NAME, "%s%sauthenc(%s,%s)%s", x->geniv ?: "", x->geniv ? "(" : "", x->aalg ? x->aalg->alg_name : "digest_null", x->ealg->alg_name, - x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) + x->geniv ? ")" : "") >= CRYPTO_MAX_ALG_NAME) { + NL_SET_ERR_MSG(extack, "Algorithm name is too long"); goto error; + } } aead = crypto_alloc_aead(authenc_name, 0, 0); err = PTR_ERR(aead); - if (IS_ERR(aead)) + if (IS_ERR(aead)) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto error; + } x->data = aead; @@ -1149,17 +1157,16 @@ static int esp_init_authenc(struct xfrm_state *x) err = -EINVAL; if (aalg_desc->uinfo.auth.icv_fullbits / 8 != crypto_aead_authsize(aead)) { - pr_info("ESP: %s digestsize %u != %u\n", - x->aalg->alg_name, - crypto_aead_authsize(aead), - aalg_desc->uinfo.auth.icv_fullbits / 8); + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto free_key; } err = crypto_aead_setauthsize( aead, x->aalg->alg_trunc_len / 8); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Kernel was unable to initialize cryptographic operations"); goto free_key; + } } param->enckeylen = cpu_to_be32((x->ealg->alg_key_len + 7) / 8); @@ -1174,7 +1181,7 @@ error: return err; } -static int esp6_init_state(struct xfrm_state *x) +static int esp6_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct crypto_aead *aead; u32 align; @@ -1182,10 +1189,14 @@ static int esp6_init_state(struct xfrm_state *x) x->data = NULL; - if (x->aead) - err = esp_init_aead(x); - else - err = esp_init_authenc(x); + if (x->aead) { + err = esp_init_aead(x, extack); + } else if (x->ealg) { + err = esp_init_authenc(x, extack); + } else { + NL_SET_ERR_MSG(extack, "ESP: AEAD or CRYPT must be provided"); + err = -EINVAL; + } if (err) goto error; @@ -1213,6 +1224,7 @@ static int esp6_init_state(struct xfrm_state *x) switch (encap->encap_type) { default: + NL_SET_ERR_MSG(extack, "Unsupported encapsulation type for ESP"); err = -EINVAL; goto error; case UDP_ENCAP_ESPINUDP: diff --git a/net/ipv6/esp6_offload.c b/net/ipv6/esp6_offload.c index 3a293838a91d..79d43548279c 100644 --- a/net/ipv6/esp6_offload.c +++ b/net/ipv6/esp6_offload.c @@ -145,7 +145,10 @@ static struct sk_buff *xfrm6_tunnel_gso_segment(struct xfrm_state *x, struct sk_buff *skb, netdev_features_t features) { - return skb_eth_gso_segment(skb, features, htons(ETH_P_IPV6)); + __be16 type = x->inner_mode.family == AF_INET ? htons(ETH_P_IP) + : htons(ETH_P_IPV6); + + return skb_eth_gso_segment(skb, features, type); } static struct sk_buff *xfrm6_transport_gso_segment(struct xfrm_state *x, diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c index 7d53d62783b1..b64b49012655 100644 --- a/net/ipv6/inet6_hashtables.c +++ b/net/ipv6/inet6_hashtables.c @@ -21,8 +21,6 @@ #include <net/ip.h> #include <net/sock_reuseport.h> -extern struct inet_hashinfo tcp_hashinfo; - u32 inet6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, const struct in6_addr *faddr, const __be16 fport) @@ -169,7 +167,7 @@ static inline struct sock *inet6_lookup_run_bpf(struct net *net, struct sock *sk, *reuse_sk; bool no_reuseport; - if (hashinfo != &tcp_hashinfo) + if (hashinfo != net->ipv4.tcp_death_row.hashinfo) return NULL; /* only TCP is supported */ no_reuseport = bpf_sk_lookup_run_v6(net, IPPROTO_TCP, saddr, sport, diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c index ceb85c67ce39..18481eb76a0a 100644 --- a/net/ipv6/ip6_flowlabel.c +++ b/net/ipv6/ip6_flowlabel.c @@ -220,7 +220,7 @@ static struct ip6_flowlabel *fl_intern(struct net *net, spin_lock_bh(&ip6_fl_lock); if (label == 0) { for (;;) { - fl->label = htonl(prandom_u32())&IPV6_FLOWLABEL_MASK; + fl->label = htonl(get_random_u32())&IPV6_FLOWLABEL_MASK; if (fl->label) { lfl = __fl_lookup(net, fl->label); if (!lfl) diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c index 48b4ff0294f6..7673e1dd1147 100644 --- a/net/ipv6/ip6_gre.c +++ b/net/ipv6/ip6_gre.c @@ -870,26 +870,6 @@ static inline int ip6gre_xmit_ipv6(struct sk_buff *skb, struct net_device *dev) return 0; } -/** - * ip6gre_tnl_addr_conflict - compare packet addresses to tunnel's own - * @t: the outgoing tunnel device - * @hdr: IPv6 header from the incoming packet - * - * Description: - * Avoid trivial tunneling loop by checking that tunnel exit-point - * doesn't match source of incoming packet. - * - * Return: - * 1 if conflict, - * 0 else - **/ - -static inline bool ip6gre_tnl_addr_conflict(const struct ip6_tnl *t, - const struct ipv6hdr *hdr) -{ - return ipv6_addr_equal(&t->parms.raddr, &hdr->saddr); -} - static int ip6gre_xmit_other(struct sk_buff *skb, struct net_device *dev) { struct ip6_tnl *t = netdev_priv(dev); @@ -1175,14 +1155,16 @@ static void ip6gre_tnl_link_config_route(struct ip6_tnl *t, int set_mtu, dev->needed_headroom = dst_len; if (set_mtu) { - dev->mtu = rt->dst.dev->mtu - t_hlen; + int mtu = rt->dst.dev->mtu - t_hlen; + if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + mtu -= 8; if (dev->type == ARPHRD_ETHER) - dev->mtu -= ETH_HLEN; + mtu -= ETH_HLEN; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); } } ip6_rt_put(rt); diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c index d37a8c97e6de..3ee345672849 100644 --- a/net/ipv6/ip6_offload.c +++ b/net/ipv6/ip6_offload.c @@ -219,7 +219,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, off = skb_gro_offset(skb); hlen = off + sizeof(*iph); - iph = skb_gro_header_slow(skb, hlen, off); + iph = skb_gro_header(skb, hlen, off); if (unlikely(!iph)) goto out; @@ -232,7 +232,7 @@ INDIRECT_CALLABLE_SCOPE struct sk_buff *ipv6_gro_receive(struct list_head *head, proto = iph->nexthdr; ops = rcu_dereference(inet6_offloads[proto]); if (!ops || !ops->callbacks.gro_receive) { - __pskb_pull(skb, skb_gro_offset(skb)); + pskb_pull(skb, skb_gro_offset(skb)); skb_gro_frag0_invalidate(skb); proto = ipv6_gso_pull_exthdrs(skb, proto); skb_gro_pull(skb, -skb_transport_offset(skb)); diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c index a60176e913a8..e19507614f64 100644 --- a/net/ipv6/ip6_output.c +++ b/net/ipv6/ip6_output.c @@ -1567,7 +1567,7 @@ emsgsize: paged = true; zc = true; } else { - uarg->zerocopy = 0; + uarg_to_msgzc(uarg)->zerocopy = 0; skb_zcopy_set(skb, uarg, &extra_uref); } } diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c index 9e97f3b4c7e8..2fb4c6ad7243 100644 --- a/net/ipv6/ip6_tunnel.c +++ b/net/ipv6/ip6_tunnel.c @@ -1450,8 +1450,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) struct net_device *tdev = NULL; struct __ip6_tnl_parm *p = &t->parms; struct flowi6 *fl6 = &t->fl.u.ip6; - unsigned int mtu; int t_hlen; + int mtu; __dev_addr_set(dev, &p->laddr, sizeof(struct in6_addr)); memcpy(dev->broadcast, &p->raddr, sizeof(struct in6_addr)); @@ -1498,12 +1498,13 @@ static void ip6_tnl_link_config(struct ip6_tnl *t) dev->hard_header_len = tdev->hard_header_len + t_hlen; mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU); - dev->mtu = mtu - t_hlen; + mtu = mtu - t_hlen; if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT)) - dev->mtu -= 8; + mtu -= 8; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); } } } @@ -1988,39 +1989,6 @@ static void ip6_tnl_netlink_parms(struct nlattr *data[], parms->fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -static bool ip6_tnl_netlink_encap_parms(struct nlattr *data[], - struct ip_tunnel_encap *ipencap) -{ - bool ret = false; - - memset(ipencap, 0, sizeof(*ipencap)); - - if (!data) - return ret; - - if (data[IFLA_IPTUN_ENCAP_TYPE]) { - ret = true; - ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); - } - - if (data[IFLA_IPTUN_ENCAP_FLAGS]) { - ret = true; - ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); - } - - if (data[IFLA_IPTUN_ENCAP_SPORT]) { - ret = true; - ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); - } - - if (data[IFLA_IPTUN_ENCAP_DPORT]) { - ret = true; - ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); - } - - return ret; -} - static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, struct nlattr *tb[], struct nlattr *data[], struct netlink_ext_ack *extack) @@ -2033,7 +2001,7 @@ static int ip6_tnl_newlink(struct net *src_net, struct net_device *dev, nt = netdev_priv(dev); - if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip6_tnl_encap_setup(nt, &ipencap); if (err < 0) return err; @@ -2070,7 +2038,7 @@ static int ip6_tnl_changelink(struct net_device *dev, struct nlattr *tb[], if (dev == ip6n->fb_tnl_dev) return -EINVAL; - if (ip6_tnl_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { int err = ip6_tnl_encap_setup(t, &ipencap); if (err < 0) diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c index 1ddd60d06830..151337d7f67b 100644 --- a/net/ipv6/ip6_vti.c +++ b/net/ipv6/ip6_vti.c @@ -154,7 +154,7 @@ vti6_tnl_link(struct vti6_net *ip6n, struct ip6_tnl *t) { struct ip6_tnl __rcu **tp = vti6_tnl_bucket(ip6n, &t->parms); - rcu_assign_pointer(t->next , rtnl_dereference(*tp)); + rcu_assign_pointer(t->next, rtnl_dereference(*tp)); rcu_assign_pointer(*tp, t); } diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c index a9ba41648e36..facdc78a43e5 100644 --- a/net/ipv6/ip6mr.c +++ b/net/ipv6/ip6mr.c @@ -1028,8 +1028,11 @@ static void ip6mr_cache_resolve(struct net *net, struct mr_table *mrt, ((struct nlmsgerr *)nlmsg_data(nlh))->error = -EMSGSIZE; } rtnl_unicast(skb, net, NETLINK_CB(skb).portid); - } else + } else { + rcu_read_lock(); ip6_mr_forward(net, mrt, skb->dev, skb, c); + rcu_read_unlock(); + } } } @@ -1827,8 +1830,8 @@ int ip6_mroute_setsockopt(struct sock *sk, int optname, sockptr_t optval, * Getsock opt support for the multicast routing system. */ -int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, - int __user *optlen) +int ip6_mroute_getsockopt(struct sock *sk, int optname, sockptr_t optval, + sockptr_t optlen) { int olr; int val; @@ -1859,16 +1862,16 @@ int ip6_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, return -ENOPROTOOPT; } - if (get_user(olr, optlen)) + if (copy_from_sockptr(&olr, optlen, sizeof(int))) return -EFAULT; olr = min_t(int, olr, sizeof(int)); if (olr < 0) return -EINVAL; - if (put_user(olr, optlen)) + if (copy_to_sockptr(optlen, &olr, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, olr)) + if (copy_to_sockptr(optval, &val, olr)) return -EFAULT; return 0; } diff --git a/net/ipv6/ipcomp6.c b/net/ipv6/ipcomp6.c index 15f984be3570..72d4858dec18 100644 --- a/net/ipv6/ipcomp6.c +++ b/net/ipv6/ipcomp6.c @@ -136,7 +136,8 @@ out: return err; } -static int ipcomp6_init_state(struct xfrm_state *x) +static int ipcomp6_init_state(struct xfrm_state *x, + struct netlink_ext_ack *extack) { int err = -EINVAL; @@ -148,17 +149,20 @@ static int ipcomp6_init_state(struct xfrm_state *x) x->props.header_len += sizeof(struct ipv6hdr); break; default: + NL_SET_ERR_MSG(extack, "Unsupported XFRM mode for IPcomp"); goto out; } - err = ipcomp_init_state(x); + err = ipcomp_init_state(x, extack); if (err) goto out; if (x->props.mode == XFRM_MODE_TUNNEL) { err = ipcomp6_tunnel_attach(x); - if (err) + if (err) { + NL_SET_ERR_MSG(extack, "Kernel error: failed to initialize the associated state"); goto out; + } } err = 0; diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c index e0dcc7a193df..9ce51680290b 100644 --- a/net/ipv6/ipv6_sockglue.c +++ b/net/ipv6/ipv6_sockglue.c @@ -327,7 +327,7 @@ static int ipv6_set_opt_hdr(struct sock *sk, int optname, sockptr_t optval, int err; /* hop-by-hop / destination options are privileged option */ - if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW)) + if (optname != IPV6_RTHDR && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW)) return -EPERM; /* remove any sticky options header with a zero option @@ -391,8 +391,8 @@ sticky_done: return err; } -static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, - sockptr_t optval, unsigned int optlen) +int do_ipv6_setsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, unsigned int optlen) { struct ipv6_pinfo *np = inet6_sk(sk); struct net *net = sock_net(sk); @@ -417,7 +417,13 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (needs_rtnl) rtnl_lock(); - lock_sock(sk); + sockopt_lock_sock(sk); + + /* Another thread has converted the socket into IPv4 with + * IPV6_ADDRFORM concurrently. + */ + if (unlikely(sk->sk_family != AF_INET6)) + goto unlock; switch (optname) { @@ -425,9 +431,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, if (optlen < sizeof(int)) goto e_inval; if (val == PF_INET) { - struct ipv6_txoptions *opt; - struct sk_buff *pktopt; - if (sk->sk_type == SOCK_RAW) break; @@ -458,7 +461,6 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; } - fl6_free_socklist(sk); __ipv6_sock_mc_close(sk); __ipv6_sock_ac_close(sk); @@ -475,9 +477,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, &tcp_prot, 1); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_stream_ops */ WRITE_ONCE(sk->sk_prot, &tcp_prot); - icsk->icsk_af_ops = &ipv4_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv4_specific); sk->sk_socket->ops = &inet_stream_ops; sk->sk_family = PF_INET; tcp_sync_mss(sk, icsk->icsk_pmtu_cookie); @@ -490,19 +493,19 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, sock_prot_inuse_add(net, sk->sk_prot, -1); sock_prot_inuse_add(net, prot, 1); - /* Paired with READ_ONCE(sk->sk_prot) in net/ipv6/af_inet6.c */ + /* Paired with READ_ONCE(sk->sk_prot) in inet6_dgram_ops */ WRITE_ONCE(sk->sk_prot, prot); sk->sk_socket->ops = &inet_dgram_ops; sk->sk_family = PF_INET; } - opt = xchg((__force struct ipv6_txoptions **)&np->opt, - NULL); - if (opt) { - atomic_sub(opt->tot_len, &sk->sk_omem_alloc); - txopt_put(opt); - } - pktopt = xchg(&np->pktoptions, NULL); - kfree_skb(pktopt); + + /* Disable all options not to allocate memory anymore, + * but there is still a race. See the lockless path + * in udpv6_sendmsg() and ipv6_local_rxpmtu(). + */ + np->rxopt.all = 0; + + inet6_cleanup_sock(sk); /* * ... and add it to the refcnt debug socks count @@ -634,8 +637,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname, break; case IPV6_TRANSPARENT: - if (valbool && !ns_capable(net->user_ns, CAP_NET_RAW) && - !ns_capable(net->user_ns, CAP_NET_ADMIN)) { + if (valbool && !sockopt_ns_capable(net->user_ns, CAP_NET_RAW) && + !sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) { retv = -EPERM; break; } @@ -946,7 +949,7 @@ done: case IPV6_IPSEC_POLICY: case IPV6_XFRM_POLICY: retv = -EPERM; - if (!ns_capable(net->user_ns, CAP_NET_ADMIN)) + if (!sockopt_ns_capable(net->user_ns, CAP_NET_ADMIN)) break; retv = xfrm_user_policy(sk, optname, optval, optlen); break; @@ -994,17 +997,16 @@ done: break; } - release_sock(sk); +unlock: + sockopt_release_sock(sk); if (needs_rtnl) rtnl_unlock(); return retv; e_inval: - release_sock(sk); - if (needs_rtnl) - rtnl_unlock(); - return -EINVAL; + retv = -EINVAL; + goto unlock; } int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, @@ -1030,7 +1032,7 @@ int ipv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, EXPORT_SYMBOL(ipv6_setsockopt); static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, - int optname, char __user *optval, int len) + int optname, sockptr_t optval, int len) { struct ipv6_opt_hdr *hdr; @@ -1058,56 +1060,53 @@ static int ipv6_getsockopt_sticky(struct sock *sk, struct ipv6_txoptions *opt, return 0; len = min_t(unsigned int, len, ipv6_optlen(hdr)); - if (copy_to_user(optval, hdr, len)) + if (copy_to_sockptr(optval, hdr, len)) return -EFAULT; return len; } -static int ipv6_get_msfilter(struct sock *sk, void __user *optval, - int __user *optlen, int len) +static int ipv6_get_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct group_filter, gf_slist_flex); - struct group_filter __user *p = optval; struct group_filter gsf; int num; int err; if (len < size0) return -EINVAL; - if (copy_from_user(&gsf, p, size0)) + if (copy_from_sockptr(&gsf, optval, size0)) return -EFAULT; if (gsf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; num = gsf.gf_numsrc; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gsf, p->gf_slist_flex); + sockopt_lock_sock(sk); + err = ip6_mc_msfget(sk, &gsf, optval, size0); if (!err) { if (num > gsf.gf_numsrc) num = gsf.gf_numsrc; - if (put_user(GROUP_FILTER_SIZE(num), optlen) || - copy_to_user(p, &gsf, size0)) + len = GROUP_FILTER_SIZE(num); + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr(optval, &gsf, size0)) err = -EFAULT; } - release_sock(sk); + sockopt_release_sock(sk); return err; } -static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, - int __user *optlen) +static int compat_ipv6_get_msfilter(struct sock *sk, sockptr_t optval, + sockptr_t optlen, int len) { const int size0 = offsetof(struct compat_group_filter, gf_slist_flex); - struct compat_group_filter __user *p = optval; struct compat_group_filter gf32; struct group_filter gf; - int len, err; + int err; int num; - if (get_user(len, optlen)) - return -EFAULT; if (len < size0) return -EINVAL; - if (copy_from_user(&gf32, p, size0)) + if (copy_from_sockptr(&gf32, optval, size0)) return -EFAULT; gf.gf_interface = gf32.gf_interface; gf.gf_fmode = gf32.gf_fmode; @@ -1117,23 +1116,25 @@ static int compat_ipv6_get_msfilter(struct sock *sk, void __user *optval, if (gf.gf_group.ss_family != AF_INET6) return -EADDRNOTAVAIL; - lock_sock(sk); - err = ip6_mc_msfget(sk, &gf, p->gf_slist_flex); - release_sock(sk); + sockopt_lock_sock(sk); + err = ip6_mc_msfget(sk, &gf, optval, size0); + sockopt_release_sock(sk); if (err) return err; if (num > gf.gf_numsrc) num = gf.gf_numsrc; len = GROUP_FILTER_SIZE(num) - (sizeof(gf)-sizeof(gf32)); - if (put_user(len, optlen) || - put_user(gf.gf_fmode, &p->gf_fmode) || - put_user(gf.gf_numsrc, &p->gf_numsrc)) + if (copy_to_sockptr(optlen, &len, sizeof(int)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_fmode), + &gf.gf_fmode, sizeof(gf32.gf_fmode)) || + copy_to_sockptr_offset(optval, offsetof(struct compat_group_filter, gf_numsrc), + &gf.gf_numsrc, sizeof(gf32.gf_numsrc))) return -EFAULT; return 0; } -static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, - char __user *optval, int __user *optlen, unsigned int flags) +int do_ipv6_getsockopt(struct sock *sk, int level, int optname, + sockptr_t optval, sockptr_t optlen) { struct ipv6_pinfo *np = inet6_sk(sk); int len; @@ -1142,7 +1143,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (ip6_mroute_opt(optname)) return ip6_mroute_getsockopt(sk, optname, optval, optlen); - if (get_user(len, optlen)) + if (copy_from_sockptr(&len, optlen, sizeof(int))) return -EFAULT; switch (optname) { case IPV6_ADDRFORM: @@ -1156,7 +1157,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, break; case MCAST_MSFILTER: if (in_compat_syscall()) - return compat_ipv6_get_msfilter(sk, optval, optlen); + return compat_ipv6_get_msfilter(sk, optval, optlen, len); return ipv6_get_msfilter(sk, optval, optlen, len); case IPV6_2292PKTOPTIONS: { @@ -1166,16 +1167,21 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (sk->sk_type != SOCK_STREAM) return -ENOPROTOOPT; - msg.msg_control_user = optval; + if (optval.is_kernel) { + msg.msg_control_is_user = false; + msg.msg_control = optval.kernel; + } else { + msg.msg_control_is_user = true; + msg.msg_control_user = optval.user; + } msg.msg_controllen = len; - msg.msg_flags = flags; - msg.msg_control_is_user = true; + msg.msg_flags = 0; - lock_sock(sk); + sockopt_lock_sock(sk); skb = np->pktoptions; if (skb) ip6_datagram_recv_ctl(sk, &msg, skb); - release_sock(sk); + sockopt_release_sock(sk); if (!skb) { if (np->rxopt.bits.rxinfo) { struct in6_pktinfo src_info; @@ -1212,7 +1218,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, } } len -= msg.msg_controllen; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_MTU: { @@ -1264,15 +1270,15 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, { struct ipv6_txoptions *opt; - lock_sock(sk); + sockopt_lock_sock(sk); opt = rcu_dereference_protected(np->opt, lockdep_sock_is_held(sk)); len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len); - release_sock(sk); + sockopt_release_sock(sk); /* check if ipv6_getsockopt_sticky() returns err code */ if (len < 0) return len; - return put_user(len, optlen); + return copy_to_sockptr(optlen, &len, sizeof(int)); } case IPV6_RECVHOPOPTS: @@ -1326,9 +1332,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (!mtuinfo.ip6m_mtu) return -ENOTCONN; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &mtuinfo, len)) + if (copy_to_sockptr(optval, &mtuinfo, len)) return -EFAULT; return 0; @@ -1405,7 +1411,7 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (len < sizeof(freq)) return -EINVAL; - if (copy_from_user(&freq, optval, sizeof(freq))) + if (copy_from_sockptr(&freq, optval, sizeof(freq))) return -EFAULT; if (freq.flr_action != IPV6_FL_A_GET) @@ -1420,9 +1426,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, if (val < 0) return val; - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &freq, len)) + if (copy_to_sockptr(optval, &freq, len)) return -EFAULT; return 0; @@ -1474,9 +1480,9 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname, return -ENOPROTOOPT; } len = min_t(unsigned int, sizeof(int), len); - if (put_user(len, optlen)) + if (copy_to_sockptr(optlen, &len, sizeof(int))) return -EFAULT; - if (copy_to_user(optval, &val, len)) + if (copy_to_sockptr(optval, &val, len)) return -EFAULT; return 0; } @@ -1492,7 +1498,8 @@ int ipv6_getsockopt(struct sock *sk, int level, int optname, if (level != SOL_IPV6) return -ENOPROTOOPT; - err = do_ipv6_getsockopt(sk, level, optname, optval, optlen, 0); + err = do_ipv6_getsockopt(sk, level, optname, + USER_SOCKPTR(optval), USER_SOCKPTR(optlen)); #ifdef CONFIG_NETFILTER /* we need to exclude all possible ENOPROTOOPTs except default case */ if (err == -ENOPROTOOPT && optname != IPV6_2292PKTOPTIONS) { diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c index 87c699d57b36..7860383295d8 100644 --- a/net/ipv6/mcast.c +++ b/net/ipv6/mcast.c @@ -580,7 +580,7 @@ done: } int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, - struct sockaddr_storage __user *p) + sockptr_t optval, size_t ss_offset) { struct ipv6_pinfo *inet6 = inet6_sk(sk); const struct in6_addr *group; @@ -612,8 +612,7 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc; gsf->gf_numsrc = count; - - for (i = 0; i < copycount; i++, p++) { + for (i = 0; i < copycount; i++) { struct sockaddr_in6 *psin6; struct sockaddr_storage ss; @@ -621,8 +620,9 @@ int ip6_mc_msfget(struct sock *sk, struct group_filter *gsf, memset(&ss, 0, sizeof(ss)); psin6->sin6_family = AF_INET6; psin6->sin6_addr = psl->sl_addr[i]; - if (copy_to_user(p, &ss, sizeof(ss))) + if (copy_to_sockptr_offset(optval, ss_offset, &ss, sizeof(ss))) return -EFAULT; + ss_offset += sizeof(ss); } return 0; } @@ -1050,7 +1050,7 @@ bool ipv6_chk_mcast_addr(struct net_device *dev, const struct in6_addr *group, /* called with mc_lock */ static void mld_gq_start_work(struct inet6_dev *idev) { - unsigned long tv = prandom_u32() % idev->mc_maxdelay; + unsigned long tv = prandom_u32_max(idev->mc_maxdelay); idev->mc_gq_running = 1; if (!mod_delayed_work(mld_wq, &idev->mc_gq_work, tv + 2)) @@ -1068,7 +1068,7 @@ static void mld_gq_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_ifc_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32() % delay; + unsigned long tv = prandom_u32_max(delay); if (!mod_delayed_work(mld_wq, &idev->mc_ifc_work, tv + 2)) in6_dev_hold(idev); @@ -1085,7 +1085,7 @@ static void mld_ifc_stop_work(struct inet6_dev *idev) /* called with mc_lock */ static void mld_dad_start_work(struct inet6_dev *idev, unsigned long delay) { - unsigned long tv = prandom_u32() % delay; + unsigned long tv = prandom_u32_max(delay); if (!mod_delayed_work(mld_wq, &idev->mc_dad_work, tv + 2)) in6_dev_hold(idev); @@ -1130,7 +1130,7 @@ static void igmp6_group_queried(struct ifmcaddr6 *ma, unsigned long resptime) } if (delay >= resptime) - delay = prandom_u32() % resptime; + delay = prandom_u32_max(resptime); if (!mod_delayed_work(mld_wq, &ma->mca_work, delay)) refcount_inc(&ma->mca_refcnt); @@ -2574,7 +2574,7 @@ static void igmp6_join_group(struct ifmcaddr6 *ma) igmp6_send(&ma->mca_addr, ma->idev->dev, ICMPV6_MGM_REPORT); - delay = prandom_u32() % unsolicited_report_interval(ma->idev); + delay = prandom_u32_max(unsolicited_report_interval(ma->idev)); if (cancel_delayed_work(&ma->mca_work)) { refcount_dec(&ma->mca_refcnt); diff --git a/net/ipv6/mip6.c b/net/ipv6/mip6.c index aeb35d26e474..83d2a8be263f 100644 --- a/net/ipv6/mip6.c +++ b/net/ipv6/mip6.c @@ -247,15 +247,14 @@ static int mip6_destopt_reject(struct xfrm_state *x, struct sk_buff *skb, return err; } -static int mip6_destopt_init_state(struct xfrm_state *x) +static int mip6_destopt_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { - pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); + NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - pr_info("%s: state's mode is not %u: %u\n", - __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } @@ -333,15 +332,14 @@ static int mip6_rthdr_output(struct xfrm_state *x, struct sk_buff *skb) return 0; } -static int mip6_rthdr_init_state(struct xfrm_state *x) +static int mip6_rthdr_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { if (x->id.spi) { - pr_info("%s: spi is not 0: %u\n", __func__, x->id.spi); + NL_SET_ERR_MSG(extack, "SPI must be 0"); return -EINVAL; } if (x->props.mode != XFRM_MODE_ROUTEOPTIMIZATION) { - pr_info("%s: state's mode is not %u: %u\n", - __func__, XFRM_MODE_ROUTEOPTIMIZATION, x->props.mode); + NL_SET_ERR_MSG(extack, "XFRM mode must be XFRM_MODE_ROUTEOPTIMIZATION"); return -EINVAL; } diff --git a/net/ipv6/netfilter/ip6t_rpfilter.c b/net/ipv6/netfilter/ip6t_rpfilter.c index d800801a5dd2..a01d9b842bd0 100644 --- a/net/ipv6/netfilter/ip6t_rpfilter.c +++ b/net/ipv6/netfilter/ip6t_rpfilter.c @@ -37,8 +37,10 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, bool ret = false; struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, + .flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev), .flowlabel = (* (__be32 *) iph) & IPV6_FLOWINFO_MASK, .flowi6_proto = iph->nexthdr, + .flowi6_uid = sock_net_uid(net, NULL), .daddr = iph->saddr, }; int lookup_flags; @@ -55,9 +57,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, if (rpfilter_addr_linklocal(&iph->saddr)) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6.flowi6_oif = dev->ifindex; - /* Set flowi6_oif for vrf devices to lookup route in l3mdev domain. */ - } else if (netif_is_l3_master(dev) || netif_is_l3_slave(dev) || - (flags & XT_RPFILTER_LOOSE) == 0) + } else if ((flags & XT_RPFILTER_LOOSE) == 0) fl6.flowi6_oif = dev->ifindex; rt = (void *)ip6_route_lookup(net, &fl6, skb, lookup_flags); @@ -72,9 +72,7 @@ static bool rpfilter_lookup_reverse6(struct net *net, const struct sk_buff *skb, goto out; } - if (rt->rt6i_idev->dev == dev || - l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) == dev->ifindex || - (flags & XT_RPFILTER_LOOSE)) + if (rt->rt6i_idev->dev == dev || (flags & XT_RPFILTER_LOOSE)) ret = true; out: ip6_rt_put(rt); diff --git a/net/ipv6/netfilter/nf_socket_ipv6.c b/net/ipv6/netfilter/nf_socket_ipv6.c index aa5bb8789ba0..a7690ec62325 100644 --- a/net/ipv6/netfilter/nf_socket_ipv6.c +++ b/net/ipv6/netfilter/nf_socket_ipv6.c @@ -83,8 +83,8 @@ nf_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff, { switch (protocol) { case IPPROTO_TCP: - return inet6_lookup(net, &tcp_hashinfo, skb, doff, - saddr, sport, daddr, dport, + return inet6_lookup(net, net->ipv4.tcp_death_row.hashinfo, + skb, doff, saddr, sport, daddr, dport, in->ifindex); case IPPROTO_UDP: return udp6_lib_lookup(net, saddr, sport, daddr, dport, diff --git a/net/ipv6/netfilter/nf_tproxy_ipv6.c b/net/ipv6/netfilter/nf_tproxy_ipv6.c index 6bac68fb27a3..929502e51203 100644 --- a/net/ipv6/netfilter/nf_tproxy_ipv6.c +++ b/net/ipv6/netfilter/nf_tproxy_ipv6.c @@ -80,6 +80,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, const struct net_device *in, const enum nf_tproxy_lookup_t lookup_type) { + struct inet_hashinfo *hinfo = net->ipv4.tcp_death_row.hashinfo; struct sock *sk; switch (protocol) { @@ -93,7 +94,7 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, switch (lookup_type) { case NF_TPROXY_LOOKUP_LISTENER: - sk = inet6_lookup_listener(net, &tcp_hashinfo, skb, + sk = inet6_lookup_listener(net, hinfo, skb, thoff + __tcp_hdrlen(hp), saddr, sport, daddr, ntohs(dport), @@ -108,9 +109,8 @@ nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, */ break; case NF_TPROXY_LOOKUP_ESTABLISHED: - sk = __inet6_lookup_established(net, &tcp_hashinfo, - saddr, sport, daddr, ntohs(dport), - in->ifindex, 0); + sk = __inet6_lookup_established(net, hinfo, saddr, sport, daddr, + ntohs(dport), in->ifindex, 0); break; default: BUG(); diff --git a/net/ipv6/netfilter/nft_fib_ipv6.c b/net/ipv6/netfilter/nft_fib_ipv6.c index 8970d0b4faeb..36dc14b34388 100644 --- a/net/ipv6/netfilter/nft_fib_ipv6.c +++ b/net/ipv6/netfilter/nft_fib_ipv6.c @@ -41,6 +41,8 @@ static int nft_fib6_flowi_init(struct flowi6 *fl6, const struct nft_fib *priv, if (ipv6_addr_type(&fl6->daddr) & IPV6_ADDR_LINKLOCAL) { lookup_flags |= RT6_LOOKUP_F_IFACE; fl6->flowi6_oif = get_ifindex(dev ? dev : pkt->skb->dev); + } else if (priv->flags & NFTA_FIB_F_IIF) { + fl6->flowi6_l3mdev = l3mdev_master_ifindex_rcu(dev); } if (ipv6_addr_type(&fl6->saddr) & IPV6_ADDR_UNICAST) @@ -64,6 +66,7 @@ static u32 __nft_fib6_eval_type(const struct nft_fib *priv, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, + .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; u32 ret = 0; @@ -161,6 +164,7 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, struct flowi6 fl6 = { .flowi6_iif = LOOPBACK_IFINDEX, .flowi6_proto = pkt->tprot, + .flowi6_uid = sock_net_uid(nft_net(pkt), NULL), }; struct rt6_info *rt; int lookup_flags; @@ -197,7 +201,8 @@ void nft_fib6_eval(const struct nft_expr *expr, struct nft_regs *regs, if (rt->rt6i_flags & (RTF_REJECT | RTF_ANYCAST | RTF_LOCAL)) goto put_rt_err; - if (oif && oif != rt->rt6i_idev->dev) + if (oif && oif != rt->rt6i_idev->dev && + l3mdev_master_ifindex_rcu(rt->rt6i_idev->dev) != oif->ifindex) goto put_rt_err; nft_fib_store_result(dest, priv, rt->rt6i_idev->dev); diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index 2880dc7d9a49..2685c3f15e9d 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -18,7 +18,7 @@ static u32 __ipv6_select_ident(struct net *net, u32 id; do { - id = prandom_u32(); + id = get_random_u32(); } while (!id); return id; diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c index 91b840514656..808983bc2ec9 100644 --- a/net/ipv6/ping.c +++ b/net/ipv6/ping.c @@ -20,13 +20,9 @@ #include <net/udp.h> #include <net/transp_v6.h> #include <linux/proc_fs.h> +#include <linux/bpf-cgroup.h> #include <net/ping.h> -static void ping_v6_destroy(struct sock *sk) -{ - inet6_destroy_sock(sk); -} - /* Compatibility glue so we can support IPv6 when it's compiled as a module */ static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len) @@ -49,6 +45,20 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr, return 0; } +static int ping_v6_pre_connect(struct sock *sk, struct sockaddr *uaddr, + int addr_len) +{ + /* This check is replicated from __ip6_datagram_connect() and + * intended to prevent BPF program called below from accessing + * bytes that are out of the bound specified by user in addr_len. + */ + + if (addr_len < SIN6_LEN_RFC2133) + return -EINVAL; + + return BPF_CGROUP_RUN_PROG_INET6_CONNECT_LOCK(sk, uaddr); +} + static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct inet_sock *inet = inet_sk(sk); @@ -164,7 +174,7 @@ static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); err = ip6_append_data(sk, ping_getfrag, &pfh, len, - 0, &ipc6, &fl6, rt, + sizeof(struct icmp6hdr), &ipc6, &fl6, rt, MSG_DONTWAIT); if (err) { @@ -190,7 +200,7 @@ struct proto pingv6_prot = { .owner = THIS_MODULE, .init = ping_init_sock, .close = ping_close, - .destroy = ping_v6_destroy, + .pre_connect = ping_v6_pre_connect, .connect = ip6_datagram_connect_v6_only, .disconnect = __udp_disconnect, .setsockopt = ipv6_setsockopt, diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c index 722de9dd0ff7..a06a9f847db5 100644 --- a/net/ipv6/raw.c +++ b/net/ipv6/raw.c @@ -1173,8 +1173,6 @@ static void raw6_destroy(struct sock *sk) lock_sock(sk); ip6_flush_pending_frames(sk); release_sock(sk); - - inet6_destroy_sock(sk); } static int rawv6_init_sk(struct sock *sk) diff --git a/net/ipv6/seg6.c b/net/ipv6/seg6.c index 5421cc7c935f..29346a6eec9f 100644 --- a/net/ipv6/seg6.c +++ b/net/ipv6/seg6.c @@ -191,6 +191,11 @@ static int seg6_genl_sethmac(struct sk_buff *skb, struct genl_info *info) goto out_unlock; } + if (slen > nla_len(info->attrs[SEG6_ATTR_SECRET])) { + err = -EINVAL; + goto out_unlock; + } + if (hinfo) { err = seg6_hmac_info_del(net, hmackeyid); if (err) diff --git a/net/ipv6/seg6_local.c b/net/ipv6/seg6_local.c index b7de5e46fdd8..487f8e98deaa 100644 --- a/net/ipv6/seg6_local.c +++ b/net/ipv6/seg6_local.c @@ -73,6 +73,55 @@ struct bpf_lwt_prog { char *name; }; +/* default length values (expressed in bits) for both Locator-Block and + * Locator-Node Function. + * + * Both SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS *must* be: + * i) greater than 0; + * ii) evenly divisible by 8. In other terms, the lengths of the + * Locator-Block and Locator-Node Function must be byte-aligned (we can + * relax this constraint in the future if really needed). + * + * Moreover, a third condition must hold: + * iii) SEG6_LOCAL_LCBLOCK_DBITS + SEG6_LOCAL_LCNODE_FN_DBITS <= 128. + * + * The correctness of SEG6_LOCAL_LCBLOCK_DBITS and SEG6_LOCAL_LCNODE_FN_DBITS + * values are checked during the kernel compilation. If the compilation stops, + * check the value of these parameters to see if they meet conditions (i), (ii) + * and (iii). + */ +#define SEG6_LOCAL_LCBLOCK_DBITS 32 +#define SEG6_LOCAL_LCNODE_FN_DBITS 16 + +/* The following next_csid_chk_{cntr,lcblock,lcblock_fn}_bits macros can be + * used directly to check whether the lengths (in bits) of Locator-Block and + * Locator-Node Function are valid according to (i), (ii), (iii). + */ +#define next_csid_chk_cntr_bits(blen, flen) \ + ((blen) + (flen) > 128) + +#define next_csid_chk_lcblock_bits(blen) \ +({ \ + typeof(blen) __tmp = blen; \ + (!__tmp || __tmp > 120 || (__tmp & 0x07)); \ +}) + +#define next_csid_chk_lcnode_fn_bits(flen) \ + next_csid_chk_lcblock_bits(flen) + +/* Supported Flavor operations are reported in this bitmask */ +#define SEG6_LOCAL_FLV_SUPP_OPS (BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID)) + +struct seg6_flavors_info { + /* Flavor operations */ + __u32 flv_ops; + + /* Locator-Block length, expressed in bits */ + __u8 lcblock_bits; + /* Locator-Node Function length, expressed in bits*/ + __u8 lcnode_func_bits; +}; + enum seg6_end_dt_mode { DT_INVALID_MODE = -EINVAL, DT_LEGACY_MODE = 0, @@ -136,6 +185,8 @@ struct seg6_local_lwt { #ifdef CONFIG_NET_L3_MASTER_DEV struct seg6_end_dt_info dt_info; #endif + struct seg6_flavors_info flv_info; + struct pcpu_seg6_local_counters __percpu *pcpu_counters; int headroom; @@ -271,8 +322,50 @@ int seg6_lookup_nexthop(struct sk_buff *skb, return seg6_lookup_any_nexthop(skb, nhaddr, tbl_id, false); } -/* regular endpoint function */ -static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) +static __u8 seg6_flv_lcblock_octects(const struct seg6_flavors_info *finfo) +{ + return finfo->lcblock_bits >> 3; +} + +static __u8 seg6_flv_lcnode_func_octects(const struct seg6_flavors_info *finfo) +{ + return finfo->lcnode_func_bits >> 3; +} + +static bool seg6_next_csid_is_arg_zero(const struct in6_addr *addr, + const struct seg6_flavors_info *finfo) +{ + __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo); + __u8 blk_octects = seg6_flv_lcblock_octects(finfo); + __u8 arg_octects; + int i; + + arg_octects = 16 - blk_octects - fnc_octects; + for (i = 0; i < arg_octects; ++i) { + if (addr->s6_addr[blk_octects + fnc_octects + i] != 0x00) + return false; + } + + return true; +} + +/* assume that DA.Argument length > 0 */ +static void seg6_next_csid_advance_arg(struct in6_addr *addr, + const struct seg6_flavors_info *finfo) +{ + __u8 fnc_octects = seg6_flv_lcnode_func_octects(finfo); + __u8 blk_octects = seg6_flv_lcblock_octects(finfo); + + /* advance DA.Argument */ + memmove(&addr->s6_addr[blk_octects], + &addr->s6_addr[blk_octects + fnc_octects], + 16 - blk_octects - fnc_octects); + + memset(&addr->s6_addr[16 - fnc_octects], 0x00, fnc_octects); +} + +static int input_action_end_core(struct sk_buff *skb, + struct seg6_local_lwt *slwt) { struct ipv6_sr_hdr *srh; @@ -291,6 +384,38 @@ drop: return -EINVAL; } +static int end_next_csid_core(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + struct in6_addr *daddr = &ipv6_hdr(skb)->daddr; + + if (seg6_next_csid_is_arg_zero(daddr, finfo)) + return input_action_end_core(skb, slwt); + + /* update DA */ + seg6_next_csid_advance_arg(daddr, finfo); + + seg6_lookup_nexthop(skb, NULL, 0); + + return dst_input(skb); +} + +static bool seg6_next_csid_enabled(__u32 fops) +{ + return fops & BIT(SEG6_LOCAL_FLV_OP_NEXT_CSID); +} + +/* regular endpoint function */ +static int input_action_end(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + const struct seg6_flavors_info *finfo = &slwt->flv_info; + + if (seg6_next_csid_enabled(finfo->flv_ops)) + return end_next_csid_core(skb, slwt); + + return input_action_end_core(skb, slwt); +} + /* regular endpoint, and forward to specified nexthop */ static int input_action_end_x(struct sk_buff *skb, struct seg6_local_lwt *slwt) { @@ -951,7 +1076,8 @@ static struct seg6_action_desc seg6_action_table[] = { { .action = SEG6_LOCAL_ACTION_END, .attrs = 0, - .optattrs = SEG6_F_LOCAL_COUNTERS, + .optattrs = SEG6_F_LOCAL_COUNTERS | + SEG6_F_ATTR(SEG6_LOCAL_FLAVORS), .input = input_action_end, }, { @@ -1132,9 +1258,11 @@ static const struct nla_policy seg6_local_policy[SEG6_LOCAL_MAX + 1] = { [SEG6_LOCAL_OIF] = { .type = NLA_U32 }, [SEG6_LOCAL_BPF] = { .type = NLA_NESTED }, [SEG6_LOCAL_COUNTERS] = { .type = NLA_NESTED }, + [SEG6_LOCAL_FLAVORS] = { .type = NLA_NESTED }, }; -static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_srh(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct ipv6_sr_hdr *srh; int len; @@ -1191,7 +1319,8 @@ static void destroy_attr_srh(struct seg6_local_lwt *slwt) kfree(slwt->srh); } -static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_table(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { slwt->table = nla_get_u32(attrs[SEG6_LOCAL_TABLE]); @@ -1225,7 +1354,8 @@ seg6_end_dt_info *seg6_possible_end_dt_info(struct seg6_local_lwt *slwt) } static int parse_nla_vrftable(struct nlattr **attrs, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct seg6_end_dt_info *info = seg6_possible_end_dt_info(slwt); @@ -1261,7 +1391,8 @@ static int cmp_nla_vrftable(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } -static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_nh4(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { memcpy(&slwt->nh4, nla_data(attrs[SEG6_LOCAL_NH4]), sizeof(struct in_addr)); @@ -1287,7 +1418,8 @@ static int cmp_nla_nh4(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return memcmp(&a->nh4, &b->nh4, sizeof(struct in_addr)); } -static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_nh6(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { memcpy(&slwt->nh6, nla_data(attrs[SEG6_LOCAL_NH6]), sizeof(struct in6_addr)); @@ -1313,7 +1445,8 @@ static int cmp_nla_nh6(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return memcmp(&a->nh6, &b->nh6, sizeof(struct in6_addr)); } -static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_iif(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { slwt->iif = nla_get_u32(attrs[SEG6_LOCAL_IIF]); @@ -1336,7 +1469,8 @@ static int cmp_nla_iif(struct seg6_local_lwt *a, struct seg6_local_lwt *b) return 0; } -static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_oif(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { slwt->oif = nla_get_u32(attrs[SEG6_LOCAL_OIF]); @@ -1366,7 +1500,8 @@ static const struct nla_policy bpf_prog_policy[SEG6_LOCAL_BPF_PROG_MAX + 1] = { .len = MAX_PROG_NAME }, }; -static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_bpf(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct nlattr *tb[SEG6_LOCAL_BPF_PROG_MAX + 1]; struct bpf_prog *p; @@ -1444,7 +1579,8 @@ nla_policy seg6_local_counters_policy[SEG6_LOCAL_CNT_MAX + 1] = { }; static int parse_nla_counters(struct nlattr **attrs, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct pcpu_seg6_local_counters __percpu *pcounters; struct nlattr *tb[SEG6_LOCAL_CNT_MAX + 1]; @@ -1508,13 +1644,13 @@ static int put_nla_counters(struct sk_buff *skb, struct seg6_local_lwt *slwt) pcounters = per_cpu_ptr(slwt->pcpu_counters, i); do { - start = u64_stats_fetch_begin_irq(&pcounters->syncp); + start = u64_stats_fetch_begin(&pcounters->syncp); packets = u64_stats_read(&pcounters->packets); bytes = u64_stats_read(&pcounters->bytes); errors = u64_stats_read(&pcounters->errors); - } while (u64_stats_fetch_retry_irq(&pcounters->syncp, start)); + } while (u64_stats_fetch_retry(&pcounters->syncp, start)); counters.packets += packets; counters.bytes += bytes; @@ -1542,8 +1678,195 @@ static void destroy_attr_counters(struct seg6_local_lwt *slwt) free_percpu(slwt->pcpu_counters); } +static const +struct nla_policy seg6_local_flavors_policy[SEG6_LOCAL_FLV_MAX + 1] = { + [SEG6_LOCAL_FLV_OPERATION] = { .type = NLA_U32 }, + [SEG6_LOCAL_FLV_LCBLOCK_BITS] = { .type = NLA_U8 }, + [SEG6_LOCAL_FLV_LCNODE_FN_BITS] = { .type = NLA_U8 }, +}; + +/* check whether the lengths of the Locator-Block and Locator-Node Function + * are compatible with the dimension of a C-SID container. + */ +static int seg6_chk_next_csid_cfg(__u8 block_len, __u8 func_len) +{ + /* Locator-Block and Locator-Node Function cannot exceed 128 bits + * (i.e. C-SID container lenghts). + */ + if (next_csid_chk_cntr_bits(block_len, func_len)) + return -EINVAL; + + /* Locator-Block length must be greater than zero and evenly divisible + * by 8. There must be room for a Locator-Node Function, at least. + */ + if (next_csid_chk_lcblock_bits(block_len)) + return -EINVAL; + + /* Locator-Node Function length must be greater than zero and evenly + * divisible by 8. There must be room for the Locator-Block. + */ + if (next_csid_chk_lcnode_fn_bits(func_len)) + return -EINVAL; + + return 0; +} + +static int seg6_parse_nla_next_csid_cfg(struct nlattr **tb, + struct seg6_flavors_info *finfo, + struct netlink_ext_ack *extack) +{ + __u8 func_len = SEG6_LOCAL_LCNODE_FN_DBITS; + __u8 block_len = SEG6_LOCAL_LCBLOCK_DBITS; + int rc; + + if (tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]) + block_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCBLOCK_BITS]); + + if (tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]) + func_len = nla_get_u8(tb[SEG6_LOCAL_FLV_LCNODE_FN_BITS]); + + rc = seg6_chk_next_csid_cfg(block_len, func_len); + if (rc < 0) { + NL_SET_ERR_MSG(extack, + "Invalid Locator Block/Node Function lengths"); + return rc; + } + + finfo->lcblock_bits = block_len; + finfo->lcnode_func_bits = func_len; + + return 0; +} + +static int parse_nla_flavors(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) +{ + struct seg6_flavors_info *finfo = &slwt->flv_info; + struct nlattr *tb[SEG6_LOCAL_FLV_MAX + 1]; + unsigned long fops; + int rc; + + rc = nla_parse_nested_deprecated(tb, SEG6_LOCAL_FLV_MAX, + attrs[SEG6_LOCAL_FLAVORS], + seg6_local_flavors_policy, NULL); + if (rc < 0) + return rc; + + /* this attribute MUST always be present since it represents the Flavor + * operation(s) to be carried out. + */ + if (!tb[SEG6_LOCAL_FLV_OPERATION]) + return -EINVAL; + + fops = nla_get_u32(tb[SEG6_LOCAL_FLV_OPERATION]); + if (fops & ~SEG6_LOCAL_FLV_SUPP_OPS) { + NL_SET_ERR_MSG(extack, "Unsupported Flavor operation(s)"); + return -EOPNOTSUPP; + } + + finfo->flv_ops = fops; + + if (seg6_next_csid_enabled(fops)) { + /* Locator-Block and Locator-Node Function lengths can be + * provided by the user space. Otherwise, default values are + * applied. + */ + rc = seg6_parse_nla_next_csid_cfg(tb, finfo, extack); + if (rc < 0) + return rc; + } + + return 0; +} + +static int seg6_fill_nla_next_csid_cfg(struct sk_buff *skb, + struct seg6_flavors_info *finfo) +{ + if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCBLOCK_BITS, finfo->lcblock_bits)) + return -EMSGSIZE; + + if (nla_put_u8(skb, SEG6_LOCAL_FLV_LCNODE_FN_BITS, + finfo->lcnode_func_bits)) + return -EMSGSIZE; + + return 0; +} + +static int put_nla_flavors(struct sk_buff *skb, struct seg6_local_lwt *slwt) +{ + struct seg6_flavors_info *finfo = &slwt->flv_info; + __u32 fops = finfo->flv_ops; + struct nlattr *nest; + int rc; + + nest = nla_nest_start(skb, SEG6_LOCAL_FLAVORS); + if (!nest) + return -EMSGSIZE; + + if (nla_put_u32(skb, SEG6_LOCAL_FLV_OPERATION, fops)) { + rc = -EMSGSIZE; + goto err; + } + + if (seg6_next_csid_enabled(fops)) { + rc = seg6_fill_nla_next_csid_cfg(skb, finfo); + if (rc < 0) + goto err; + } + + return nla_nest_end(skb, nest); + +err: + nla_nest_cancel(skb, nest); + return rc; +} + +static int seg6_cmp_nla_next_csid_cfg(struct seg6_flavors_info *finfo_a, + struct seg6_flavors_info *finfo_b) +{ + if (finfo_a->lcblock_bits != finfo_b->lcblock_bits) + return 1; + + if (finfo_a->lcnode_func_bits != finfo_b->lcnode_func_bits) + return 1; + + return 0; +} + +static int cmp_nla_flavors(struct seg6_local_lwt *a, struct seg6_local_lwt *b) +{ + struct seg6_flavors_info *finfo_a = &a->flv_info; + struct seg6_flavors_info *finfo_b = &b->flv_info; + + if (finfo_a->flv_ops != finfo_b->flv_ops) + return 1; + + if (seg6_next_csid_enabled(finfo_a->flv_ops)) { + if (seg6_cmp_nla_next_csid_cfg(finfo_a, finfo_b)) + return 1; + } + + return 0; +} + +static int encap_size_flavors(struct seg6_local_lwt *slwt) +{ + struct seg6_flavors_info *finfo = &slwt->flv_info; + int nlsize; + + nlsize = nla_total_size(0) + /* nest SEG6_LOCAL_FLAVORS */ + nla_total_size(4); /* SEG6_LOCAL_FLV_OPERATION */ + + if (seg6_next_csid_enabled(finfo->flv_ops)) + nlsize += nla_total_size(1) + /* SEG6_LOCAL_FLV_LCBLOCK_BITS */ + nla_total_size(1); /* SEG6_LOCAL_FLV_LCNODE_FN_BITS */ + + return nlsize; +} + struct seg6_action_param { - int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt); + int (*parse)(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack); int (*put)(struct sk_buff *skb, struct seg6_local_lwt *slwt); int (*cmp)(struct seg6_local_lwt *a, struct seg6_local_lwt *b); @@ -1593,6 +1916,10 @@ static struct seg6_action_param seg6_action_params[SEG6_LOCAL_MAX + 1] = { .put = put_nla_counters, .cmp = cmp_nla_counters, .destroy = destroy_attr_counters }, + + [SEG6_LOCAL_FLAVORS] = { .parse = parse_nla_flavors, + .put = put_nla_flavors, + .cmp = cmp_nla_flavors }, }; /* call the destroy() callback (if available) for each set attribute in @@ -1636,7 +1963,8 @@ static void destroy_attrs(struct seg6_local_lwt *slwt) } static int parse_nla_optional_attrs(struct nlattr **attrs, - struct seg6_local_lwt *slwt) + struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct seg6_action_desc *desc = slwt->desc; unsigned long parsed_optattrs = 0; @@ -1652,7 +1980,7 @@ static int parse_nla_optional_attrs(struct nlattr **attrs, */ param = &seg6_action_params[i]; - err = param->parse(attrs, slwt); + err = param->parse(attrs, slwt, extack); if (err < 0) goto parse_optattrs_err; @@ -1705,7 +2033,8 @@ static void seg6_local_lwtunnel_destroy_state(struct seg6_local_lwt *slwt) ops->destroy_state(slwt); } -static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) +static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt, + struct netlink_ext_ack *extack) { struct seg6_action_param *param; struct seg6_action_desc *desc; @@ -1749,14 +2078,14 @@ static int parse_nla_action(struct nlattr **attrs, struct seg6_local_lwt *slwt) param = &seg6_action_params[i]; - err = param->parse(attrs, slwt); + err = param->parse(attrs, slwt, extack); if (err < 0) goto parse_attrs_err; } } /* parse the optional attributes, if any */ - err = parse_nla_optional_attrs(attrs, slwt); + err = parse_nla_optional_attrs(attrs, slwt, extack); if (err < 0) goto parse_attrs_err; @@ -1800,7 +2129,7 @@ static int seg6_local_build_state(struct net *net, struct nlattr *nla, slwt = seg6_local_lwtunnel(newts); slwt->action = nla_get_u32(tb[SEG6_LOCAL_ACTION]); - err = parse_nla_action(tb, slwt); + err = parse_nla_action(tb, slwt, extack); if (err < 0) goto out_free; @@ -1904,6 +2233,9 @@ static int seg6_local_get_encap_size(struct lwtunnel_state *lwt) /* SEG6_LOCAL_CNT_ERRORS */ nla_total_size_64bit(sizeof(__u64)); + if (attrs & SEG6_F_ATTR(SEG6_LOCAL_FLAVORS)) + nlsize += encap_size_flavors(slwt); + return nlsize; } @@ -1959,6 +2291,15 @@ int __init seg6_local_init(void) */ BUILD_BUG_ON(SEG6_LOCAL_MAX + 1 > BITS_PER_TYPE(unsigned long)); + /* If the default NEXT-C-SID Locator-Block/Node Function lengths (in + * bits) have been changed with invalid values, kernel build stops + * here. + */ + BUILD_BUG_ON(next_csid_chk_cntr_bits(SEG6_LOCAL_LCBLOCK_DBITS, + SEG6_LOCAL_LCNODE_FN_DBITS)); + BUILD_BUG_ON(next_csid_chk_lcblock_bits(SEG6_LOCAL_LCBLOCK_DBITS)); + BUILD_BUG_ON(next_csid_chk_lcnode_fn_bits(SEG6_LOCAL_LCNODE_FN_DBITS)); + return lwtunnel_encap_add_ops(&seg6_local_ops, LWTUNNEL_ENCAP_SEG6_LOCAL); } diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c index 98f1cf40746f..5703d3cbea9b 100644 --- a/net/ipv6/sit.c +++ b/net/ipv6/sit.c @@ -1124,10 +1124,12 @@ static void ipip6_tunnel_bind_dev(struct net_device *dev) if (tdev && !netif_is_l3_master(tdev)) { int t_hlen = tunnel->hlen + sizeof(struct iphdr); + int mtu; - dev->mtu = tdev->mtu - t_hlen; - if (dev->mtu < IPV6_MIN_MTU) - dev->mtu = IPV6_MIN_MTU; + mtu = tdev->mtu - t_hlen; + if (mtu < IPV6_MIN_MTU) + mtu = IPV6_MIN_MTU; + WRITE_ONCE(dev->mtu, mtu); } } @@ -1503,71 +1505,12 @@ static void ipip6_netlink_parms(struct nlattr *data[], if (!data) return; - if (data[IFLA_IPTUN_LINK]) - parms->link = nla_get_u32(data[IFLA_IPTUN_LINK]); - - if (data[IFLA_IPTUN_LOCAL]) - parms->iph.saddr = nla_get_be32(data[IFLA_IPTUN_LOCAL]); - - if (data[IFLA_IPTUN_REMOTE]) - parms->iph.daddr = nla_get_be32(data[IFLA_IPTUN_REMOTE]); - - if (data[IFLA_IPTUN_TTL]) { - parms->iph.ttl = nla_get_u8(data[IFLA_IPTUN_TTL]); - if (parms->iph.ttl) - parms->iph.frag_off = htons(IP_DF); - } - - if (data[IFLA_IPTUN_TOS]) - parms->iph.tos = nla_get_u8(data[IFLA_IPTUN_TOS]); - - if (!data[IFLA_IPTUN_PMTUDISC] || nla_get_u8(data[IFLA_IPTUN_PMTUDISC])) - parms->iph.frag_off = htons(IP_DF); - - if (data[IFLA_IPTUN_FLAGS]) - parms->i_flags = nla_get_be16(data[IFLA_IPTUN_FLAGS]); - - if (data[IFLA_IPTUN_PROTO]) - parms->iph.protocol = nla_get_u8(data[IFLA_IPTUN_PROTO]); + ip_tunnel_netlink_parms(data, parms); if (data[IFLA_IPTUN_FWMARK]) *fwmark = nla_get_u32(data[IFLA_IPTUN_FWMARK]); } -/* This function returns true when ENCAP attributes are present in the nl msg */ -static bool ipip6_netlink_encap_parms(struct nlattr *data[], - struct ip_tunnel_encap *ipencap) -{ - bool ret = false; - - memset(ipencap, 0, sizeof(*ipencap)); - - if (!data) - return ret; - - if (data[IFLA_IPTUN_ENCAP_TYPE]) { - ret = true; - ipencap->type = nla_get_u16(data[IFLA_IPTUN_ENCAP_TYPE]); - } - - if (data[IFLA_IPTUN_ENCAP_FLAGS]) { - ret = true; - ipencap->flags = nla_get_u16(data[IFLA_IPTUN_ENCAP_FLAGS]); - } - - if (data[IFLA_IPTUN_ENCAP_SPORT]) { - ret = true; - ipencap->sport = nla_get_be16(data[IFLA_IPTUN_ENCAP_SPORT]); - } - - if (data[IFLA_IPTUN_ENCAP_DPORT]) { - ret = true; - ipencap->dport = nla_get_be16(data[IFLA_IPTUN_ENCAP_DPORT]); - } - - return ret; -} - #ifdef CONFIG_IPV6_SIT_6RD /* This function returns true when 6RD attributes are present in the nl msg */ static bool ipip6_netlink_6rd_parms(struct nlattr *data[], @@ -1619,7 +1562,7 @@ static int ipip6_newlink(struct net *src_net, struct net_device *dev, nt = netdev_priv(dev); - if (ipip6_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(nt, &ipencap); if (err < 0) return err; @@ -1671,7 +1614,7 @@ static int ipip6_changelink(struct net_device *dev, struct nlattr *tb[], if (dev == sitn->fb_tunnel_dev) return -EINVAL; - if (ipip6_netlink_encap_parms(data, &ipencap)) { + if (ip_tunnel_netlink_encap_parms(data, &ipencap)) { err = ip_tunnel_encap_setup(t, &ipencap); if (err < 0) return err; diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c index 35013497e407..f676be14e6b6 100644 --- a/net/ipv6/tcp_ipv6.c +++ b/net/ipv6/tcp_ipv6.c @@ -146,15 +146,16 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) { struct sockaddr_in6 *usin = (struct sockaddr_in6 *) uaddr; - struct inet_sock *inet = inet_sk(sk); struct inet_connection_sock *icsk = inet_csk(sk); + struct in6_addr *saddr = NULL, *final_p, final; struct inet_timewait_death_row *tcp_death_row; struct ipv6_pinfo *np = tcp_inet6_sk(sk); + struct inet_sock *inet = inet_sk(sk); struct tcp_sock *tp = tcp_sk(sk); - struct in6_addr *saddr = NULL, *final_p, final; + struct net *net = sock_net(sk); struct ipv6_txoptions *opt; - struct flowi6 fl6; struct dst_entry *dst; + struct flowi6 fl6; int addr_type; int err; @@ -237,7 +238,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sin.sin_port = usin->sin6_port; sin.sin_addr.s_addr = usin->sin6_addr.s6_addr32[3]; - icsk->icsk_af_ops = &ipv6_mapped; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_mapped); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, true); sk->sk_backlog_rcv = tcp_v4_do_rcv; @@ -249,7 +251,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, if (err) { icsk->icsk_ext_hdr_len = exthdrlen; - icsk->icsk_af_ops = &ipv6_specific; + /* Paired with READ_ONCE() in tcp_(get|set)sockopt() */ + WRITE_ONCE(icsk->icsk_af_ops, &ipv6_specific); if (sk_is_mptcp(sk)) mptcpv6_handle_mapped(sk, false); sk->sk_backlog_rcv = tcp_v6_do_rcv; @@ -280,20 +283,21 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, security_sk_classify_flow(sk, flowi6_to_flowi_common(&fl6)); - dst = ip6_dst_lookup_flow(sock_net(sk), sk, &fl6, final_p); + dst = ip6_dst_lookup_flow(net, sk, &fl6, final_p); if (IS_ERR(dst)) { err = PTR_ERR(dst); goto failure; } + tcp_death_row = &sock_net(sk)->ipv4.tcp_death_row; + if (!saddr) { struct inet_bind_hashbucket *prev_addr_hashbucket = NULL; struct in6_addr prev_v6_rcv_saddr; if (icsk->icsk_bind2_hash) { - prev_addr_hashbucket = inet_bhashfn_portaddr(&tcp_hashinfo, - sk, sock_net(sk), - inet->inet_num); + prev_addr_hashbucket = inet_bhashfn_portaddr(tcp_death_row->hashinfo, + sk, net, inet->inet_num); prev_v6_rcv_saddr = sk->sk_v6_rcv_saddr; } saddr = &fl6.saddr; @@ -325,7 +329,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, inet->inet_dport = usin->sin6_port; tcp_set_state(sk, TCP_SYN_SENT); - tcp_death_row = sock_net(sk)->ipv4.tcp_death_row; err = inet6_hash_connect(tcp_death_row, sk); if (err) goto late_failure; @@ -339,8 +342,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, sk->sk_v6_daddr.s6_addr32, inet->inet_sport, inet->inet_dport)); - tp->tsoffset = secure_tcpv6_ts_off(sock_net(sk), - np->saddr.s6_addr32, + tp->tsoffset = secure_tcpv6_ts_off(net, np->saddr.s6_addr32, sk->sk_v6_daddr.s6_addr32); } @@ -403,7 +405,7 @@ static int tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt, bool fatal; int err; - sk = __inet6_lookup_established(net, &tcp_hashinfo, + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->daddr, th->dest, &hdr->saddr, ntohs(th->source), skb->dev->ifindex, inet6_sdif(skb)); @@ -1001,6 +1003,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) __be32 label = 0; u32 priority = 0; struct net *net; + u32 txhash = 0; int oif = 0; if (th->rst) @@ -1036,11 +1039,10 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) * Incoming packet is checked with md5 hash with finding key, * no RST generated if md5 hash doesn't match. */ - sk1 = inet6_lookup_listener(net, - &tcp_hashinfo, NULL, 0, - &ipv6h->saddr, - th->source, &ipv6h->daddr, - ntohs(th->source), dif, sdif); + sk1 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, + NULL, 0, &ipv6h->saddr, th->source, + &ipv6h->daddr, ntohs(th->source), + dif, sdif); if (!sk1) goto out; @@ -1074,10 +1076,12 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) if (np->repflow) label = ip6_flowlabel(ipv6h); priority = sk->sk_priority; + txhash = sk->sk_hash; } if (sk->sk_state == TCP_TIME_WAIT) { label = cpu_to_be32(inet_twsk(sk)->tw_flowlabel); priority = inet_twsk(sk)->tw_priority; + txhash = inet_twsk(sk)->tw_txhash; } } else { if (net->ipv6.sysctl.flowlabel_reflect & FLOWLABEL_REFLECT_TCP_RESET) @@ -1085,7 +1089,7 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb) } tcp_v6_send_response(sk, skb, seq, ack_seq, 0, 0, 0, oif, key, 1, - ipv6_get_dsfield(ipv6h), label, priority, 0); + ipv6_get_dsfield(ipv6h), label, priority, txhash); #ifdef CONFIG_TCP_MD5SIG out: @@ -1638,7 +1642,7 @@ INDIRECT_CALLABLE_SCOPE int tcp_v6_rcv(struct sk_buff *skb) hdr = ipv6_hdr(skb); lookup: - sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), + sk = __inet6_lookup_skb(net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), th->source, th->dest, inet6_iif(skb), sdif, &refcounted); if (!sk) @@ -1813,7 +1817,7 @@ do_time_wait: { struct sock *sk2; - sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo, + sk2 = inet6_lookup_listener(net, net->ipv4.tcp_death_row.hashinfo, skb, __tcp_hdrlen(th), &ipv6_hdr(skb)->saddr, th->source, &ipv6_hdr(skb)->daddr, @@ -1846,6 +1850,7 @@ do_time_wait: void tcp_v6_early_demux(struct sk_buff *skb) { + struct net *net = dev_net(skb->dev); const struct ipv6hdr *hdr; const struct tcphdr *th; struct sock *sk; @@ -1863,7 +1868,7 @@ void tcp_v6_early_demux(struct sk_buff *skb) return; /* Note : We use inet6_iif() here, not tcp_v6_iif() */ - sk = __inet6_lookup_established(dev_net(skb->dev), &tcp_hashinfo, + sk = __inet6_lookup_established(net, net->ipv4.tcp_death_row.hashinfo, &hdr->saddr, th->source, &hdr->daddr, ntohs(th->dest), inet6_iif(skb), inet6_sdif(skb)); @@ -1961,12 +1966,6 @@ static int tcp_v6_init_sock(struct sock *sk) return 0; } -static void tcp_v6_destroy_sock(struct sock *sk) -{ - tcp_v4_destroy_sock(sk); - inet6_destroy_sock(sk); -} - #ifdef CONFIG_PROC_FS /* Proc filesystem TCPv6 sock list dumping. */ static void get_openreq6(struct seq_file *seq, @@ -2159,7 +2158,7 @@ struct proto tcpv6_prot = { .accept = inet_csk_accept, .ioctl = tcp_ioctl, .init = tcp_v6_init_sock, - .destroy = tcp_v6_destroy_sock, + .destroy = tcp_v4_destroy_sock, .shutdown = tcp_shutdown, .setsockopt = tcp_setsockopt, .getsockopt = tcp_getsockopt, @@ -2195,7 +2194,7 @@ struct proto tcpv6_prot = { .slab_flags = SLAB_TYPESAFE_BY_RCU, .twsk_prot = &tcp6_timewait_sock_ops, .rsk_prot = &tcp6_request_sock_ops, - .h.hashinfo = &tcp_hashinfo, + .h.hashinfo = NULL, .no_autobind = true, .diag_destroy = tcp_abort, }; @@ -2229,7 +2228,7 @@ static void __net_exit tcpv6_net_exit(struct net *net) static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list) { - inet_twsk_purge(&tcp_hashinfo, AF_INET6); + tcp_twsk_purge(net_exit_list, AF_INET6); } static struct pernet_operations tcpv6_net_ops = { diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c index 16c176e7c69a..297f7cc06044 100644 --- a/net/ipv6/udp.c +++ b/net/ipv6/udp.c @@ -56,6 +56,19 @@ #include <trace/events/skb.h> #include "udp_impl.h" +static void udpv6_destruct_sock(struct sock *sk) +{ + udp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +int udpv6_init_sock(struct sock *sk) +{ + udp_lib_init_sock(sk); + sk->sk_destruct = udpv6_destruct_sock; + return 0; +} + static u32 udp6_ehashfn(const struct net *net, const struct in6_addr *laddr, const u16 lport, @@ -182,7 +195,7 @@ static struct sock *udp6_lib_lookup2(struct net *net, result = lookup_reuseport(net, sk, skb, saddr, sport, daddr, hnum); /* Fall back to scoring if group has connections */ - if (result && !reuseport_has_conns(sk, false)) + if (result && !reuseport_has_conns(sk)) return result; result = result ? : sk; @@ -616,8 +629,11 @@ int __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt, } /* Tunnels don't have an application socket: don't pass errors back */ - if (tunnel) + if (tunnel) { + if (udp_sk(sk)->encap_err_rcv) + udp_sk(sk)->encap_err_rcv(sk, skb, offset); goto out; + } if (!np->recverr) { if (!harderr || sk->sk_state != TCP_ESTABLISHED) @@ -647,16 +663,20 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb) rc = __udp_enqueue_schedule_skb(sk, skb); if (rc < 0) { int is_udplite = IS_UDPLITE(sk); + enum skb_drop_reason drop_reason; /* Note that an ENOMEM error is charged twice */ - if (rc == -ENOMEM) + if (rc == -ENOMEM) { UDP6_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS, is_udplite); - else + drop_reason = SKB_DROP_REASON_SOCKET_RCVBUFF; + } else { UDP6_INC_STATS(sock_net(sk), UDP_MIB_MEMERRORS, is_udplite); + drop_reason = SKB_DROP_REASON_PROTO_MEM; + } UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return -1; } @@ -672,11 +692,14 @@ static __inline__ int udpv6_err(struct sk_buff *skb, static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) { + enum skb_drop_reason drop_reason = SKB_DROP_REASON_NOT_SPECIFIED; struct udp_sock *up = udp_sk(sk); int is_udplite = IS_UDPLITE(sk); - if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) + if (!xfrm6_policy_check(sk, XFRM_POLICY_IN, skb)) { + drop_reason = SKB_DROP_REASON_XFRM_POLICY; goto drop; + } if (static_branch_unlikely(&udpv6_encap_needed_key) && up->encap_type) { int (*encap_rcv)(struct sock *sk, struct sk_buff *skb); @@ -735,8 +758,10 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) udp_lib_checksum_complete(skb)) goto csum_error; - if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) + if (sk_filter_trim_cap(sk, skb, sizeof(struct udphdr))) { + drop_reason = SKB_DROP_REASON_SOCKET_FILTER; goto drop; + } udp_csum_pull_header(skb); @@ -745,11 +770,12 @@ static int udpv6_queue_rcv_one_skb(struct sock *sk, struct sk_buff *skb) return __udpv6_queue_rcv_skb(sk, skb); csum_error: + drop_reason = SKB_DROP_REASON_UDP_CSUM; __UDP6_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite); drop: __UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite); atomic_inc(&sk->sk_drops); - kfree_skb(skb); + kfree_skb_reason(skb, drop_reason); return -1; } @@ -1635,8 +1661,6 @@ void udpv6_destroy_sock(struct sock *sk) udp_encap_disable(); } } - - inet6_destroy_sock(sk); } /* @@ -1645,7 +1669,7 @@ void udpv6_destroy_sock(struct sock *sk) int udpv6_setsockopt(struct sock *sk, int level, int optname, sockptr_t optval, unsigned int optlen) { - if (level == SOL_UDP || level == SOL_UDPLITE) + if (level == SOL_UDP || level == SOL_UDPLITE || level == SOL_SOCKET) return udp_lib_setsockopt(sk, level, optname, optval, optlen, udp_v6_push_pending_frames); @@ -1720,7 +1744,7 @@ struct proto udpv6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udp_init_sock, + .init = udpv6_init_sock, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, diff --git a/net/ipv6/udp_impl.h b/net/ipv6/udp_impl.h index 4251e49d32a0..0590f566379d 100644 --- a/net/ipv6/udp_impl.h +++ b/net/ipv6/udp_impl.h @@ -12,6 +12,7 @@ int __udp6_lib_rcv(struct sk_buff *, struct udp_table *, int); int __udp6_lib_err(struct sk_buff *, struct inet6_skb_parm *, u8, u8, int, __be32, struct udp_table *); +int udpv6_init_sock(struct sock *sk); int udp_v6_get_port(struct sock *sk, unsigned short snum); void udp_v6_rehash(struct sock *sk); diff --git a/net/ipv6/udplite.c b/net/ipv6/udplite.c index b70725856259..67eaf3ca14ce 100644 --- a/net/ipv6/udplite.c +++ b/net/ipv6/udplite.c @@ -12,6 +12,13 @@ #include <linux/proc_fs.h> #include "udp_impl.h" +static int udplitev6_sk_init(struct sock *sk) +{ + udpv6_init_sock(sk); + udp_sk(sk)->pcflag = UDPLITE_BIT; + return 0; +} + static int udplitev6_rcv(struct sk_buff *skb) { return __udp6_lib_rcv(skb, &udplite_table, IPPROTO_UDPLITE); @@ -38,7 +45,7 @@ struct proto udplitev6_prot = { .connect = ip6_datagram_connect, .disconnect = udp_disconnect, .ioctl = udp_ioctl, - .init = udplite_sk_init, + .init = udplitev6_sk_init, .destroy = udpv6_destroy_sock, .setsockopt = udpv6_setsockopt, .getsockopt = udpv6_getsockopt, diff --git a/net/ipv6/xfrm6_tunnel.c b/net/ipv6/xfrm6_tunnel.c index 2b31112c0856..1323f2f6928e 100644 --- a/net/ipv6/xfrm6_tunnel.c +++ b/net/ipv6/xfrm6_tunnel.c @@ -270,13 +270,17 @@ static int xfrm6_tunnel_err(struct sk_buff *skb, struct inet6_skb_parm *opt, return 0; } -static int xfrm6_tunnel_init_state(struct xfrm_state *x) +static int xfrm6_tunnel_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { - if (x->props.mode != XFRM_MODE_TUNNEL) + if (x->props.mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG(extack, "IPv6 tunnel can only be used with tunnel mode"); return -EINVAL; + } - if (x->encap) + if (x->encap) { + NL_SET_ERR_MSG(extack, "IPv6 tunnel is not compatible with encapsulation"); return -EINVAL; + } x->props.header_len = sizeof(struct ipv6hdr); diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c index 1215c863e1c4..a5004228111d 100644 --- a/net/kcm/kcmsock.c +++ b/net/kcm/kcmsock.c @@ -162,7 +162,8 @@ static void kcm_rcv_ready(struct kcm_sock *kcm) /* Buffer limit is okay now, add to ready list */ list_add_tail(&kcm->wait_rx_list, &kcm->mux->kcm_rx_waiters); - kcm->rx_wait = true; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, true); } static void kcm_rfree(struct sk_buff *skb) @@ -178,7 +179,7 @@ static void kcm_rfree(struct sk_buff *skb) /* For reading rx_wait and rx_psock without holding lock */ smp_mb__after_atomic(); - if (!kcm->rx_wait && !kcm->rx_psock && + if (!READ_ONCE(kcm->rx_wait) && !READ_ONCE(kcm->rx_psock) && sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) { spin_lock_bh(&mux->rx_lock); kcm_rcv_ready(kcm); @@ -237,7 +238,8 @@ try_again: if (kcm_queue_rcv_skb(&kcm->sk, skb)) { /* Should mean socket buffer full */ list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); /* Commit rx_wait to read in kcm_free */ smp_wmb(); @@ -280,10 +282,12 @@ static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock, kcm = list_first_entry(&mux->kcm_rx_waiters, struct kcm_sock, wait_rx_list); list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); psock->rx_kcm = kcm; - kcm->rx_psock = psock; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, psock); spin_unlock_bh(&mux->rx_lock); @@ -310,7 +314,8 @@ static void unreserve_rx_kcm(struct kcm_psock *psock, spin_lock_bh(&mux->rx_lock); psock->rx_kcm = NULL; - kcm->rx_psock = NULL; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_psock, NULL); /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with * kcm_rfree @@ -834,7 +839,7 @@ static ssize_t kcm_sendpage(struct socket *sock, struct page *page, } get_page(page); - skb_fill_page_desc(skb, i, page, offset, size); + skb_fill_page_desc_noacc(skb, i, page, offset, size); skb_shinfo(skb)->flags |= SKBFL_SHARED_FRAG; coalesced: @@ -1240,7 +1245,8 @@ static void kcm_recv_disable(struct kcm_sock *kcm) if (!kcm->rx_psock) { if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue); @@ -1793,7 +1799,8 @@ static void kcm_done(struct kcm_sock *kcm) if (kcm->rx_wait) { list_del(&kcm->wait_rx_list); - kcm->rx_wait = false; + /* paired with lockless reads in kcm_rfree() */ + WRITE_ONCE(kcm->rx_wait, false); } /* Move any pending receive messages to other kcm sockets */ requeue_rx_msgs(mux, &sk->sk_receive_queue); @@ -1838,10 +1845,10 @@ static int kcm_release(struct socket *sock) kcm = kcm_sk(sk); mux = kcm->mux; + lock_sock(sk); sock_orphan(sk); kfree_skb(kcm->seq_skb); - lock_sock(sk); /* Purge queue under lock to avoid race condition with tx_work trying * to act when queue is nonempty. If tx_work runs after this point * it will just return. diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c index 9dbd801ddb98..2478aa60145f 100644 --- a/net/l2tp/l2tp_ip6.c +++ b/net/l2tp/l2tp_ip6.c @@ -257,8 +257,6 @@ static void l2tp_ip6_destroy_sock(struct sock *sk) if (tunnel) l2tp_tunnel_delete(tunnel); - - inet6_destroy_sock(sk); } static int l2tp_ip6_bind(struct sock *sk, struct sockaddr *uaddr, int addr_len) diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c index 9414d3bbd65f..c6fa53230450 100644 --- a/net/mac80211/agg-rx.c +++ b/net/mac80211/agg-rx.c @@ -183,34 +183,15 @@ static void ieee80211_add_addbaext(struct ieee80211_sub_if_data *sdata, const struct ieee80211_addba_ext_ie *req, u16 buf_size) { - struct ieee80211_supported_band *sband; struct ieee80211_addba_ext_ie *resp; - const struct ieee80211_sta_he_cap *he_cap; - u8 frag_level, cap_frag_level; u8 *pos; - sband = ieee80211_get_sband(sdata); - if (!sband) - return; - he_cap = ieee80211_get_he_iftype_cap(sband, - ieee80211_vif_type_p2p(&sdata->vif)); - if (!he_cap) - return; - pos = skb_put_zero(skb, 2 + sizeof(struct ieee80211_addba_ext_ie)); *pos++ = WLAN_EID_ADDBA_EXT; *pos++ = sizeof(struct ieee80211_addba_ext_ie); resp = (struct ieee80211_addba_ext_ie *)pos; resp->data = req->data & IEEE80211_ADDBA_EXT_NO_FRAG; - frag_level = u32_get_bits(req->data, - IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); - cap_frag_level = u32_get_bits(he_cap->he_cap_elem.mac_cap_info[0], - IEEE80211_HE_MAC_CAP0_DYNAMIC_FRAG_MASK); - if (frag_level > cap_frag_level) - frag_level = cap_frag_level; - resp->data |= u8_encode_bits(frag_level, - IEEE80211_ADDBA_EXT_FRAG_LEVEL_MASK); resp->data |= u8_encode_bits(buf_size >> IEEE80211_ADDBA_EXT_BUF_SIZE_SHIFT, IEEE80211_ADDBA_EXT_BUF_SIZE_MASK); } @@ -242,7 +223,7 @@ static void ieee80211_send_addba_resp(struct sta_info *sta, u8 *da, u16 tid, sdata->vif.type == NL80211_IFTYPE_MESH_POINT) memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); @@ -297,9 +278,9 @@ void ___ieee80211_start_rx_ba_session(struct sta_info *sta, } if (!sta->sta.deflink.ht_cap.ht_supported && - sta->sdata->vif.bss_conf.chandef.chan->band != NL80211_BAND_6GHZ) { + !sta->sta.deflink.he_cap.has_he) { ht_dbg(sta->sdata, - "STA %pM erroneously requests BA session on tid %d w/o QoS\n", + "STA %pM erroneously requests BA session on tid %d w/o HT\n", sta->sta.addr, tid); /* send a response anyway, it's an error case if we get here */ goto end; diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c index 07c892aa8c73..9c40f8d3bce8 100644 --- a/net/mac80211/agg-tx.c +++ b/net/mac80211/agg-tx.c @@ -82,7 +82,7 @@ static void ieee80211_send_addba_request(struct ieee80211_sub_if_data *sdata, sdata->vif.type == NL80211_IFTYPE_MESH_POINT) memcpy(mgmt->bssid, sdata->vif.addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_STATION) - memcpy(mgmt->bssid, sdata->deflink.u.mgd.bssid, ETH_ALEN); + memcpy(mgmt->bssid, sdata->vif.cfg.ap_addr, ETH_ALEN); else if (sdata->vif.type == NL80211_IFTYPE_ADHOC) memcpy(mgmt->bssid, sdata->u.ibss.bssid, ETH_ALEN); diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c index 687b4c878d4a..c848fe04dd44 100644 --- a/net/mac80211/cfg.c +++ b/net/mac80211/cfg.c @@ -2554,47 +2554,50 @@ static int ieee80211_change_bss(struct wiphy *wiphy, struct bss_parameters *params) { struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev); + struct ieee80211_link_data *link; struct ieee80211_supported_band *sband; u32 changed = 0; - if (!sdata_dereference(sdata->deflink.u.ap.beacon, sdata)) + link = ieee80211_link_or_deflink(sdata, params->link_id, true); + if (IS_ERR(link)) + return PTR_ERR(link); + + if (!sdata_dereference(link->u.ap.beacon, sdata)) return -ENOENT; - sband = ieee80211_get_sband(sdata); + sband = ieee80211_get_link_sband(link); if (!sband) return -EINVAL; if (params->use_cts_prot >= 0) { - sdata->vif.bss_conf.use_cts_prot = params->use_cts_prot; + link->conf->use_cts_prot = params->use_cts_prot; changed |= BSS_CHANGED_ERP_CTS_PROT; } if (params->use_short_preamble >= 0) { - sdata->vif.bss_conf.use_short_preamble = - params->use_short_preamble; + link->conf->use_short_preamble = params->use_short_preamble; changed |= BSS_CHANGED_ERP_PREAMBLE; } - if (!sdata->vif.bss_conf.use_short_slot && + if (!link->conf->use_short_slot && (sband->band == NL80211_BAND_5GHZ || sband->band == NL80211_BAND_6GHZ)) { - sdata->vif.bss_conf.use_short_slot = true; + link->conf->use_short_slot = true; changed |= BSS_CHANGED_ERP_SLOT; } if (params->use_short_slot_time >= 0) { - sdata->vif.bss_conf.use_short_slot = - params->use_short_slot_time; + link->conf->use_short_slot = params->use_short_slot_time; changed |= BSS_CHANGED_ERP_SLOT; } if (params->basic_rates) { - ieee80211_parse_bitrates(sdata->vif.bss_conf.chandef.width, + ieee80211_parse_bitrates(link->conf->chandef.width, wiphy->bands[sband->band], params->basic_rates, params->basic_rates_len, - &sdata->vif.bss_conf.basic_rates); + &link->conf->basic_rates); changed |= BSS_CHANGED_BASIC_RATES; - ieee80211_check_rate_mask(&sdata->deflink); + ieee80211_check_rate_mask(link); } if (params->ap_isolate >= 0) { @@ -2606,30 +2609,29 @@ static int ieee80211_change_bss(struct wiphy *wiphy, } if (params->ht_opmode >= 0) { - sdata->vif.bss_conf.ht_operation_mode = - (u16) params->ht_opmode; + link->conf->ht_operation_mode = (u16)params->ht_opmode; changed |= BSS_CHANGED_HT; } if (params->p2p_ctwindow >= 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_CTWINDOW_MASK; - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + link->conf->p2p_noa_attr.oppps_ctwindow |= params->p2p_ctwindow & IEEE80211_P2P_OPPPS_CTWINDOW_MASK; changed |= BSS_CHANGED_P2P_PS; } if (params->p2p_opp_ps > 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow |= + link->conf->p2p_noa_attr.oppps_ctwindow |= IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } else if (params->p2p_opp_ps == 0) { - sdata->vif.bss_conf.p2p_noa_attr.oppps_ctwindow &= + link->conf->p2p_noa_attr.oppps_ctwindow &= ~IEEE80211_P2P_OPPPS_ENABLE_BIT; changed |= BSS_CHANGED_P2P_PS; } - ieee80211_link_info_change_notify(sdata, &sdata->deflink, changed); + ieee80211_link_info_change_notify(sdata, link, changed); return 0; } @@ -4338,9 +4340,6 @@ static int ieee80211_get_txq_stats(struct wiphy *wiphy, struct ieee80211_sub_if_data *sdata; int ret = 0; - if (!local->ops->wake_tx_queue) - return 1; - spin_lock_bh(&local->fq.lock); rcu_read_lock(); diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c index 78c7d60e8667..dfb9f55e2685 100644 --- a/net/mac80211/debugfs.c +++ b/net/mac80211/debugfs.c @@ -663,9 +663,7 @@ void debugfs_hw_add(struct ieee80211_local *local) DEBUGFS_ADD_MODE(force_tx_status, 0600); DEBUGFS_ADD_MODE(aql_enable, 0600); DEBUGFS_ADD(aql_pending); - - if (local->ops->wake_tx_queue) - DEBUGFS_ADD_MODE(aqm, 0600); + DEBUGFS_ADD_MODE(aqm, 0600); DEBUGFS_ADD_MODE(airtime_flags, 0600); diff --git a/net/mac80211/debugfs_netdev.c b/net/mac80211/debugfs_netdev.c index 5b014786fd2d..c87e1137e5da 100644 --- a/net/mac80211/debugfs_netdev.c +++ b/net/mac80211/debugfs_netdev.c @@ -677,8 +677,7 @@ static void add_common_files(struct ieee80211_sub_if_data *sdata) DEBUGFS_ADD(rc_rateidx_vht_mcs_mask_5ghz); DEBUGFS_ADD(hw_queues); - if (sdata->local->ops->wake_tx_queue && - sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && + if (sdata->vif.type != NL80211_IFTYPE_P2P_DEVICE && sdata->vif.type != NL80211_IFTYPE_NAN) DEBUGFS_ADD(aqm); } diff --git a/net/mac80211/debugfs_sta.c b/net/mac80211/debugfs_sta.c index d3397c1248d3..7a3d7893e19d 100644 --- a/net/mac80211/debugfs_sta.c +++ b/net/mac80211/debugfs_sta.c @@ -5,7 +5,7 @@ * Copyright 2007 Johannes Berg <johannes@sipsolutions.net> * Copyright 2013-2014 Intel Mobile Communications GmbH * Copyright(c) 2016 Intel Deutschland GmbH - * Copyright (C) 2018 - 2021 Intel Corporation + * Copyright (C) 2018 - 2022 Intel Corporation */ #include <linux/debugfs.h> @@ -435,8 +435,29 @@ static ssize_t sta_agg_status_write(struct file *file, const char __user *userbu } STA_OPS_RW(agg_status); -static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +/* link sta attributes */ +#define LINK_STA_OPS(name) \ +static const struct file_operations link_sta_ ##name## _ops = { \ + .read = link_sta_##name##_read, \ + .open = simple_open, \ + .llseek = generic_file_llseek, \ +} + +static ssize_t link_sta_addr_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) +{ + struct link_sta_info *link_sta = file->private_data; + u8 mac[3 * ETH_ALEN + 1]; + + snprintf(mac, sizeof(mac), "%pM\n", link_sta->pub->addr); + + return simple_read_from_buffer(userbuf, count, ppos, mac, 3 * ETH_ALEN); +} + +LINK_STA_OPS(addr); + +static ssize_t link_sta_ht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { #define PRINT_HT_CAP(_cond, _str) \ do { \ @@ -446,8 +467,8 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, char *buf, *p; int i; ssize_t bufsz = 512; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_ht_cap *htc = &sta->sta.deflink.ht_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_ht_cap *htc = &link_sta->pub->ht_cap; ssize_t ret; buf = kzalloc(bufsz, GFP_KERNEL); @@ -524,14 +545,14 @@ static ssize_t sta_ht_capa_read(struct file *file, char __user *userbuf, kfree(buf); return ret; } -STA_OPS(ht_capa); +LINK_STA_OPS(ht_capa); -static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +static ssize_t link_sta_vht_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { char *buf, *p; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_vht_cap *vhtc = &sta->sta.deflink.vht_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_vht_cap *vhtc = &link_sta->pub->vht_cap; ssize_t ret; ssize_t bufsz = 512; @@ -638,15 +659,15 @@ static ssize_t sta_vht_capa_read(struct file *file, char __user *userbuf, kfree(buf); return ret; } -STA_OPS(vht_capa); +LINK_STA_OPS(vht_capa); -static ssize_t sta_he_capa_read(struct file *file, char __user *userbuf, - size_t count, loff_t *ppos) +static ssize_t link_sta_he_capa_read(struct file *file, char __user *userbuf, + size_t count, loff_t *ppos) { char *buf, *p; size_t buf_sz = PAGE_SIZE; - struct sta_info *sta = file->private_data; - struct ieee80211_sta_he_cap *hec = &sta->sta.deflink.he_cap; + struct link_sta_info *link_sta = file->private_data; + struct ieee80211_sta_he_cap *hec = &link_sta->pub->he_cap; struct ieee80211_he_mcs_nss_supp *nss = &hec->he_mcs_nss_supp; u8 ppe_size; u8 *cap; @@ -1011,7 +1032,7 @@ out: kfree(buf); return ret; } -STA_OPS(he_capa); +LINK_STA_OPS(he_capa); #define DEBUGFS_ADD(name) \ debugfs_create_file(#name, 0400, \ @@ -1048,18 +1069,11 @@ void ieee80211_sta_debugfs_add(struct sta_info *sta) DEBUGFS_ADD(num_ps_buf_frames); DEBUGFS_ADD(last_seq_ctrl); DEBUGFS_ADD(agg_status); - DEBUGFS_ADD(ht_capa); - DEBUGFS_ADD(vht_capa); - DEBUGFS_ADD(he_capa); - - DEBUGFS_ADD_COUNTER(rx_duplicates, deflink.rx_stats.num_duplicates); - DEBUGFS_ADD_COUNTER(rx_fragments, deflink.rx_stats.fragments); + /* FIXME: Kept here as the statistics are only done on the deflink */ DEBUGFS_ADD_COUNTER(tx_filtered, deflink.status_stats.filtered); - if (local->ops->wake_tx_queue) { - DEBUGFS_ADD(aqm); - DEBUGFS_ADD(airtime); - } + DEBUGFS_ADD(aqm); + DEBUGFS_ADD(airtime); if (wiphy_ext_feature_isset(local->hw.wiphy, NL80211_EXT_FEATURE_AQL)) @@ -1076,3 +1090,85 @@ void ieee80211_sta_debugfs_remove(struct sta_info *sta) debugfs_remove_recursive(sta->debugfs_dir); sta->debugfs_dir = NULL; } + +#undef DEBUGFS_ADD +#undef DEBUGFS_ADD_COUNTER + +#define DEBUGFS_ADD(name) \ + debugfs_create_file(#name, 0400, \ + link_sta->debugfs_dir, link_sta, &link_sta_ ##name## _ops) +#define DEBUGFS_ADD_COUNTER(name, field) \ + debugfs_create_ulong(#name, 0400, link_sta->debugfs_dir, &link_sta->field) + +void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) +{ + if (WARN_ON(!link_sta->sta->debugfs_dir)) + return; + + /* For non-MLO, leave the files in the main directory. */ + if (link_sta->sta->sta.valid_links) { + char link_dir_name[10]; + + snprintf(link_dir_name, sizeof(link_dir_name), + "link-%d", link_sta->link_id); + + link_sta->debugfs_dir = + debugfs_create_dir(link_dir_name, + link_sta->sta->debugfs_dir); + + DEBUGFS_ADD(addr); + } else { + if (WARN_ON(link_sta != &link_sta->sta->deflink)) + return; + + link_sta->debugfs_dir = link_sta->sta->debugfs_dir; + } + + DEBUGFS_ADD(ht_capa); + DEBUGFS_ADD(vht_capa); + DEBUGFS_ADD(he_capa); + + DEBUGFS_ADD_COUNTER(rx_duplicates, rx_stats.num_duplicates); + DEBUGFS_ADD_COUNTER(rx_fragments, rx_stats.fragments); +} + +void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) +{ + if (!link_sta->debugfs_dir || !link_sta->sta->debugfs_dir) { + link_sta->debugfs_dir = NULL; + return; + } + + if (link_sta->debugfs_dir == link_sta->sta->debugfs_dir) { + WARN_ON(link_sta != &link_sta->sta->deflink); + link_sta->sta->debugfs_dir = NULL; + return; + } + + debugfs_remove_recursive(link_sta->debugfs_dir); + link_sta->debugfs_dir = NULL; +} + +void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) +{ + if (WARN_ON(!link_sta->debugfs_dir)) + return; + + drv_link_sta_add_debugfs(link_sta->sta->local, link_sta->sta->sdata, + link_sta->pub, link_sta->debugfs_dir); +} + +void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) +{ + if (!link_sta->debugfs_dir) + return; + + if (WARN_ON(link_sta->debugfs_dir == link_sta->sta->debugfs_dir)) + return; + + /* Recreate the directory excluding the driver data */ + debugfs_remove_recursive(link_sta->debugfs_dir); + link_sta->debugfs_dir = NULL; + + ieee80211_link_sta_debugfs_add(link_sta); +} diff --git a/net/mac80211/debugfs_sta.h b/net/mac80211/debugfs_sta.h index d2e7c27ad6d1..cde8148bdb18 100644 --- a/net/mac80211/debugfs_sta.h +++ b/net/mac80211/debugfs_sta.h @@ -7,9 +7,21 @@ #ifdef CONFIG_MAC80211_DEBUGFS void ieee80211_sta_debugfs_add(struct sta_info *sta); void ieee80211_sta_debugfs_remove(struct sta_info *sta); + +void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta); +void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta); + +void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta); +void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta); #else static inline void ieee80211_sta_debugfs_add(struct sta_info *sta) {} static inline void ieee80211_sta_debugfs_remove(struct sta_info *sta) {} + +static inline void ieee80211_link_sta_debugfs_add(struct link_sta_info *link_sta) {} +static inline void ieee80211_link_sta_debugfs_remove(struct link_sta_info *link_sta) {} + +static inline void ieee80211_link_sta_debugfs_drv_add(struct link_sta_info *link_sta) {} +static inline void ieee80211_link_sta_debugfs_drv_remove(struct link_sta_info *link_sta) {} #endif #endif /* __MAC80211_DEBUGFS_STA_H */ diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c index 5392ffa18270..d737db4e07e2 100644 --- a/net/mac80211/driver-ops.c +++ b/net/mac80211/driver-ops.c @@ -7,6 +7,7 @@ #include "ieee80211_i.h" #include "trace.h" #include "driver-ops.h" +#include "debugfs_sta.h" int drv_start(struct ieee80211_local *local) { @@ -497,6 +498,11 @@ int drv_change_sta_links(struct ieee80211_local *local, struct ieee80211_sta *sta, u16 old_links, u16 new_links) { + struct sta_info *info = container_of(sta, struct sta_info, sta); + struct link_sta_info *link_sta; + unsigned long links_to_add; + unsigned long links_to_rem; + unsigned int link_id; int ret = -EOPNOTSUPP; might_sleep(); @@ -510,11 +516,30 @@ int drv_change_sta_links(struct ieee80211_local *local, if (old_links == new_links) return 0; + links_to_add = ~old_links & new_links; + links_to_rem = old_links & ~new_links; + + for_each_set_bit(link_id, &links_to_rem, IEEE80211_MLD_MAX_NUM_LINKS) { + link_sta = rcu_dereference_protected(info->link[link_id], + lockdep_is_held(&local->sta_mtx)); + + ieee80211_link_sta_debugfs_drv_remove(link_sta); + } + trace_drv_change_sta_links(local, sdata, sta, old_links, new_links); if (local->ops->change_sta_links) ret = local->ops->change_sta_links(&local->hw, &sdata->vif, sta, old_links, new_links); trace_drv_return_int(local, ret); - return ret; + if (ret) + return ret; + + for_each_set_bit(link_id, &links_to_add, IEEE80211_MLD_MAX_NUM_LINKS) { + link_sta = rcu_dereference_protected(info->link[link_id], + lockdep_is_held(&local->sta_mtx)); + ieee80211_link_sta_debugfs_drv_add(link_sta); + } + + return 0; } diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h index 81e40b0a3b16..809bad53e15b 100644 --- a/net/mac80211/driver-ops.h +++ b/net/mac80211/driver-ops.h @@ -480,6 +480,22 @@ static inline void drv_sta_add_debugfs(struct ieee80211_local *local, local->ops->sta_add_debugfs(&local->hw, &sdata->vif, sta, dir); } + +static inline void drv_link_sta_add_debugfs(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_link_sta *link_sta, + struct dentry *dir) +{ + might_sleep(); + + sdata = get_bss_sdata(sdata); + if (!check_sdata_in_driver(sdata)) + return; + + if (local->ops->link_sta_add_debugfs) + local->ops->link_sta_add_debugfs(&local->hw, &sdata->vif, + link_sta, dir); +} #endif static inline void drv_sta_pre_rcu_remove(struct ieee80211_local *local, diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h index 4e1d4c339f2d..63ff0d2524b6 100644 --- a/net/mac80211/ieee80211_i.h +++ b/net/mac80211/ieee80211_i.h @@ -390,6 +390,7 @@ struct ieee80211_mgd_auth_data { bool done, waiting; bool peer_confirmed; bool timeout_started; + int link_id; u8 ap_addr[ETH_ALEN] __aligned(2); @@ -412,6 +413,8 @@ struct ieee80211_mgd_assoc_data { u8 *elems; /* pointing to inside ie[] below */ ieee80211_conn_flags_t conn_flags; + + u16 status; } link[IEEE80211_MLD_MAX_NUM_LINKS]; u8 ap_addr[ETH_ALEN] __aligned(2); @@ -1707,8 +1710,27 @@ struct ieee802_11_elems { u8 tx_pwr_env_num; u8 eht_cap_len; + /* mult-link element can be de-fragmented and thus u8 is not sufficient */ + size_t multi_link_len; + + /* + * store the per station profile pointer and length in case that the + * parsing also handled Multi-Link element parsing for a specific link + * ID. + */ + struct ieee80211_mle_per_sta_profile *prof; + size_t sta_prof_len; + /* whether a parse error occurred while retrieving these elements */ bool parse_error; + + /* + * scratch buffer that can be used for various element parsing related + * tasks, e.g., element de-fragmentation etc. + */ + size_t scratch_len; + u8 *scratch_pos; + u8 scratch[]; }; static inline struct ieee80211_local *hw_to_local( @@ -2197,9 +2219,13 @@ static inline void ieee80211_tx_skb(struct ieee80211_sub_if_data *sdata, * represent a non-transmitting BSS in which case the data * for that non-transmitting BSS is returned * @link_id: the link ID to parse elements for, if a STA profile - * is present in the multi-link element, or -1 to ignore + * is present in the multi-link element, or -1 to ignore; + * note that the code currently assumes parsing an association + * (or re-association) response frame if this is given * @from_ap: frame is received from an AP (currently used only * for EHT capabilities parsing) + * @scratch_len: if non zero, specifies the requested length of the scratch + * buffer; otherwise, 'len' is used. */ struct ieee80211_elems_parse_params { const u8 *start; @@ -2210,6 +2236,7 @@ struct ieee80211_elems_parse_params { struct cfg80211_bss *bss; int link_id; bool from_ap; + size_t scratch_len; }; struct ieee802_11_elems * @@ -2280,7 +2307,6 @@ void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, void ieee80211_stop_queue_by_reason(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted); -void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue); void ieee80211_add_pending_skb(struct ieee80211_local *local, struct sk_buff *skb); void ieee80211_add_pending_skbs(struct ieee80211_local *local, diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c index 572254366a0f..7c4ce716c939 100644 --- a/net/mac80211/iface.c +++ b/net/mac80211/iface.c @@ -243,7 +243,7 @@ static int ieee80211_can_powered_addr_change(struct ieee80211_sub_if_data *sdata */ break; default: - return -EOPNOTSUPP; + ret = -EOPNOTSUPP; } unlock: @@ -458,12 +458,6 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata, bool going_do if (cancel_scan) ieee80211_scan_cancel(local); - /* - * Stop TX on this interface first. - */ - if (sdata->dev) - netif_tx_stop_all_queues(sdata->dev); - ieee80211_roc_purge(local, sdata); switch (sdata->vif.type) { @@ -811,13 +805,6 @@ static void ieee80211_uninit(struct net_device *dev) ieee80211_teardown_sdata(IEEE80211_DEV_TO_SUB_IF(dev)); } -static u16 ieee80211_netdev_select_queue(struct net_device *dev, - struct sk_buff *skb, - struct net_device *sb_dev) -{ - return ieee80211_select_queue(IEEE80211_DEV_TO_SUB_IF(dev), skb); -} - static void ieee80211_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) { @@ -831,7 +818,6 @@ static const struct net_device_ops ieee80211_dataif_ops = { .ndo_start_xmit = ieee80211_subif_start_xmit, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, - .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, }; @@ -939,7 +925,6 @@ static const struct net_device_ops ieee80211_dataif_8023_ops = { .ndo_start_xmit = ieee80211_subif_start_xmit_8023, .ndo_set_rx_mode = ieee80211_set_multicast_list, .ndo_set_mac_address = ieee80211_change_mac, - .ndo_select_queue = ieee80211_netdev_select_queue, .ndo_get_stats64 = ieee80211_get_stats64, .ndo_fill_forward_path = ieee80211_netdev_fill_forward_path, }; @@ -1412,8 +1397,6 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) sdata->vif.type != NL80211_IFTYPE_STATION); } - set_bit(SDATA_STATE_RUNNING, &sdata->state); - switch (sdata->vif.type) { case NL80211_IFTYPE_P2P_DEVICE: rcu_assign_pointer(local->p2p_sdata, sdata); @@ -1443,34 +1426,7 @@ int ieee80211_do_open(struct wireless_dev *wdev, bool coming_up) ieee80211_recalc_ps(local); - if (sdata->vif.type == NL80211_IFTYPE_MONITOR || - sdata->vif.type == NL80211_IFTYPE_AP_VLAN || - local->ops->wake_tx_queue) { - /* XXX: for AP_VLAN, actually track AP queues */ - if (dev) - netif_tx_start_all_queues(dev); - } else if (dev) { - unsigned long flags; - int n_acs = IEEE80211_NUM_ACS; - int ac; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - spin_lock_irqsave(&local->queue_stop_reason_lock, flags); - if (sdata->vif.cab_queue == IEEE80211_INVAL_HW_QUEUE || - (local->queue_stop_reasons[sdata->vif.cab_queue] == 0 && - skb_queue_empty(&local->pending[sdata->vif.cab_queue]))) { - for (ac = 0; ac < n_acs; ac++) { - int ac_queue = sdata->vif.hw_queue[ac]; - - if (local->queue_stop_reasons[ac_queue] == 0 && - skb_queue_empty(&local->pending[ac_queue])) - netif_start_subqueue(dev, ac); - } - } - spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); - } + set_bit(SDATA_STATE_RUNNING, &sdata->state); return 0; err_del_interface: @@ -1499,17 +1455,12 @@ static void ieee80211_if_setup(struct net_device *dev) { ether_setup(dev); dev->priv_flags &= ~IFF_TX_SKB_SHARING; + dev->priv_flags |= IFF_NO_QUEUE; dev->netdev_ops = &ieee80211_dataif_ops; dev->needs_free_netdev = true; dev->priv_destructor = ieee80211_if_free; } -static void ieee80211_if_setup_no_queue(struct net_device *dev) -{ - ieee80211_if_setup(dev); - dev->priv_flags |= IFF_NO_QUEUE; -} - static void ieee80211_iface_process_skb(struct ieee80211_local *local, struct ieee80211_sub_if_data *sdata, struct sk_buff *skb) @@ -2094,9 +2045,7 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, struct net_device *ndev = NULL; struct ieee80211_sub_if_data *sdata = NULL; struct txq_info *txqi; - void (*if_setup)(struct net_device *dev); int ret, i; - int txqs = 1; ASSERT_RTNL(); @@ -2119,30 +2068,18 @@ int ieee80211_if_add(struct ieee80211_local *local, const char *name, sizeof(void *)); int txq_size = 0; - if (local->ops->wake_tx_queue && - type != NL80211_IFTYPE_AP_VLAN && + if (type != NL80211_IFTYPE_AP_VLAN && (type != NL80211_IFTYPE_MONITOR || (params->flags & MONITOR_FLAG_ACTIVE))) txq_size += sizeof(struct txq_info) + local->hw.txq_data_size; - if (local->ops->wake_tx_queue) { - if_setup = ieee80211_if_setup_no_queue; - } else { - if_setup = ieee80211_if_setup; - if (local->hw.queues >= IEEE80211_NUM_ACS) - txqs = IEEE80211_NUM_ACS; - } - ndev = alloc_netdev_mqs(size + txq_size, name, name_assign_type, - if_setup, txqs, 1); + ieee80211_if_setup, 1, 1); if (!ndev) return -ENOMEM; - if (!local->ops->wake_tx_queue && local->hw.wiphy->tx_queue_len) - ndev->tx_queue_len = local->hw.wiphy->tx_queue_len; - dev_net_set(ndev, wiphy_net(local->hw.wiphy)); ndev->tstats = netdev_alloc_pcpu_stats(struct pcpu_sw_netstats); diff --git a/net/mac80211/link.c b/net/mac80211/link.c index e309708abae8..d1f5a9f7c647 100644 --- a/net/mac80211/link.c +++ b/net/mac80211/link.c @@ -357,6 +357,11 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; + + /* this is very temporary, but do it anyway */ + __ieee80211_sta_recalc_aggregates(sta, + old_active | active_links); + ret = drv_change_sta_links(local, sdata, &sta->sta, old_active, old_active | active_links); @@ -369,10 +374,22 @@ static int _ieee80211_set_active_links(struct ieee80211_sub_if_data *sdata, list_for_each_entry(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; + + __ieee80211_sta_recalc_aggregates(sta, active_links); + ret = drv_change_sta_links(local, sdata, &sta->sta, old_active | active_links, active_links); WARN_ON_ONCE(ret); + + /* + * Do it again, just in case - the driver might very + * well have called ieee80211_sta_recalc_aggregates() + * from there when filling in the new links, which + * would set it wrong since the vif's active links are + * not switched yet... + */ + __ieee80211_sta_recalc_aggregates(sta, active_links); } for_each_set_bit(link_id, &add, IEEE80211_MLD_MAX_NUM_LINKS) { diff --git a/net/mac80211/main.c b/net/mac80211/main.c index 46f3eddc2388..b7279d88cddb 100644 --- a/net/mac80211/main.c +++ b/net/mac80211/main.c @@ -630,7 +630,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (WARN_ON(!ops->tx || !ops->start || !ops->stop || !ops->config || !ops->add_interface || !ops->remove_interface || - !ops->configure_filter)) + !ops->configure_filter || !ops->wake_tx_queue)) return NULL; if (WARN_ON(ops->sta_state && (ops->sta_add || ops->sta_remove))) @@ -719,9 +719,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, if (!ops->set_key) wiphy->flags |= WIPHY_FLAG_IBSS_RSN; - if (ops->wake_tx_queue) - wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); - + wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_TXQS); wiphy_ext_feature_set(wiphy, NL80211_EXT_FEATURE_RRM); wiphy->bss_priv_size = sizeof(struct ieee80211_bss); @@ -834,10 +832,7 @@ struct ieee80211_hw *ieee80211_alloc_hw_nm(size_t priv_data_len, atomic_set(&local->agg_queue_stop[i], 0); } tasklet_setup(&local->tx_pending_tasklet, ieee80211_tx_pending); - - if (ops->wake_tx_queue) - tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); - + tasklet_setup(&local->wake_txqs_tasklet, ieee80211_wake_txqs); tasklet_setup(&local->tasklet, ieee80211_tasklet_handler); skb_queue_head_init(&local->skb_queue); @@ -1087,6 +1082,16 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) channels += sband->n_channels; + /* + * Due to the way the aggregation code handles this and it + * being an HT capability, we can't really support delayed + * BA in MLO (yet). + */ + if (WARN_ON(sband->ht_cap.ht_supported && + (sband->ht_cap.cap & IEEE80211_HT_CAP_DELAY_BA) && + hw->wiphy->flags & WIPHY_FLAG_SUPPORTS_MLO)) + return -EINVAL; + if (max_bitrates < sband->n_bitrates) max_bitrates = sband->n_bitrates; supp_ht = supp_ht || sband->ht_cap.ht_supported; @@ -1155,6 +1160,8 @@ int ieee80211_register_hw(struct ieee80211_hw *hw) if (!local->int_scan_req) return -ENOMEM; + eth_broadcast_addr(local->int_scan_req->bssid); + for (band = 0; band < NUM_NL80211_BANDS; band++) { if (!local->hw.wiphy->bands[band]) continue; diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c index ff449e0c2e62..a804e0220ed7 100644 --- a/net/mac80211/mlme.c +++ b/net/mac80211/mlme.c @@ -2717,18 +2717,10 @@ static u32 ieee80211_link_set_associated(struct ieee80211_link_data *link, } if (link->u.mgd.have_beacon) { - /* - * If the AP is buggy we may get here with no DTIM period - * known, so assume it's 1 which is the only safe assumption - * in that case, although if the TIM IE is broken powersave - * probably just won't work at all. - */ - bss_conf->dtim_period = link->u.mgd.dtim_period ?: 1; bss_conf->beacon_rate = bss->beacon_rate; changed |= BSS_CHANGED_BEACON_INFO; } else { bss_conf->beacon_rate = NULL; - bss_conf->dtim_period = 0; } /* Tell the driver to monitor connection quality (if supported) */ @@ -2754,7 +2746,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; - if (!cbss) + if (!cbss || + assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -2782,7 +2775,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata, struct ieee80211_link_data *link; struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; - if (!cbss) + if (!cbss || + assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -3868,9 +3862,15 @@ static void ieee80211_get_rates(struct ieee80211_supported_band *sband, } } -static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta, +static bool ieee80211_twt_req_supported(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + const struct link_sta_info *link_sta, const struct ieee802_11_elems *elems) { + const struct ieee80211_sta_he_cap *own_he_cap = + ieee80211_get_he_iftype_cap(sband, + ieee80211_vif_type_p2p(&sdata->vif)); + if (elems->ext_capab_len < 10) return false; @@ -3878,14 +3878,19 @@ static bool ieee80211_twt_req_supported(const struct link_sta_info *link_sta, return false; return link_sta->pub->he_cap.he_cap_elem.mac_cap_info[0] & - IEEE80211_HE_MAC_CAP0_TWT_RES; + IEEE80211_HE_MAC_CAP0_TWT_RES && + own_he_cap && + (own_he_cap->he_cap_elem.mac_cap_info[0] & + IEEE80211_HE_MAC_CAP0_TWT_REQ); } -static int ieee80211_recalc_twt_req(struct ieee80211_link_data *link, +static int ieee80211_recalc_twt_req(struct ieee80211_sub_if_data *sdata, + struct ieee80211_supported_band *sband, + struct ieee80211_link_data *link, struct link_sta_info *link_sta, struct ieee802_11_elems *elems) { - bool twt = ieee80211_twt_req_supported(link_sta, elems); + bool twt = ieee80211_twt_req_supported(sdata, sband, link_sta, elems); if (link->conf->twt_requester != twt) { link->conf->twt_requester = twt; @@ -3923,11 +3928,12 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, struct ieee80211_mgd_assoc_data *assoc_data = sdata->u.mgd.assoc_data; struct ieee80211_bss_conf *bss_conf = link->conf; struct ieee80211_local *local = sdata->local; + unsigned int link_id = link->link_id; struct ieee80211_elems_parse_params parse_params = { .start = elem_start, .len = elem_len, .bss = cbss, - .link_id = link == &sdata->deflink ? -1 : link->link_id, + .link_id = link_id == assoc_data->assoc_link_id ? -1 : link_id, .from_ap = true, }; bool is_6ghz = cbss->channel->band == NL80211_BAND_6GHZ; @@ -3942,8 +3948,35 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, if (!elems) return false; - /* FIXME: use from STA profile element after parsing that */ - capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + if (link_id == assoc_data->assoc_link_id) { + capab_info = le16_to_cpu(mgmt->u.assoc_resp.capab_info); + + /* + * we should not get to this flow unless the association was + * successful, so set the status directly to success + */ + assoc_data->link[link_id].status = WLAN_STATUS_SUCCESS; + } else if (!elems->prof) { + ret = false; + goto out; + } else { + const u8 *ptr = elems->prof->variable + + elems->prof->sta_info_len - 1; + + /* + * During parsing, we validated that these fields exist, + * otherwise elems->prof would have been set to NULL. + */ + capab_info = get_unaligned_le16(ptr); + assoc_data->link[link_id].status = get_unaligned_le16(ptr + 2); + + if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { + link_info(link, "association response status code=%u\n", + assoc_data->link[link_id].status); + ret = true; + goto out; + } + } if (!is_s1g && !elems->supp_rates) { sdata_info(sdata, "no SuppRates element in AssocResp\n"); @@ -4065,7 +4098,6 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, if (!(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_HE) && (!elems->he_cap || !elems->he_operation)) { - mutex_unlock(&sdata->local->sta_mtx); sdata_info(sdata, "HE AP is missing HE capability/operation\n"); ret = false; @@ -4100,7 +4132,8 @@ static bool ieee80211_assoc_config_link(struct ieee80211_link_data *link, else bss_conf->twt_protected = false; - *changed |= ieee80211_recalc_twt_req(link, link_sta, elems); + *changed |= ieee80211_recalc_twt_req(sdata, sband, link, + link_sta, elems); if (elems->eht_operation && elems->eht_cap && !(link->u.mgd.conn_flags & IEEE80211_CONN_DISABLE_EHT)) { @@ -4410,8 +4443,11 @@ ieee80211_verify_peer_he_mcs_support(struct ieee80211_sub_if_data *sdata, he_cap_elem = cfg80211_find_ext_elem(WLAN_EID_EXT_HE_CAPABILITY, ies->data, ies->len); + if (!he_cap_elem) + return false; + /* invalid HE IE */ - if (!he_cap_elem || he_cap_elem->datalen < 1 + sizeof(*he_cap)) { + if (he_cap_elem->datalen < 1 + sizeof(*he_cap)) { sdata_info(sdata, "Invalid HE elem, Disable HE\n"); return false; @@ -4677,8 +4713,6 @@ static int ieee80211_prep_channel(struct ieee80211_sub_if_data *sdata, } if (!elems->vht_cap_elem) { - sdata_info(sdata, - "bad VHT capabilities, disabling VHT\n"); *conn_flags |= IEEE80211_CONN_DISABLE_VHT; vht_oper = NULL; } @@ -4864,6 +4898,7 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, unsigned int link_id; struct sta_info *sta; u64 changed[IEEE80211_MLD_MAX_NUM_LINKS] = {}; + u16 valid_links = 0; int err; mutex_lock(&sdata->local->sta_mtx); @@ -4876,8 +4911,6 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, goto out_err; if (sdata->vif.valid_links) { - u16 valid_links = 0; - for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { if (!assoc_data->link[link_id].bss) continue; @@ -4894,10 +4927,11 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } for (link_id = 0; link_id < IEEE80211_MLD_MAX_NUM_LINKS; link_id++) { + struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; struct ieee80211_link_data *link; struct link_sta_info *link_sta; - if (!assoc_data->link[link_id].bss) + if (!cbss) continue; link = sdata_dereference(sdata->link[link_id], sdata); @@ -4906,28 +4940,36 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, if (sdata->vif.valid_links) link_info(link, - "local address %pM, AP link address %pM\n", + "local address %pM, AP link address %pM%s\n", link->conf->addr, - assoc_data->link[link_id].bss->bssid); + assoc_data->link[link_id].bss->bssid, + link_id == assoc_data->assoc_link_id ? + " (assoc)" : ""); link_sta = rcu_dereference_protected(sta->link[link_id], lockdep_is_held(&local->sta_mtx)); if (WARN_ON(!link_sta)) goto out_err; - if (link_id != assoc_data->assoc_link_id) { - struct cfg80211_bss *cbss = assoc_data->link[link_id].bss; + if (!link->u.mgd.have_beacon) { const struct cfg80211_bss_ies *ies; rcu_read_lock(); - ies = rcu_dereference(cbss->ies); + ies = rcu_dereference(cbss->beacon_ies); + if (ies) + link->u.mgd.have_beacon = true; + else + ies = rcu_dereference(cbss->ies); ieee80211_get_dtim(ies, &link->conf->sync_dtim_count, &link->u.mgd.dtim_period); - link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; link->conf->beacon_int = cbss->beacon_interval; rcu_read_unlock(); + } + link->conf->dtim_period = link->u.mgd.dtim_period ?: 1; + + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_prep_channel(sdata, link, cbss, &link->u.mgd.conn_flags); if (err) { @@ -4947,6 +4989,12 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, &changed[link_id])) goto out_err; + if (assoc_data->link[link_id].status != WLAN_STATUS_SUCCESS) { + valid_links &= ~BIT(link_id); + ieee80211_sta_remove_link(sta, link_id); + continue; + } + if (link_id != assoc_data->assoc_link_id) { err = ieee80211_sta_activate_link(sta, link_id); if (err) @@ -4954,6 +5002,9 @@ static bool ieee80211_assoc_success(struct ieee80211_sub_if_data *sdata, } } + /* links might have changed due to rejected ones, set them again */ + ieee80211_vif_set_links(sdata, valid_links); + rate_control_rate_init(sta); if (ifmgd->flags & IEEE80211_STA_MFP_ENABLED) { @@ -5033,6 +5084,7 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, struct cfg80211_rx_assoc_resp resp = { .uapsd_queues = -1, }; + u8 ap_mld_addr[ETH_ALEN] __aligned(2); unsigned int link_id; sdata_assert_lock(sdata); @@ -5187,10 +5239,13 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, link = sdata_dereference(sdata->link[link_id], sdata); if (!link) continue; + if (!assoc_data->link[link_id].bss) continue; + resp.links[link_id].bss = assoc_data->link[link_id].bss; resp.links[link_id].addr = link->conf->addr; + resp.links[link_id].status = assoc_data->link[link_id].status; /* get uapsd queues configuration - same for all links */ resp.uapsd_queues = 0; @@ -5199,6 +5254,11 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, resp.uapsd_queues |= ieee80211_ac_to_qos_mask[ac]; } + if (sdata->vif.valid_links) { + ether_addr_copy(ap_mld_addr, sdata->vif.cfg.ap_addr); + resp.ap_mld_addr = ap_mld_addr; + } + ieee80211_destroy_assoc_data(sdata, status_code == WLAN_STATUS_SUCCESS ? ASSOC_SUCCESS : @@ -5208,8 +5268,6 @@ static void ieee80211_rx_mgmt_assoc_resp(struct ieee80211_sub_if_data *sdata, resp.len = len; resp.req_ies = ifmgd->assoc_req_ies; resp.req_ies_len = ifmgd->assoc_req_ies_len; - if (sdata->vif.valid_links) - resp.ap_mld_addr = sdata->vif.cfg.ap_addr; cfg80211_rx_assoc_resp(sdata->dev, &resp); notify_driver: drv_mgd_complete_tx(sdata->local, sdata, &info); @@ -5432,6 +5490,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, struct ieee802_11_elems *elems; struct ieee80211_local *local = sdata->local; struct ieee80211_chanctx_conf *chanctx_conf; + struct ieee80211_supported_band *sband; struct ieee80211_channel *chan; struct link_sta_info *link_sta; struct sta_info *sta; @@ -5683,14 +5742,23 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_link_data *link, mutex_lock(&local->sta_mtx); sta = sta_info_get(sdata, sdata->vif.cfg.ap_addr); - if (WARN_ON(!sta)) + if (WARN_ON(!sta)) { + mutex_unlock(&local->sta_mtx); goto free; + } link_sta = rcu_dereference_protected(sta->link[link->link_id], lockdep_is_held(&local->sta_mtx)); - if (WARN_ON(!link_sta)) + if (WARN_ON(!link_sta)) { + mutex_unlock(&local->sta_mtx); goto free; + } + + if (WARN_ON(!link->conf->chandef.chan)) + goto free; + + sband = local->hw.wiphy->bands[link->conf->chandef.chan->band]; - changed |= ieee80211_recalc_twt_req(link, link_sta, elems); + changed |= ieee80211_recalc_twt_req(sdata, sband, link, link_sta, elems); if (ieee80211_config_bw(link, elems->ht_cap_elem, elems->vht_cap_elem, elems->ht_operation, @@ -6636,6 +6704,7 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, req->ap_mld_addr ?: req->bss->bssid, ETH_ALEN); auth_data->bss = req->bss; + auth_data->link_id = req->link_id; if (req->auth_data_len >= 4) { if (req->auth_type == NL80211_AUTHTYPE_SAE) { @@ -6654,7 +6723,8 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata, * removal and re-addition of the STA entry in * ieee80211_prep_connection(). */ - cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss; + cont_auth = ifmgd->auth_data && req->bss == ifmgd->auth_data->bss && + ifmgd->auth_data->link_id == req->link_id; if (req->ie && req->ie_len) { memcpy(&auth_data->data[auth_data->data_len], @@ -6978,7 +7048,8 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata, /* keep sta info, bssid if matching */ match = ether_addr_equal(ifmgd->auth_data->ap_addr, - assoc_data->ap_addr); + assoc_data->ap_addr) && + ifmgd->auth_data->link_id == req->link_id; ieee80211_destroy_auth_data(sdata, match); } diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c index 24c3c055db6d..762346598338 100644 --- a/net/mac80211/rc80211_minstrel_ht.c +++ b/net/mac80211/rc80211_minstrel_ht.c @@ -10,6 +10,7 @@ #include <linux/random.h> #include <linux/moduleparam.h> #include <linux/ieee80211.h> +#include <linux/minmax.h> #include <net/mac80211.h> #include "rate.h" #include "sta_info.h" @@ -1550,6 +1551,7 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) { struct ieee80211_sta_rates *rates; int i = 0; + int max_rates = min_t(int, mp->hw->max_rates, IEEE80211_TX_RATE_TABLE_SIZE); rates = kzalloc(sizeof(*rates), GFP_ATOMIC); if (!rates) @@ -1559,10 +1561,10 @@ minstrel_ht_update_rates(struct minstrel_priv *mp, struct minstrel_ht_sta *mi) minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_tp_rate[0]); /* Fill up remaining, keep one entry for max_probe_rate */ - for (; i < (mp->hw->max_rates - 1); i++) + for (; i < (max_rates - 1); i++) minstrel_ht_set_rate(mp, mi, rates, i, mi->max_tp_rate[i]); - if (i < mp->hw->max_rates) + if (i < max_rates) minstrel_ht_set_rate(mp, mi, rates, i++, mi->max_prob_rate); if (i < IEEE80211_TX_RATE_TABLE_SIZE) @@ -1961,9 +1963,6 @@ minstrel_ht_alloc(struct ieee80211_hw *hw) /* safe default, does not necessarily have to match hw properties */ mp->max_retry = 7; - if (hw->max_rates >= 4) - mp->has_mrr = true; - mp->hw = hw; mp->update_interval = HZ / 20; @@ -2034,7 +2033,7 @@ static void __init init_sample_table(void) memset(sample_table, 0xff, sizeof(sample_table)); for (col = 0; col < SAMPLE_COLUMNS; col++) { - prandom_bytes(rnd, sizeof(rnd)); + get_random_bytes(rnd, sizeof(rnd)); for (i = 0; i < MCS_GROUP_RATES; i++) { new_idx = (i + rnd[i]) % MCS_GROUP_RATES; while (sample_table[col][new_idx] != 0xff) diff --git a/net/mac80211/rc80211_minstrel_ht.h b/net/mac80211/rc80211_minstrel_ht.h index 1766ff0c78d3..4be0401f7721 100644 --- a/net/mac80211/rc80211_minstrel_ht.h +++ b/net/mac80211/rc80211_minstrel_ht.h @@ -74,7 +74,6 @@ struct minstrel_priv { struct ieee80211_hw *hw; - bool has_mrr; unsigned int cw_min; unsigned int cw_max; unsigned int max_retry; diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c index a57811372027..c28c6fbf786e 100644 --- a/net/mac80211/rx.c +++ b/net/mac80211/rx.c @@ -49,7 +49,7 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, if (present_fcs_len) __pskb_trim(skb, skb->len - present_fcs_len); - __pskb_pull(skb, rtap_space); + pskb_pull(skb, rtap_space); hdr = (void *)skb->data; fc = hdr->frame_control; @@ -74,7 +74,7 @@ static struct sk_buff *ieee80211_clean_skb(struct sk_buff *skb, memmove(skb->data + IEEE80211_HT_CTL_LEN, skb->data, hdrlen - IEEE80211_HT_CTL_LEN); - __pskb_pull(skb, IEEE80211_HT_CTL_LEN); + pskb_pull(skb, IEEE80211_HT_CTL_LEN); return skb; } @@ -1571,9 +1571,6 @@ static void sta_ps_start(struct sta_info *sta) ieee80211_clear_fast_xmit(sta); - if (!sta->sta.txq[0]) - return; - for (tid = 0; tid < IEEE80211_NUM_TIDS; tid++) { struct ieee80211_txq *txq = sta->sta.txq[tid]; struct txq_info *txqi = to_txq_info(txq); @@ -1978,10 +1975,11 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) if (mmie_keyidx < NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS || mmie_keyidx >= NUM_DEFAULT_KEYS + NUM_DEFAULT_MGMT_KEYS + - NUM_DEFAULT_BEACON_KEYS) { - cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, - skb->data, - skb->len); + NUM_DEFAULT_BEACON_KEYS) { + if (rx->sdata->dev) + cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, + skb->data, + skb->len); return RX_DROP_MONITOR; /* unexpected BIP keyidx */ } @@ -2131,7 +2129,8 @@ ieee80211_rx_h_decrypt(struct ieee80211_rx_data *rx) /* either the frame has been decrypted or will be dropped */ status->flag |= RX_FLAG_DECRYPTED; - if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE)) + if (unlikely(ieee80211_is_beacon(fc) && result == RX_DROP_UNUSABLE && + rx->sdata->dev)) cfg80211_rx_unprot_mlme_mgmt(rx->sdata->dev, skb->data, skb->len); @@ -4352,6 +4351,7 @@ void ieee80211_check_fast_rx(struct sta_info *sta) .vif_type = sdata->vif.type, .control_port_protocol = sdata->control_port_protocol, }, *old, *new = NULL; + u32 offload_flags; bool set_offload = false; bool assign = false; bool offload; @@ -4467,10 +4467,10 @@ void ieee80211_check_fast_rx(struct sta_info *sta) if (assign) new = kmemdup(&fastrx, sizeof(fastrx), GFP_KERNEL); - offload = assign && - (sdata->vif.offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED); + offload_flags = get_bss_sdata(sdata)->vif.offload_flags; + offload = offload_flags & IEEE80211_OFFLOAD_DECAP_ENABLED; - if (offload) + if (assign && offload) set_offload = !test_and_set_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); else set_offload = test_and_clear_sta_flag(sta, WLAN_STA_DECAP_OFFLOAD); @@ -4708,7 +4708,7 @@ static bool ieee80211_invoke_fast_rx(struct ieee80211_rx_data *rx, if (!(status->rx_flags & IEEE80211_RX_AMSDU)) { if (!pskb_may_pull(skb, snap_offs + sizeof(*payload))) - goto drop; + return false; payload = (void *)(skb->data + snap_offs); diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c index 0e8c4f48c36d..dc3cdee51e66 100644 --- a/net/mac80211/scan.c +++ b/net/mac80211/scan.c @@ -641,7 +641,7 @@ static void ieee80211_send_scan_probe_req(struct ieee80211_sub_if_data *sdata, if (flags & IEEE80211_PROBE_FLAG_RANDOM_SN) { struct ieee80211_hdr *hdr = (void *)skb->data; struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); - u16 sn = get_random_u32(); + u16 sn = get_random_u16(); info->control.flags |= IEEE80211_TX_CTRL_NO_SEQNO; hdr->seq_ctrl = diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c index cebfd148bb40..04e0f132b1d9 100644 --- a/net/mac80211/sta_info.c +++ b/net/mac80211/sta_info.c @@ -140,17 +140,15 @@ static void __cleanup_single_sta(struct sta_info *sta) atomic_dec(&ps->num_sta_ps); } - if (sta->sta.txq[0]) { - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txqi; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txqi; - if (!sta->sta.txq[i]) - continue; + if (!sta->sta.txq[i]) + continue; - txqi = to_txq_info(sta->sta.txq[i]); + txqi = to_txq_info(sta->sta.txq[i]); - ieee80211_txq_purge(local, txqi); - } + ieee80211_txq_purge(local, txqi); } for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) { @@ -366,6 +364,9 @@ static void sta_remove_link(struct sta_info *sta, unsigned int link_id, if (unhash) link_sta_info_hash_del(sta->local, link_sta); + if (test_sta_flag(sta, WLAN_STA_INSERTED)) + ieee80211_link_sta_debugfs_remove(link_sta); + if (link_sta != &sta->deflink) alloc = container_of(link_sta, typeof(*alloc), info); @@ -425,8 +426,7 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) sta_dbg(sta->sdata, "Destroyed STA %pM\n", sta->sta.addr); - if (sta->sta.txq[0]) - kfree(to_txq_info(sta->sta.txq[0])); + kfree(to_txq_info(sta->sta.txq[0])); kfree(rcu_dereference_raw(sta->sta.rates)); #ifdef CONFIG_MAC80211_MESH kfree(sta->mesh); @@ -511,6 +511,7 @@ static void sta_info_add_link(struct sta_info *sta, link_info->sta = sta; link_info->link_id = link_id; link_info->pub = link_sta; + link_info->pub->sta = &sta->sta; link_sta->link_id = link_id; rcu_assign_pointer(sta->link[link_id], link_info); rcu_assign_pointer(sta->sta.link[link_id], link_sta); @@ -527,6 +528,8 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, struct ieee80211_local *local = sdata->local; struct ieee80211_hw *hw = &local->hw; struct sta_info *sta; + void *txq_data; + int size; int i; sta = kzalloc(sizeof(*sta) + hw->sta_data_size, gfp); @@ -596,21 +599,18 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, sta->last_connected = ktime_get_seconds(); - if (local->ops->wake_tx_queue) { - void *txq_data; - int size = sizeof(struct txq_info) + - ALIGN(hw->txq_data_size, sizeof(void *)); + size = sizeof(struct txq_info) + + ALIGN(hw->txq_data_size, sizeof(void *)); - txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); - if (!txq_data) - goto free; + txq_data = kcalloc(ARRAY_SIZE(sta->sta.txq), size, gfp); + if (!txq_data) + goto free; - for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { - struct txq_info *txq = txq_data + i * size; + for (i = 0; i < ARRAY_SIZE(sta->sta.txq); i++) { + struct txq_info *txq = txq_data + i * size; - /* might not do anything for the bufferable MMPDU TXQ */ - ieee80211_txq_init(sdata, sta, txq, i); - } + /* might not do anything for the (bufferable) MMPDU TXQ */ + ieee80211_txq_init(sdata, sta, txq, i); } if (sta_prepare_rate_control(local, sta, gfp)) @@ -684,8 +684,7 @@ __sta_info_alloc(struct ieee80211_sub_if_data *sdata, return sta; free_txq: - if (sta->sta.txq[0]) - kfree(to_txq_info(sta->sta.txq[0])); + kfree(to_txq_info(sta->sta.txq[0])); free: sta_info_free_link(&sta->deflink); #ifdef CONFIG_MAC80211_MESH @@ -874,6 +873,26 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU) ieee80211_sta_debugfs_add(sta); rate_control_add_sta_debugfs(sta); + if (sta->sta.valid_links) { + int i; + + for (i = 0; i < ARRAY_SIZE(sta->link); i++) { + struct link_sta_info *link_sta; + + link_sta = rcu_dereference_protected(sta->link[i], + lockdep_is_held(&local->sta_mtx)); + + if (!link_sta) + continue; + + ieee80211_link_sta_debugfs_add(link_sta); + if (sdata->vif.active_links & BIT(i)) + ieee80211_link_sta_debugfs_drv_add(link_sta); + } + } else { + ieee80211_link_sta_debugfs_add(&sta->deflink); + ieee80211_link_sta_debugfs_drv_add(&sta->deflink); + } sinfo->generation = local->sta_generation; cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL); @@ -1958,9 +1977,6 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta, * TIM recalculation. */ - if (!sta->sta.txq[0]) - return; - for (tid = 0; tid < ARRAY_SIZE(sta->sta.txq); tid++) { if (!sta->sta.txq[tid] || !(driver_release_tids & BIT(tid)) || @@ -2127,22 +2143,30 @@ void ieee80211_sta_register_airtime(struct ieee80211_sta *pubsta, u8 tid, } EXPORT_SYMBOL(ieee80211_sta_register_airtime); -void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) +void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links) { - struct sta_info *sta = container_of(pubsta, struct sta_info, sta); - struct ieee80211_link_sta *link_sta; - int link_id, i; bool first = true; + int link_id; - if (!pubsta->valid_links || !pubsta->mlo) { - pubsta->cur = &pubsta->deflink.agg; + if (!sta->sta.valid_links || !sta->sta.mlo) { + sta->sta.cur = &sta->sta.deflink.agg; return; } rcu_read_lock(); - for_each_sta_active_link(&sta->sdata->vif, pubsta, link_sta, link_id) { + for (link_id = 0; link_id < ARRAY_SIZE((sta)->link); link_id++) { + struct ieee80211_link_sta *link_sta; + int i; + + if (!(active_links & BIT(link_id))) + continue; + + link_sta = rcu_dereference(sta->sta.link[link_id]); + if (!link_sta) + continue; + if (first) { - sta->cur = pubsta->deflink.agg; + sta->cur = sta->sta.deflink.agg; first = false; continue; } @@ -2161,7 +2185,14 @@ void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) } rcu_read_unlock(); - pubsta->cur = &sta->cur; + sta->sta.cur = &sta->cur; +} + +void ieee80211_sta_recalc_aggregates(struct ieee80211_sta *pubsta) +{ + struct sta_info *sta = container_of(pubsta, struct sta_info, sta); + + __ieee80211_sta_recalc_aggregates(sta, sta->sdata->vif.active_links); } EXPORT_SYMBOL(ieee80211_sta_recalc_aggregates); @@ -2396,9 +2427,9 @@ static inline u64 sta_get_tidstats_msdu(struct ieee80211_sta_rx_stats *rxstats, u64 value; do { - start = u64_stats_fetch_begin_irq(&rxstats->syncp); + start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->msdu[tid]; - } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } @@ -2445,7 +2476,7 @@ static void sta_set_tidstats(struct sta_info *sta, tidstats->tx_msdu_failed = sta->deflink.status_stats.msdu_failed[tid]; } - if (local->ops->wake_tx_queue && tid < IEEE80211_NUM_TIDS) { + if (tid < IEEE80211_NUM_TIDS) { spin_lock_bh(&local->fq.lock); rcu_read_lock(); @@ -2464,9 +2495,9 @@ static inline u64 sta_get_stats_bytes(struct ieee80211_sta_rx_stats *rxstats) u64 value; do { - start = u64_stats_fetch_begin_irq(&rxstats->syncp); + start = u64_stats_fetch_begin(&rxstats->syncp); value = rxstats->bytes; - } while (u64_stats_fetch_retry_irq(&rxstats->syncp, start)); + } while (u64_stats_fetch_retry(&rxstats->syncp, start)); return value; } @@ -2773,9 +2804,6 @@ unsigned long ieee80211_sta_last_active(struct sta_info *sta) static void sta_update_codel_params(struct sta_info *sta, u32 thr) { - if (!sta->sdata->local->ops->wake_tx_queue) - return; - if (thr && thr < STA_SLOW_THRESHOLD * sta->local->num_sta) { sta->cparams.target = MS2TIME(50); sta->cparams.interval = MS2TIME(300); @@ -2823,6 +2851,8 @@ int ieee80211_sta_allocate_link(struct sta_info *sta, unsigned int link_id) sta_info_add_link(sta, link_id, &alloc->info, &alloc->sta); + ieee80211_link_sta_debugfs_add(&alloc->info); + return 0; } diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h index 2517ea714dc4..69820b551668 100644 --- a/net/mac80211/sta_info.h +++ b/net/mac80211/sta_info.h @@ -513,6 +513,7 @@ struct ieee80211_fragment_cache { * @status_stats.avg_ack_signal: average ACK signal * @cur_max_bandwidth: maximum bandwidth to use for TX to the station, * taken from HT/VHT capabilities or VHT operating mode notification + * @debugfs_dir: debug filesystem directory dentry * @pub: public (driver visible) link STA data * TODO Move other link params from sta_info as required for MLD operation */ @@ -560,6 +561,10 @@ struct link_sta_info { enum ieee80211_sta_rx_bandwidth cur_max_bandwidth; +#ifdef CONFIG_MAC80211_DEBUGFS + struct dentry *debugfs_dir; +#endif + struct ieee80211_link_sta *pub; }; @@ -922,6 +927,8 @@ void ieee80211_sta_set_max_amsdu_subframes(struct sta_info *sta, const u8 *ext_capab, unsigned int ext_capab_len); +void __ieee80211_sta_recalc_aggregates(struct sta_info *sta, u16 active_links); + enum sta_stats_type { STA_STATS_RATE_TYPE_INVALID = 0, STA_STATS_RATE_TYPE_LEGACY, diff --git a/net/mac80211/status.c b/net/mac80211/status.c index 8e77fd2e9fdf..3f9ddd7f04b6 100644 --- a/net/mac80211/status.c +++ b/net/mac80211/status.c @@ -729,7 +729,7 @@ static void ieee80211_report_used_skb(struct ieee80211_local *local, if (!sdata) { skb->dev = NULL; - } else { + } else if (!dropped) { unsigned int hdr_size = ieee80211_hdrlen(hdr->frame_control); diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c index f4b4d25eef95..b255f3b5bf01 100644 --- a/net/mac80211/tdls.c +++ b/net/mac80211/tdls.c @@ -1016,7 +1016,6 @@ ieee80211_tdls_prep_mgmt_packet(struct wiphy *wiphy, struct net_device *dev, skb->priority = 256 + 5; break; } - skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, skb)); /* * Set the WLAN_TDLS_TEARDOWN flag to indicate a teardown in progress. diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c index 24c0a1706b92..bb2e54610101 100644 --- a/net/mac80211/tx.c +++ b/net/mac80211/tx.c @@ -1599,9 +1599,6 @@ int ieee80211_txq_setup_flows(struct ieee80211_local *local) bool supp_vht = false; enum nl80211_band band; - if (!local->ops->wake_tx_queue) - return 0; - ret = fq_init(fq, 4096); if (ret) return ret; @@ -1649,9 +1646,6 @@ void ieee80211_txq_teardown_flows(struct ieee80211_local *local) { struct fq *fq = &local->fq; - if (!local->ops->wake_tx_queue) - return; - kfree(local->cvars); local->cvars = NULL; @@ -1668,8 +1662,7 @@ static bool ieee80211_queue_skb(struct ieee80211_local *local, struct ieee80211_vif *vif; struct txq_info *txqi; - if (!local->ops->wake_tx_queue || - sdata->vif.type == NL80211_IFTYPE_MONITOR) + if (sdata->vif.type == NL80211_IFTYPE_MONITOR) return false; if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN) @@ -2319,6 +2312,10 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, u16 len_rthdr; int hdrlen; + sdata = IEEE80211_DEV_TO_SUB_IF(dev); + if (unlikely(!ieee80211_sdata_running(sdata))) + goto fail; + memset(info, 0, sizeof(*info)); info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS | IEEE80211_TX_CTL_INJECTED; @@ -2378,8 +2375,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb, * This is necessary, for example, for old hostapd versions that * don't use nl80211-based management TX/RX. */ - sdata = IEEE80211_DEV_TO_SUB_IF(dev); - list_for_each_entry_rcu(tmp_sdata, &local->interfaces, list) { if (!ieee80211_sdata_running(tmp_sdata)) continue; @@ -2971,7 +2966,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata, if (pre_conf_link_id != link_id && link_id != IEEE80211_LINK_UNSPECIFIED) { -#ifdef CPTCFG_MAC80211_VERBOSE_DEBUG +#ifdef CONFIG_MAC80211_VERBOSE_DEBUG net_info_ratelimited("%s: dropped frame to %pM with bad link ID request (%d vs. %d)\n", sdata->name, hdr.addr1, pre_conf_link_id, link_id); @@ -4169,7 +4164,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, struct sk_buff *next; int len = skb->len; - if (unlikely(skb->len < ETH_HLEN)) { + if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return; } @@ -4182,12 +4177,7 @@ void __ieee80211_subif_start_xmit(struct sk_buff *skb, if (IS_ERR(sta)) sta = NULL; - if (local->ops->wake_tx_queue) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); - skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); - } - + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); ieee80211_aggr_check(sdata, sta, skb); sk_pacing_shift_update(skb->sk, sdata->local->hw.tx_sk_pacing_shift); @@ -4493,11 +4483,7 @@ static void ieee80211_8023_xmit(struct ieee80211_sub_if_data *sdata, struct tid_ampdu_tx *tid_tx; u8 tid; - if (local->ops->wake_tx_queue) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); - skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); - } + skb_set_queue_mapping(skb, ieee80211_select_queue(sdata, sta, skb)); if (unlikely(test_bit(SCAN_SW_SCANNING, &local->scanning)) && test_bit(SDATA_STATE_OFFCHANNEL, &sdata->state)) @@ -4566,7 +4552,7 @@ netdev_tx_t ieee80211_subif_start_xmit_8023(struct sk_buff *skb, struct ieee80211_key *key; struct sta_info *sta; - if (unlikely(skb->len < ETH_HLEN)) { + if (unlikely(!ieee80211_sdata_running(sdata) || skb->len < ETH_HLEN)) { kfree_skb(skb); return NETDEV_TX_OK; } @@ -4751,9 +4737,6 @@ void ieee80211_tx_pending(struct tasklet_struct *t) if (!txok) break; } - - if (skb_queue_empty(&local->pending[i])) - ieee80211_propagate_queue_wake(local, i); } spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); @@ -5930,6 +5913,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, skb_reset_network_header(skb); skb_reset_mac_header(skb); + if (local->hw.queues < IEEE80211_NUM_ACS) + goto start_xmit; + /* update QoS header to prioritize control port frames if possible, * priorization also happens for control port frames send over * AF_PACKET @@ -5943,10 +5929,9 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, } if (!IS_ERR(sta)) { - u16 queue = __ieee80211_select_queue(sdata, sta, skb); + u16 queue = ieee80211_select_queue(sdata, sta, skb); skb_set_queue_mapping(skb, queue); - skb_get_hash(skb); /* * for MLO STA, the SA should be the AP MLD address, but @@ -5957,6 +5942,7 @@ int ieee80211_tx_control_port(struct wiphy *wiphy, struct net_device *dev, } rcu_read_unlock(); +start_xmit: /* mutex lock is only needed for incrementing the cookie counter */ mutex_lock(&local->mtx); diff --git a/net/mac80211/util.c b/net/mac80211/util.c index 0ea5d50091dc..6f5407038459 100644 --- a/net/mac80211/util.c +++ b/net/mac80211/util.c @@ -288,6 +288,52 @@ __le16 ieee80211_ctstoself_duration(struct ieee80211_hw *hw, } EXPORT_SYMBOL(ieee80211_ctstoself_duration); +static void wake_tx_push_queue(struct ieee80211_local *local, + struct ieee80211_sub_if_data *sdata, + struct ieee80211_txq *queue) +{ + int q = sdata->vif.hw_queue[queue->ac]; + struct ieee80211_tx_control control = { + .sta = queue->sta, + }; + struct sk_buff *skb; + unsigned long flags; + bool q_stopped; + + while (1) { + spin_lock_irqsave(&local->queue_stop_reason_lock, flags); + q_stopped = local->queue_stop_reasons[q]; + spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); + + if (q_stopped) + break; + + skb = ieee80211_tx_dequeue(&local->hw, queue); + if (!skb) + break; + + drv_tx(local, &control, skb); + } +} + +/* wake_tx_queue handler for driver not implementing a custom one*/ +void ieee80211_handle_wake_tx_queue(struct ieee80211_hw *hw, + struct ieee80211_txq *txq) +{ + struct ieee80211_local *local = hw_to_local(hw); + struct ieee80211_sub_if_data *sdata = vif_to_sdata(txq->vif); + struct ieee80211_txq *queue; + + /* Use ieee80211_next_txq() for airtime fairness accounting */ + ieee80211_txq_schedule_start(hw, txq->ac); + while ((queue = ieee80211_next_txq(hw, txq->ac))) { + wake_tx_push_queue(local, sdata, queue); + ieee80211_return_txq(hw, queue, false); + } + ieee80211_txq_schedule_end(hw, txq->ac); +} +EXPORT_SYMBOL(ieee80211_handle_wake_tx_queue); + static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) { struct ieee80211_local *local = sdata->local; @@ -301,14 +347,14 @@ static void __ieee80211_wake_txqs(struct ieee80211_sub_if_data *sdata, int ac) local_bh_disable(); spin_lock(&fq->lock); + sdata->vif.txqs_stopped[ac] = false; + if (!test_bit(SDATA_STATE_RUNNING, &sdata->state)) goto out; if (sdata->vif.type == NL80211_IFTYPE_AP) ps = &sdata->bss->ps; - sdata->vif.txqs_stopped[ac] = false; - list_for_each_entry_rcu(sta, &local->sta_list, list) { if (sdata != sta->sdata) continue; @@ -400,39 +446,6 @@ void ieee80211_wake_txqs(struct tasklet_struct *t) spin_unlock_irqrestore(&local->queue_stop_reason_lock, flags); } -void ieee80211_propagate_queue_wake(struct ieee80211_local *local, int queue) -{ - struct ieee80211_sub_if_data *sdata; - int n_acs = IEEE80211_NUM_ACS; - - if (local->ops->wake_tx_queue) - return; - - if (local->hw.queues < IEEE80211_NUM_ACS) - n_acs = 1; - - list_for_each_entry_rcu(sdata, &local->interfaces, list) { - int ac; - - if (!sdata->dev) - continue; - - if (sdata->vif.cab_queue != IEEE80211_INVAL_HW_QUEUE && - local->queue_stop_reasons[sdata->vif.cab_queue] != 0) - continue; - - for (ac = 0; ac < n_acs; ac++) { - int ac_queue = sdata->vif.hw_queue[ac]; - - if (ac_queue == queue || - (sdata->vif.cab_queue == queue && - local->queue_stop_reasons[ac_queue] == 0 && - skb_queue_empty(&local->pending[ac_queue]))) - netif_wake_subqueue(sdata->dev, ac); - } - } -} - static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, enum queue_stop_reason reason, bool refcounted, @@ -463,11 +476,7 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, /* someone still has this queue stopped */ return; - if (skb_queue_empty(&local->pending[queue])) { - rcu_read_lock(); - ieee80211_propagate_queue_wake(local, queue); - rcu_read_unlock(); - } else + if (!skb_queue_empty(&local->pending[queue])) tasklet_schedule(&local->tx_pending_tasklet); /* @@ -477,12 +486,10 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue, * release someone's lock, but it is fine because all the callers of * __ieee80211_wake_queue call it right before releasing the lock. */ - if (local->ops->wake_tx_queue) { - if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) - tasklet_schedule(&local->wake_txqs_tasklet); - else - _ieee80211_wake_txqs(local, flags); - } + if (reason == IEEE80211_QUEUE_STOP_REASON_DRIVER) + tasklet_schedule(&local->wake_txqs_tasklet); + else + _ieee80211_wake_txqs(local, flags); } void ieee80211_wake_queue_by_reason(struct ieee80211_hw *hw, int queue, @@ -539,10 +546,6 @@ static void __ieee80211_stop_queue(struct ieee80211_hw *hw, int queue, for (ac = 0; ac < n_acs; ac++) { if (sdata->vif.hw_queue[ac] == queue || sdata->vif.cab_queue == queue) { - if (!local->ops->wake_tx_queue) { - netif_stop_subqueue(sdata->dev, ac); - continue; - } spin_lock(&local->fq.lock); sdata->vif.txqs_stopped[ac] = true; spin_unlock(&local->fq.lock); @@ -1026,8 +1029,10 @@ ieee80211_parse_extension_element(u32 *crc, elems->eht_operation = data; break; case WLAN_EID_EXT_EHT_MULTI_LINK: - if (ieee80211_mle_size_ok(data, len)) + if (ieee80211_mle_size_ok(data, len)) { elems->multi_link = (void *)data; + elems->multi_link_len = len; + } break; } } @@ -1445,6 +1450,8 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, start, len) { if (elem->datalen < 2) continue; + if (elem->data[0] < 1 || elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 new_bssid[ETH_ALEN]; @@ -1497,6 +1504,145 @@ static size_t ieee802_11_find_bssid_profile(const u8 *start, size_t len, return found ? profile_len : 0; } +static void ieee80211_defragment_element(struct ieee802_11_elems *elems, + void **elem_ptr, size_t *len, + size_t total_len, u8 frag_id) +{ + u8 *data = *elem_ptr, *pos, *start; + const struct element *elem; + + /* + * Since 'data' points to the data of the element, not the element + * itself, allow 254 in case it was an extended element where the + * extended ID isn't part of the data we see here and thus not part of + * 'len' either. + */ + if (!data || (*len != 254 && *len != 255)) + return; + + start = elems->scratch_pos; + + if (WARN_ON(*len > (elems->scratch + elems->scratch_len - + elems->scratch_pos))) + return; + + memcpy(elems->scratch_pos, data, *len); + elems->scratch_pos += *len; + + pos = data + *len; + total_len -= *len; + for_each_element(elem, pos, total_len) { + if (elem->id != frag_id) + break; + + if (WARN_ON(elem->datalen > + (elems->scratch + elems->scratch_len - + elems->scratch_pos))) + return; + + memcpy(elems->scratch_pos, elem->data, elem->datalen); + elems->scratch_pos += elem->datalen; + + *len += elem->datalen; + } + + *elem_ptr = start; +} + +static void ieee80211_mle_get_sta_prof(struct ieee802_11_elems *elems, + u8 link_id) +{ + const struct ieee80211_multi_link_elem *ml = elems->multi_link; + size_t ml_len = elems->multi_link_len; + const struct element *sub; + + if (!ml || !ml_len) + return; + + if (le16_get_bits(ml->control, IEEE80211_ML_CONTROL_TYPE) != + IEEE80211_ML_CONTROL_TYPE_BASIC) + return; + + for_each_mle_subelement(sub, (u8 *)ml, ml_len) { + struct ieee80211_mle_per_sta_profile *prof = (void *)sub->data; + u16 control; + + if (sub->id != IEEE80211_MLE_SUBELEM_PER_STA_PROFILE) + continue; + + if (!ieee80211_mle_sta_prof_size_ok(sub->data, sub->datalen)) + return; + + control = le16_to_cpu(prof->control); + + if (link_id != u16_get_bits(control, + IEEE80211_MLE_STA_CONTROL_LINK_ID)) + continue; + + if (!(control & IEEE80211_MLE_STA_CONTROL_COMPLETE_PROFILE)) + return; + + elems->prof = prof; + elems->sta_prof_len = sub->datalen; + + /* the sub element can be fragmented */ + ieee80211_defragment_element(elems, (void **)&elems->prof, + &elems->sta_prof_len, + ml_len - (sub->data - (u8 *)ml), + IEEE80211_MLE_SUBELEM_FRAGMENT); + return; + } +} + +static void ieee80211_mle_parse_link(struct ieee802_11_elems *elems, + struct ieee80211_elems_parse_params *params) +{ + struct ieee80211_mle_per_sta_profile *prof; + struct ieee80211_elems_parse_params sub = { + .action = params->action, + .from_ap = params->from_ap, + .link_id = -1, + }; + const struct element *non_inherit = NULL; + const u8 *end; + + if (params->link_id == -1) + return; + + ieee80211_defragment_element(elems, (void **)&elems->multi_link, + &elems->multi_link_len, + elems->total_len - ((u8 *)elems->multi_link - + elems->ie_start), + WLAN_EID_FRAGMENT); + + ieee80211_mle_get_sta_prof(elems, params->link_id); + prof = elems->prof; + + if (!prof) + return; + + /* check if we have the 4 bytes for the fixed part in assoc response */ + if (elems->sta_prof_len < sizeof(*prof) + prof->sta_info_len - 1 + 4) { + elems->prof = NULL; + elems->sta_prof_len = 0; + return; + } + + /* + * Skip the capability information and the status code that are expected + * as part of the station profile in association response frames. Note + * the -1 is because the 'sta_info_len' is accounted to as part of the + * per-STA profile, but not part of the 'u8 variable[]' portion. + */ + sub.start = prof->variable + prof->sta_info_len - 1 + 4; + end = (const u8 *)prof + elems->sta_prof_len; + sub.len = end - sub.start; + + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + sub.start, sub.len); + _ieee802_11_parse_elems_full(&sub, elems, non_inherit); +} + struct ieee802_11_elems * ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) { @@ -1504,24 +1650,26 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) const struct element *non_inherit = NULL; u8 *nontransmitted_profile; int nontransmitted_profile_len = 0; + size_t scratch_len = params->scratch_len ?: 3 * params->len; - elems = kzalloc(sizeof(*elems), GFP_ATOMIC); + elems = kzalloc(sizeof(*elems) + scratch_len, GFP_ATOMIC); if (!elems) return NULL; elems->ie_start = params->start; elems->total_len = params->len; - - nontransmitted_profile = kmalloc(params->len, GFP_ATOMIC); - if (nontransmitted_profile) { - nontransmitted_profile_len = - ieee802_11_find_bssid_profile(params->start, params->len, - elems, params->bss, - nontransmitted_profile); - non_inherit = - cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, - nontransmitted_profile, - nontransmitted_profile_len); - } + elems->scratch_len = scratch_len; + elems->scratch_pos = elems->scratch; + + nontransmitted_profile = elems->scratch_pos; + nontransmitted_profile_len = + ieee802_11_find_bssid_profile(params->start, params->len, + elems, params->bss, + nontransmitted_profile); + elems->scratch_pos += nontransmitted_profile_len; + elems->scratch_len -= nontransmitted_profile_len; + non_inherit = cfg80211_find_ext_elem(WLAN_EID_EXT_NON_INHERITANCE, + nontransmitted_profile, + nontransmitted_profile_len); elems->crc = _ieee802_11_parse_elems_full(params, elems, non_inherit); @@ -1537,6 +1685,8 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) _ieee802_11_parse_elems_full(&sub, elems, NULL); } + ieee80211_mle_parse_link(elems, params); + if (elems->tim && !elems->parse_error) { const struct ieee80211_tim_ie *tim_ie = elems->tim; @@ -1555,8 +1705,6 @@ ieee802_11_parse_elems_full(struct ieee80211_elems_parse_params *params) offsetofend(struct ieee80211_bssid_index, dtim_count)) elems->dtim_count = elems->bssid_index->dtim_count; - kfree(nontransmitted_profile); - return elems; } @@ -2046,7 +2194,7 @@ static int ieee80211_build_preq_ies_band(struct ieee80211_sub_if_data *sdata, if (he_cap) { enum nl80211_iftype iftype = ieee80211_vif_type_p2p(&sdata->vif); - __le16 cap = ieee80211_get_he_6ghz_capa(sband, iftype); + __le16 cap = ieee80211_get_he_6ghz_capa(sband6, iftype); pos = ieee80211_write_he_6ghz_cap(pos, cap, end); } diff --git a/net/mac80211/wme.c b/net/mac80211/wme.c index ecc1de2e68a5..a12c63638680 100644 --- a/net/mac80211/wme.c +++ b/net/mac80211/wme.c @@ -122,6 +122,9 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb); u8 *p; + /* Ensure hash is set prior to potential SW encryption */ + skb_get_hash(skb); + if ((info->control.flags & IEEE80211_TX_CTRL_DONT_REORDER) || local->hw.queues < IEEE80211_NUM_ACS) return 0; @@ -141,12 +144,15 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, return ieee80211_downgrade_queue(sdata, NULL, skb); } -u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb) +u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, + struct sta_info *sta, struct sk_buff *skb) { struct mac80211_qos_map *qos_map; bool qos; + /* Ensure hash is set prior to potential SW encryption */ + skb_get_hash(skb); + /* all mesh/ocb stations are required to support WME */ if (sta && (sdata->vif.type == NL80211_IFTYPE_MESH_POINT || sdata->vif.type == NL80211_IFTYPE_OCB)) @@ -176,59 +182,6 @@ u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, return ieee80211_downgrade_queue(sdata, sta, skb); } - -/* Indicate which queue to use. */ -u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb) -{ - struct ieee80211_local *local = sdata->local; - struct sta_info *sta = NULL; - const u8 *ra = NULL; - u16 ret; - - /* when using iTXQ, we can do this later */ - if (local->ops->wake_tx_queue) - return 0; - - if (local->hw.queues < IEEE80211_NUM_ACS || skb->len < 6) { - skb->priority = 0; /* required for correct WPA/11i MIC */ - return 0; - } - - rcu_read_lock(); - switch (sdata->vif.type) { - case NL80211_IFTYPE_AP_VLAN: - sta = rcu_dereference(sdata->u.vlan.sta); - if (sta) - break; - fallthrough; - case NL80211_IFTYPE_AP: - ra = skb->data; - break; - case NL80211_IFTYPE_STATION: - /* might be a TDLS station */ - sta = sta_info_get(sdata, skb->data); - if (sta) - break; - - ra = sdata->deflink.u.mgd.bssid; - break; - case NL80211_IFTYPE_ADHOC: - ra = skb->data; - break; - default: - break; - } - - if (!sta && ra && !is_multicast_ether_addr(ra)) - sta = sta_info_get(sdata, ra); - - ret = __ieee80211_select_queue(sdata, sta, skb); - - rcu_read_unlock(); - return ret; -} - /** * ieee80211_set_qos_hdr - Fill in the QoS header if there is one. * diff --git a/net/mac80211/wme.h b/net/mac80211/wme.h index 2e3dec0b6087..81f0039527a9 100644 --- a/net/mac80211/wme.h +++ b/net/mac80211/wme.h @@ -13,10 +13,8 @@ u16 ieee80211_select_queue_80211(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb, struct ieee80211_hdr *hdr); -u16 __ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sta_info *sta, struct sk_buff *skb); u16 ieee80211_select_queue(struct ieee80211_sub_if_data *sdata, - struct sk_buff *skb); + struct sta_info *sta, struct sk_buff *skb); void ieee80211_set_qos_hdr(struct ieee80211_sub_if_data *sdata, struct sk_buff *skb); diff --git a/net/mac802154/cfg.c b/net/mac802154/cfg.c index 1e4a9f74ed43..dc2d918fac68 100644 --- a/net/mac802154/cfg.c +++ b/net/mac802154/cfg.c @@ -46,7 +46,7 @@ static int ieee802154_suspend(struct wpan_phy *wpan_phy) if (!local->open_count) goto suspend; - ieee802154_stop_queue(&local->hw); + ieee802154_sync_and_hold_queue(local); synchronize_net(); /* stop hardware - this must stop RX */ @@ -67,12 +67,12 @@ static int ieee802154_resume(struct wpan_phy *wpan_phy) goto wake_up; /* restart hardware */ - ret = drv_start(local); + ret = drv_start(local, local->phy->filtering, &local->addr_filt); if (ret) return ret; wake_up: - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); local->suspended = false; return 0; } diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h index d23f0db98015..a7af3f0ddb3e 100644 --- a/net/mac802154/driver-ops.h +++ b/net/mac802154/driver-ops.h @@ -24,203 +24,290 @@ drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb) return local->ops->xmit_sync(&local->hw, skb); } -static inline int drv_start(struct ieee802154_local *local) +static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - trace_802154_drv_start(local); - local->started = true; - smp_mb(); - ret = local->ops->start(&local->hw); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } + + filt.pan_id = pan_id; + + trace_802154_drv_set_pan_id(local, pan_id); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_PANID_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline void drv_stop(struct ieee802154_local *local) +static inline int +drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) { - might_sleep(); + struct ieee802154_hw_addr_filt filt; + int ret; - trace_802154_drv_stop(local); - local->ops->stop(&local->hw); - trace_802154_drv_return_void(local); + might_sleep(); - /* sync away all work on the tasklet before clearing started */ - tasklet_disable(&local->tasklet); - tasklet_enable(&local->tasklet); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } - barrier(); + filt.ieee_addr = extended_addr; - local->started = false; + trace_802154_drv_set_extended_addr(local, extended_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_return_int(local, ret); + return ret; } static inline int -drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) +drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - trace_802154_drv_set_channel(local, page, channel); - ret = local->ops->set_channel(&local->hw, page, channel); + if (!local->ops->set_hw_addr_filt) { + WARN_ON(1); + return -EOPNOTSUPP; + } + + filt.short_addr = short_addr; + + trace_802154_drv_set_short_addr(local, short_addr); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_SADDR_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) +static inline int +drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) { + struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_txpower) { + if (!local->ops->set_hw_addr_filt) { WARN_ON(1); return -EOPNOTSUPP; } - trace_802154_drv_set_tx_power(local, mbm); - ret = local->ops->set_txpower(&local->hw, mbm); + filt.pan_coord = is_coord; + + trace_802154_drv_set_pan_coord(local, is_coord); + ret = local->ops->set_hw_addr_filt(&local->hw, &filt, + IEEE802154_AFILT_PANC_CHANGED); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_cca_mode(struct ieee802154_local *local, - const struct wpan_phy_cca *cca) +static inline int +drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) { int ret; might_sleep(); - if (!local->ops->set_cca_mode) { + if (!local->ops->set_promiscuous_mode) { WARN_ON(1); return -EOPNOTSUPP; } - trace_802154_drv_set_cca_mode(local, cca); - ret = local->ops->set_cca_mode(&local->hw, cca); + trace_802154_drv_set_promiscuous_mode(local, on); + ret = local->ops->set_promiscuous_mode(&local->hw, on); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) +static inline int drv_start(struct ieee802154_local *local, + enum ieee802154_filtering_level level, + const struct ieee802154_hw_addr_filt *addr_filt) { int ret; might_sleep(); - if (!local->ops->set_lbt) { + /* setup receive mode parameters e.g. address mode */ + if (local->hw.flags & IEEE802154_HW_AFILT) { + ret = drv_set_pan_id(local, addr_filt->pan_id); + if (ret < 0) + return ret; + + ret = drv_set_short_addr(local, addr_filt->short_addr); + if (ret < 0) + return ret; + + ret = drv_set_extended_addr(local, addr_filt->ieee_addr); + if (ret < 0) + return ret; + } + + switch (level) { + case IEEE802154_FILTERING_NONE: + fallthrough; + case IEEE802154_FILTERING_1_FCS: + fallthrough; + case IEEE802154_FILTERING_2_PROMISCUOUS: + /* TODO: Requires a different receive mode setup e.g. + * at86rf233 hardware. + */ + fallthrough; + case IEEE802154_FILTERING_3_SCAN: + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, true); + if (ret < 0) + return ret; + } else { + return -EOPNOTSUPP; + } + + /* In practice other filtering levels can be requested, but as + * for now most hardware/drivers only support + * IEEE802154_FILTERING_NONE, we fallback to this actual + * filtering level in hardware and make our own additional + * filtering in mac802154 receive path. + * + * TODO: Move this logic to the device drivers as hardware may + * support more higher level filters. Hardware may also require + * a different order how register are set, which could currently + * be buggy, so all received parameters need to be moved to the + * start() callback and let the driver go into the mode before + * it will turn on receive handling. + */ + local->phy->filtering = IEEE802154_FILTERING_NONE; + break; + case IEEE802154_FILTERING_4_FRAME_FIELDS: + /* Do not error out if IEEE802154_HW_PROMISCUOUS because we + * expect the hardware to operate at the level + * IEEE802154_FILTERING_4_FRAME_FIELDS anyway. + */ + if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { + ret = drv_set_promiscuous_mode(local, false); + if (ret < 0) + return ret; + } + + local->phy->filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; + break; + default: WARN_ON(1); - return -EOPNOTSUPP; + return -EINVAL; } - trace_802154_drv_set_lbt_mode(local, mode); - ret = local->ops->set_lbt(&local->hw, mode); + trace_802154_drv_start(local); + local->started = true; + smp_mb(); + ret = local->ops->start(&local->hw); trace_802154_drv_return_int(local, ret); return ret; } +static inline void drv_stop(struct ieee802154_local *local) +{ + might_sleep(); + + trace_802154_drv_stop(local); + local->ops->stop(&local->hw); + trace_802154_drv_return_void(local); + + /* sync away all work on the tasklet before clearing started */ + tasklet_disable(&local->tasklet); + tasklet_enable(&local->tasklet); + + barrier(); + + local->started = false; +} + static inline int -drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) +drv_set_channel(struct ieee802154_local *local, u8 page, u8 channel) { int ret; might_sleep(); - if (!local->ops->set_cca_ed_level) { - WARN_ON(1); - return -EOPNOTSUPP; - } - - trace_802154_drv_set_cca_ed_level(local, mbm); - ret = local->ops->set_cca_ed_level(&local->hw, mbm); + trace_802154_drv_set_channel(local, page, channel); + ret = local->ops->set_channel(&local->hw, page, channel); trace_802154_drv_return_int(local, ret); return ret; } -static inline int drv_set_pan_id(struct ieee802154_local *local, __le16 pan_id) +static inline int drv_set_tx_power(struct ieee802154_local *local, s32 mbm) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_txpower) { WARN_ON(1); return -EOPNOTSUPP; } - filt.pan_id = pan_id; - - trace_802154_drv_set_pan_id(local, pan_id); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_PANID_CHANGED); + trace_802154_drv_set_tx_power(local, mbm); + ret = local->ops->set_txpower(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } -static inline int -drv_set_extended_addr(struct ieee802154_local *local, __le64 extended_addr) +static inline int drv_set_cca_mode(struct ieee802154_local *local, + const struct wpan_phy_cca *cca) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_cca_mode) { WARN_ON(1); return -EOPNOTSUPP; } - filt.ieee_addr = extended_addr; - - trace_802154_drv_set_extended_addr(local, extended_addr); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_IEEEADDR_CHANGED); + trace_802154_drv_set_cca_mode(local, cca); + ret = local->ops->set_cca_mode(&local->hw, cca); trace_802154_drv_return_int(local, ret); return ret; } -static inline int -drv_set_short_addr(struct ieee802154_local *local, __le16 short_addr) +static inline int drv_set_lbt_mode(struct ieee802154_local *local, bool mode) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_lbt) { WARN_ON(1); return -EOPNOTSUPP; } - filt.short_addr = short_addr; - - trace_802154_drv_set_short_addr(local, short_addr); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_SADDR_CHANGED); + trace_802154_drv_set_lbt_mode(local, mode); + ret = local->ops->set_lbt(&local->hw, mode); trace_802154_drv_return_int(local, ret); return ret; } static inline int -drv_set_pan_coord(struct ieee802154_local *local, bool is_coord) +drv_set_cca_ed_level(struct ieee802154_local *local, s32 mbm) { - struct ieee802154_hw_addr_filt filt; int ret; might_sleep(); - if (!local->ops->set_hw_addr_filt) { + if (!local->ops->set_cca_ed_level) { WARN_ON(1); return -EOPNOTSUPP; } - filt.pan_coord = is_coord; - - trace_802154_drv_set_pan_coord(local, is_coord); - ret = local->ops->set_hw_addr_filt(&local->hw, &filt, - IEEE802154_AFILT_PANC_CHANGED); + trace_802154_drv_set_cca_ed_level(local, mbm); + ret = local->ops->set_cca_ed_level(&local->hw, mbm); trace_802154_drv_return_int(local, ret); return ret; } @@ -264,22 +351,4 @@ drv_set_max_frame_retries(struct ieee802154_local *local, s8 max_frame_retries) return ret; } -static inline int -drv_set_promiscuous_mode(struct ieee802154_local *local, bool on) -{ - int ret; - - might_sleep(); - - if (!local->ops->set_promiscuous_mode) { - WARN_ON(1); - return -EOPNOTSUPP; - } - - trace_802154_drv_set_promiscuous_mode(local, on); - ret = local->ops->set_promiscuous_mode(&local->hw, on); - trace_802154_drv_return_int(local, ret); - return ret; -} - #endif /* __MAC802154_DRIVER_OPS */ diff --git a/net/mac802154/ieee802154_i.h b/net/mac802154/ieee802154_i.h index 1381e6a5e180..509e0172fe82 100644 --- a/net/mac802154/ieee802154_i.h +++ b/net/mac802154/ieee802154_i.h @@ -26,6 +26,8 @@ struct ieee802154_local { struct ieee802154_hw hw; const struct ieee802154_ops *ops; + /* hardware address filter */ + struct ieee802154_hw_addr_filt addr_filt; /* ieee802154 phy */ struct wpan_phy *phy; @@ -55,7 +57,7 @@ struct ieee802154_local { struct sk_buff_head skb_queue; struct sk_buff *tx_skb; - struct work_struct tx_work; + struct work_struct sync_tx_work; /* A negative Linux error code or a null/positive MLME error status */ int tx_result; }; @@ -82,6 +84,16 @@ struct ieee802154_sub_if_data { struct ieee802154_local *local; struct net_device *dev; + /* Each interface starts and works in nominal state at a given filtering + * level given by iface_default_filtering, which is set once for all at + * the interface creation and should not evolve over time. For some MAC + * operations however, the filtering level may change temporarily, as + * reflected in the required_filtering field. The actual filtering at + * the PHY level may be different and is shown in struct wpan_phy. + */ + enum ieee802154_filtering_level iface_default_filtering; + enum ieee802154_filtering_level required_filtering; + unsigned long state; char name[IFNAMSIZ]; @@ -123,13 +135,53 @@ ieee802154_sdata_running(struct ieee802154_sub_if_data *sdata) extern struct ieee802154_mlme_ops mac802154_mlme_wpan; void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb); -void ieee802154_xmit_worker(struct work_struct *work); +void ieee802154_xmit_sync_worker(struct work_struct *work); +int ieee802154_sync_and_hold_queue(struct ieee802154_local *local); +int ieee802154_mlme_op_pre(struct ieee802154_local *local); +int ieee802154_mlme_tx(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); +void ieee802154_mlme_op_post(struct ieee802154_local *local); +int ieee802154_mlme_tx_one(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb); netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev); netdev_tx_t ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev); enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer); +/** + * ieee802154_hold_queue - hold ieee802154 queue + * @local: main mac object + * + * Hold a queue by incrementing an atomic counter and requesting the netif + * queues to be stopped. The queues cannot be woken up while the counter has not + * been reset with as any ieee802154_release_queue() calls as needed. + */ +void ieee802154_hold_queue(struct ieee802154_local *local); + +/** + * ieee802154_release_queue - release ieee802154 queue + * @local: main mac object + * + * Release a queue which is held by decrementing an atomic counter and wake it + * up only if the counter reaches 0. + */ +void ieee802154_release_queue(struct ieee802154_local *local); + +/** + * ieee802154_disable_queue - disable ieee802154 queue + * @local: main mac object + * + * When trying to sync the Tx queue, we cannot just stop the queue + * (which is basically a bit being set without proper lock handling) + * because it would be racy. We actually need to call netif_tx_disable() + * instead, which is done by this helper. Restarting the queue can + * however still be done with a regular wake call. + */ +void ieee802154_disable_queue(struct ieee802154_local *local); + /* MIB callbacks */ void mac802154_dev_set_page_channel(struct net_device *dev, u8 page, u8 chan); diff --git a/net/mac802154/iface.c b/net/mac802154/iface.c index 500ed1b81250..d9b50884d34e 100644 --- a/net/mac802154/iface.c +++ b/net/mac802154/iface.c @@ -147,25 +147,12 @@ static int ieee802154_setup_hw(struct ieee802154_sub_if_data *sdata) struct wpan_dev *wpan_dev = &sdata->wpan_dev; int ret; - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - ret = drv_set_promiscuous_mode(local, - wpan_dev->promiscuous_mode); - if (ret < 0) - return ret; - } + sdata->required_filtering = sdata->iface_default_filtering; if (local->hw.flags & IEEE802154_HW_AFILT) { - ret = drv_set_pan_id(local, wpan_dev->pan_id); - if (ret < 0) - return ret; - - ret = drv_set_extended_addr(local, wpan_dev->extended_addr); - if (ret < 0) - return ret; - - ret = drv_set_short_addr(local, wpan_dev->short_addr); - if (ret < 0) - return ret; + local->addr_filt.pan_id = wpan_dev->pan_id; + local->addr_filt.ieee_addr = wpan_dev->extended_addr; + local->addr_filt.short_addr = wpan_dev->short_addr; } if (local->hw.flags & IEEE802154_HW_LBT) { @@ -206,7 +193,8 @@ static int mac802154_slave_open(struct net_device *dev) if (res) goto err; - res = drv_start(local); + res = drv_start(local, sdata->required_filtering, + &local->addr_filt); if (res) goto err; } @@ -223,15 +211,16 @@ err: static int ieee802154_check_mac_settings(struct ieee802154_local *local, - struct wpan_dev *wpan_dev, - struct wpan_dev *nwpan_dev) + struct ieee802154_sub_if_data *sdata, + struct ieee802154_sub_if_data *nsdata) { + struct wpan_dev *nwpan_dev = &nsdata->wpan_dev; + struct wpan_dev *wpan_dev = &sdata->wpan_dev; + ASSERT_RTNL(); - if (local->hw.flags & IEEE802154_HW_PROMISCUOUS) { - if (wpan_dev->promiscuous_mode != nwpan_dev->promiscuous_mode) - return -EBUSY; - } + if (sdata->iface_default_filtering != nsdata->iface_default_filtering) + return -EBUSY; if (local->hw.flags & IEEE802154_HW_AFILT) { if (wpan_dev->pan_id != nwpan_dev->pan_id || @@ -285,8 +274,7 @@ ieee802154_check_concurrent_iface(struct ieee802154_sub_if_data *sdata, /* check all phy mac sublayer settings are the same. * We have only one phy, different values makes trouble. */ - ret = ieee802154_check_mac_settings(local, wpan_dev, - &nsdata->wpan_dev); + ret = ieee802154_check_mac_settings(local, sdata, nsdata); if (ret < 0) return ret; } @@ -586,7 +574,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, sdata->dev->priv_destructor = mac802154_wpan_free; sdata->dev->netdev_ops = &mac802154_wpan_ops; sdata->dev->ml_priv = &mac802154_mlme_wpan; - wpan_dev->promiscuous_mode = false; + sdata->iface_default_filtering = IEEE802154_FILTERING_4_FRAME_FIELDS; wpan_dev->header_ops = &ieee802154_header_ops; mutex_init(&sdata->sec_mtx); @@ -600,7 +588,7 @@ ieee802154_setup_sdata(struct ieee802154_sub_if_data *sdata, case NL802154_IFTYPE_MONITOR: sdata->dev->needs_free_netdev = true; sdata->dev->netdev_ops = &mac802154_monitor_ops; - wpan_dev->promiscuous_mode = true; + sdata->iface_default_filtering = IEEE802154_FILTERING_NONE; break; default: BUG(); diff --git a/net/mac802154/main.c b/net/mac802154/main.c index bd7bdb1219dd..40fab08df24b 100644 --- a/net/mac802154/main.c +++ b/net/mac802154/main.c @@ -95,7 +95,7 @@ ieee802154_alloc_hw(size_t priv_data_len, const struct ieee802154_ops *ops) skb_queue_head_init(&local->skb_queue); - INIT_WORK(&local->tx_work, ieee802154_xmit_worker); + INIT_WORK(&local->sync_tx_work, ieee802154_xmit_sync_worker); /* init supported flags with 802.15.4 default ranges */ phy->supported.max_minbe = 8; diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c index c439125ef2b9..0724aac8f48c 100644 --- a/net/mac802154/rx.c +++ b/net/mac802154/rx.c @@ -34,6 +34,7 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, struct sk_buff *skb, const struct ieee802154_hdr *hdr) { struct wpan_dev *wpan_dev = &sdata->wpan_dev; + struct wpan_phy *wpan_phy = sdata->local->hw.phy; __le16 span, sshort; int rc; @@ -42,6 +43,17 @@ ieee802154_subif_frame(struct ieee802154_sub_if_data *sdata, span = wpan_dev->pan_id; sshort = wpan_dev->short_addr; + /* Level 3 filtering: Only beacons are accepted during scans */ + if (sdata->required_filtering == IEEE802154_FILTERING_3_SCAN && + sdata->required_filtering > wpan_phy->filtering) { + if (mac_cb(skb)->type != IEEE802154_FC_TYPE_BEACON) { + dev_dbg(&sdata->dev->dev, + "drop non-beacon frame (0x%x) during scan\n", + mac_cb(skb)->type); + goto fail; + } + } + switch (mac_cb(skb)->dest.mode) { case IEEE802154_ADDR_NONE: if (hdr->source.mode != IEEE802154_ADDR_NONE) @@ -114,8 +126,10 @@ fail: static void ieee802154_print_addr(const char *name, const struct ieee802154_addr *addr) { - if (addr->mode == IEEE802154_ADDR_NONE) + if (addr->mode == IEEE802154_ADDR_NONE) { pr_debug("%s not present\n", name); + return; + } pr_debug("%s PAN ID: %04x\n", name, le16_to_cpu(addr->pan_id)); if (addr->mode == IEEE802154_ADDR_SHORT) { @@ -132,7 +146,7 @@ static int ieee802154_parse_frame_start(struct sk_buff *skb, struct ieee802154_hdr *hdr) { int hlen; - struct ieee802154_mac_cb *cb = mac_cb_init(skb); + struct ieee802154_mac_cb *cb = mac_cb(skb); skb_reset_mac_header(skb); @@ -209,6 +223,13 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local, if (!ieee802154_sdata_running(sdata)) continue; + /* Do not deliver packets received on interfaces expecting + * AACK=1 if the address filters where disabled. + */ + if (local->hw.phy->filtering < IEEE802154_FILTERING_4_FRAME_FIELDS && + sdata->required_filtering == IEEE802154_FILTERING_4_FRAME_FIELDS) + continue; + ieee802154_subif_frame(sdata, skb, &hdr); skb = NULL; break; @@ -268,10 +289,8 @@ void ieee802154_rx(struct ieee802154_local *local, struct sk_buff *skb) ieee802154_monitors_rx(local, skb); - /* Check if transceiver doesn't validate the checksum. - * If not we validate the checksum here. - */ - if (local->hw.flags & IEEE802154_HW_RX_DROP_BAD_CKSUM) { + /* Level 1 filtering: Check the FCS by software when relevant */ + if (local->hw.phy->filtering == IEEE802154_FILTERING_NONE) { crc = crc_ccitt(0, skb->data, skb->len); if (crc) { rcu_read_unlock(); @@ -294,8 +313,9 @@ void ieee802154_rx_irqsafe(struct ieee802154_hw *hw, struct sk_buff *skb, u8 lqi) { struct ieee802154_local *local = hw_to_local(hw); + struct ieee802154_mac_cb *cb = mac_cb_init(skb); - mac_cb(skb)->lqi = lqi; + cb->lqi = lqi; skb->pkt_type = IEEE802154_RX_MSG; skb_queue_tail(&local->skb_queue, skb); tasklet_schedule(&local->tasklet); diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c index c829e4a75325..9d8d43cf1e64 100644 --- a/net/mac802154/tx.c +++ b/net/mac802154/tx.c @@ -22,10 +22,10 @@ #include "ieee802154_i.h" #include "driver-ops.h" -void ieee802154_xmit_worker(struct work_struct *work) +void ieee802154_xmit_sync_worker(struct work_struct *work) { struct ieee802154_local *local = - container_of(work, struct ieee802154_local, tx_work); + container_of(work, struct ieee802154_local, sync_tx_work); struct sk_buff *skb = local->tx_skb; struct net_device *dev = skb->dev; int res; @@ -43,7 +43,9 @@ void ieee802154_xmit_worker(struct work_struct *work) err_tx: /* Restart the netif queue on each sub_if_data object. */ - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); + if (atomic_dec_and_test(&local->phy->ongoing_txs)) + wake_up(&local->phy->sync_txq); kfree_skb(skb); netdev_dbg(dev, "transmission failed\n"); } @@ -65,7 +67,7 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) consume_skb(skb); skb = nskb; } else { - goto err_tx; + goto err_free_skb; } } @@ -74,32 +76,134 @@ ieee802154_tx(struct ieee802154_local *local, struct sk_buff *skb) } /* Stop the netif queue on each sub_if_data object. */ - ieee802154_stop_queue(&local->hw); + ieee802154_hold_queue(local); + atomic_inc(&local->phy->ongoing_txs); - /* async is priority, otherwise sync is fallback */ + /* Drivers should preferably implement the async callback. In some rare + * cases they only provide a sync callback which we will use as a + * fallback. + */ if (local->ops->xmit_async) { unsigned int len = skb->len; ret = drv_xmit_async(local, skb); - if (ret) { - ieee802154_wake_queue(&local->hw); - goto err_tx; - } + if (ret) + goto err_wake_netif_queue; dev->stats.tx_packets++; dev->stats.tx_bytes += len; } else { local->tx_skb = skb; - queue_work(local->workqueue, &local->tx_work); + queue_work(local->workqueue, &local->sync_tx_work); } return NETDEV_TX_OK; -err_tx: +err_wake_netif_queue: + ieee802154_release_queue(local); + if (atomic_dec_and_test(&local->phy->ongoing_txs)) + wake_up(&local->phy->sync_txq); +err_free_skb: kfree_skb(skb); return NETDEV_TX_OK; } +static int ieee802154_sync_queue(struct ieee802154_local *local) +{ + int ret; + + ieee802154_hold_queue(local); + ieee802154_disable_queue(local); + wait_event(local->phy->sync_txq, !atomic_read(&local->phy->ongoing_txs)); + ret = local->tx_result; + ieee802154_release_queue(local); + + return ret; +} + +int ieee802154_sync_and_hold_queue(struct ieee802154_local *local) +{ + int ret; + + ieee802154_hold_queue(local); + ret = ieee802154_sync_queue(local); + set_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); + + return ret; +} + +int ieee802154_mlme_op_pre(struct ieee802154_local *local) +{ + return ieee802154_sync_and_hold_queue(local); +} + +int ieee802154_mlme_tx(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) +{ + int ret; + + /* Avoid possible calls to ->ndo_stop() when we asynchronously perform + * MLME transmissions. + */ + rtnl_lock(); + + /* Ensure the device was not stopped, otherwise error out */ + if (!local->open_count) { + rtnl_unlock(); + return -ENETDOWN; + } + + /* Warn if the ieee802154 core thinks MLME frames can be sent while the + * net interface expects this cannot happen. + */ + if (WARN_ON_ONCE(!netif_running(sdata->dev))) { + rtnl_unlock(); + return -ENETDOWN; + } + + ieee802154_tx(local, skb); + ret = ieee802154_sync_queue(local); + + rtnl_unlock(); + + return ret; +} + +void ieee802154_mlme_op_post(struct ieee802154_local *local) +{ + ieee802154_release_queue(local); +} + +int ieee802154_mlme_tx_one(struct ieee802154_local *local, + struct ieee802154_sub_if_data *sdata, + struct sk_buff *skb) +{ + int ret; + + ieee802154_mlme_op_pre(local); + ret = ieee802154_mlme_tx(local, sdata, skb); + ieee802154_mlme_op_post(local); + + return ret; +} + +static bool ieee802154_queue_is_stopped(struct ieee802154_local *local) +{ + return test_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); +} + +static netdev_tx_t +ieee802154_hot_tx(struct ieee802154_local *local, struct sk_buff *skb) +{ + /* Warn if the net interface tries to transmit frames while the + * ieee802154 core assumes the queue is stopped. + */ + WARN_ON_ONCE(ieee802154_queue_is_stopped(local)); + + return ieee802154_tx(local, skb); +} + netdev_tx_t ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) { @@ -107,7 +211,7 @@ ieee802154_monitor_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->skb_iif = dev->ifindex; - return ieee802154_tx(sdata->local, skb); + return ieee802154_hot_tx(sdata->local, skb); } netdev_tx_t @@ -129,5 +233,5 @@ ieee802154_subif_start_xmit(struct sk_buff *skb, struct net_device *dev) skb->skb_iif = dev->ifindex; - return ieee802154_tx(sdata->local, skb); + return ieee802154_hot_tx(sdata->local, skb); } diff --git a/net/mac802154/util.c b/net/mac802154/util.c index 9f024d85563b..ebc9a8521765 100644 --- a/net/mac802154/util.c +++ b/net/mac802154/util.c @@ -13,12 +13,23 @@ /* privid for wpan_phys to determine whether they belong to us or not */ const void *const mac802154_wpan_phy_privid = &mac802154_wpan_phy_privid; -void ieee802154_wake_queue(struct ieee802154_hw *hw) +/** + * ieee802154_wake_queue - wake ieee802154 queue + * @hw: main hardware object + * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we had to stop the queue to + * avoid new skb to come during the transmission. The queue then needs to be + * woken up after the operation. + */ +static void ieee802154_wake_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; rcu_read_lock(); + clear_bit(WPAN_PHY_FLAG_STATE_QUEUE_STOPPED, &local->phy->flags); list_for_each_entry_rcu(sdata, &local->interfaces, list) { if (!sdata->dev) continue; @@ -27,9 +38,18 @@ void ieee802154_wake_queue(struct ieee802154_hw *hw) } rcu_read_unlock(); } -EXPORT_SYMBOL(ieee802154_wake_queue); -void ieee802154_stop_queue(struct ieee802154_hw *hw) +/** + * ieee802154_stop_queue - stop ieee802154 queue + * @hw: main hardware object + * + * Tranceivers usually have either one transmit framebuffer or one framebuffer + * for both transmitting and receiving. Hence, the core currently only handles + * one frame at a time for each phy, which means we need to tell upper layers to + * stop giving us new skbs while we are busy with the transmitted one. The queue + * must then be stopped before transmitting. + */ +static void ieee802154_stop_queue(struct ieee802154_hw *hw) { struct ieee802154_local *local = hw_to_local(hw); struct ieee802154_sub_if_data *sdata; @@ -43,14 +63,47 @@ void ieee802154_stop_queue(struct ieee802154_hw *hw) } rcu_read_unlock(); } -EXPORT_SYMBOL(ieee802154_stop_queue); + +void ieee802154_hold_queue(struct ieee802154_local *local) +{ + unsigned long flags; + + spin_lock_irqsave(&local->phy->queue_lock, flags); + if (!atomic_fetch_inc(&local->phy->hold_txs)) + ieee802154_stop_queue(&local->hw); + spin_unlock_irqrestore(&local->phy->queue_lock, flags); +} + +void ieee802154_release_queue(struct ieee802154_local *local) +{ + unsigned long flags; + + spin_lock_irqsave(&local->phy->queue_lock, flags); + if (atomic_dec_and_test(&local->phy->hold_txs)) + ieee802154_wake_queue(&local->hw); + spin_unlock_irqrestore(&local->phy->queue_lock, flags); +} + +void ieee802154_disable_queue(struct ieee802154_local *local) +{ + struct ieee802154_sub_if_data *sdata; + + rcu_read_lock(); + list_for_each_entry_rcu(sdata, &local->interfaces, list) { + if (!sdata->dev) + continue; + + netif_tx_disable(sdata->dev); + } + rcu_read_unlock(); +} enum hrtimer_restart ieee802154_xmit_ifs_timer(struct hrtimer *timer) { struct ieee802154_local *local = container_of(timer, struct ieee802154_local, ifs_timer); - ieee802154_wake_queue(&local->hw); + ieee802154_release_queue(local); return HRTIMER_NORESTART; } @@ -84,10 +137,12 @@ void ieee802154_xmit_complete(struct ieee802154_hw *hw, struct sk_buff *skb, hw->phy->sifs_period * NSEC_PER_USEC, HRTIMER_MODE_REL); } else { - ieee802154_wake_queue(hw); + ieee802154_release_queue(local); } dev_consume_skb_any(skb); + if (atomic_dec_and_test(&hw->phy->ongoing_txs)) + wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_complete); @@ -97,8 +152,10 @@ void ieee802154_xmit_error(struct ieee802154_hw *hw, struct sk_buff *skb, struct ieee802154_local *local = hw_to_local(hw); local->tx_result = reason; - ieee802154_wake_queue(hw); + ieee802154_release_queue(local); dev_kfree_skb_any(skb); + if (atomic_dec_and_test(&hw->phy->ongoing_txs)) + wake_up(&hw->phy->sync_txq); } EXPORT_SYMBOL(ieee802154_xmit_error); diff --git a/net/mctp/af_mctp.c b/net/mctp/af_mctp.c index c2fc2a7b2528..b6b5e496fa40 100644 --- a/net/mctp/af_mctp.c +++ b/net/mctp/af_mctp.c @@ -295,11 +295,12 @@ __must_hold(&net->mctp.keys_lock) mctp_dev_release_key(key->dev, key); spin_unlock_irqrestore(&key->lock, flags); - hlist_del(&key->hlist); - hlist_del(&key->sklist); - - /* unref for the lists */ - mctp_key_unref(key); + if (!hlist_unhashed(&key->hlist)) { + hlist_del_init(&key->hlist); + hlist_del_init(&key->sklist); + /* unref for the lists */ + mctp_key_unref(key); + } kfree_skb(skb); } @@ -373,9 +374,17 @@ static int mctp_ioctl_alloctag(struct mctp_sock *msk, unsigned long arg) ctl.tag = tag | MCTP_TAG_OWNER | MCTP_TAG_PREALLOC; if (copy_to_user((void __user *)arg, &ctl, sizeof(ctl))) { - spin_lock_irqsave(&key->lock, flags); - __mctp_key_remove(key, net, flags, MCTP_TRACE_KEY_DROPPED); + unsigned long fl2; + /* Unwind our key allocation: the keys list lock needs to be + * taken before the individual key locks, and we need a valid + * flags value (fl2) to pass to __mctp_key_remove, hence the + * second spin_lock_irqsave() rather than a plain spin_lock(). + */ + spin_lock_irqsave(&net->mctp.keys_lock, flags); + spin_lock_irqsave(&key->lock, fl2); + __mctp_key_remove(key, net, fl2, MCTP_TRACE_KEY_DROPPED); mctp_key_unref(key); + spin_unlock_irqrestore(&net->mctp.keys_lock, flags); return -EFAULT; } diff --git a/net/mctp/route.c b/net/mctp/route.c index 3b24b8d18b5b..2155f15a074c 100644 --- a/net/mctp/route.c +++ b/net/mctp/route.c @@ -228,12 +228,12 @@ __releases(&key->lock) if (!key->manual_alloc) { spin_lock_irqsave(&net->mctp.keys_lock, flags); - hlist_del(&key->hlist); - hlist_del(&key->sklist); + if (!hlist_unhashed(&key->hlist)) { + hlist_del_init(&key->hlist); + hlist_del_init(&key->sklist); + mctp_key_unref(key); + } spin_unlock_irqrestore(&net->mctp.keys_lock, flags); - - /* unref for the lists */ - mctp_key_unref(key); } /* and one for the local reference */ diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c index b52afe316dc4..35b5f806fdda 100644 --- a/net/mpls/af_mpls.c +++ b/net/mpls/af_mpls.c @@ -1079,9 +1079,9 @@ static void mpls_get_stats(struct mpls_dev *mdev, p = per_cpu_ptr(mdev->stats, i); do { - start = u64_stats_fetch_begin_irq(&p->syncp); + start = u64_stats_fetch_begin(&p->syncp); local = p->stats; - } while (u64_stats_fetch_retry_irq(&p->syncp, start)); + } while (u64_stats_fetch_retry(&p->syncp, start)); stats->rx_packets += local.rx_packets; stats->rx_bytes += local.rx_bytes; diff --git a/net/mptcp/mptcp_diag.c b/net/mptcp/mptcp_diag.c index 7f9a71780437..8df1bdb647e2 100644 --- a/net/mptcp/mptcp_diag.c +++ b/net/mptcp/mptcp_diag.c @@ -81,15 +81,18 @@ static void mptcp_diag_dump_listeners(struct sk_buff *skb, struct netlink_callba struct mptcp_diag_ctx *diag_ctx = (void *)cb->ctx; struct nlattr *bc = cb_data->inet_diag_nla_bc; struct net *net = sock_net(skb->sk); + struct inet_hashinfo *hinfo; int i; - for (i = diag_ctx->l_slot; i <= tcp_hashinfo.lhash2_mask; i++) { + hinfo = net->ipv4.tcp_death_row.hashinfo; + + for (i = diag_ctx->l_slot; i <= hinfo->lhash2_mask; i++) { struct inet_listen_hashbucket *ilb; struct hlist_nulls_node *node; struct sock *sk; int num = 0; - ilb = &tcp_hashinfo.lhash2[i]; + ilb = &hinfo->lhash2[i]; rcu_read_lock(); spin_lock(&ilb->lock); diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c index a3e4ee7af0ee..9813ed0fde9b 100644 --- a/net/mptcp/pm_netlink.c +++ b/net/mptcp/pm_netlink.c @@ -796,7 +796,7 @@ static void mptcp_pm_nl_rm_addr_or_subflow(struct mptcp_sock *msk, u8 rm_id = rm_list->ids[i]; bool removed = false; - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); int how = RCV_SHUTDOWN | SEND_SHUTDOWN; u8 id = subflow->local_id; @@ -1327,7 +1327,7 @@ static int mptcp_nl_cmd_add_addr(struct sk_buff *skb, struct genl_info *info) return -EINVAL; } - entry = kmalloc(sizeof(*entry), GFP_KERNEL); + entry = kmalloc(sizeof(*entry), GFP_KERNEL_ACCOUNT); if (!entry) { GENL_SET_ERR_MSG(info, "can't allocate addr"); return -ENOMEM; @@ -2218,17 +2218,17 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_ADD_ADDR, .doit = mptcp_nl_cmd_add_addr, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_DEL_ADDR, .doit = mptcp_nl_cmd_del_addr, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_FLUSH_ADDRS, .doit = mptcp_nl_cmd_flush_addrs, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_ADDR, @@ -2238,7 +2238,7 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_SET_LIMITS, .doit = mptcp_nl_cmd_set_limits, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_GET_LIMITS, @@ -2247,27 +2247,27 @@ static const struct genl_small_ops mptcp_pm_ops[] = { { .cmd = MPTCP_PM_CMD_SET_FLAGS, .doit = mptcp_nl_cmd_set_flags, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_ANNOUNCE, .doit = mptcp_nl_cmd_announce, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_REMOVE, .doit = mptcp_nl_cmd_remove, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_SUBFLOW_CREATE, .doit = mptcp_nl_cmd_sf_create, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, { .cmd = MPTCP_PM_CMD_SUBFLOW_DESTROY, .doit = mptcp_nl_cmd_sf_destroy, - .flags = GENL_ADMIN_PERM, + .flags = GENL_UNS_ADMIN_PERM, }, }; diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c index d398f3810662..109eea2c65ff 100644 --- a/net/mptcp/protocol.c +++ b/net/mptcp/protocol.c @@ -150,9 +150,15 @@ static bool mptcp_try_coalesce(struct sock *sk, struct sk_buff *to, MPTCP_SKB_CB(from)->map_seq, MPTCP_SKB_CB(to)->map_seq, to->len, MPTCP_SKB_CB(from)->end_seq); MPTCP_SKB_CB(to)->end_seq = MPTCP_SKB_CB(from)->end_seq; - kfree_skb_partial(from, fragstolen); + + /* note the fwd memory can reach a negative value after accounting + * for the delta, but the later skb free will restore a non + * negative one + */ atomic_add(delta, &sk->sk_rmem_alloc); mptcp_rmem_charge(sk, delta); + kfree_skb_partial(from, fragstolen); + return true; } @@ -656,9 +662,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, skb = skb_peek(&ssk->sk_receive_queue); if (!skb) { - /* if no data is found, a racing workqueue/recvmsg - * already processed the new data, stop here or we - * can enter an infinite loop + /* With racing move_skbs_to_msk() and __mptcp_move_skbs(), + * a different CPU can have already processed the pending + * data, stop here or we can enter an infinite loop */ if (!moved) done = true; @@ -666,9 +672,9 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk, } if (__mptcp_check_fallback(msk)) { - /* if we are running under the workqueue, TCP could have - * collapsed skbs between dummy map creation and now - * be sure to adjust the size + /* Under fallback skbs have no MPTCP extension and TCP could + * collapse them between the dummy map creation and the + * current dequeue. Be sure to adjust the map size. */ map_remaining = skb->len; subflow->map_data_len = skb->len; @@ -1538,8 +1544,9 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) struct mptcp_sendmsg_info info = { .flags = flags, }; + bool do_check_data_fin = false; struct mptcp_data_frag *dfrag; - int len, copied = 0; + int len; while ((dfrag = mptcp_send_head(sk))) { info.sent = dfrag->already_sent; @@ -1574,8 +1581,8 @@ void __mptcp_push_pending(struct sock *sk, unsigned int flags) goto out; } + do_check_data_fin = true; info.sent += ret; - copied += ret; len -= ret; mptcp_update_post_push(msk, dfrag, ret); @@ -1591,7 +1598,7 @@ out: /* ensure the rtx timer is running */ if (!mptcp_timer_pending(sk)) mptcp_reset_timer(sk); - if (copied) + if (do_check_data_fin) __mptcp_check_send_data_fin(sk); } @@ -1666,10 +1673,42 @@ static void mptcp_set_nospace(struct sock *sk) set_bit(MPTCP_NOSPACE, &mptcp_sk(sk)->flags); } +static int mptcp_sendmsg_fastopen(struct sock *sk, struct sock *ssk, struct msghdr *msg, + size_t len, int *copied_syn) +{ + unsigned int saved_flags = msg->msg_flags; + struct mptcp_sock *msk = mptcp_sk(sk); + int ret; + + lock_sock(ssk); + msg->msg_flags |= MSG_DONTWAIT; + msk->connect_flags = O_NONBLOCK; + msk->is_sendmsg = 1; + ret = tcp_sendmsg_fastopen(ssk, msg, copied_syn, len, NULL); + msk->is_sendmsg = 0; + msg->msg_flags = saved_flags; + release_sock(ssk); + + /* do the blocking bits of inet_stream_connect outside the ssk socket lock */ + if (ret == -EINPROGRESS && !(msg->msg_flags & MSG_DONTWAIT)) { + ret = __inet_stream_connect(sk->sk_socket, msg->msg_name, + msg->msg_namelen, msg->msg_flags, 1); + + /* Keep the same behaviour of plain TCP: zero the copied bytes in + * case of any error, except timeout or signal + */ + if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR) + *copied_syn = 0; + } + + return ret; +} + static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) { struct mptcp_sock *msk = mptcp_sk(sk); struct page_frag *pfrag; + struct socket *ssock; size_t copied = 0; int ret = 0; long timeo; @@ -1683,14 +1722,30 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) lock_sock(sk); + ssock = __mptcp_nmpc_socket(msk); + if (unlikely(ssock && inet_sk(ssock->sk)->defer_connect)) { + int copied_syn = 0; + + ret = mptcp_sendmsg_fastopen(sk, ssock->sk, msg, len, &copied_syn); + copied += copied_syn; + if (ret == -EINPROGRESS && copied_syn > 0) + goto out; + else if (ret) + goto do_error; + } + timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT); if ((1 << sk->sk_state) & ~(TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { ret = sk_stream_wait_connect(sk, &timeo); if (ret) - goto out; + goto do_error; } + ret = -EPIPE; + if (unlikely(sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))) + goto do_error; + pfrag = sk_page_frag(sk); while (msg_data_left(msg)) { @@ -1699,11 +1754,6 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) bool dfrag_collapsed; size_t psize, offset; - if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN)) { - ret = -EPIPE; - goto out; - } - /* reuse tail pfrag, if possible, or carve a new one from the * page allocator */ @@ -1735,7 +1785,7 @@ static int mptcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len) if (copy_page_from_iter(dfrag->page, offset, psize, &msg->msg_iter) != psize) { ret = -EFAULT; - goto out; + goto do_error; } /* data successfully copied into the write queue */ @@ -1767,7 +1817,7 @@ wait_for_memory: __mptcp_push_pending(sk, msg->msg_flags); ret = sk_stream_wait_memory(sk, &timeo); if (ret) - goto out; + goto do_error; } if (copied) @@ -1775,7 +1825,14 @@ wait_for_memory: out: release_sock(sk); - return copied ? : ret; + return copied; + +do_error: + if (copied) + goto out; + + copied = sk_stream_error(sk, msg->msg_flags, ret); + goto out; } static int __mptcp_recvmsg_mskq(struct mptcp_sock *msk, @@ -2278,8 +2335,14 @@ static void __mptcp_close_ssk(struct sock *sk, struct sock *ssk, lock_sock_nested(ssk, SINGLE_DEPTH_NESTING); - if (flags & MPTCP_CF_FASTCLOSE) + if (flags & MPTCP_CF_FASTCLOSE) { + /* be sure to force the tcp_disconnect() path, + * to generate the egress reset + */ + ssk->sk_lingertime = 0; + sock_set_flag(ssk, SOCK_LINGER); subflow->send_fastclose = 1; + } need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk); if (!dispose_it) { @@ -2357,7 +2420,7 @@ static void __mptcp_close_subflow(struct mptcp_sock *msk) might_sleep(); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *ssk = mptcp_subflow_tcp_sock(subflow); if (inet_sk_state_load(ssk) != TCP_CLOSE) @@ -2400,7 +2463,7 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) mptcp_token_destroy(msk); - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) { + mptcp_for_each_subflow_safe(msk, subflow, tmp) { struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow); bool slow; @@ -2412,12 +2475,31 @@ static void mptcp_check_fastclose(struct mptcp_sock *msk) unlock_sock_fast(tcp_sk, slow); } + /* Mirror the tcp_reset() error propagation */ + switch (sk->sk_state) { + case TCP_SYN_SENT: + sk->sk_err = ECONNREFUSED; + break; + case TCP_CLOSE_WAIT: + sk->sk_err = EPIPE; + break; + case TCP_CLOSE: + return; + default: + sk->sk_err = ECONNRESET; + } + inet_sk_state_store(sk, TCP_CLOSE); sk->sk_shutdown = SHUTDOWN_MASK; smp_mb__before_atomic(); /* SHUTDOWN must be visible first */ set_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags); - mptcp_close_wake_up(sk); + /* the calling mptcp_worker will properly destroy the socket */ + if (sock_flag(sk, SOCK_DEAD)) + return; + + sk->sk_state_change(sk); + sk_error_report(sk); } static void __mptcp_retrans(struct sock *sk) @@ -2523,6 +2605,16 @@ static void mptcp_mp_fail_no_response(struct mptcp_sock *msk) mptcp_reset_timeout(msk, 0); } +static void mptcp_do_fastclose(struct sock *sk) +{ + struct mptcp_subflow_context *subflow, *tmp; + struct mptcp_sock *msk = mptcp_sk(sk); + + mptcp_for_each_subflow_safe(msk, subflow, tmp) + __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), + subflow, MPTCP_CF_FASTCLOSE); +} + static void mptcp_worker(struct work_struct *work) { struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work); @@ -2551,11 +2643,15 @@ static void mptcp_worker(struct work_struct *work) * closed, but we need the msk around to reply to incoming DATA_FIN, * even if it is orphaned and in FIN_WAIT2 state */ - if (sock_flag(sk, SOCK_DEAD) && - (mptcp_check_close_timeout(sk) || sk->sk_state == TCP_CLOSE)) { - inet_sk_state_store(sk, TCP_CLOSE); - __mptcp_destroy_sock(sk); - goto unlock; + if (sock_flag(sk, SOCK_DEAD)) { + if (mptcp_check_close_timeout(sk)) { + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } + if (sk->sk_state == TCP_CLOSE) { + __mptcp_destroy_sock(sk); + goto unlock; + } } if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags)) @@ -2634,6 +2730,8 @@ static int mptcp_init_sock(struct sock *sk) if (ret) return ret; + set_bit(SOCK_CUSTOM_SOCKOPT, &sk->sk_socket->flags); + /* fetch the ca name; do it outside __mptcp_init_sock(), so that clone will * propagate the correct value */ @@ -2656,7 +2754,7 @@ static void __mptcp_clear_xmit(struct sock *sk) dfrag_clear(sk, dfrag); } -static void mptcp_cancel_work(struct sock *sk) +void mptcp_cancel_work(struct sock *sk) { struct mptcp_sock *msk = mptcp_sk(sk); @@ -2796,13 +2894,24 @@ static void __mptcp_destroy_sock(struct sock *sk) sock_put(sk); } -static void mptcp_close(struct sock *sk, long timeout) +static __poll_t mptcp_check_readable(struct mptcp_sock *msk) +{ + /* Concurrent splices from sk_receive_queue into receive_queue will + * always show at least one non-empty queue when checked in this order. + */ + if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && + skb_queue_empty_lockless(&msk->receive_queue)) + return 0; + + return EPOLLIN | EPOLLRDNORM; +} + +bool __mptcp_close(struct sock *sk, long timeout) { struct mptcp_subflow_context *subflow; struct mptcp_sock *msk = mptcp_sk(sk); bool do_cancel_work = false; - lock_sock(sk); sk->sk_shutdown = SHUTDOWN_MASK; if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) { @@ -2810,8 +2919,13 @@ static void mptcp_close(struct sock *sk, long timeout) goto cleanup; } - if (mptcp_close_state(sk)) + if (mptcp_check_readable(msk)) { + /* the msk has read data, do the MPTCP equivalent of TCP reset */ + inet_sk_state_store(sk, TCP_CLOSE); + mptcp_do_fastclose(sk); + } else if (mptcp_close_state(sk)) { __mptcp_wr_shutdown(sk); + } sk_stream_wait_close(sk, timeout); @@ -2844,6 +2958,17 @@ cleanup: } else { mptcp_reset_timeout(msk, 0); } + + return do_cancel_work; +} + +static void mptcp_close(struct sock *sk, long timeout) +{ + bool do_cancel_work; + + lock_sock(sk); + + do_cancel_work = __mptcp_close(sk, timeout); release_sock(sk); if (do_cancel_work) mptcp_cancel_work(sk); @@ -2851,7 +2976,7 @@ cleanup: sock_put(sk); } -static void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk) { #if IS_ENABLED(CONFIG_MPTCP_IPV6) const struct ipv6_pinfo *ssk6 = inet6_sk(ssk); @@ -3047,7 +3172,7 @@ void mptcp_destroy_common(struct mptcp_sock *msk, unsigned int flags) __mptcp_clear_xmit(sk); /* join list will be eventually flushed (with rst) at sock lock release time */ - list_for_each_entry_safe(subflow, tmp, &msk->conn_list, node) + mptcp_for_each_subflow_safe(msk, subflow, tmp) __mptcp_close_ssk(sk, mptcp_subflow_tcp_sock(subflow), subflow, flags); /* move to sk_receive_queue, sk_stream_kill_queues will purge it */ @@ -3406,10 +3531,73 @@ static int mptcp_ioctl(struct sock *sk, int cmd, unsigned long arg) return put_user(answ, (int __user *)arg); } +static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, + struct mptcp_subflow_context *subflow) +{ + subflow->request_mptcp = 0; + __mptcp_do_fallback(msk); +} + +static int mptcp_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) +{ + struct mptcp_subflow_context *subflow; + struct mptcp_sock *msk = mptcp_sk(sk); + struct socket *ssock; + int err = -EINVAL; + + ssock = __mptcp_nmpc_socket(msk); + if (!ssock) + return -EINVAL; + + mptcp_token_destroy(msk); + inet_sk_state_store(sk, TCP_SYN_SENT); + subflow = mptcp_subflow_ctx(ssock->sk); +#ifdef CONFIG_TCP_MD5SIG + /* no MPTCP if MD5SIG is enabled on this socket or we may run out of + * TCP option space. + */ + if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) + mptcp_subflow_early_fallback(msk, subflow); +#endif + if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { + MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); + mptcp_subflow_early_fallback(msk, subflow); + } + if (likely(!__mptcp_check_fallback(msk))) + MPTCP_INC_STATS(sock_net(sk), MPTCP_MIB_MPCAPABLEACTIVE); + + /* if reaching here via the fastopen/sendmsg path, the caller already + * acquired the subflow socket lock, too. + */ + if (msk->is_sendmsg) + err = __inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags, 1); + else + err = inet_stream_connect(ssock, uaddr, addr_len, msk->connect_flags); + inet_sk(sk)->defer_connect = inet_sk(ssock->sk)->defer_connect; + + /* on successful connect, the msk state will be moved to established by + * subflow_finish_connect() + */ + if (unlikely(err && err != -EINPROGRESS)) { + inet_sk_state_store(sk, inet_sk_state_load(ssock->sk)); + return err; + } + + mptcp_copy_inaddrs(sk, ssock->sk); + + /* unblocking connect, mptcp-level inet_stream_connect will error out + * without changing the socket state, update it here. + */ + if (err == -EINPROGRESS) + sk->sk_socket->state = ssock->state; + return err; +} + static struct proto mptcp_prot = { .name = "MPTCP", .owner = THIS_MODULE, .init = mptcp_init_sock, + .connect = mptcp_connect, .disconnect = mptcp_disconnect, .close = mptcp_close, .accept = mptcp_accept, @@ -3461,77 +3649,16 @@ unlock: return err; } -static void mptcp_subflow_early_fallback(struct mptcp_sock *msk, - struct mptcp_subflow_context *subflow) -{ - subflow->request_mptcp = 0; - __mptcp_do_fallback(msk); -} - static int mptcp_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { - struct mptcp_sock *msk = mptcp_sk(sock->sk); - struct mptcp_subflow_context *subflow; - struct socket *ssock; - int err = -EINVAL; + int ret; lock_sock(sock->sk); - if (uaddr) { - if (addr_len < sizeof(uaddr->sa_family)) - goto unlock; - - if (uaddr->sa_family == AF_UNSPEC) { - err = mptcp_disconnect(sock->sk, flags); - sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; - goto unlock; - } - } - - if (sock->state != SS_UNCONNECTED && msk->subflow) { - /* pending connection or invalid state, let existing subflow - * cope with that - */ - ssock = msk->subflow; - goto do_connect; - } - - ssock = __mptcp_nmpc_socket(msk); - if (!ssock) - goto unlock; - - mptcp_token_destroy(msk); - inet_sk_state_store(sock->sk, TCP_SYN_SENT); - subflow = mptcp_subflow_ctx(ssock->sk); -#ifdef CONFIG_TCP_MD5SIG - /* no MPTCP if MD5SIG is enabled on this socket or we may run out of - * TCP option space. - */ - if (rcu_access_pointer(tcp_sk(ssock->sk)->md5sig_info)) - mptcp_subflow_early_fallback(msk, subflow); -#endif - if (subflow->request_mptcp && mptcp_token_new_connect(ssock->sk)) { - MPTCP_INC_STATS(sock_net(ssock->sk), MPTCP_MIB_TOKENFALLBACKINIT); - mptcp_subflow_early_fallback(msk, subflow); - } - if (likely(!__mptcp_check_fallback(msk))) - MPTCP_INC_STATS(sock_net(sock->sk), MPTCP_MIB_MPCAPABLEACTIVE); - -do_connect: - err = ssock->ops->connect(ssock, uaddr, addr_len, flags); - sock->state = ssock->state; - - /* on successful connect, the msk state will be moved to established by - * subflow_finish_connect() - */ - if (!err || err == -EINPROGRESS) - mptcp_copy_inaddrs(sock->sk, ssock->sk); - else - inet_sk_state_store(sock->sk, inet_sk_state_load(ssock->sk)); - -unlock: + mptcp_sk(sock->sk)->connect_flags = flags; + ret = __inet_stream_connect(sock, uaddr, addr_len, flags, 0); release_sock(sock->sk); - return err; + return ret; } static int mptcp_listen(struct socket *sock, int backlog) @@ -3582,6 +3709,8 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, struct mptcp_subflow_context *subflow; struct sock *newsk = newsock->sk; + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + lock_sock(newsk); /* PM/worker can now acquire the first subflow socket @@ -3597,7 +3726,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, if (mptcp_is_fully_established(newsk)) mptcp_pm_fully_established(msk, msk->first, GFP_KERNEL); - mptcp_copy_inaddrs(newsk, msk->first); mptcp_rcv_space_init(msk, msk->first); mptcp_propagate_sndbuf(newsk, msk->first); @@ -3616,18 +3744,6 @@ static int mptcp_stream_accept(struct socket *sock, struct socket *newsock, return err; } -static __poll_t mptcp_check_readable(struct mptcp_sock *msk) -{ - /* Concurrent splices from sk_receive_queue into receive_queue will - * always show at least one non-empty queue when checked in this order. - */ - if (skb_queue_empty_lockless(&((struct sock *)msk)->sk_receive_queue) && - skb_queue_empty_lockless(&msk->receive_queue)) - return 0; - - return EPOLLIN | EPOLLRDNORM; -} - static __poll_t mptcp_check_writeable(struct mptcp_sock *msk) { struct sock *sk = (struct sock *)msk; @@ -3669,13 +3785,16 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock, if (state != TCP_SYN_SENT && state != TCP_SYN_RECV) { mask |= mptcp_check_readable(msk); mask |= mptcp_check_writeable(msk); + } else if (state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) { + /* cf tcp_poll() note about TFO */ + mask |= EPOLLOUT | EPOLLWRNORM; } if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE) mask |= EPOLLHUP; if (sk->sk_shutdown & RCV_SHUTDOWN) mask |= EPOLLIN | EPOLLRDNORM | EPOLLRDHUP; - /* This barrier is coupled with smp_wmb() in tcp_reset() */ + /* This barrier is coupled with smp_wmb() in __mptcp_error_report() */ smp_rmb(); if (sk->sk_err) mask |= EPOLLERR; @@ -3805,12 +3924,6 @@ static const struct proto_ops mptcp_v6_stream_ops = { static struct proto mptcp_v6_prot; -static void mptcp_v6_destroy(struct sock *sk) -{ - mptcp_destroy(sk); - inet6_destroy_sock(sk); -} - static struct inet_protosw mptcp_v6_protosw = { .type = SOCK_STREAM, .protocol = IPPROTO_MPTCP, @@ -3826,7 +3939,6 @@ int __init mptcp_proto_v6_init(void) mptcp_v6_prot = mptcp_prot; strcpy(mptcp_v6_prot.name, "MPTCPv6"); mptcp_v6_prot.slab = NULL; - mptcp_v6_prot.destroy = mptcp_v6_destroy; mptcp_v6_prot.obj_size = sizeof(struct mptcp6_sock); err = proto_register(&mptcp_v6_prot, 1); diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h index 132d50833df1..6a09ab99a12d 100644 --- a/net/mptcp/protocol.h +++ b/net/mptcp/protocol.h @@ -285,7 +285,9 @@ struct mptcp_sock { u8 mpc_endpoint_id; u8 recvmsg_inq:1, cork:1, - nodelay:1; + nodelay:1, + is_sendmsg:1; + int connect_flags; struct work_struct work; struct sk_buff *ooo_last_skb; struct rb_root out_of_order_queue; @@ -314,6 +316,8 @@ struct mptcp_sock { #define mptcp_for_each_subflow(__msk, __subflow) \ list_for_each_entry(__subflow, &((__msk)->conn_list), node) +#define mptcp_for_each_subflow_safe(__msk, __subflow, __tmp) \ + list_for_each_entry_safe(__subflow, __tmp, &((__msk)->conn_list), node) static inline void msk_owned_by_me(const struct mptcp_sock *msk) { @@ -597,6 +601,7 @@ int mptcp_is_checksum_enabled(const struct net *net); int mptcp_allow_join_id0(const struct net *net); unsigned int mptcp_stale_loss_cnt(const struct net *net); int mptcp_get_pm_type(const struct net *net); +void mptcp_copy_inaddrs(struct sock *msk, const struct sock *ssk); void mptcp_subflow_fully_established(struct mptcp_subflow_context *subflow, struct mptcp_options_received *mp_opt); bool __mptcp_retransmit_pending_data(struct sock *sk); @@ -612,6 +617,8 @@ void mptcp_subflow_reset(struct sock *ssk); void mptcp_subflow_queue_clean(struct sock *ssk); void mptcp_sock_graft(struct sock *sk, struct socket *parent); struct socket *__mptcp_nmpc_socket(const struct mptcp_sock *msk); +bool __mptcp_close(struct sock *sk, long timeout); +void mptcp_cancel_work(struct sock *sk); bool mptcp_addresses_equal(const struct mptcp_addr_info *a, const struct mptcp_addr_info *b, bool use_port); diff --git a/net/mptcp/sockopt.c b/net/mptcp/sockopt.c index 423d3826ca1e..f85e9bbfe86f 100644 --- a/net/mptcp/sockopt.c +++ b/net/mptcp/sockopt.c @@ -559,6 +559,8 @@ static bool mptcp_supported_sockopt(int level, int optname) case TCP_NOTSENT_LOWAT: case TCP_TX_DELAY: case TCP_INQ: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: return true; } @@ -567,8 +569,8 @@ static bool mptcp_supported_sockopt(int level, int optname) /* TCP_REPAIR, TCP_REPAIR_QUEUE, TCP_QUEUE_SEQ, TCP_REPAIR_OPTIONS, * TCP_REPAIR_WINDOW are not supported, better avoid this mess */ - /* TCP_FASTOPEN_KEY, TCP_FASTOPEN TCP_FASTOPEN_CONNECT, TCP_FASTOPEN_NO_COOKIE, - * are not supported fastopen is currently unsupported + /* TCP_FASTOPEN_KEY, TCP_FASTOPEN are not supported because + * fastopen for the listener side is currently unsupported */ } return false; @@ -756,16 +758,17 @@ static int mptcp_setsockopt_v4(struct mptcp_sock *msk, int optname, return -EOPNOTSUPP; } -static int mptcp_setsockopt_sol_tcp_defer(struct mptcp_sock *msk, sockptr_t optval, - unsigned int optlen) +static int mptcp_setsockopt_first_sf_only(struct mptcp_sock *msk, int level, int optname, + sockptr_t optval, unsigned int optlen) { - struct socket *listener; + struct socket *sock; - listener = __mptcp_nmpc_socket(msk); - if (!listener) - return 0; /* TCP_DEFER_ACCEPT does not fail */ + /* Limit to first subflow, before the connection establishment */ + sock = __mptcp_nmpc_socket(msk); + if (!sock) + return -EINVAL; - return tcp_setsockopt(listener->sk, SOL_TCP, TCP_DEFER_ACCEPT, optval, optlen); + return tcp_setsockopt(sock->sk, level, optname, optval, optlen); } static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, @@ -795,7 +798,13 @@ static int mptcp_setsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_NODELAY: return mptcp_setsockopt_sol_tcp_nodelay(msk, optval, optlen); case TCP_DEFER_ACCEPT: - return mptcp_setsockopt_sol_tcp_defer(msk, optval, optlen); + /* See tcp.c: TCP_DEFER_ACCEPT does not fail */ + mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); + return 0; + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: + return mptcp_setsockopt_first_sf_only(msk, SOL_TCP, optname, + optval, optlen); } return -EOPNOTSUPP; @@ -1157,6 +1166,8 @@ static int mptcp_getsockopt_sol_tcp(struct mptcp_sock *msk, int optname, case TCP_INFO: case TCP_CC_INFO: case TCP_DEFER_ACCEPT: + case TCP_FASTOPEN_CONNECT: + case TCP_FASTOPEN_NO_COOKIE: return mptcp_getsockopt_first_sf_only(msk, SOL_TCP, optname, optval, optlen); case TCP_INQ: diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c index c7d49fb6e7bd..437a283ba6ea 100644 --- a/net/mptcp/subflow.c +++ b/net/mptcp/subflow.c @@ -602,30 +602,6 @@ static bool subflow_hmac_valid(const struct request_sock *req, return !crypto_memneq(hmac, mp_opt->hmac, MPTCPOPT_HMAC_LEN); } -static void mptcp_sock_destruct(struct sock *sk) -{ - /* if new mptcp socket isn't accepted, it is free'd - * from the tcp listener sockets request queue, linked - * from req->sk. The tcp socket is released. - * This calls the ULP release function which will - * also remove the mptcp socket, via - * sock_put(ctx->conn). - * - * Problem is that the mptcp socket will be in - * ESTABLISHED state and will not have the SOCK_DEAD flag. - * Both result in warnings from inet_sock_destruct. - */ - if ((1 << sk->sk_state) & (TCPF_ESTABLISHED | TCPF_CLOSE_WAIT)) { - sk->sk_state = TCP_CLOSE; - WARN_ON_ONCE(sk->sk_socket); - sock_orphan(sk); - } - - /* We don't need to clear msk->subflow, as it's still NULL at this point */ - mptcp_destroy_common(mptcp_sk(sk), 0); - inet_sock_destruct(sk); -} - static void mptcp_force_close(struct sock *sk) { /* the msk is not yet exposed to user-space */ @@ -747,6 +723,8 @@ create_child: goto dispose_child; } + if (new_msk) + mptcp_copy_inaddrs(new_msk, child); subflow_drop_ctx(child); goto out; } @@ -768,13 +746,17 @@ create_child: /* new mpc subflow takes ownership of the newly * created mptcp socket */ - new_msk->sk_destruct = mptcp_sock_destruct; mptcp_sk(new_msk)->setsockopt_seq = ctx->setsockopt_seq; mptcp_pm_new_connection(mptcp_sk(new_msk), child, 1); mptcp_token_accept(subflow_req, mptcp_sk(new_msk)); ctx->conn = new_msk; new_msk = NULL; + /* set msk addresses early to ensure mptcp_pm_get_local_id() + * uses the correct data + */ + mptcp_copy_inaddrs(ctx->conn, child); + /* with OoO packets we can reach here without ingress * mpc option */ @@ -1620,7 +1602,9 @@ int mptcp_subflow_create_socket(struct sock *sk, struct socket **new_sock) /* kernel sockets do not by default acquire net ref, but TCP timer * needs it. + * Update ns_tracker to current stack trace and refcounted tracker. */ + __netns_tracker_free(net, &sf->sk->ns_tracker, false); sf->sk->sk_net_refcnt = 1; get_net_track(net, &sf->sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); @@ -1763,13 +1747,19 @@ void mptcp_subflow_queue_clean(struct sock *listener_ssk) for (msk = head; msk; msk = next) { struct sock *sk = (struct sock *)msk; - bool slow; + bool slow, do_cancel_work; + sock_hold(sk); slow = lock_sock_fast_nested(sk); next = msk->dl_next; msk->first = NULL; msk->dl_next = NULL; + + do_cancel_work = __mptcp_close(sk, 0); unlock_sock_fast(sk, slow); + if (do_cancel_work) + mptcp_cancel_work(sk); + sock_put(sk); } /* we are still under the listener msk socket lock */ diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 4b8d04640ff3..0846bd75b1da 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -568,12 +568,6 @@ config NFT_TUNNEL This option adds the "tunnel" expression that you can use to set tunneling policies. -config NFT_OBJREF - tristate "Netfilter nf_tables stateful object reference module" - help - This option adds the "objref" expression that allows you to refer to - stateful objects, such as counters and quotas. - config NFT_QUEUE depends on NETFILTER_NETLINK_QUEUE tristate "Netfilter nf_tables queue module" diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile index 06df49ea6329..1d4db1943936 100644 --- a/net/netfilter/Makefile +++ b/net/netfilter/Makefile @@ -60,6 +60,12 @@ obj-$(CONFIG_NF_NAT) += nf_nat.o nf_nat-$(CONFIG_NF_NAT_REDIRECT) += nf_nat_redirect.o nf_nat-$(CONFIG_NF_NAT_MASQUERADE) += nf_nat_masquerade.o +ifeq ($(CONFIG_NF_NAT),m) +nf_nat-$(CONFIG_DEBUG_INFO_BTF_MODULES) += nf_nat_bpf.o +else ifeq ($(CONFIG_NF_NAT),y) +nf_nat-$(CONFIG_DEBUG_INFO_BTF) += nf_nat_bpf.o +endif + # NAT helpers obj-$(CONFIG_NF_NAT_AMANDA) += nf_nat_amanda.o obj-$(CONFIG_NF_NAT_FTP) += nf_nat_ftp.o @@ -80,7 +86,8 @@ nf_tables-objs := nf_tables_core.o nf_tables_api.o nft_chain_filter.o \ nf_tables_trace.o nft_immediate.o nft_cmp.o nft_range.o \ nft_bitwise.o nft_byteorder.o nft_payload.o nft_lookup.o \ nft_dynset.o nft_meta.o nft_rt.o nft_exthdr.o nft_last.o \ - nft_counter.o nft_chain_route.o nf_tables_offload.o \ + nft_counter.o nft_objref.o nft_inner.o \ + nft_chain_route.o nf_tables_offload.o \ nft_set_hash.o nft_set_bitmap.o nft_set_rbtree.o \ nft_set_pipapo.o @@ -98,7 +105,6 @@ obj-$(CONFIG_NFT_CT) += nft_ct.o obj-$(CONFIG_NFT_FLOW_OFFLOAD) += nft_flow_offload.o obj-$(CONFIG_NFT_LIMIT) += nft_limit.o obj-$(CONFIG_NFT_NAT) += nft_nat.o -obj-$(CONFIG_NFT_OBJREF) += nft_objref.o obj-$(CONFIG_NFT_QUEUE) += nft_queue.o obj-$(CONFIG_NFT_QUOTA) += nft_quota.o obj-$(CONFIG_NFT_REJECT) += nft_reject.o diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index 16ae92054baa..e7ba5b6dd2b7 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -353,7 +353,7 @@ ip_set_init_comment(struct ip_set *set, struct ip_set_comment *comment, c = kmalloc(sizeof(*c) + len + 1, GFP_ATOMIC); if (unlikely(!c)) return; - strlcpy(c->str, ext->comment, len + 1); + strscpy(c->str, ext->comment, len + 1); set->ext_size += sizeof(*c) + strlen(c->str) + 1; rcu_assign_pointer(comment->c, c); } @@ -1072,7 +1072,7 @@ static int ip_set_create(struct sk_buff *skb, const struct nfnl_info *info, if (!set) return -ENOMEM; spin_lock_init(&set->lock); - strlcpy(set->name, name, IPSET_MAXNAMELEN); + strscpy(set->name, name, IPSET_MAXNAMELEN); set->family = family; set->revision = revision; @@ -1719,11 +1719,13 @@ call_ad(struct net *net, struct sock *ctnl, struct sk_buff *skb, skb2 = nlmsg_new(payload, GFP_KERNEL); if (!skb2) return -ENOMEM; - rep = __nlmsg_put(skb2, NETLINK_CB(skb).portid, - nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); + rep = nlmsg_put(skb2, NETLINK_CB(skb).portid, + nlh->nlmsg_seq, NLMSG_ERROR, payload, 0); errmsg = nlmsg_data(rep); errmsg->error = ret; - memcpy(&errmsg->msg, nlh, nlh->nlmsg_len); + unsafe_memcpy(&errmsg->msg, nlh, nlh->nlmsg_len, + /* Bounds checked by the skb layer. */); + cmdattr = (void *)&errmsg->msg + min_len; ret = nla_parse(cda, IPSET_ATTR_CMD_MAX, cmdattr, diff --git a/net/netfilter/ipvs/ip_vs_conn.c b/net/netfilter/ipvs/ip_vs_conn.c index fb67f1ca2495..8c04bb57dd6f 100644 --- a/net/netfilter/ipvs/ip_vs_conn.c +++ b/net/netfilter/ipvs/ip_vs_conn.c @@ -1308,7 +1308,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs) * Randomly scan 1/32 of the whole table every second */ for (idx = 0; idx < (ip_vs_conn_tab_size>>5); idx++) { - unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask; + unsigned int hash = get_random_u32() & ip_vs_conn_tab_mask; hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) { if (cp->ipvs != ipvs) diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c index 818b0b058b10..4d62059a6021 100644 --- a/net/netfilter/ipvs/ip_vs_ctl.c +++ b/net/netfilter/ipvs/ip_vs_ctl.c @@ -2296,13 +2296,13 @@ static int ip_vs_stats_percpu_show(struct seq_file *seq, void *v) u64 conns, inpkts, outpkts, inbytes, outbytes; do { - start = u64_stats_fetch_begin_irq(&u->syncp); + start = u64_stats_fetch_begin(&u->syncp); conns = u->cnt.conns; inpkts = u->cnt.inpkts; outpkts = u->cnt.outpkts; inbytes = u->cnt.inbytes; outbytes = u->cnt.outbytes; - } while (u64_stats_fetch_retry_irq(&u->syncp, start)); + } while (u64_stats_fetch_retry(&u->syncp, start)); seq_printf(seq, "%3X %8LX %8LX %8LX %16LX %16LX\n", i, (u64)conns, (u64)inpkts, @@ -2611,7 +2611,7 @@ ip_vs_copy_service(struct ip_vs_service_entry *dst, struct ip_vs_service *src) dst->addr = src->addr.ip; dst->port = src->port; dst->fwmark = src->fwmark; - strlcpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); + strscpy(dst->sched_name, sched_name, sizeof(dst->sched_name)); dst->flags = src->flags; dst->timeout = src->timeout / HZ; dst->netmask = src->netmask; @@ -2805,13 +2805,13 @@ do_ip_vs_get_ctl(struct sock *sk, int cmd, void __user *user, int *len) mutex_lock(&ipvs->sync_mutex); if (ipvs->sync_state & IP_VS_STATE_MASTER) { d[0].state = IP_VS_STATE_MASTER; - strlcpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, + strscpy(d[0].mcast_ifn, ipvs->mcfg.mcast_ifn, sizeof(d[0].mcast_ifn)); d[0].syncid = ipvs->mcfg.syncid; } if (ipvs->sync_state & IP_VS_STATE_BACKUP) { d[1].state = IP_VS_STATE_BACKUP; - strlcpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, + strscpy(d[1].mcast_ifn, ipvs->bcfg.mcast_ifn, sizeof(d[1].mcast_ifn)); d[1].syncid = ipvs->bcfg.syncid; } @@ -3561,7 +3561,7 @@ static int ip_vs_genl_new_daemon(struct netns_ipvs *ipvs, struct nlattr **attrs) attrs[IPVS_DAEMON_ATTR_MCAST_IFN] && attrs[IPVS_DAEMON_ATTR_SYNC_ID])) return -EINVAL; - strlcpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), + strscpy(c.mcast_ifn, nla_data(attrs[IPVS_DAEMON_ATTR_MCAST_IFN]), sizeof(c.mcast_ifn)); c.syncid = nla_get_u32(attrs[IPVS_DAEMON_ATTR_SYNC_ID]); diff --git a/net/netfilter/ipvs/ip_vs_twos.c b/net/netfilter/ipvs/ip_vs_twos.c index acb55d8393ef..f2579fc9c75b 100644 --- a/net/netfilter/ipvs/ip_vs_twos.c +++ b/net/netfilter/ipvs/ip_vs_twos.c @@ -71,8 +71,8 @@ static struct ip_vs_dest *ip_vs_twos_schedule(struct ip_vs_service *svc, * from 0 to total_weight */ total_weight += 1; - rweight1 = prandom_u32() % total_weight; - rweight2 = prandom_u32() % total_weight; + rweight1 = prandom_u32_max(total_weight); + rweight2 = prandom_u32_max(total_weight); /* Pick two weighted servers */ list_for_each_entry_rcu(dest, &svc->destinations, n_list) { diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c index 1cd87b28c9b0..8639e7efd0e2 100644 --- a/net/netfilter/nf_conntrack_bpf.c +++ b/net/netfilter/nf_conntrack_bpf.c @@ -6,12 +6,14 @@ * are exposed through to BPF programs is explicitly unstable. */ +#include <linux/bpf_verifier.h> #include <linux/bpf.h> #include <linux/btf.h> +#include <linux/filter.h> +#include <linux/mutex.h> #include <linux/types.h> #include <linux/btf_ids.h> #include <linux/net_namespace.h> -#include <net/netfilter/nf_conntrack.h> #include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> @@ -134,7 +136,6 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, memset(&ct->proto, 0, sizeof(ct->proto)); __nf_ct_set_timeout(ct, timeout * HZ); - ct->status |= IPS_CONFIRMED; out: if (opts->netns_id >= 0) @@ -184,14 +185,58 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, return ct; } +BTF_ID_LIST(btf_nf_conn_ids) +BTF_ID(struct, nf_conn) +BTF_ID(struct, nf_conn___init) + +/* Check writes into `struct nf_conn` */ +static int _nf_conntrack_btf_struct_access(struct bpf_verifier_log *log, + const struct btf *btf, + const struct btf_type *t, int off, + int size, enum bpf_access_type atype, + u32 *next_btf_id, + enum bpf_type_flag *flag) +{ + const struct btf_type *ncit; + const struct btf_type *nct; + size_t end; + + ncit = btf_type_by_id(btf, btf_nf_conn_ids[1]); + nct = btf_type_by_id(btf, btf_nf_conn_ids[0]); + + if (t != nct && t != ncit) { + bpf_log(log, "only read is supported\n"); + return -EACCES; + } + + /* `struct nf_conn` and `struct nf_conn___init` have the same layout + * so we are safe to simply merge offset checks here + */ + switch (off) { +#if defined(CONFIG_NF_CONNTRACK_MARK) + case offsetof(struct nf_conn, mark): + end = offsetofend(struct nf_conn, mark); + break; +#endif + default: + bpf_log(log, "no write support to nf_conn at off %d\n", off); + return -EACCES; + } + + if (off + size > end) { + bpf_log(log, + "write access at off %d with size %d beyond the member of nf_conn ended at %zu\n", + off, size, end); + return -EACCES; + } + + return 0; +} + __diag_push(); __diag_ignore_all("-Wmissing-prototypes", "Global functions as their definitions will be in nf_conntrack BTF"); -struct nf_conn___init { - struct nf_conn ct; -}; - /* bpf_xdp_ct_alloc - Allocate a new CT entry * * Parameters: @@ -339,6 +384,7 @@ struct nf_conn *bpf_ct_insert_entry(struct nf_conn___init *nfct_i) struct nf_conn *nfct = (struct nf_conn *)nfct_i; int err; + nfct->status |= IPS_CONFIRMED; err = nf_conntrack_hash_check_insert(nfct); if (err < 0) { nf_conntrack_free(nfct); @@ -449,5 +495,19 @@ int register_nf_conntrack_bpf(void) int ret; ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, &nf_conntrack_kfunc_set); - return ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); + ret = ret ?: register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, &nf_conntrack_kfunc_set); + if (!ret) { + mutex_lock(&nf_conn_btf_access_lock); + nfct_btf_struct_access = _nf_conntrack_btf_struct_access; + mutex_unlock(&nf_conn_btf_access_lock); + } + + return ret; +} + +void cleanup_nf_conntrack_bpf(void) +{ + mutex_lock(&nf_conn_btf_access_lock); + nfct_btf_struct_access = NULL; + mutex_unlock(&nf_conn_btf_access_lock); } diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c index da65c6e8eeeb..f97bda06d2a9 100644 --- a/net/netfilter/nf_conntrack_core.c +++ b/net/netfilter/nf_conntrack_core.c @@ -67,6 +67,7 @@ struct conntrack_gc_work { struct delayed_work dwork; u32 next_bucket; u32 avg_timeout; + u32 count; u32 start_time; bool exiting; bool early_drop; @@ -85,10 +86,12 @@ static DEFINE_MUTEX(nf_conntrack_mutex); /* clamp timeouts to this value (TCP unacked) */ #define GC_SCAN_INTERVAL_CLAMP (300ul * HZ) -/* large initial bias so that we don't scan often just because we have - * three entries with a 1s timeout. +/* Initial bias pretending we have 100 entries at the upper bound so we don't + * wakeup often just because we have three entries with a 1s timeout while still + * allowing non-idle machines to wakeup more often when needed. */ -#define GC_SCAN_INTERVAL_INIT INT_MAX +#define GC_SCAN_INITIAL_COUNT 100 +#define GC_SCAN_INTERVAL_INIT GC_SCAN_INTERVAL_MAX #define GC_SCAN_MAX_DURATION msecs_to_jiffies(10) #define GC_SCAN_EXPIRED_MAX (64000u / HZ) @@ -1466,6 +1469,7 @@ static void gc_worker(struct work_struct *work) unsigned int expired_count = 0; unsigned long next_run; s32 delta_time; + long count; gc_work = container_of(work, struct conntrack_gc_work, dwork.work); @@ -1475,10 +1479,12 @@ static void gc_worker(struct work_struct *work) if (i == 0) { gc_work->avg_timeout = GC_SCAN_INTERVAL_INIT; + gc_work->count = GC_SCAN_INITIAL_COUNT; gc_work->start_time = start_time; } next_run = gc_work->avg_timeout; + count = gc_work->count; end_time = start_time + GC_SCAN_MAX_DURATION; @@ -1498,8 +1504,8 @@ static void gc_worker(struct work_struct *work) hlist_nulls_for_each_entry_rcu(h, n, &ct_hash[i], hnnode) { struct nf_conntrack_net *cnet; - unsigned long expires; struct net *net; + long expires; tmp = nf_ct_tuplehash_to_ctrack(h); @@ -1513,6 +1519,7 @@ static void gc_worker(struct work_struct *work) gc_work->next_bucket = i; gc_work->avg_timeout = next_run; + gc_work->count = count; delta_time = nfct_time_stamp - gc_work->start_time; @@ -1528,8 +1535,8 @@ static void gc_worker(struct work_struct *work) } expires = clamp(nf_ct_expires(tmp), GC_SCAN_INTERVAL_MIN, GC_SCAN_INTERVAL_CLAMP); + expires = (expires - (long)next_run) / ++count; next_run += expires; - next_run /= 2u; if (nf_conntrack_max95 == 0 || gc_worker_skip_ct(tmp)) continue; @@ -1570,6 +1577,7 @@ static void gc_worker(struct work_struct *work) delta_time = nfct_time_stamp - end_time; if (delta_time > 0 && i < hashsz) { gc_work->avg_timeout = next_run; + gc_work->count = count; gc_work->next_bucket = i; next_run = 0; goto early_exit; @@ -1782,7 +1790,7 @@ init_conntrack(struct net *net, struct nf_conn *tmpl, } spin_unlock_bh(&nf_conntrack_expect_lock); } - if (!exp) + if (!exp && tmpl) __nf_ct_try_assign_helper(ct, tmpl, GFP_ATOMIC); /* Other CPU might have obtained a pointer to this object before it was @@ -2068,10 +2076,6 @@ void nf_conntrack_alter_reply(struct nf_conn *ct, ct->tuplehash[IP_CT_DIR_REPLY].tuple = *newreply; if (ct->master || (help && !hlist_empty(&help->expectations))) return; - - rcu_read_lock(); - __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - rcu_read_unlock(); } EXPORT_SYMBOL_GPL(nf_conntrack_alter_reply); @@ -2512,6 +2516,7 @@ static int kill_all(struct nf_conn *i, void *data) void nf_conntrack_cleanup_start(void) { + cleanup_nf_conntrack_bpf(); conntrack_gc_work.exiting = true; } @@ -2797,7 +2802,6 @@ int nf_conntrack_init_net(struct net *net) nf_conntrack_acct_pernet_init(net); nf_conntrack_tstamp_pernet_init(net); nf_conntrack_ecache_pernet_init(net); - nf_conntrack_helper_pernet_init(net); nf_conntrack_proto_pernet_init(net); return 0; diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c index 0d9332e9cf71..617f744a2e3a 100644 --- a/net/netfilter/nf_conntrack_ftp.c +++ b/net/netfilter/nf_conntrack_ftp.c @@ -33,6 +33,7 @@ MODULE_AUTHOR("Rusty Russell <rusty@rustcorp.com.au>"); MODULE_DESCRIPTION("ftp connection tracking helper"); MODULE_ALIAS("ip_conntrack_ftp"); MODULE_ALIAS_NFCT_HELPER(HELPER_NAME); +static DEFINE_SPINLOCK(nf_ftp_lock); #define MAX_PORTS 8 static u_int16_t ports[MAX_PORTS]; @@ -409,7 +410,8 @@ static int help(struct sk_buff *skb, } datalen = skb->len - dataoff; - spin_lock_bh(&ct->lock); + /* seqadj (nat) uses ct->lock internally, nf_nat_ftp would cause deadlock */ + spin_lock_bh(&nf_ftp_lock); fb_ptr = skb->data + dataoff; ends_in_nl = (fb_ptr[datalen - 1] == '\n'); @@ -538,7 +540,7 @@ out_update_nl: if (ends_in_nl) update_nl_seq(ct, seq, ct_ftp_info, dir, skb); out: - spin_unlock_bh(&ct->lock); + spin_unlock_bh(&nf_ftp_lock); return ret; } diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c index e96b32221444..ff737a76052e 100644 --- a/net/netfilter/nf_conntrack_helper.c +++ b/net/netfilter/nf_conntrack_helper.c @@ -35,11 +35,6 @@ unsigned int nf_ct_helper_hsize __read_mostly; EXPORT_SYMBOL_GPL(nf_ct_helper_hsize); static unsigned int nf_ct_helper_count __read_mostly; -static bool nf_ct_auto_assign_helper __read_mostly = false; -module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644); -MODULE_PARM_DESC(nf_conntrack_helper, - "Enable automatic conntrack helper assignment (default 0)"); - static DEFINE_MUTEX(nf_ct_nat_helpers_mutex); static struct list_head nf_ct_nat_helpers __read_mostly; @@ -51,24 +46,6 @@ static unsigned int helper_hash(const struct nf_conntrack_tuple *tuple) (__force __u16)tuple->src.u.all) % nf_ct_helper_hsize; } -static struct nf_conntrack_helper * -__nf_ct_helper_find(const struct nf_conntrack_tuple *tuple) -{ - struct nf_conntrack_helper *helper; - struct nf_conntrack_tuple_mask mask = { .src.u.all = htons(0xFFFF) }; - unsigned int h; - - if (!nf_ct_helper_count) - return NULL; - - h = helper_hash(tuple); - hlist_for_each_entry_rcu(helper, &nf_ct_helper_hash[h], hnode) { - if (nf_ct_tuple_src_mask_cmp(tuple, &helper->tuple, &mask)) - return helper; - } - return NULL; -} - struct nf_conntrack_helper * __nf_conntrack_helper_find(const char *name, u16 l3num, u8 protonum) { @@ -209,33 +186,11 @@ nf_ct_helper_ext_add(struct nf_conn *ct, gfp_t gfp) } EXPORT_SYMBOL_GPL(nf_ct_helper_ext_add); -static struct nf_conntrack_helper * -nf_ct_lookup_helper(struct nf_conn *ct, struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - if (!cnet->sysctl_auto_assign_helper) { - if (cnet->auto_assign_helper_warned) - return NULL; - if (!__nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple)) - return NULL; - pr_info("nf_conntrack: default automatic helper assignment " - "has been turned off for security reasons and CT-based " - "firewall rule not found. Use the iptables CT target " - "to attach helpers instead.\n"); - cnet->auto_assign_helper_warned = true; - return NULL; - } - - return __nf_ct_helper_find(&ct->tuplehash[IP_CT_DIR_REPLY].tuple); -} - int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, gfp_t flags) { struct nf_conntrack_helper *helper = NULL; struct nf_conn_help *help; - struct net *net = nf_ct_net(ct); /* We already got a helper explicitly attached. The function * nf_conntrack_alter_reply - in case NAT is in use - asks for looking @@ -246,23 +201,21 @@ int __nf_ct_try_assign_helper(struct nf_conn *ct, struct nf_conn *tmpl, if (test_bit(IPS_HELPER_BIT, &ct->status)) return 0; - if (tmpl != NULL) { - help = nfct_help(tmpl); - if (help != NULL) { - helper = rcu_dereference(help->helper); - set_bit(IPS_HELPER_BIT, &ct->status); - } + if (WARN_ON_ONCE(!tmpl)) + return 0; + + help = nfct_help(tmpl); + if (help != NULL) { + helper = rcu_dereference(help->helper); + set_bit(IPS_HELPER_BIT, &ct->status); } help = nfct_help(ct); if (helper == NULL) { - helper = nf_ct_lookup_helper(ct, net); - if (helper == NULL) { - if (help) - RCU_INIT_POINTER(help->helper, NULL); - return 0; - } + if (help) + RCU_INIT_POINTER(help->helper, NULL); + return 0; } if (help == NULL) { @@ -545,19 +498,6 @@ void nf_nat_helper_unregister(struct nf_conntrack_nat_helper *nat) } EXPORT_SYMBOL_GPL(nf_nat_helper_unregister); -void nf_ct_set_auto_assign_helper_warned(struct net *net) -{ - nf_ct_pernet(net)->auto_assign_helper_warned = true; -} -EXPORT_SYMBOL_GPL(nf_ct_set_auto_assign_helper_warned); - -void nf_conntrack_helper_pernet_init(struct net *net) -{ - struct nf_conntrack_net *cnet = nf_ct_pernet(net); - - cnet->sysctl_auto_assign_helper = nf_ct_auto_assign_helper; -} - int nf_conntrack_helper_init(void) { nf_ct_helper_hsize = 1; /* gets rounded up to use one page */ diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c index 1796c456ac98..5703846bea3b 100644 --- a/net/netfilter/nf_conntrack_irc.c +++ b/net/netfilter/nf_conntrack_irc.c @@ -157,15 +157,37 @@ static int help(struct sk_buff *skb, unsigned int protoff, data = ib_ptr; data_limit = ib_ptr + datalen; - /* strlen("\1DCC SENT t AAAAAAAA P\1\n")=24 - * 5+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=14 */ - while (data < data_limit - (19 + MINMATCHLEN)) { - if (memcmp(data, "\1DCC ", 5)) { + /* Skip any whitespace */ + while (data < data_limit - 10) { + if (*data == ' ' || *data == '\r' || *data == '\n') + data++; + else + break; + } + + /* strlen("PRIVMSG x ")=10 */ + if (data < data_limit - 10) { + if (strncasecmp("PRIVMSG ", data, 8)) + goto out; + data += 8; + } + + /* strlen(" :\1DCC SENT t AAAAAAAA P\1\n")=26 + * 7+MINMATCHLEN+strlen("t AAAAAAAA P\1\n")=26 + */ + while (data < data_limit - (21 + MINMATCHLEN)) { + /* Find first " :", the start of message */ + if (memcmp(data, " :", 2)) { data++; continue; } + data += 2; + + /* then check that place only for the DCC command */ + if (memcmp(data, "\1DCC ", 5)) + goto out; data += 5; - /* we have at least (19+MINMATCHLEN)-5 bytes valid data left */ + /* we have at least (21+MINMATCHLEN)-(2+5) bytes valid data left */ iph = ip_hdr(skb); pr_debug("DCC found in master %pI4:%u %pI4:%u\n", @@ -181,7 +203,7 @@ static int help(struct sk_buff *skb, unsigned int protoff, pr_debug("DCC %s detected\n", dccprotos[i]); /* we have at least - * (19+MINMATCHLEN)-5-dccprotos[i].matchlen bytes valid + * (21+MINMATCHLEN)-7-dccprotos[i].matchlen bytes valid * data left (== 14/13 bytes) */ if (parse_dcc(data, data_limit, &dcc_ip, &dcc_port, &addr_beg_p, &addr_end_p)) { @@ -194,8 +216,9 @@ static int help(struct sk_buff *skb, unsigned int protoff, /* dcc_ip can be the internal OR external (NAT'ed) IP */ tuple = &ct->tuplehash[dir].tuple; - if (tuple->src.u3.ip != dcc_ip && - tuple->dst.u3.ip != dcc_ip) { + if ((tuple->src.u3.ip != dcc_ip && + ct->tuplehash[!dir].tuple.dst.u3.ip != dcc_ip) || + dcc_port == 0) { net_warn_ratelimited("Forged DCC command from %pI4: %pI4:%u\n", &tuple->src.u3.ip, &dcc_ip, dcc_port); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index 04169b54f2a2..7562b215b932 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -2298,11 +2298,6 @@ ctnetlink_create_conntrack(struct net *net, ct->status |= IPS_HELPER; RCU_INIT_POINTER(help->helper, helper); } - } else { - /* try an implicit helper assignation */ - err = __nf_ct_try_assign_helper(ct, NULL, GFP_ATOMIC); - if (err < 0) - goto err2; } err = ctnetlink_setup_nat(ct, cda); diff --git a/net/netfilter/nf_conntrack_proto_tcp.c b/net/netfilter/nf_conntrack_proto_tcp.c index a634c72b1ffc..656631083177 100644 --- a/net/netfilter/nf_conntrack_proto_tcp.c +++ b/net/netfilter/nf_conntrack_proto_tcp.c @@ -47,6 +47,12 @@ static const char *const tcp_conntrack_names[] = { "SYN_SENT2", }; +enum nf_ct_tcp_action { + NFCT_TCP_IGNORE, + NFCT_TCP_INVALID, + NFCT_TCP_ACCEPT, +}; + #define SECS * HZ #define MINS * 60 SECS #define HOURS * 60 MINS @@ -472,23 +478,45 @@ static void tcp_init_sender(struct ip_ct_tcp_state *sender, } } -static bool tcp_in_window(struct nf_conn *ct, - enum ip_conntrack_dir dir, - unsigned int index, - const struct sk_buff *skb, - unsigned int dataoff, - const struct tcphdr *tcph, - const struct nf_hook_state *hook_state) +__printf(6, 7) +static enum nf_ct_tcp_action nf_tcp_log_invalid(const struct sk_buff *skb, + const struct nf_conn *ct, + const struct nf_hook_state *state, + const struct ip_ct_tcp_state *sender, + enum nf_ct_tcp_action ret, + const char *fmt, ...) +{ + const struct nf_tcp_net *tn = nf_tcp_pernet(nf_ct_net(ct)); + struct va_format vaf; + va_list args; + bool be_liberal; + + be_liberal = sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || tn->tcp_be_liberal; + if (be_liberal) + return NFCT_TCP_ACCEPT; + + va_start(args, fmt); + vaf.fmt = fmt; + vaf.va = &args; + nf_ct_l4proto_log_invalid(skb, ct, state, "%pV", &vaf); + va_end(args); + + return ret; +} + +static enum nf_ct_tcp_action +tcp_in_window(struct nf_conn *ct, enum ip_conntrack_dir dir, + unsigned int index, const struct sk_buff *skb, + unsigned int dataoff, const struct tcphdr *tcph, + const struct nf_hook_state *hook_state) { struct ip_ct_tcp *state = &ct->proto.tcp; - struct net *net = nf_ct_net(ct); - struct nf_tcp_net *tn = nf_tcp_pernet(net); struct ip_ct_tcp_state *sender = &state->seen[dir]; struct ip_ct_tcp_state *receiver = &state->seen[!dir]; __u32 seq, ack, sack, end, win, swin; - u16 win_raw; + bool in_recv_win, seq_ok; s32 receiver_offset; - bool res, in_recv_win; + u16 win_raw; /* * Get the required data from the packet. @@ -517,7 +545,7 @@ static bool tcp_in_window(struct nf_conn *ct, end, win); if (!tcph->ack) /* Simultaneous open */ - return true; + return NFCT_TCP_ACCEPT; } else { /* * We are in the middle of a connection, @@ -560,7 +588,7 @@ static bool tcp_in_window(struct nf_conn *ct, end, win); if (dir == IP_CT_DIR_REPLY && !tcph->ack) - return true; + return NFCT_TCP_ACCEPT; } if (!(tcph->ack)) { @@ -584,122 +612,166 @@ static bool tcp_in_window(struct nf_conn *ct, */ seq = end = sender->td_end; - /* Is the ending sequence in the receive window (if available)? */ - in_recv_win = !receiver->td_maxwin || - after(end, sender->td_end - receiver->td_maxwin - 1); - - if (before(seq, sender->td_maxend + 1) && - in_recv_win && - before(sack, receiver->td_end + 1) && - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) { - /* - * Take into account window scaling (RFC 1323). - */ - if (!tcph->syn) - win <<= sender->td_scale; - - /* - * Update sender data. - */ - swin = win + (sack - ack); - if (sender->td_maxwin < swin) - sender->td_maxwin = swin; - if (after(end, sender->td_end)) { + seq_ok = before(seq, sender->td_maxend + 1); + if (!seq_ok) { + u32 overshot = end - sender->td_maxend + 1; + bool ack_ok; + + ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); + in_recv_win = receiver->td_maxwin && + after(end, sender->td_end - receiver->td_maxwin - 1); + + if (in_recv_win && + ack_ok && + overshot <= receiver->td_maxwin && + before(sack, receiver->td_end + 1)) { + /* Work around TCPs that send more bytes than allowed by + * the receive window. + * + * If the (marked as invalid) packet is allowed to pass by + * the ruleset and the peer acks this data, then its possible + * all future packets will trigger 'ACK is over upper bound' check. + * + * Thus if only the sequence check fails then do update td_end so + * possible ACK for this data can update internal state. + */ sender->td_end = end; sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; - } - if (tcph->ack) { - if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { - sender->td_maxack = ack; - sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; - } else if (after(ack, sender->td_maxack)) - sender->td_maxack = ack; + + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "%u bytes more than expected", overshot); } - /* - * Update receiver data. - */ - if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) - receiver->td_maxwin += end - sender->td_maxend; - if (after(sack + win, receiver->td_maxend - 1)) { - receiver->td_maxend = sack + win; - if (win == 0) - receiver->td_maxend++; + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, + "SEQ is over upper bound %u (over the window of the receiver)", + sender->td_maxend + 1); + } + + if (!before(sack, receiver->td_end + 1)) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_INVALID, + "ACK is over upper bound %u (ACKed data not seen yet)", + receiver->td_end + 1); + + /* Is the ending sequence in the receive window (if available)? */ + in_recv_win = !receiver->td_maxwin || + after(end, sender->td_end - receiver->td_maxwin - 1); + if (!in_recv_win) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "SEQ is under lower bound %u (already ACKed data retransmitted)", + sender->td_end - receiver->td_maxwin - 1); + if (!after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1)) + return nf_tcp_log_invalid(skb, ct, hook_state, sender, NFCT_TCP_IGNORE, + "ignored ACK under lower bound %u (possible overly delayed)", + receiver->td_end - MAXACKWINDOW(sender) - 1); + + /* Take into account window scaling (RFC 1323). */ + if (!tcph->syn) + win <<= sender->td_scale; + + /* Update sender data. */ + swin = win + (sack - ack); + if (sender->td_maxwin < swin) + sender->td_maxwin = swin; + if (after(end, sender->td_end)) { + sender->td_end = end; + sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } + if (tcph->ack) { + if (!(sender->flags & IP_CT_TCP_FLAG_MAXACK_SET)) { + sender->td_maxack = ack; + sender->flags |= IP_CT_TCP_FLAG_MAXACK_SET; + } else if (after(ack, sender->td_maxack)) { + sender->td_maxack = ack; } - if (ack == receiver->td_end) - receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + } - /* - * Check retransmissions. - */ - if (index == TCP_ACK_SET) { - if (state->last_dir == dir - && state->last_seq == seq - && state->last_ack == ack - && state->last_end == end - && state->last_win == win_raw) - state->retrans++; - else { - state->last_dir = dir; - state->last_seq = seq; - state->last_ack = ack; - state->last_end = end; - state->last_win = win_raw; - state->retrans = 0; - } + /* Update receiver data. */ + if (receiver->td_maxwin != 0 && after(end, sender->td_maxend)) + receiver->td_maxwin += end - sender->td_maxend; + if (after(sack + win, receiver->td_maxend - 1)) { + receiver->td_maxend = sack + win; + if (win == 0) + receiver->td_maxend++; + } + if (ack == receiver->td_end) + receiver->flags &= ~IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; + + /* Check retransmissions. */ + if (index == TCP_ACK_SET) { + if (state->last_dir == dir && + state->last_seq == seq && + state->last_ack == ack && + state->last_end == end && + state->last_win == win_raw) { + state->retrans++; + } else { + state->last_dir = dir; + state->last_seq = seq; + state->last_ack = ack; + state->last_end = end; + state->last_win = win_raw; + state->retrans = 0; } - res = true; - } else { - res = false; - if (sender->flags & IP_CT_TCP_FLAG_BE_LIBERAL || - tn->tcp_be_liberal) - res = true; - if (!res) { - bool seq_ok = before(seq, sender->td_maxend + 1); - - if (!seq_ok) { - u32 overshot = end - sender->td_maxend + 1; - bool ack_ok; - - ack_ok = after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1); - - if (in_recv_win && - ack_ok && - overshot <= receiver->td_maxwin && - before(sack, receiver->td_end + 1)) { - /* Work around TCPs that send more bytes than allowed by - * the receive window. - * - * If the (marked as invalid) packet is allowed to pass by - * the ruleset and the peer acks this data, then its possible - * all future packets will trigger 'ACK is over upper bound' check. - * - * Thus if only the sequence check fails then do update td_end so - * possible ACK for this data can update internal state. - */ - sender->td_end = end; - sender->flags |= IP_CT_TCP_FLAG_DATA_UNACKNOWLEDGED; - - nf_ct_l4proto_log_invalid(skb, ct, hook_state, - "%u bytes more than expected", overshot); - return res; - } - } + } + + return NFCT_TCP_ACCEPT; +} + +static void __cold nf_tcp_handle_invalid(struct nf_conn *ct, + enum ip_conntrack_dir dir, + int index, + const struct sk_buff *skb, + const struct nf_hook_state *hook_state) +{ + const unsigned int *timeouts; + const struct nf_tcp_net *tn; + unsigned int timeout; + u32 expires; + + if (!test_bit(IPS_ASSURED_BIT, &ct->status) || + test_bit(IPS_FIXED_TIMEOUT_BIT, &ct->status)) + return; + + /* We don't want to have connections hanging around in ESTABLISHED + * state for long time 'just because' conntrack deemed a FIN/RST + * out-of-window. + * + * Shrink the timeout just like when there is unacked data. + * This speeds up eviction of 'dead' connections where the + * connection and conntracks internal state are out of sync. + */ + switch (index) { + case TCP_RST_SET: + case TCP_FIN_SET: + break; + default: + return; + } + + if (ct->proto.tcp.last_dir != dir && + (ct->proto.tcp.last_index == TCP_FIN_SET || + ct->proto.tcp.last_index == TCP_RST_SET)) { + expires = nf_ct_expires(ct); + if (expires < 120 * HZ) + return; + + tn = nf_tcp_pernet(nf_ct_net(ct)); + timeouts = nf_ct_timeout_lookup(ct); + if (!timeouts) + timeouts = tn->timeouts; + timeout = READ_ONCE(timeouts[TCP_CONNTRACK_UNACK]); + if (expires > timeout) { nf_ct_l4proto_log_invalid(skb, ct, hook_state, - "%s", - before(seq, sender->td_maxend + 1) ? - in_recv_win ? - before(sack, receiver->td_end + 1) ? - after(sack, receiver->td_end - MAXACKWINDOW(sender) - 1) ? "BUG" - : "ACK is under the lower bound (possible overly delayed ACK)" - : "ACK is over the upper bound (ACKed data not seen yet)" - : "SEQ is under the lower bound (already ACKed data retransmitted)" - : "SEQ is over the upper bound (over the window of the receiver)"); + "packet (index %d, dir %d) response for index %d lower timeout to %u", + index, dir, ct->proto.tcp.last_index, timeout); + + WRITE_ONCE(ct->timeout, timeout + nfct_time_stamp); } + } else { + ct->proto.tcp.last_index = index; + ct->proto.tcp.last_dir = dir; } - - return res; } /* table of valid flag combinations - PUSH, ECE and CWR are always valid */ @@ -861,6 +933,7 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, struct nf_conntrack_tuple *tuple; enum tcp_conntrack new_state, old_state; unsigned int index, *timeouts; + enum nf_ct_tcp_action res; enum ip_conntrack_dir dir; const struct tcphdr *th; struct tcphdr _tcph; @@ -1126,10 +1199,18 @@ int nf_conntrack_tcp_packet(struct nf_conn *ct, break; } - if (!tcp_in_window(ct, dir, index, - skb, dataoff, th, state)) { + res = tcp_in_window(ct, dir, index, + skb, dataoff, th, state); + switch (res) { + case NFCT_TCP_IGNORE: + spin_unlock_bh(&ct->lock); + return NF_ACCEPT; + case NFCT_TCP_INVALID: + nf_tcp_handle_invalid(ct, dir, index, skb, state); spin_unlock_bh(&ct->lock); return -NF_ACCEPT; + case NFCT_TCP_ACCEPT: + break; } in_window: /* From now on we have got in-window packets */ diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c index daf06f71d31c..77f5e82d8e3f 100644 --- a/net/netfilter/nf_conntrack_sip.c +++ b/net/netfilter/nf_conntrack_sip.c @@ -477,7 +477,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, return ret; if (ret == 0) break; - dataoff += *matchoff; + dataoff = *matchoff; } *in_header = 0; } @@ -489,7 +489,7 @@ static int ct_sip_walk_headers(const struct nf_conn *ct, const char *dptr, break; if (ret == 0) return ret; - dataoff += *matchoff; + dataoff = *matchoff; } if (in_header) diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c index 05895878610c..4ffe84c5a82c 100644 --- a/net/netfilter/nf_conntrack_standalone.c +++ b/net/netfilter/nf_conntrack_standalone.c @@ -561,7 +561,6 @@ enum nf_ct_sysctl_index { NF_SYSCTL_CT_LOG_INVALID, NF_SYSCTL_CT_EXPECT_MAX, NF_SYSCTL_CT_ACCT, - NF_SYSCTL_CT_HELPER, #ifdef CONFIG_NF_CONNTRACK_EVENTS NF_SYSCTL_CT_EVENTS, #endif @@ -680,14 +679,6 @@ static struct ctl_table nf_ct_sysctl_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_ONE, }, - [NF_SYSCTL_CT_HELPER] = { - .procname = "nf_conntrack_helper", - .maxlen = sizeof(u8), - .mode = 0644, - .proc_handler = proc_dou8vec_minmax, - .extra1 = SYSCTL_ZERO, - .extra2 = SYSCTL_ONE, - }, #ifdef CONFIG_NF_CONNTRACK_EVENTS [NF_SYSCTL_CT_EVENTS] = { .procname = "nf_conntrack_events", @@ -1100,7 +1091,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net) table[NF_SYSCTL_CT_CHECKSUM].data = &net->ct.sysctl_checksum; table[NF_SYSCTL_CT_LOG_INVALID].data = &net->ct.sysctl_log_invalid; table[NF_SYSCTL_CT_ACCT].data = &net->ct.sysctl_acct; - table[NF_SYSCTL_CT_HELPER].data = &cnet->sysctl_auto_assign_helper; #ifdef CONFIG_NF_CONNTRACK_EVENTS table[NF_SYSCTL_CT_EVENTS].data = &net->ct.sysctl_events; #endif diff --git a/net/netfilter/nf_log.c b/net/netfilter/nf_log.c index edee7fa944c1..8a29290149bd 100644 --- a/net/netfilter/nf_log.c +++ b/net/netfilter/nf_log.c @@ -443,9 +443,9 @@ static int nf_log_proc_dostring(struct ctl_table *table, int write, mutex_lock(&nf_log_mutex); logger = nft_log_dereference(net->nf.nf_loggers[tindex]); if (!logger) - strlcpy(buf, "NONE", sizeof(buf)); + strscpy(buf, "NONE", sizeof(buf)); else - strlcpy(buf, logger->name, sizeof(buf)); + strscpy(buf, logger->name, sizeof(buf)); mutex_unlock(&nf_log_mutex); r = proc_dostring(&tmp, write, buffer, lenp, ppos); } diff --git a/net/netfilter/nf_nat_amanda.c b/net/netfilter/nf_nat_amanda.c index 3bc7e0854efe..98deef6cde69 100644 --- a/net/netfilter/nf_nat_amanda.c +++ b/net/netfilter/nf_nat_amanda.c @@ -44,19 +44,7 @@ static unsigned int help(struct sk_buff *skb, exp->expectfn = nf_nat_follow_master; /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int res; - - exp->tuple.dst.u.tcp.port = htons(port); - res = nf_ct_expect_related(exp, 0); - if (res == 0) - break; - else if (res != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port)); if (port == 0) { nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; diff --git a/net/netfilter/nf_nat_bpf.c b/net/netfilter/nf_nat_bpf.c new file mode 100644 index 000000000000..0fa5a0bbb0ff --- /dev/null +++ b/net/netfilter/nf_nat_bpf.c @@ -0,0 +1,79 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* Unstable NAT Helpers for XDP and TC-BPF hook + * + * These are called from the XDP and SCHED_CLS BPF programs. Note that it is + * allowed to break compatibility for these functions since the interface they + * are exposed through to BPF programs is explicitly unstable. + */ + +#include <linux/bpf.h> +#include <linux/btf_ids.h> +#include <net/netfilter/nf_conntrack_bpf.h> +#include <net/netfilter/nf_conntrack_core.h> +#include <net/netfilter/nf_nat.h> + +__diag_push(); +__diag_ignore_all("-Wmissing-prototypes", + "Global functions as their definitions will be in nf_nat BTF"); + +/* bpf_ct_set_nat_info - Set source or destination nat address + * + * Set source or destination nat address of the newly allocated + * nf_conn before insertion. This must be invoked for referenced + * PTR_TO_BTF_ID to nf_conn___init. + * + * Parameters: + * @nfct - Pointer to referenced nf_conn object, obtained using + * bpf_xdp_ct_alloc or bpf_skb_ct_alloc. + * @addr - Nat source/destination address + * @port - Nat source/destination port. Non-positive values are + * interpreted as select a random port. + * @manip - NF_NAT_MANIP_SRC or NF_NAT_MANIP_DST + */ +int bpf_ct_set_nat_info(struct nf_conn___init *nfct, + union nf_inet_addr *addr, int port, + enum nf_nat_manip_type manip) +{ + struct nf_conn *ct = (struct nf_conn *)nfct; + u16 proto = nf_ct_l3num(ct); + struct nf_nat_range2 range; + + if (proto != NFPROTO_IPV4 && proto != NFPROTO_IPV6) + return -EINVAL; + + memset(&range, 0, sizeof(struct nf_nat_range2)); + range.flags = NF_NAT_RANGE_MAP_IPS; + range.min_addr = *addr; + range.max_addr = range.min_addr; + if (port > 0) { + range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + range.min_proto.all = cpu_to_be16(port); + range.max_proto.all = range.min_proto.all; + } + + return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0; +} + +__diag_pop() + +BTF_SET8_START(nf_nat_kfunc_set) +BTF_ID_FLAGS(func, bpf_ct_set_nat_info, KF_TRUSTED_ARGS) +BTF_SET8_END(nf_nat_kfunc_set) + +static const struct btf_kfunc_id_set nf_bpf_nat_kfunc_set = { + .owner = THIS_MODULE, + .set = &nf_nat_kfunc_set, +}; + +int register_nf_nat_bpf(void) +{ + int ret; + + ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_XDP, + &nf_bpf_nat_kfunc_set); + if (ret) + return ret; + + return register_btf_kfunc_id_set(BPF_PROG_TYPE_SCHED_CLS, + &nf_bpf_nat_kfunc_set); +} diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 7981be526f26..18319a6e6806 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -16,7 +16,7 @@ #include <linux/siphash.h> #include <linux/rtnetlink.h> -#include <net/netfilter/nf_conntrack.h> +#include <net/netfilter/nf_conntrack_bpf.h> #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_helper.h> #include <net/netfilter/nf_conntrack_seqadj.h> @@ -468,7 +468,7 @@ find_free_id: if (range->flags & NF_NAT_RANGE_PROTO_OFFSET) off = (ntohs(*keyptr) - ntohs(range->base_proto.all)); else - off = prandom_u32(); + off = get_random_u16(); attempts = range_size; if (attempts > max_attempts) @@ -490,7 +490,7 @@ another_round: if (attempts >= range_size || attempts < 16) return; attempts /= 2; - off = prandom_u32(); + off = get_random_u16(); goto another_round; } @@ -1152,7 +1152,7 @@ static int __init nf_nat_init(void) WARN_ON(nf_nat_hook != NULL); RCU_INIT_POINTER(nf_nat_hook, &nat_hook); - return 0; + return register_nf_nat_bpf(); } static void __exit nf_nat_cleanup(void) diff --git a/net/netfilter/nf_nat_ftp.c b/net/netfilter/nf_nat_ftp.c index aace6768a64e..c92a436d9c48 100644 --- a/net/netfilter/nf_nat_ftp.c +++ b/net/netfilter/nf_nat_ftp.c @@ -86,22 +86,9 @@ static unsigned int nf_nat_ftp(struct sk_buff *skb, * this one. */ exp->expectfn = nf_nat_follow_master; - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, ntohs(exp->saved_proto.tcp.port)); if (port == 0) { - nf_ct_helper_log(skb, ct, "all ports in use"); + nf_ct_helper_log(skb, exp->master, "all ports in use"); return NF_DROP; } diff --git a/net/netfilter/nf_nat_helper.c b/net/netfilter/nf_nat_helper.c index a263505455fc..a95a25196943 100644 --- a/net/netfilter/nf_nat_helper.c +++ b/net/netfilter/nf_nat_helper.c @@ -198,3 +198,34 @@ void nf_nat_follow_master(struct nf_conn *ct, nf_nat_setup_info(ct, &range, NF_NAT_MANIP_DST); } EXPORT_SYMBOL(nf_nat_follow_master); + +u16 nf_nat_exp_find_port(struct nf_conntrack_expect *exp, u16 port) +{ + static const unsigned int max_attempts = 128; + int range, attempts_left; + u16 min = port; + + range = USHRT_MAX - port; + attempts_left = range; + + if (attempts_left > max_attempts) + attempts_left = max_attempts; + + /* Try to get same port: if not, try to change it. */ + for (;;) { + int res; + + exp->tuple.dst.u.tcp.port = htons(port); + res = nf_ct_expect_related(exp, 0); + if (res == 0) + return port; + + if (res != -EBUSY || (--attempts_left < 0)) + break; + + port = min + prandom_u32_max(range); + } + + return 0; +} +EXPORT_SYMBOL_GPL(nf_nat_exp_find_port); diff --git a/net/netfilter/nf_nat_irc.c b/net/netfilter/nf_nat_irc.c index c691ab8d234c..19c4fcc60c50 100644 --- a/net/netfilter/nf_nat_irc.c +++ b/net/netfilter/nf_nat_irc.c @@ -48,20 +48,8 @@ static unsigned int help(struct sk_buff *skb, exp->dir = IP_CT_DIR_REPLY; exp->expectfn = nf_nat_follow_master; - /* Try to get same port: if not, try to change it. */ - for (port = ntohs(exp->saved_proto.tcp.port); port != 0; port++) { - int ret; - - exp->tuple.dst.u.tcp.port = htons(port); - ret = nf_ct_expect_related(exp, 0); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, + ntohs(exp->saved_proto.tcp.port)); if (port == 0) { nf_ct_helper_log(skb, ct, "all ports in use"); return NF_DROP; diff --git a/net/netfilter/nf_nat_sip.c b/net/netfilter/nf_nat_sip.c index f0a735e86851..cf4aeb299bde 100644 --- a/net/netfilter/nf_nat_sip.c +++ b/net/netfilter/nf_nat_sip.c @@ -410,19 +410,7 @@ static unsigned int nf_nat_sip_expect(struct sk_buff *skb, unsigned int protoff, exp->dir = !dir; exp->expectfn = nf_nat_sip_expected; - for (; port != 0; port++) { - int ret; - - exp->tuple.dst.u.udp.port = htons(port); - ret = nf_ct_expect_related(exp, NF_CT_EXP_F_SKIP_MASTER); - if (ret == 0) - break; - else if (ret != -EBUSY) { - port = 0; - break; - } - } - + port = nf_nat_exp_find_port(exp, port); if (port == 0) { nf_ct_helper_log(skb, ct, "all ports in use for SIP"); return NF_DROP; diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c index 2ee50e23c9b7..62da204eed41 100644 --- a/net/netfilter/nf_tables_api.c +++ b/net/netfilter/nf_tables_api.c @@ -742,7 +742,7 @@ __printf(2, 3) int nft_request_module(struct net *net, const char *fmt, return -ENOMEM; req->done = false; - strlcpy(req->module, module_name, MODULE_NAME_LEN); + strscpy(req->module, module_name, MODULE_NAME_LEN); list_add_tail(&req->list, &nft_net->module_list); return -EAGAIN; @@ -1534,10 +1534,10 @@ static int nft_dump_stats(struct sk_buff *skb, struct nft_stats __percpu *stats) for_each_possible_cpu(cpu) { cpu_stats = per_cpu_ptr(stats, cpu); do { - seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp); + seq = u64_stats_fetch_begin(&cpu_stats->syncp); pkts = cpu_stats->pkts; bytes = cpu_stats->bytes; - } while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq)); + } while (u64_stats_fetch_retry(&cpu_stats->syncp, seq)); total.pkts += pkts; total.bytes += bytes; } @@ -2166,8 +2166,10 @@ static int nft_basechain_init(struct nft_base_chain *basechain, u8 family, chain->flags |= NFT_CHAIN_BASE | flags; basechain->policy = NF_ACCEPT; if (chain->flags & NFT_CHAIN_HW_OFFLOAD && - !nft_chain_offload_support(basechain)) + !nft_chain_offload_support(basechain)) { + list_splice_init(&basechain->hook_list, &hook->list); return -EOPNOTSUPP; + } flow_block_init(&basechain->flow_block); @@ -2195,7 +2197,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, struct netlink_ext_ack *extack) { const struct nlattr * const *nla = ctx->nla; - struct nft_stats __percpu *stats = NULL; struct nft_table *table = ctx->table; struct nft_base_chain *basechain; struct net *net = ctx->net; @@ -2210,6 +2211,7 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, return -EOVERFLOW; if (nla[NFTA_CHAIN_HOOK]) { + struct nft_stats __percpu *stats = NULL; struct nft_chain_hook hook; if (flags & NFT_CHAIN_BINDING) @@ -2241,8 +2243,11 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, if (err < 0) { nft_chain_release_hook(&hook); kfree(basechain); + free_percpu(stats); return err; } + if (stats) + static_branch_inc(&nft_counters_enabled); } else { if (flags & NFT_CHAIN_BASE) return -EINVAL; @@ -2317,9 +2322,6 @@ static int nf_tables_addchain(struct nft_ctx *ctx, u8 family, u8 genmask, goto err_unregister_hook; } - if (stats) - static_branch_inc(&nft_counters_enabled); - table->use++; return 0; @@ -2855,6 +2857,43 @@ err1: return err; } +int nft_expr_inner_parse(const struct nft_ctx *ctx, const struct nlattr *nla, + struct nft_expr_info *info) +{ + struct nlattr *tb[NFTA_EXPR_MAX + 1]; + const struct nft_expr_type *type; + int err; + + err = nla_parse_nested_deprecated(tb, NFTA_EXPR_MAX, nla, + nft_expr_policy, NULL); + if (err < 0) + return err; + + if (!tb[NFTA_EXPR_DATA]) + return -EINVAL; + + type = __nft_expr_type_get(ctx->family, tb[NFTA_EXPR_NAME]); + if (IS_ERR(type)) + return PTR_ERR(type); + + if (!type->inner_ops) + return -EOPNOTSUPP; + + err = nla_parse_nested_deprecated(info->tb, type->maxattr, + tb[NFTA_EXPR_DATA], + type->policy, NULL); + if (err < 0) + goto err_nla_parse; + + info->attr = nla; + info->ops = type->inner_ops; + + return 0; + +err_nla_parse: + return err; +} + static int nf_tables_newexpr(const struct nft_ctx *ctx, const struct nft_expr_info *expr_info, struct nft_expr *expr) @@ -5863,8 +5902,9 @@ static bool nft_setelem_valid_key_end(const struct nft_set *set, (NFT_SET_CONCAT | NFT_SET_INTERVAL)) { if (flags & NFT_SET_ELEM_INTERVAL_END) return false; - if (!nla[NFTA_SET_ELEM_KEY_END] && - !(flags & NFT_SET_ELEM_CATCHALL)) + + if (nla[NFTA_SET_ELEM_KEY_END] && + flags & NFT_SET_ELEM_CATCHALL) return false; } else { if (nla[NFTA_SET_ELEM_KEY_END]) diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c index cee3e4e905ec..709a736c301c 100644 --- a/net/netfilter/nf_tables_core.c +++ b/net/netfilter/nf_tables_core.c @@ -340,6 +340,8 @@ static struct nft_expr_type *nft_basic_types[] = { &nft_exthdr_type, &nft_last_type, &nft_counter_type, + &nft_objref_type, + &nft_inner_type, }; static struct nft_object_type *nft_basic_objects[] = { diff --git a/net/netfilter/nfnetlink_osf.c b/net/netfilter/nfnetlink_osf.c index 0fa2e2030427..ee6840bd5933 100644 --- a/net/netfilter/nfnetlink_osf.c +++ b/net/netfilter/nfnetlink_osf.c @@ -269,6 +269,7 @@ bool nf_osf_find(const struct sk_buff *skb, struct nf_osf_hdr_ctx ctx; const struct tcphdr *tcp; struct tcphdr _tcph; + bool found = false; memset(&ctx, 0, sizeof(ctx)); @@ -283,10 +284,11 @@ bool nf_osf_find(const struct sk_buff *skb, data->genre = f->genre; data->version = f->version; + found = true; break; } - return true; + return found; } EXPORT_SYMBOL_GPL(nf_osf_find); diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c index b04995c3e17f..a3f01f209a53 100644 --- a/net/netfilter/nft_ct.c +++ b/net/netfilter/nft_ct.c @@ -1089,9 +1089,6 @@ static int nft_ct_helper_obj_init(const struct nft_ctx *ctx, if (err < 0) goto err_put_helper; - /* Avoid the bogus warning, helper will be assigned after CT init */ - nf_ct_set_auto_assign_helper_warned(ctx->net); - return 0; err_put_helper: diff --git a/net/netfilter/nft_inner.c b/net/netfilter/nft_inner.c new file mode 100644 index 000000000000..eae7caeff316 --- /dev/null +++ b/net/netfilter/nft_inner.c @@ -0,0 +1,384 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2022 Pablo Neira Ayuso <pablo@netfilter.org> + */ + +#include <linux/kernel.h> +#include <linux/if_vlan.h> +#include <linux/init.h> +#include <linux/module.h> +#include <linux/netlink.h> +#include <linux/netfilter.h> +#include <linux/netfilter/nf_tables.h> +#include <net/netfilter/nf_tables_core.h> +#include <net/netfilter/nf_tables.h> +#include <net/netfilter/nft_meta.h> +#include <net/netfilter/nf_tables_offload.h> +#include <linux/tcp.h> +#include <linux/udp.h> +#include <net/gre.h> +#include <net/geneve.h> +#include <net/ip.h> +#include <linux/icmpv6.h> +#include <linux/ip.h> +#include <linux/ipv6.h> + +static DEFINE_PER_CPU(struct nft_inner_tun_ctx, nft_pcpu_tun_ctx); + +/* Same layout as nft_expr but it embeds the private expression data area. */ +struct __nft_expr { + const struct nft_expr_ops *ops; + union { + struct nft_payload payload; + struct nft_meta meta; + } __attribute__((aligned(__alignof__(u64)))); +}; + +enum { + NFT_INNER_EXPR_PAYLOAD, + NFT_INNER_EXPR_META, +}; + +struct nft_inner { + u8 flags; + u8 hdrsize; + u8 type; + u8 expr_type; + + struct __nft_expr expr; +}; + +static int nft_inner_parse_l2l3(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 off) +{ + __be16 llproto, outer_llproto; + u32 nhoff, thoff; + + if (priv->flags & NFT_INNER_LL) { + struct vlan_ethhdr *veth, _veth; + struct ethhdr *eth, _eth; + u32 hdrsize; + + eth = skb_header_pointer(pkt->skb, off, sizeof(_eth), &_eth); + if (!eth) + return -1; + + switch (eth->h_proto) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + llproto = eth->h_proto; + hdrsize = sizeof(_eth); + break; + case htons(ETH_P_8021Q): + veth = skb_header_pointer(pkt->skb, off, sizeof(_veth), &_veth); + if (!eth) + return -1; + + outer_llproto = veth->h_vlan_encapsulated_proto; + llproto = veth->h_vlan_proto; + hdrsize = sizeof(_veth); + break; + default: + return -1; + } + + ctx->inner_lloff = off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_LL; + off += hdrsize; + } else { + struct iphdr *iph; + u32 _version; + + iph = skb_header_pointer(pkt->skb, off, sizeof(_version), &_version); + if (!iph) + return -1; + + switch (iph->version) { + case 4: + llproto = htons(ETH_P_IP); + break; + case 6: + llproto = htons(ETH_P_IPV6); + break; + default: + return -1; + } + } + + ctx->llproto = llproto; + if (llproto == htons(ETH_P_8021Q)) + llproto = outer_llproto; + + nhoff = off; + + switch (llproto) { + case htons(ETH_P_IP): { + struct iphdr *iph, _iph; + + iph = skb_header_pointer(pkt->skb, nhoff, sizeof(_iph), &_iph); + if (!iph) + return -1; + + if (iph->ihl < 5 || iph->version != 4) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff + (iph->ihl * 4); + if ((ntohs(iph->frag_off) & IP_OFFSET) == 0) { + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = iph->protocol; + } + } + break; + case htons(ETH_P_IPV6): { + struct ipv6hdr *ip6h, _ip6h; + int fh_flags = IP6_FH_F_AUTH; + unsigned short fragoff; + int l4proto; + + ip6h = skb_header_pointer(pkt->skb, nhoff, sizeof(_ip6h), &_ip6h); + if (!ip6h) + return -1; + + if (ip6h->version != 6) + return -1; + + ctx->inner_nhoff = nhoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_NH; + + thoff = nhoff; + l4proto = ipv6_find_hdr(pkt->skb, &thoff, -1, &fragoff, &fh_flags); + if (l4proto < 0 || thoff > U16_MAX) + return -1; + + if (fragoff == 0) { + thoff = nhoff + sizeof(_ip6h); + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TH; + ctx->inner_thoff = thoff; + ctx->l4proto = l4proto; + } + } + break; + default: + return -1; + } + + return 0; +} + +static int nft_inner_parse_tunhdr(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *ctx, u32 *off) +{ + if (pkt->tprot == IPPROTO_GRE) { + ctx->inner_tunoff = pkt->thoff; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + return 0; + } + + if (pkt->tprot != IPPROTO_UDP) + return -1; + + ctx->inner_tunoff = *off; + ctx->flags |= NFT_PAYLOAD_CTX_INNER_TUN; + *off += priv->hdrsize; + + switch (priv->type) { + case NFT_INNER_GENEVE: { + struct genevehdr *gnvh, _gnvh; + + gnvh = skb_header_pointer(pkt->skb, pkt->inneroff, + sizeof(_gnvh), &_gnvh); + if (!gnvh) + return -1; + + *off += gnvh->opt_len * 4; + } + break; + default: + break; + } + + return 0; +} + +static int nft_inner_parse(const struct nft_inner *priv, + struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + struct nft_inner_tun_ctx ctx = {}; + u32 off = pkt->inneroff; + + if (priv->flags & NFT_INNER_HDRSIZE && + nft_inner_parse_tunhdr(priv, pkt, &ctx, &off) < 0) + return -1; + + if (priv->flags & (NFT_INNER_LL | NFT_INNER_NH)) { + if (nft_inner_parse_l2l3(priv, pkt, &ctx, off) < 0) + return -1; + } else if (priv->flags & NFT_INNER_TH) { + ctx.inner_thoff = off; + ctx.flags |= NFT_PAYLOAD_CTX_INNER_TH; + } + + *tun_ctx = ctx; + tun_ctx->type = priv->type; + pkt->flags |= NFT_PKTINFO_INNER_FULL; + + return 0; +} + +static bool nft_inner_parse_needed(const struct nft_inner *priv, + const struct nft_pktinfo *pkt, + const struct nft_inner_tun_ctx *tun_ctx) +{ + if (!(pkt->flags & NFT_PKTINFO_INNER_FULL)) + return true; + + if (priv->type != tun_ctx->type) + return true; + + return false; +} + +static void nft_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt) +{ + struct nft_inner_tun_ctx *tun_ctx = this_cpu_ptr(&nft_pcpu_tun_ctx); + const struct nft_inner *priv = nft_expr_priv(expr); + + if (nft_payload_inner_offset(pkt) < 0) + goto err; + + if (nft_inner_parse_needed(priv, pkt, tun_ctx) && + nft_inner_parse(priv, (struct nft_pktinfo *)pkt, tun_ctx) < 0) + goto err; + + switch (priv->expr_type) { + case NFT_INNER_EXPR_PAYLOAD: + nft_payload_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + break; + case NFT_INNER_EXPR_META: + nft_meta_inner_eval((struct nft_expr *)&priv->expr, regs, pkt, tun_ctx); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static const struct nla_policy nft_inner_policy[NFTA_INNER_MAX + 1] = { + [NFTA_INNER_NUM] = { .type = NLA_U32 }, + [NFTA_INNER_FLAGS] = { .type = NLA_U32 }, + [NFTA_INNER_HDRSIZE] = { .type = NLA_U32 }, + [NFTA_INNER_TYPE] = { .type = NLA_U32 }, + [NFTA_INNER_EXPR] = { .type = NLA_NESTED }, +}; + +struct nft_expr_info { + const struct nft_expr_ops *ops; + const struct nlattr *attr; + struct nlattr *tb[NFT_EXPR_MAXATTR + 1]; +}; + +static int nft_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_inner *priv = nft_expr_priv(expr); + u32 flags, hdrsize, type, num; + struct nft_expr_info expr_info; + int err; + + if (!tb[NFTA_INNER_FLAGS] || + !tb[NFTA_INNER_HDRSIZE] || + !tb[NFTA_INNER_TYPE] || + !tb[NFTA_INNER_EXPR]) + return -EINVAL; + + flags = ntohl(nla_get_be32(tb[NFTA_INNER_FLAGS])); + if (flags & ~NFT_INNER_MASK) + return -EOPNOTSUPP; + + num = ntohl(nla_get_be32(tb[NFTA_INNER_NUM])); + if (num != 0) + return -EOPNOTSUPP; + + hdrsize = ntohl(nla_get_be32(tb[NFTA_INNER_HDRSIZE])); + type = ntohl(nla_get_be32(tb[NFTA_INNER_TYPE])); + + if (type > U8_MAX) + return -EINVAL; + + if (flags & NFT_INNER_HDRSIZE) { + if (hdrsize == 0 || hdrsize > 64) + return -EOPNOTSUPP; + } + + priv->flags = flags; + priv->hdrsize = hdrsize; + priv->type = type; + + err = nft_expr_inner_parse(ctx, tb[NFTA_INNER_EXPR], &expr_info); + if (err < 0) + return err; + + priv->expr.ops = expr_info.ops; + + if (!strcmp(expr_info.ops->type->name, "payload")) + priv->expr_type = NFT_INNER_EXPR_PAYLOAD; + else if (!strcmp(expr_info.ops->type->name, "meta")) + priv->expr_type = NFT_INNER_EXPR_META; + else + return -EINVAL; + + err = expr_info.ops->init(ctx, (struct nft_expr *)&priv->expr, + (const struct nlattr * const*)expr_info.tb); + if (err < 0) + return err; + + return 0; +} + +static int nft_inner_dump(struct sk_buff *skb, const struct nft_expr *expr) +{ + const struct nft_inner *priv = nft_expr_priv(expr); + + if (nla_put_be32(skb, NFTA_INNER_NUM, htonl(0)) || + nla_put_be32(skb, NFTA_INNER_TYPE, htonl(priv->type)) || + nla_put_be32(skb, NFTA_INNER_FLAGS, htonl(priv->flags)) || + nla_put_be32(skb, NFTA_INNER_HDRSIZE, htonl(priv->hdrsize))) + goto nla_put_failure; + + if (nft_expr_dump(skb, NFTA_INNER_EXPR, + (struct nft_expr *)&priv->expr) < 0) + goto nla_put_failure; + + return 0; + +nla_put_failure: + return -1; +} + +static const struct nft_expr_ops nft_inner_ops = { + .type = &nft_inner_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_inner)), + .eval = nft_inner_eval, + .init = nft_inner_init, + .dump = nft_inner_dump, +}; + +struct nft_expr_type nft_inner_type __read_mostly = { + .name = "inner", + .ops = &nft_inner_ops, + .policy = nft_inner_policy, + .maxattr = NFTA_INNER_MAX, + .owner = THIS_MODULE, +}; diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c index 55d2d49c3425..8c39adeebb5c 100644 --- a/net/netfilter/nft_meta.c +++ b/net/netfilter/nft_meta.c @@ -831,9 +831,71 @@ nft_meta_select_ops(const struct nft_ctx *ctx, return ERR_PTR(-EINVAL); } +static int nft_meta_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_meta *priv = nft_expr_priv(expr); + unsigned int len; + + priv->key = ntohl(nla_get_be32(tb[NFTA_META_KEY])); + switch (priv->key) { + case NFT_META_PROTOCOL: + len = sizeof(u16); + break; + case NFT_META_L4PROTO: + len = sizeof(u32); + break; + default: + return -EOPNOTSUPP; + } + priv->len = len; + + return nft_parse_register_store(ctx, tb[NFTA_META_DREG], &priv->dreg, + NULL, NFT_DATA_VALUE, len); +} + +void nft_meta_inner_eval(const struct nft_expr *expr, + struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_meta *priv = nft_expr_priv(expr); + u32 *dest = ®s->data[priv->dreg]; + + switch (priv->key) { + case NFT_META_PROTOCOL: + nft_reg_store16(dest, (__force u16)tun_ctx->llproto); + break; + case NFT_META_L4PROTO: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + nft_reg_store8(dest, tun_ctx->l4proto); + break; + default: + WARN_ON_ONCE(1); + goto err; + } + return; + +err: + regs->verdict.code = NFT_BREAK; +} +EXPORT_SYMBOL_GPL(nft_meta_inner_eval); + +static const struct nft_expr_ops nft_meta_inner_ops = { + .type = &nft_meta_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_meta)), + .init = nft_meta_inner_init, + .dump = nft_meta_get_dump, + /* direct call to nft_meta_inner_eval(). */ +}; + struct nft_expr_type nft_meta_type __read_mostly = { .name = "meta", .select_ops = nft_meta_select_ops, + .inner_ops = &nft_meta_inner_ops, .policy = nft_meta_policy, .maxattr = NFTA_META_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_objref.c b/net/netfilter/nft_objref.c index 5d8d91b3904d..74e0eea4abac 100644 --- a/net/netfilter/nft_objref.c +++ b/net/netfilter/nft_objref.c @@ -82,7 +82,6 @@ static void nft_objref_activate(const struct nft_ctx *ctx, obj->use++; } -static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_object *)), @@ -195,7 +194,6 @@ static void nft_objref_map_destroy(const struct nft_ctx *ctx, nf_tables_destroy_set(ctx, priv->set); } -static struct nft_expr_type nft_objref_type; static const struct nft_expr_ops nft_objref_map_ops = { .type = &nft_objref_type, .size = NFT_EXPR_SIZE(sizeof(struct nft_objref_map)), @@ -233,28 +231,10 @@ static const struct nla_policy nft_objref_policy[NFTA_OBJREF_MAX + 1] = { [NFTA_OBJREF_SET_ID] = { .type = NLA_U32 }, }; -static struct nft_expr_type nft_objref_type __read_mostly = { +struct nft_expr_type nft_objref_type __read_mostly = { .name = "objref", .select_ops = nft_objref_select_ops, .policy = nft_objref_policy, .maxattr = NFTA_OBJREF_MAX, .owner = THIS_MODULE, }; - -static int __init nft_objref_module_init(void) -{ - return nft_register_expr(&nft_objref_type); -} - -static void __exit nft_objref_module_exit(void) -{ - nft_unregister_expr(&nft_objref_type); -} - -module_init(nft_objref_module_init); -module_exit(nft_objref_module_exit); - -MODULE_LICENSE("GPL"); -MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>"); -MODULE_ALIAS_NFT_EXPR("objref"); -MODULE_DESCRIPTION("nftables stateful object reference module"); diff --git a/net/netfilter/nft_osf.c b/net/netfilter/nft_osf.c index 89342ccccdcc..adacf95b6e2b 100644 --- a/net/netfilter/nft_osf.c +++ b/net/netfilter/nft_osf.c @@ -51,7 +51,7 @@ static void nft_osf_eval(const struct nft_expr *expr, struct nft_regs *regs, snprintf(os_match, NFT_OSF_MAXGENRELEN, "%s:%s", data.genre, data.version); else - strlcpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); + strscpy(os_match, data.genre, NFT_OSF_MAXGENRELEN); strncpy((char *)dest, os_match, NFT_OSF_MAXGENRELEN); } diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c index eb0e40c29712..9d2ac764a14c 100644 --- a/net/netfilter/nft_payload.c +++ b/net/netfilter/nft_payload.c @@ -19,6 +19,7 @@ /* For layer 4 checksum field offset. */ #include <linux/tcp.h> #include <linux/udp.h> +#include <net/gre.h> #include <linux/icmpv6.h> #include <linux/ip.h> #include <linux/ipv6.h> @@ -100,6 +101,40 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) pkt->inneroff = thoff + __tcp_hdrlen(th); } break; + case IPPROTO_GRE: { + u32 offset = sizeof(struct gre_base_hdr), version; + struct gre_base_hdr *gre, _gre; + + gre = skb_header_pointer(pkt->skb, thoff, sizeof(_gre), &_gre); + if (!gre) + return -1; + + version = gre->flags & GRE_VERSION; + switch (version) { + case GRE_VERSION_0: + if (gre->flags & GRE_ROUTING) + return -1; + + if (gre->flags & GRE_CSUM) { + offset += sizeof_field(struct gre_full_hdr, csum) + + sizeof_field(struct gre_full_hdr, reserved1); + } + if (gre->flags & GRE_KEY) + offset += sizeof_field(struct gre_full_hdr, key); + + if (gre->flags & GRE_SEQ) + offset += sizeof_field(struct gre_full_hdr, seq); + break; + default: + return -1; + } + + pkt->inneroff = thoff + offset; + } + break; + case IPPROTO_IPIP: + pkt->inneroff = thoff; + break; default: return -1; } @@ -109,7 +144,7 @@ static int __nft_payload_inner_offset(struct nft_pktinfo *pkt) return 0; } -static int nft_payload_inner_offset(const struct nft_pktinfo *pkt) +int nft_payload_inner_offset(const struct nft_pktinfo *pkt) { if (!(pkt->flags & NFT_PKTINFO_INNER) && __nft_payload_inner_offset((struct nft_pktinfo *)pkt) < 0) @@ -173,10 +208,10 @@ static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = { [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 }, [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255), + [NFTA_PAYLOAD_LEN] = NLA_POLICY_MAX_BE(NLA_U32, 255), [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 }, - [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 }, + [NFTA_PAYLOAD_CSUM_OFFSET] = NLA_POLICY_MAX_BE(NLA_U32, 255), [NFTA_PAYLOAD_CSUM_FLAGS] = { .type = NLA_U32 }, }; @@ -552,6 +587,92 @@ const struct nft_expr_ops nft_payload_fast_ops = { .offload = nft_payload_offload, }; +void nft_payload_inner_eval(const struct nft_expr *expr, struct nft_regs *regs, + const struct nft_pktinfo *pkt, + struct nft_inner_tun_ctx *tun_ctx) +{ + const struct nft_payload *priv = nft_expr_priv(expr); + const struct sk_buff *skb = pkt->skb; + u32 *dest = ®s->data[priv->dreg]; + int offset; + + if (priv->len % NFT_REG32_SIZE) + dest[priv->len / NFT_REG32_SIZE] = 0; + + switch (priv->base) { + case NFT_PAYLOAD_TUN_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TUN)) + goto err; + + offset = tun_ctx->inner_tunoff; + break; + case NFT_PAYLOAD_LL_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_LL)) + goto err; + + offset = tun_ctx->inner_lloff; + break; + case NFT_PAYLOAD_NETWORK_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_NH)) + goto err; + + offset = tun_ctx->inner_nhoff; + break; + case NFT_PAYLOAD_TRANSPORT_HEADER: + if (!(tun_ctx->flags & NFT_PAYLOAD_CTX_INNER_TH)) + goto err; + + offset = tun_ctx->inner_thoff; + break; + default: + WARN_ON_ONCE(1); + goto err; + } + offset += priv->offset; + + if (skb_copy_bits(skb, offset, dest, priv->len) < 0) + goto err; + + return; +err: + regs->verdict.code = NFT_BREAK; +} + +static int nft_payload_inner_init(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nlattr * const tb[]) +{ + struct nft_payload *priv = nft_expr_priv(expr); + u32 base; + + base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE])); + switch (base) { + case NFT_PAYLOAD_TUN_HEADER: + case NFT_PAYLOAD_LL_HEADER: + case NFT_PAYLOAD_NETWORK_HEADER: + case NFT_PAYLOAD_TRANSPORT_HEADER: + break; + default: + return -EOPNOTSUPP; + } + + priv->base = base; + priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET])); + priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN])); + + return nft_parse_register_store(ctx, tb[NFTA_PAYLOAD_DREG], + &priv->dreg, NULL, NFT_DATA_VALUE, + priv->len); +} + +static const struct nft_expr_ops nft_payload_inner_ops = { + .type = &nft_payload_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_payload)), + .init = nft_payload_inner_init, + .dump = nft_payload_dump, + /* direct call to nft_payload_inner_eval(). */ +}; + static inline void nft_csum_replace(__sum16 *sum, __wsum fsum, __wsum tsum) { *sum = csum_fold(csum_add(csum_sub(~csum_unfold(*sum), fsum), tsum)); @@ -665,6 +786,16 @@ static int nft_payload_csum_inet(struct sk_buff *skb, const u32 *src, return 0; } +struct nft_payload_set { + enum nft_payload_bases base:8; + u8 offset; + u8 len; + u8 sreg; + u8 csum_type; + u8 csum_offset; + u8 csum_flags; +}; + static void nft_payload_set_eval(const struct nft_expr *expr, struct nft_regs *regs, const struct nft_pktinfo *pkt) @@ -885,6 +1016,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx, struct nft_expr_type nft_payload_type __read_mostly = { .name = "payload", .select_ops = nft_payload_select_ops, + .inner_ops = &nft_payload_inner_ops, .policy = nft_payload_policy, .maxattr = NFTA_PAYLOAD_MAX, .owner = THIS_MODULE, diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c index a7de29137618..49a5348a6a14 100644 --- a/net/netfilter/nft_socket.c +++ b/net/netfilter/nft_socket.c @@ -40,16 +40,17 @@ static noinline bool nft_sock_get_eval_cgroupv2(u32 *dest, struct sock *sk, const struct nft_pktinfo *pkt, u32 level) { struct cgroup *cgrp; + u64 cgid; if (!sk_fullsock(sk)) return false; - cgrp = sock_cgroup_ptr(&sk->sk_cgrp_data); - if (level > cgrp->level) + cgrp = cgroup_ancestor(sock_cgroup_ptr(&sk->sk_cgrp_data), level); + if (!cgrp) return false; - memcpy(dest, &cgrp->ancestor_ids[level], sizeof(u64)); - + cgid = cgroup_id(cgrp); + memcpy(dest, &cgid, sizeof(u64)); return true; } #endif diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c index 54a489f16b17..470282cf3fae 100644 --- a/net/netfilter/x_tables.c +++ b/net/netfilter/x_tables.c @@ -766,7 +766,7 @@ void xt_compat_match_from_user(struct xt_entry_match *m, void **dstptr, msize += off; m->u.user.match_size = msize; - strlcpy(name, match->name, sizeof(name)); + strscpy(name, match->name, sizeof(name)); module_put(match->me); strncpy(m->u.user.name, name, sizeof(m->u.user.name)); @@ -1146,7 +1146,7 @@ void xt_compat_target_from_user(struct xt_entry_target *t, void **dstptr, tsize += off; t->u.user.target_size = tsize; - strlcpy(name, target->name, sizeof(name)); + strscpy(name, target->name, sizeof(name)); module_put(target->me); strncpy(t->u.user.name, name, sizeof(t->u.user.name)); @@ -1827,7 +1827,7 @@ int xt_proto_init(struct net *net, u_int8_t af) root_uid = make_kuid(net->user_ns, 0); root_gid = make_kgid(net->user_ns, 0); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); proc = proc_create_net_data(buf, 0440, net->proc_net, &xt_table_seq_ops, sizeof(struct seq_net_private), @@ -1837,7 +1837,7 @@ int xt_proto_init(struct net *net, u_int8_t af) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_match_seq_ops, sizeof(struct nf_mttg_trav), @@ -1847,7 +1847,7 @@ int xt_proto_init(struct net *net, u_int8_t af) if (uid_valid(root_uid) && gid_valid(root_gid)) proc_set_user(proc, root_uid, root_gid); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); proc = proc_create_seq_private(buf, 0440, net->proc_net, &xt_target_seq_ops, sizeof(struct nf_mttg_trav), @@ -1862,12 +1862,12 @@ int xt_proto_init(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS out_remove_matches: - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out_remove_tables: - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); out: @@ -1881,15 +1881,15 @@ void xt_proto_fini(struct net *net, u_int8_t af) #ifdef CONFIG_PROC_FS char buf[XT_FUNCTION_MAXNAMELEN]; - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TABLES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_TARGETS, sizeof(buf)); remove_proc_entry(buf, net->proc_net); - strlcpy(buf, xt_prefix[af], sizeof(buf)); + strscpy(buf, xt_prefix[af], sizeof(buf)); strlcat(buf, FORMAT_MATCHES, sizeof(buf)); remove_proc_entry(buf, net->proc_net); #endif /*CONFIG_PROC_FS*/ diff --git a/net/netfilter/xt_RATEEST.c b/net/netfilter/xt_RATEEST.c index 8aec1b529364..80f6624e2355 100644 --- a/net/netfilter/xt_RATEEST.c +++ b/net/netfilter/xt_RATEEST.c @@ -144,7 +144,7 @@ static int xt_rateest_tg_checkentry(const struct xt_tgchk_param *par) goto err1; gnet_stats_basic_sync_init(&est->bstats); - strlcpy(est->name, info->name, sizeof(est->name)); + strscpy(est->name, info->name, sizeof(est->name)); spin_lock_init(&est->lock); est->refcnt = 1; est->params.interval = info->interval; diff --git a/net/netfilter/xt_statistic.c b/net/netfilter/xt_statistic.c index 203e24ae472c..b26c1dcfc27b 100644 --- a/net/netfilter/xt_statistic.c +++ b/net/netfilter/xt_statistic.c @@ -34,7 +34,7 @@ statistic_mt(const struct sk_buff *skb, struct xt_action_param *par) switch (info->mode) { case XT_STATISTIC_MODE_RANDOM: - if ((prandom_u32() & 0x7FFFFFFF) < info->u.random.probability) + if ((get_random_u32() & 0x7FFFFFFF) < info->u.random.probability) ret = !ret; break; case XT_STATISTIC_MODE_NTH: diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c index f89ba302ac6e..f0c94d394ab1 100644 --- a/net/netlink/af_netlink.c +++ b/net/netlink/af_netlink.c @@ -812,6 +812,17 @@ static int netlink_release(struct socket *sock) } sock_prot_inuse_add(sock_net(sk), &netlink_proto, -1); + + /* Because struct net might disappear soon, do not keep a pointer. */ + if (!sk->sk_net_refcnt && sock_net(sk) != &init_net) { + __netns_tracker_free(sock_net(sk), &sk->ns_tracker, false); + /* Because of deferred_put_nlk_sk and use of work queue, + * it is possible netns will be freed before this socket. + */ + sock_net_set(sk, &init_net); + __netns_tracker_alloc(&init_net, &sk->ns_tracker, + false, GFP_KERNEL); + } call_rcu(&nlk->rcu, deferred_put_nlk_sk); return 0; } @@ -2494,11 +2505,13 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err, return; } - rep = __nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, - NLMSG_ERROR, payload, flags); + rep = nlmsg_put(skb, NETLINK_CB(in_skb).portid, nlh->nlmsg_seq, + NLMSG_ERROR, payload, flags); errmsg = nlmsg_data(rep); errmsg->error = err; - memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) ? nlh->nlmsg_len : sizeof(*nlh)); + unsafe_memcpy(&errmsg->msg, nlh, payload > sizeof(*errmsg) + ? nlh->nlmsg_len : sizeof(*nlh), + /* Bounds checked by the skb layer. */); if (tlvlen) netlink_ack_tlv_fill(in_skb, skb, nlh, err, extack); diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c index 7c136de117eb..3e16527beb91 100644 --- a/net/netlink/genetlink.c +++ b/net/netlink/genetlink.c @@ -78,10 +78,29 @@ static unsigned long mc_group_start = 0x3 | BIT(GENL_ID_CTRL) | static unsigned long *mc_groups = &mc_group_start; static unsigned long mc_groups_longs = 1; +/* We need the last attribute with non-zero ID therefore a 2-entry array */ +static struct nla_policy genl_policy_reject_all[] = { + { .type = NLA_REJECT }, + { .type = NLA_REJECT }, +}; + static int genl_ctrl_event(int event, const struct genl_family *family, const struct genl_multicast_group *grp, int grp_id); +static void +genl_op_fill_in_reject_policy(const struct genl_family *family, + struct genl_ops *op) +{ + BUILD_BUG_ON(ARRAY_SIZE(genl_policy_reject_all) - 1 != 1); + + if (op->policy || op->cmd < family->resv_start_op) + return; + + op->policy = genl_policy_reject_all; + op->maxattr = 1; +} + static const struct genl_family *genl_family_find_byid(unsigned int id) { return idr_find(&genl_fam_idr, id); @@ -113,6 +132,8 @@ static void genl_op_from_full(const struct genl_family *family, op->maxattr = family->maxattr; if (!op->policy) op->policy = family->policy; + + genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_full(u32 cmd, const struct genl_family *family, @@ -142,6 +163,8 @@ static void genl_op_from_small(const struct genl_family *family, op->maxattr = family->maxattr; op->policy = family->policy; + + genl_op_fill_in_reject_policy(family, op); } static int genl_get_cmd_small(u32 cmd, const struct genl_family *family, @@ -357,6 +380,8 @@ static int genl_validate_ops(const struct genl_family *family) genl_get_cmd_by_index(i, family, &op); if (op.dumpit == NULL && op.doit == NULL) return -EINVAL; + if (WARN_ON(op.cmd >= family->resv_start_op && op.validate)) + return -EINVAL; for (j = i + 1; j < genl_get_cmd_cnt(family); j++) { struct genl_ops op2; @@ -739,6 +764,36 @@ out: return err; } +static int genl_header_check(const struct genl_family *family, + struct nlmsghdr *nlh, struct genlmsghdr *hdr, + struct netlink_ext_ack *extack) +{ + u16 flags; + + /* Only for commands added after we started validating */ + if (hdr->cmd < family->resv_start_op) + return 0; + + if (hdr->reserved) { + NL_SET_ERR_MSG(extack, "genlmsghdr.reserved field is not 0"); + return -EINVAL; + } + + /* Old netlink flags have pretty loose semantics, allow only the flags + * consumed by the core where we can enforce the meaning. + */ + flags = nlh->nlmsg_flags; + if ((flags & NLM_F_DUMP) == NLM_F_DUMP) /* DUMP is 2 bits */ + flags &= ~NLM_F_DUMP; + if (flags & ~(NLM_F_REQUEST | NLM_F_ACK | NLM_F_ECHO)) { + NL_SET_ERR_MSG(extack, + "ambiguous or reserved bits set in nlmsg_flags"); + return -EINVAL; + } + + return 0; +} + static int genl_family_rcv_msg(const struct genl_family *family, struct sk_buff *skb, struct nlmsghdr *nlh, @@ -757,7 +812,7 @@ static int genl_family_rcv_msg(const struct genl_family *family, if (nlh->nlmsg_len < nlmsg_msg_size(hdrlen)) return -EINVAL; - if (hdr->cmd >= family->resv_start_op && hdr->reserved) + if (genl_header_check(family, nlh, hdr, extack)) return -EINVAL; if (genl_get_cmd(hdr->cmd, family, &op)) diff --git a/net/nfc/hci/hcp.c b/net/nfc/hci/hcp.c index 05c60988f59a..4902f5064098 100644 --- a/net/nfc/hci/hcp.c +++ b/net/nfc/hci/hcp.c @@ -73,14 +73,12 @@ int nfc_hci_hcp_message_tx(struct nfc_hci_dev *hdev, u8 pipe, if (firstfrag) { firstfrag = false; packet->message.header = HCP_HEADER(type, instruction); - if (ptr) { - memcpy(packet->message.data, ptr, - data_link_len - 1); - ptr += data_link_len - 1; - } } else { - memcpy(&packet->message, ptr, data_link_len); - ptr += data_link_len; + packet->message.header = *ptr++; + } + if (ptr) { + memcpy(packet->message.data, ptr, data_link_len - 1); + ptr += data_link_len - 1; } /* This is the last fragment, set the cb bit */ diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c index 868db4669a29..ca3ebfdb3023 100644 --- a/net/openvswitch/actions.c +++ b/net/openvswitch/actions.c @@ -1033,7 +1033,7 @@ static int sample(struct datapath *dp, struct sk_buff *skb, actions = nla_next(sample_arg, &rem); if ((arg->probability != U32_MAX) && - (!arg->probability || prandom_u32() > arg->probability)) { + (!arg->probability || get_random_u32() > arg->probability)) { if (last) consume_skb(skb); return 0; diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c index 48e8f5c29b67..c7b10234cf7c 100644 --- a/net/openvswitch/conntrack.c +++ b/net/openvswitch/conntrack.c @@ -1015,7 +1015,8 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key, * connections which we will commit, we may need to attach * the helper here. */ - if (info->commit && info->helper && !nfct_help(ct)) { + if (!nf_ct_is_confirmed(ct) && info->commit && + info->helper && !nfct_help(ct)) { int err = __nf_ct_try_assign_helper(ct, info->ct, GFP_ATOMIC); if (err) @@ -1982,7 +1983,8 @@ static int ovs_ct_limit_set_zone_limit(struct nlattr *nla_zone_limit, } else { struct ovs_ct_limit *ct_limit; - ct_limit = kmalloc(sizeof(*ct_limit), GFP_KERNEL); + ct_limit = kmalloc(sizeof(*ct_limit), + GFP_KERNEL_ACCOUNT); if (!ct_limit) return -ENOMEM; @@ -2252,14 +2254,16 @@ exit_err: static const struct genl_small_ops ct_limit_genl_ops[] = { { .cmd = OVS_CT_LIMIT_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_ct_limit_cmd_set, }, { .cmd = OVS_CT_LIMIT_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_ct_limit_cmd_del, }, { .cmd = OVS_CT_LIMIT_CMD_GET, diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c index c8a9075ddd0a..f95b716ea96d 100644 --- a/net/openvswitch/datapath.c +++ b/net/openvswitch/datapath.c @@ -716,9 +716,9 @@ static void get_dp_stats(const struct datapath *dp, struct ovs_dp_stats *stats, percpu_stats = per_cpu_ptr(dp->stats_percpu, i); do { - start = u64_stats_fetch_begin_irq(&percpu_stats->syncp); + start = u64_stats_fetch_begin(&percpu_stats->syncp); local_stats = *percpu_stats; - } while (u64_stats_fetch_retry_irq(&percpu_stats->syncp, start)); + } while (u64_stats_fetch_retry(&percpu_stats->syncp, start)); stats->n_hit += local_stats.n_hit; stats->n_missed += local_stats.n_missed; @@ -1616,7 +1616,8 @@ static void ovs_dp_reset_user_features(struct sk_buff *skb, if (IS_ERR(dp)) return; - WARN(dp->user_features, "Dropping previously announced user features\n"); + pr_warn("%s: Dropping previously announced user features\n", + ovs_dp_name(dp)); dp->user_features = 0; } diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c index 4c09cf8a0ab2..ead5418c126e 100644 --- a/net/openvswitch/flow_netlink.c +++ b/net/openvswitch/flow_netlink.c @@ -2309,7 +2309,7 @@ static struct sw_flow_actions *nla_alloc_flow_actions(int size) WARN_ON_ONCE(size > MAX_ACTIONS_BUFSIZE); - sfa = kmalloc(sizeof(*sfa) + size, GFP_KERNEL); + sfa = kmalloc(kmalloc_size_roundup(sizeof(*sfa) + size), GFP_KERNEL); if (!sfa) return ERR_PTR(-ENOMEM); @@ -3304,7 +3304,7 @@ static int __ovs_nla_copy_actions(struct net *net, const struct nlattr *attr, /* Disallow subsequent L2.5+ set actions and mpls_pop * actions once the last MPLS label in the packet is - * is popped as there is no check here to ensure that + * popped as there is no check here to ensure that * the new eth type is valid and thus set actions could * write off the end of the packet or otherwise corrupt * it. diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c index d4a2db0b2299..0a0e4c283f02 100644 --- a/net/openvswitch/flow_table.c +++ b/net/openvswitch/flow_table.c @@ -205,9 +205,9 @@ static void tbl_mask_array_reset_counters(struct mask_array *ma) stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; - } while (u64_stats_fetch_retry_irq(&stats->syncp, start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); ma->masks_usage_zero_cntr[i] += counter; } @@ -1136,10 +1136,9 @@ void ovs_flow_masks_rebalance(struct flow_table *table) stats = per_cpu_ptr(ma->masks_usage_stats, cpu); do { - start = u64_stats_fetch_begin_irq(&stats->syncp); + start = u64_stats_fetch_begin(&stats->syncp); counter = stats->usage_cntrs[i]; - } while (u64_stats_fetch_retry_irq(&stats->syncp, - start)); + } while (u64_stats_fetch_retry(&stats->syncp, start)); masks_and_count[i].counter += counter; } diff --git a/net/openvswitch/meter.c b/net/openvswitch/meter.c index 51111a9009bd..6e38f68f88c2 100644 --- a/net/openvswitch/meter.c +++ b/net/openvswitch/meter.c @@ -343,7 +343,7 @@ static struct dp_meter *dp_meter_create(struct nlattr **a) return ERR_PTR(-EINVAL); /* Allocate and set up the meter before locking anything. */ - meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL); + meter = kzalloc(struct_size(meter, bands, n_bands), GFP_KERNEL_ACCOUNT); if (!meter) return ERR_PTR(-ENOMEM); @@ -687,9 +687,9 @@ static const struct genl_small_ops dp_meter_genl_ops[] = { }, { .cmd = OVS_METER_CMD_SET, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_meter_cmd_set, }, { .cmd = OVS_METER_CMD_GET, @@ -699,9 +699,9 @@ static const struct genl_small_ops dp_meter_genl_ops[] = { }, { .cmd = OVS_METER_CMD_DEL, .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, - .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN - * privilege. - */ + .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN + * privilege. + */ .doit = ovs_meter_cmd_del }, }; diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c index 35f42c9821c2..74c88a6baa43 100644 --- a/net/openvswitch/vport-internal_dev.c +++ b/net/openvswitch/vport-internal_dev.c @@ -190,7 +190,7 @@ static void internal_dev_destroy(struct vport *vport) rtnl_unlock(); } -static netdev_tx_t internal_dev_recv(struct sk_buff *skb) +static int internal_dev_recv(struct sk_buff *skb) { struct net_device *netdev = skb->dev; diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h index 7d276f60c000..6ff45e8a0868 100644 --- a/net/openvswitch/vport.h +++ b/net/openvswitch/vport.h @@ -132,7 +132,7 @@ struct vport_ops { int (*set_options)(struct vport *, struct nlattr *); int (*get_options)(const struct vport *, struct sk_buff *); - netdev_tx_t (*send) (struct sk_buff *skb); + int (*send)(struct sk_buff *skb); struct module *owner; struct list_head list; }; diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 7de46d831349..8c5b3da0c29f 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -1350,7 +1350,7 @@ static bool fanout_flow_is_huge(struct packet_sock *po, struct sk_buff *skb) if (READ_ONCE(history[i]) == rxhash) count++; - victim = prandom_u32() % ROLLOVER_HLEN; + victim = prandom_u32_max(ROLLOVER_HLEN); /* Avoid dirtying the cache line if possible */ if (READ_ONCE(history[victim]) != rxhash) @@ -3277,7 +3277,7 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, int addr_len) { struct sock *sk = sock->sk; - char name[sizeof(uaddr->sa_data) + 1]; + char name[sizeof(uaddr->sa_data_min) + 1]; /* * Check legality @@ -3288,8 +3288,8 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr, /* uaddr->sa_data comes from the userspace, it's not guaranteed to be * zero-terminated. */ - memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data)); - name[sizeof(uaddr->sa_data)] = 0; + memcpy(name, uaddr->sa_data, sizeof(uaddr->sa_data_min)); + name[sizeof(uaddr->sa_data_min)] = 0; return packet_do_bind(sk, name, 0, pkt_sk(sk)->num); } @@ -3561,11 +3561,11 @@ static int packet_getname_spkt(struct socket *sock, struct sockaddr *uaddr, return -EOPNOTSUPP; uaddr->sa_family = AF_PACKET; - memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data)); + memset(uaddr->sa_data, 0, sizeof(uaddr->sa_data_min)); rcu_read_lock(); dev = dev_get_by_index_rcu(sock_net(sk), READ_ONCE(pkt_sk(sk)->ifindex)); if (dev) - strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data)); + strscpy(uaddr->sa_data, dev->name, sizeof(uaddr->sa_data_min)); rcu_read_unlock(); return sizeof(*uaddr); @@ -4725,37 +4725,37 @@ static struct pernet_operations packet_net_ops = { static void __exit packet_exit(void) { - unregister_netdevice_notifier(&packet_netdev_notifier); - unregister_pernet_subsys(&packet_net_ops); sock_unregister(PF_PACKET); proto_unregister(&packet_proto); + unregister_netdevice_notifier(&packet_netdev_notifier); + unregister_pernet_subsys(&packet_net_ops); } static int __init packet_init(void) { int rc; - rc = proto_register(&packet_proto, 0); - if (rc) - goto out; - rc = sock_register(&packet_family_ops); - if (rc) - goto out_proto; rc = register_pernet_subsys(&packet_net_ops); if (rc) - goto out_sock; + goto out; rc = register_netdevice_notifier(&packet_netdev_notifier); if (rc) goto out_pernet; + rc = proto_register(&packet_proto, 0); + if (rc) + goto out_notifier; + rc = sock_register(&packet_family_ops); + if (rc) + goto out_proto; return 0; -out_pernet: - unregister_pernet_subsys(&packet_net_ops); -out_sock: - sock_unregister(PF_PACKET); out_proto: proto_unregister(&packet_proto); +out_notifier: + unregister_netdevice_notifier(&packet_netdev_notifier); +out_pernet: + unregister_pernet_subsys(&packet_net_ops); out: return rc; } diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c index b239120dd9ca..3ff6995244e5 100644 --- a/net/rds/af_rds.c +++ b/net/rds/af_rds.c @@ -894,7 +894,7 @@ module_exit(rds_exit); u32 rds_gen_num; -static int rds_init(void) +static int __init rds_init(void) { int ret; diff --git a/net/rds/bind.c b/net/rds/bind.c index 5b5fb4ca8d3e..97a29172a8ee 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -104,7 +104,7 @@ static int rds_add_bound(struct rds_sock *rs, const struct in6_addr *addr, return -EINVAL; last = rover; } else { - rover = max_t(u16, prandom_u32(), 2); + rover = max_t(u16, get_random_u16(), 2); last = rover - 1; } diff --git a/net/rds/message.c b/net/rds/message.c index 44dbc612ef54..b47e4f0a1639 100644 --- a/net/rds/message.c +++ b/net/rds/message.c @@ -366,7 +366,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * struct scatterlist *sg; int ret = 0; int length = iov_iter_count(from); - int total_copied = 0; struct rds_msg_zcopy_info *info; rm->m_inc.i_hdr.h_len = cpu_to_be32(iov_iter_count(from)); @@ -404,7 +403,6 @@ static int rds_message_zcopy_from_user(struct rds_message *rm, struct iov_iter * ret = -EFAULT; goto err; } - total_copied += copied; length -= copied; sg_set_page(sg, pages, copied, start); rm->data.op_nents++; diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c index a9e4ff948a7d..d36f3f6b4351 100644 --- a/net/rds/rdma_transport.c +++ b/net/rds/rdma_transport.c @@ -291,7 +291,7 @@ static void rds_rdma_listen_stop(void) #endif } -static int rds_rdma_init(void) +static int __init rds_rdma_init(void) { int ret; @@ -307,7 +307,7 @@ out: } module_init(rds_rdma_init); -static void rds_rdma_exit(void) +static void __exit rds_rdma_exit(void) { /* stop listening first to ensure no new connections are attempted */ rds_rdma_listen_stop(); diff --git a/net/rds/tcp.c b/net/rds/tcp.c index 73ee2771093d..c5b86066ff66 100644 --- a/net/rds/tcp.c +++ b/net/rds/tcp.c @@ -166,10 +166,10 @@ void rds_tcp_reset_callbacks(struct socket *sock, */ atomic_set(&cp->cp_state, RDS_CONN_RESETTING); wait_event(cp->cp_waitq, !test_bit(RDS_IN_XMIT, &cp->cp_flags)); - lock_sock(osock->sk); /* reset receive side state for rds_tcp_data_recv() for osock */ cancel_delayed_work_sync(&cp->cp_send_w); cancel_delayed_work_sync(&cp->cp_recv_w); + lock_sock(osock->sk); if (tc->t_tinc) { rds_inc_put(&tc->t_tinc->ti_inc); tc->t_tinc = NULL; @@ -503,6 +503,9 @@ bool rds_tcp_tune(struct socket *sock) release_sock(sk); return false; } + /* Update ns_tracker to current stack trace and refcounted tracker */ + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; netns_tracker_alloc(net, &sk->ns_tracker, GFP_KERNEL); sock_inuse_add(net, 1); @@ -712,7 +715,7 @@ static void rds_tcp_exit(void) } module_exit(rds_tcp_exit); -static int rds_tcp_init(void) +static int __init rds_tcp_init(void) { int ret; diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h index 571436064cd6..1ad0ec5afb50 100644 --- a/net/rxrpc/ar-internal.h +++ b/net/rxrpc/ar-internal.h @@ -782,7 +782,6 @@ void rxrpc_delete_call_timer(struct rxrpc_call *call); */ extern const char *const rxrpc_call_states[]; extern const char *const rxrpc_call_completions[]; -extern unsigned int rxrpc_max_call_lifetime; extern struct kmem_cache *rxrpc_call_jar; struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long); @@ -982,6 +981,7 @@ void rxrpc_send_keepalive(struct rxrpc_peer *); /* * peer_event.c */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, unsigned int udp_offset); void rxrpc_error_report(struct sock *); void rxrpc_peer_keepalive_worker(struct work_struct *); diff --git a/net/rxrpc/call_event.c b/net/rxrpc/call_event.c index f8ecad2b730e..2a93e7b5fbd0 100644 --- a/net/rxrpc/call_event.c +++ b/net/rxrpc/call_event.c @@ -166,7 +166,7 @@ static void rxrpc_resend(struct rxrpc_call *call, unsigned long now_j) _enter("{%d,%d}", call->tx_hard_ack, call->tx_top); now = ktime_get_real(); - max_age = ktime_sub(now, jiffies_to_usecs(call->peer->rto_j)); + max_age = ktime_sub_us(now, jiffies_to_usecs(call->peer->rto_j)); spin_lock_bh(&call->lock); diff --git a/net/rxrpc/local_object.c b/net/rxrpc/local_object.c index 96ecb7356c0f..38ea98ff426b 100644 --- a/net/rxrpc/local_object.c +++ b/net/rxrpc/local_object.c @@ -137,6 +137,7 @@ static int rxrpc_open_socket(struct rxrpc_local *local, struct net *net) tuncfg.encap_type = UDP_ENCAP_RXRPC; tuncfg.encap_rcv = rxrpc_input_packet; + tuncfg.encap_err_rcv = rxrpc_encap_err_rcv; tuncfg.sk_user_data = local; setup_udp_tunnel_sock(net, local->socket, &tuncfg); @@ -405,6 +406,9 @@ static void rxrpc_local_processor(struct work_struct *work) container_of(work, struct rxrpc_local, processor); bool again; + if (local->dead) + return; + trace_rxrpc_local(local->debug_id, rxrpc_local_processing, refcount_read(&local->ref), NULL); diff --git a/net/rxrpc/peer_event.c b/net/rxrpc/peer_event.c index be032850ae8c..32561e9567fe 100644 --- a/net/rxrpc/peer_event.c +++ b/net/rxrpc/peer_event.c @@ -16,22 +16,105 @@ #include <net/sock.h> #include <net/af_rxrpc.h> #include <net/ip.h> +#include <net/icmp.h> #include "ar-internal.h" +static void rxrpc_adjust_mtu(struct rxrpc_peer *, unsigned int); static void rxrpc_store_error(struct rxrpc_peer *, struct sock_exterr_skb *); static void rxrpc_distribute_error(struct rxrpc_peer *, int, enum rxrpc_call_completion); /* - * Find the peer associated with an ICMP packet. + * Find the peer associated with an ICMPv4 packet. */ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, - const struct sk_buff *skb, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, struct sockaddr_rxrpc *srx) { - struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + struct iphdr *ip, *ip0 = ip_hdr(skb); + struct icmphdr *icmp = icmp_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); - _enter(""); + _enter("%u,%u,%u", ip0->protocol, icmp->type, icmp->code); + + switch (icmp->type) { + case ICMP_DEST_UNREACH: + *info = ntohs(icmp->un.frag.mtu); + fallthrough; + case ICMP_TIME_EXCEEDED: + case ICMP_PARAMETERPROB: + ip = (struct iphdr *)((void *)icmp + 8); + break; + default: + return NULL; + } + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + + /* Can we see an ICMP4 packet on an ICMP6 listening socket? and vice + * versa? + */ + switch (srx->transport.family) { + case AF_INET: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case AF_INET6: + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, &ip->daddr, + sizeof(struct in_addr)); + break; +#endif + + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} + +#ifdef CONFIG_AF_RXRPC_IPV6 +/* + * Find the peer associated with an ICMPv6 packet. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_icmp6_rcu(struct rxrpc_local *local, + struct sk_buff *skb, + unsigned int udp_offset, + unsigned int *info, + struct sockaddr_rxrpc *srx) +{ + struct icmp6hdr *icmp = icmp6_hdr(skb); + struct ipv6hdr *ip, *ip0 = ipv6_hdr(skb); + struct udphdr *udp = (struct udphdr *)(skb->data + udp_offset); + + _enter("%u,%u,%u", ip0->nexthdr, icmp->icmp6_type, icmp->icmp6_code); + + switch (icmp->icmp6_type) { + case ICMPV6_DEST_UNREACH: + *info = ntohl(icmp->icmp6_mtu); + fallthrough; + case ICMPV6_PKT_TOOBIG: + case ICMPV6_TIME_EXCEED: + case ICMPV6_PARAMPROB: + ip = (struct ipv6hdr *)((void *)icmp + 8); + break; + default: + return NULL; + } memset(srx, 0, sizeof(*srx)); srx->transport_type = local->srx.transport_type; @@ -43,6 +126,165 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, */ switch (srx->transport.family) { case AF_INET: + _net("Rx ICMP6 on v4 sock"); + srx->transport_len = sizeof(srx->transport.sin); + srx->transport.family = AF_INET; + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin.sin_addr, + &ip->daddr.s6_addr32[3], sizeof(struct in_addr)); + break; + case AF_INET6: + _net("Rx ICMP6"); + srx->transport.sin.sin_port = udp->dest; + memcpy(&srx->transport.sin6.sin6_addr, &ip->daddr, + sizeof(struct in6_addr)); + break; + default: + WARN_ON_ONCE(1); + return NULL; + } + + _net("ICMP {%pISp}", &srx->transport); + return rxrpc_lookup_peer_rcu(local, srx); +} +#endif /* CONFIG_AF_RXRPC_IPV6 */ + +/* + * Handle an error received on the local endpoint as a tunnel. + */ +void rxrpc_encap_err_rcv(struct sock *sk, struct sk_buff *skb, + unsigned int udp_offset) +{ + struct sock_extended_err ee; + struct sockaddr_rxrpc srx; + struct rxrpc_local *local; + struct rxrpc_peer *peer; + unsigned int info = 0; + int err; + u8 version = ip_hdr(skb)->version; + u8 type = icmp_hdr(skb)->type; + u8 code = icmp_hdr(skb)->code; + + rcu_read_lock(); + local = rcu_dereference_sk_user_data(sk); + if (unlikely(!local)) { + rcu_read_unlock(); + return; + } + + rxrpc_new_skb(skb, rxrpc_skb_received); + + switch (ip_hdr(skb)->version) { + case IPVERSION: + peer = rxrpc_lookup_peer_icmp_rcu(local, skb, udp_offset, + &info, &srx); + break; +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + peer = rxrpc_lookup_peer_icmp6_rcu(local, skb, udp_offset, + &info, &srx); + break; +#endif + default: + rcu_read_unlock(); + return; + } + + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (!peer) { + rcu_read_unlock(); + return; + } + + memset(&ee, 0, sizeof(ee)); + + switch (version) { + case IPVERSION: + switch (type) { + case ICMP_DEST_UNREACH: + switch (code) { + case ICMP_FRAG_NEEDED: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + default: + break; + } + + err = EHOSTUNREACH; + if (code <= NR_ICMP_UNREACH) { + /* Might want to do something different with + * non-fatal errors + */ + //harderr = icmp_err_convert[code].fatal; + err = icmp_err_convert[code].errno; + } + break; + + case ICMP_TIME_EXCEEDED: + err = EHOSTUNREACH; + break; + default: + err = EPROTO; + break; + } + + ee.ee_origin = SO_EE_ORIGIN_ICMP; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; + +#ifdef CONFIG_AF_RXRPC_IPV6 + case 6: + switch (type) { + case ICMPV6_PKT_TOOBIG: + rxrpc_adjust_mtu(peer, info); + rcu_read_unlock(); + rxrpc_put_peer(peer); + return; + } + + icmpv6_err_convert(type, code, &err); + + if (err == EACCES) + err = EHOSTUNREACH; + + ee.ee_origin = SO_EE_ORIGIN_ICMP6; + ee.ee_type = type; + ee.ee_code = code; + ee.ee_errno = err; + break; +#endif + } + + trace_rxrpc_rx_icmp(peer, &ee, &srx); + + rxrpc_distribute_error(peer, err, RXRPC_CALL_NETWORK_ERROR); + rcu_read_unlock(); + rxrpc_put_peer(peer); +} + +/* + * Find the peer associated with a local error. + */ +static struct rxrpc_peer *rxrpc_lookup_peer_local_rcu(struct rxrpc_local *local, + const struct sk_buff *skb, + struct sockaddr_rxrpc *srx) +{ + struct sock_exterr_skb *serr = SKB_EXT_ERR(skb); + + _enter(""); + + memset(srx, 0, sizeof(*srx)); + srx->transport_type = local->srx.transport_type; + srx->transport_len = local->srx.transport_len; + srx->transport.family = local->srx.transport.family; + + switch (srx->transport.family) { + case AF_INET: srx->transport_len = sizeof(srx->transport.sin); srx->transport.family = AF_INET; srx->transport.sin.sin_port = serr->port; @@ -104,10 +346,8 @@ static struct rxrpc_peer *rxrpc_lookup_peer_icmp_rcu(struct rxrpc_local *local, /* * Handle an MTU/fragmentation problem. */ -static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, struct sock_exterr_skb *serr) +static void rxrpc_adjust_mtu(struct rxrpc_peer *peer, unsigned int mtu) { - u32 mtu = serr->ee.ee_info; - _net("Rx ICMP Fragmentation Needed (%d)", mtu); /* wind down the local interface MTU */ @@ -148,7 +388,7 @@ void rxrpc_error_report(struct sock *sk) struct sock_exterr_skb *serr; struct sockaddr_rxrpc srx; struct rxrpc_local *local; - struct rxrpc_peer *peer; + struct rxrpc_peer *peer = NULL; struct sk_buff *skb; rcu_read_lock(); @@ -172,41 +412,20 @@ void rxrpc_error_report(struct sock *sk) } rxrpc_new_skb(skb, rxrpc_skb_received); serr = SKB_EXT_ERR(skb); - if (!skb->len && serr->ee.ee_origin == SO_EE_ORIGIN_TIMESTAMPING) { - _leave("UDP empty message"); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - return; - } - peer = rxrpc_lookup_peer_icmp_rcu(local, skb, &srx); - if (peer && !rxrpc_get_peer_maybe(peer)) - peer = NULL; - if (!peer) { - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - _leave(" [no peer]"); - return; - } - - trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); - - if ((serr->ee.ee_origin == SO_EE_ORIGIN_ICMP && - serr->ee.ee_type == ICMP_DEST_UNREACH && - serr->ee.ee_code == ICMP_FRAG_NEEDED)) { - rxrpc_adjust_mtu(peer, serr); - rcu_read_unlock(); - rxrpc_free_skb(skb, rxrpc_skb_freed); - rxrpc_put_peer(peer); - _leave(" [MTU update]"); - return; + if (serr->ee.ee_origin == SO_EE_ORIGIN_LOCAL) { + peer = rxrpc_lookup_peer_local_rcu(local, skb, &srx); + if (peer && !rxrpc_get_peer_maybe(peer)) + peer = NULL; + if (peer) { + trace_rxrpc_rx_icmp(peer, &serr->ee, &srx); + rxrpc_store_error(peer, serr); + } } - rxrpc_store_error(peer, serr); rcu_read_unlock(); rxrpc_free_skb(skb, rxrpc_skb_freed); rxrpc_put_peer(peer); - _leave(""); } diff --git a/net/rxrpc/recvmsg.c b/net/rxrpc/recvmsg.c index 250f23bc1c07..7e39c262fd79 100644 --- a/net/rxrpc/recvmsg.c +++ b/net/rxrpc/recvmsg.c @@ -771,46 +771,3 @@ call_complete: goto out; } EXPORT_SYMBOL(rxrpc_kernel_recv_data); - -/** - * rxrpc_kernel_get_reply_time - Get timestamp on first reply packet - * @sock: The socket that the call exists on - * @call: The call to query - * @_ts: Where to put the timestamp - * - * Retrieve the timestamp from the first DATA packet of the reply if it is - * in the ring. Returns true if successful, false if not. - */ -bool rxrpc_kernel_get_reply_time(struct socket *sock, struct rxrpc_call *call, - ktime_t *_ts) -{ - struct sk_buff *skb; - rxrpc_seq_t hard_ack, top, seq; - bool success = false; - - mutex_lock(&call->user_mutex); - - if (READ_ONCE(call->state) != RXRPC_CALL_CLIENT_RECV_REPLY) - goto out; - - hard_ack = call->rx_hard_ack; - if (hard_ack != 0) - goto out; - - seq = hard_ack + 1; - top = smp_load_acquire(&call->rx_top); - if (after(seq, top)) - goto out; - - skb = call->rxtx_buffer[seq & RXRPC_RXTX_BUFF_MASK]; - if (!skb) - goto out; - - *_ts = skb_get_ktime(skb); - success = true; - -out: - mutex_unlock(&call->user_mutex); - return success; -} -EXPORT_SYMBOL(rxrpc_kernel_get_reply_time); diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c index 258917a714c8..78fa0524156f 100644 --- a/net/rxrpc/rxkad.c +++ b/net/rxrpc/rxkad.c @@ -540,7 +540,7 @@ static int rxkad_verify_packet_2(struct rxrpc_call *call, struct sk_buff *skb, * directly into the target buffer. */ sg = _sg; - nsg = skb_shinfo(skb)->nr_frags; + nsg = skb_shinfo(skb)->nr_frags + 1; if (nsg <= 4) { nsg = 4; } else { diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 817065aa2833..9b31a10cc639 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -676,6 +676,31 @@ int tcf_idr_search(struct tc_action_net *tn, struct tc_action **a, u32 index) } EXPORT_SYMBOL(tcf_idr_search); +static int __tcf_generic_walker(struct net *net, struct sk_buff *skb, + struct netlink_callback *cb, int type, + const struct tc_action_ops *ops, + struct netlink_ext_ack *extack) +{ + struct tc_action_net *tn = net_generic(net, ops->net_id); + + if (unlikely(ops->walk)) + return ops->walk(net, skb, cb, type, ops, extack); + + return tcf_generic_walker(tn, skb, cb, type, ops, extack); +} + +static int __tcf_idr_search(struct net *net, + const struct tc_action_ops *ops, + struct tc_action **a, u32 index) +{ + struct tc_action_net *tn = net_generic(net, ops->net_id); + + if (unlikely(ops->lookup)) + return ops->lookup(net, a, index); + + return tcf_idr_search(tn, a, index); +} + static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) { struct tc_action *p; @@ -926,7 +951,7 @@ int tcf_register_action(struct tc_action_ops *act, struct tc_action_ops *a; int ret; - if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup) + if (!act->act || !act->dump || !act->init) return -EINVAL; /* We have to register pernet ops before making the action ops visible, @@ -1638,7 +1663,7 @@ static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla, goto err_out; } err = -ENOENT; - if (ops->lookup(net, &a, index) == 0) { + if (__tcf_idr_search(net, ops, &a, index) == 0) { NL_SET_ERR_MSG(extack, "TC action with specified index not found"); goto err_mod; } @@ -1703,7 +1728,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla, goto out_module_put; } - err = ops->walk(net, skb, &dcb, RTM_DELACTION, ops, extack); + err = __tcf_generic_walker(net, skb, &dcb, RTM_DELACTION, ops, extack); if (err <= 0) { nla_nest_cancel(skb, nest); goto out_module_put; @@ -2121,7 +2146,7 @@ static int tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb) if (nest == NULL) goto out_module_put; - ret = a_o->walk(net, skb, cb, RTM_GETACTION, a_o, NULL); + ret = __tcf_generic_walker(net, skb, cb, RTM_GETACTION, a_o, NULL); if (ret < 0) goto out_module_put; diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c index fea2d78b9ddc..b79eee44e24e 100644 --- a/net/sched/act_bpf.c +++ b/net/sched/act_bpf.c @@ -29,7 +29,6 @@ struct tcf_bpf_cfg { bool is_ebpf; }; -static unsigned int bpf_net_id; static struct tc_action_ops act_bpf_ops; static int tcf_bpf_act(struct sk_buff *skb, const struct tc_action *act, @@ -280,7 +279,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, bpf_net_id); + struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_ACT_BPF_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -334,7 +333,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla, is_bpf = tb[TCA_ACT_BPF_OPS_LEN] && tb[TCA_ACT_BPF_OPS]; is_ebpf = tb[TCA_ACT_BPF_FD]; - if ((!is_bpf && !is_ebpf) || (is_bpf && is_ebpf)) { + if (is_bpf == is_ebpf) { ret = -EINVAL; goto put_chain; } @@ -390,23 +389,6 @@ static void tcf_bpf_cleanup(struct tc_action *act) tcf_bpf_cfg_cleanup(&tmp); } -static int tcf_bpf_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, bpf_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_bpf_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, bpf_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_bpf_ops __read_mostly = { .kind = "bpf", .id = TCA_ID_BPF, @@ -415,27 +397,25 @@ static struct tc_action_ops act_bpf_ops __read_mostly = { .dump = tcf_bpf_dump, .cleanup = tcf_bpf_cleanup, .init = tcf_bpf_init, - .walk = tcf_bpf_walker, - .lookup = tcf_bpf_search, .size = sizeof(struct tcf_bpf), }; static __net_init int bpf_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, bpf_net_id); + struct tc_action_net *tn = net_generic(net, act_bpf_ops.net_id); return tc_action_net_init(net, tn, &act_bpf_ops); } static void __net_exit bpf_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, bpf_net_id); + tc_action_net_exit(net_list, act_bpf_ops.net_id); } static struct pernet_operations bpf_net_ops = { .init = bpf_init_net, .exit_batch = bpf_exit_net, - .id = &bpf_net_id, + .id = &act_bpf_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c index 09e2aafc8943..66b143bb04ac 100644 --- a/net/sched/act_connmark.c +++ b/net/sched/act_connmark.c @@ -25,7 +25,6 @@ #include <net/netfilter/nf_conntrack_core.h> #include <net/netfilter/nf_conntrack_zones.h> -static unsigned int connmark_net_id; static struct tc_action_ops act_connmark_ops; static int tcf_connmark_act(struct sk_buff *skb, const struct tc_action *a, @@ -99,7 +98,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, connmark_net_id); + struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); struct nlattr *tb[TCA_CONNMARK_MAX + 1]; bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_chain *goto_ch = NULL; @@ -200,23 +199,6 @@ nla_put_failure: return -1; } -static int tcf_connmark_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, connmark_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_connmark_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, connmark_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_connmark_ops = { .kind = "connmark", .id = TCA_ID_CONNMARK, @@ -224,27 +206,25 @@ static struct tc_action_ops act_connmark_ops = { .act = tcf_connmark_act, .dump = tcf_connmark_dump, .init = tcf_connmark_init, - .walk = tcf_connmark_walker, - .lookup = tcf_connmark_search, .size = sizeof(struct tcf_connmark_info), }; static __net_init int connmark_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, connmark_net_id); + struct tc_action_net *tn = net_generic(net, act_connmark_ops.net_id); return tc_action_net_init(net, tn, &act_connmark_ops); } static void __net_exit connmark_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, connmark_net_id); + tc_action_net_exit(net_list, act_connmark_ops.net_id); } static struct pernet_operations connmark_net_ops = { .init = connmark_init_net, .exit_batch = connmark_exit_net, - .id = &connmark_net_id, + .id = &act_connmark_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c index 22847ee009ef..1366adf9b909 100644 --- a/net/sched/act_csum.c +++ b/net/sched/act_csum.c @@ -37,7 +37,6 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = { [TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), }, }; -static unsigned int csum_net_id; static struct tc_action_ops act_csum_ops; static int tcf_csum_init(struct net *net, struct nlattr *nla, @@ -45,7 +44,7 @@ static int tcf_csum_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, csum_net_id); + struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_csum_params *params_new; struct nlattr *tb[TCA_CSUM_MAX + 1]; @@ -673,23 +672,6 @@ static void tcf_csum_cleanup(struct tc_action *a) kfree_rcu(params, rcu); } -static int tcf_csum_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, csum_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_csum_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, csum_net_id); - - return tcf_idr_search(tn, a, index); -} - static size_t tcf_csum_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_csum)); @@ -722,8 +704,6 @@ static struct tc_action_ops act_csum_ops = { .dump = tcf_csum_dump, .init = tcf_csum_init, .cleanup = tcf_csum_cleanup, - .walk = tcf_csum_walker, - .lookup = tcf_csum_search, .get_fill_size = tcf_csum_get_fill_size, .offload_act_setup = tcf_csum_offload_act_setup, .size = sizeof(struct tcf_csum), @@ -731,20 +711,20 @@ static struct tc_action_ops act_csum_ops = { static __net_init int csum_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, csum_net_id); + struct tc_action_net *tn = net_generic(net, act_csum_ops.net_id); return tc_action_net_init(net, tn, &act_csum_ops); } static void __net_exit csum_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, csum_net_id); + tc_action_net_exit(net_list, act_csum_ops.net_id); } static struct pernet_operations csum_net_ops = { .init = csum_init_net, .exit_batch = csum_exit_net, - .id = &csum_net_id, + .id = &act_csum_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ct.c b/net/sched/act_ct.c index d55afb8d14be..b38d91d6b249 100644 --- a/net/sched/act_ct.c +++ b/net/sched/act_ct.c @@ -649,7 +649,6 @@ static void tcf_ct_flow_tables_uninit(void) } static struct tc_action_ops act_ct_ops; -static unsigned int ct_net_id; struct tc_ct_action_net { struct tc_action_net tn; /* Must be first */ @@ -697,7 +696,6 @@ drop_ct: static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) { unsigned int len; - int err; switch (family) { case NFPROTO_IPV4: @@ -711,9 +709,7 @@ static int tcf_ct_skb_network_trim(struct sk_buff *skb, int family) len = skb->len; } - err = pskb_trim_rcsum(skb, len); - - return err; + return pskb_trim_rcsum(skb, len); } static u8 tcf_ct_skb_nf_family(struct sk_buff *skb) @@ -1255,7 +1251,7 @@ static int tcf_ct_fill_params(struct net *net, struct nlattr **tb, struct netlink_ext_ack *extack) { - struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); struct nf_conntrack_zone zone; struct nf_conn *tmpl; int err; @@ -1330,7 +1326,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, ct_net_id); + struct tc_action_net *tn = net_generic(net, act_ct_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct tcf_ct_params *params = NULL; struct nlattr *tb[TCA_CT_MAX + 1]; @@ -1394,7 +1390,7 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, err = tcf_ct_flow_table_get(net, params); if (err) - goto cleanup; + goto cleanup_params; spin_lock_bh(&c->tcf_lock); goto_ch = tcf_action_set_ctrlact(*a, parm->action, goto_ch); @@ -1409,6 +1405,9 @@ static int tcf_ct_init(struct net *net, struct nlattr *nla, return res; +cleanup_params: + if (params->tmpl) + nf_ct_put(params->tmpl); cleanup: if (goto_ch) tcf_chain_put_by_act(goto_ch); @@ -1558,23 +1557,6 @@ nla_put_failure: return -1; } -static int tcf_ct_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, ct_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_ct_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, ct_net_id); - - return tcf_idr_search(tn, a, index); -} - static void tcf_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { @@ -1613,8 +1595,6 @@ static struct tc_action_ops act_ct_ops = { .dump = tcf_ct_dump, .init = tcf_ct_init, .cleanup = tcf_ct_cleanup, - .walk = tcf_ct_walker, - .lookup = tcf_ct_search, .stats_update = tcf_stats_update, .offload_act_setup = tcf_ct_offload_act_setup, .size = sizeof(struct tcf_ct), @@ -1623,7 +1603,7 @@ static struct tc_action_ops act_ct_ops = { static __net_init int ct_init_net(struct net *net) { unsigned int n_bits = sizeof_field(struct tcf_ct_params, labels) * 8; - struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); if (nf_connlabels_get(net, n_bits - 1)) { tn->labels = false; @@ -1641,20 +1621,20 @@ static void __net_exit ct_exit_net(struct list_head *net_list) rtnl_lock(); list_for_each_entry(net, net_list, exit_list) { - struct tc_ct_action_net *tn = net_generic(net, ct_net_id); + struct tc_ct_action_net *tn = net_generic(net, act_ct_ops.net_id); if (tn->labels) nf_connlabels_put(net); } rtnl_unlock(); - tc_action_net_exit(net_list, ct_net_id); + tc_action_net_exit(net_list, act_ct_ops.net_id); } static struct pernet_operations ct_net_ops = { .init = ct_init_net, .exit_batch = ct_exit_net, - .id = &ct_net_id, + .id = &act_ct_ops.net_id, .size = sizeof(struct tc_ct_action_net), }; diff --git a/net/sched/act_ctinfo.c b/net/sched/act_ctinfo.c index 0281e45987a4..d4102f0a9abd 100644 --- a/net/sched/act_ctinfo.c +++ b/net/sched/act_ctinfo.c @@ -25,7 +25,6 @@ #include <net/netfilter/nf_conntrack_zones.h> static struct tc_action_ops act_ctinfo_ops; -static unsigned int ctinfo_net_id; static void tcf_ctinfo_dscp_set(struct nf_conn *ct, struct tcf_ctinfo *ca, struct tcf_ctinfo_params *cp, @@ -157,7 +156,7 @@ static int tcf_ctinfo_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; u32 dscpmask = 0, dscpstatemask, index; struct nlattr *tb[TCA_CTINFO_MAX + 1]; @@ -342,23 +341,6 @@ nla_put_failure: return -1; } -static int tcf_ctinfo_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, ctinfo_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_ctinfo_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, ctinfo_net_id); - - return tcf_idr_search(tn, a, index); -} - static void tcf_ctinfo_cleanup(struct tc_action *a) { struct tcf_ctinfo *ci = to_ctinfo(a); @@ -377,27 +359,25 @@ static struct tc_action_ops act_ctinfo_ops = { .dump = tcf_ctinfo_dump, .init = tcf_ctinfo_init, .cleanup= tcf_ctinfo_cleanup, - .walk = tcf_ctinfo_walker, - .lookup = tcf_ctinfo_search, .size = sizeof(struct tcf_ctinfo), }; static __net_init int ctinfo_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, ctinfo_net_id); + struct tc_action_net *tn = net_generic(net, act_ctinfo_ops.net_id); return tc_action_net_init(net, tn, &act_ctinfo_ops); } static void __net_exit ctinfo_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, ctinfo_net_id); + tc_action_net_exit(net_list, act_ctinfo_ops.net_id); } static struct pernet_operations ctinfo_net_ops = { .init = ctinfo_init_net, .exit_batch = ctinfo_exit_net, - .id = &ctinfo_net_id, + .id = &act_ctinfo_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c index ac29d1065232..62d682b96b88 100644 --- a/net/sched/act_gact.c +++ b/net/sched/act_gact.c @@ -19,14 +19,13 @@ #include <linux/tc_act/tc_gact.h> #include <net/tc_act/tc_gact.h> -static unsigned int gact_net_id; static struct tc_action_ops act_gact_ops; #ifdef CONFIG_GACT_PROB static int gact_net_rand(struct tcf_gact *gact) { smp_rmb(); /* coupled with smp_wmb() in tcf_gact_init() */ - if (prandom_u32() % gact->tcfg_pval) + if (prandom_u32_max(gact->tcfg_pval)) return gact->tcf_action; return gact->tcfg_paction; } @@ -55,7 +54,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, gact_net_id); + struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GACT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -222,23 +221,6 @@ nla_put_failure: return -1; } -static int tcf_gact_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, gact_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_gact_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, gact_net_id); - - return tcf_idr_search(tn, a, index); -} - static size_t tcf_gact_get_fill_size(const struct tc_action *act) { size_t sz = nla_total_size(sizeof(struct tc_gact)); /* TCA_GACT_PARMS */ @@ -308,8 +290,6 @@ static struct tc_action_ops act_gact_ops = { .stats_update = tcf_gact_stats_update, .dump = tcf_gact_dump, .init = tcf_gact_init, - .walk = tcf_gact_walker, - .lookup = tcf_gact_search, .get_fill_size = tcf_gact_get_fill_size, .offload_act_setup = tcf_gact_offload_act_setup, .size = sizeof(struct tcf_gact), @@ -317,20 +297,20 @@ static struct tc_action_ops act_gact_ops = { static __net_init int gact_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, gact_net_id); + struct tc_action_net *tn = net_generic(net, act_gact_ops.net_id); return tc_action_net_init(net, tn, &act_gact_ops); } static void __net_exit gact_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, gact_net_id); + tc_action_net_exit(net_list, act_gact_ops.net_id); } static struct pernet_operations gact_net_ops = { .init = gact_init_net, .exit_batch = gact_exit_net, - .id = &gact_net_id, + .id = &act_gact_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_gate.c b/net/sched/act_gate.c index fd5155274733..3049878e7315 100644 --- a/net/sched/act_gate.c +++ b/net/sched/act_gate.c @@ -15,7 +15,6 @@ #include <net/pkt_cls.h> #include <net/tc_act/tc_gate.h> -static unsigned int gate_net_id; static struct tc_action_ops act_gate_ops; static ktime_t gate_get_time(struct tcf_gate *gact) @@ -298,7 +297,7 @@ static int tcf_gate_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, gate_net_id); + struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); enum tk_offsets tk_offset = TK_OFFS_TAI; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_GATE_MAX + 1]; @@ -565,16 +564,6 @@ nla_put_failure: return -1; } -static int tcf_gate_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, gate_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { @@ -585,13 +574,6 @@ static void tcf_gate_stats_update(struct tc_action *a, u64 bytes, u64 packets, tm->lastuse = max_t(u64, tm->lastuse, lastuse); } -static int tcf_gate_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, gate_net_id); - - return tcf_idr_search(tn, a, index); -} - static size_t tcf_gate_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_gate)); @@ -654,30 +636,28 @@ static struct tc_action_ops act_gate_ops = { .dump = tcf_gate_dump, .init = tcf_gate_init, .cleanup = tcf_gate_cleanup, - .walk = tcf_gate_walker, .stats_update = tcf_gate_stats_update, .get_fill_size = tcf_gate_get_fill_size, - .lookup = tcf_gate_search, .offload_act_setup = tcf_gate_offload_act_setup, .size = sizeof(struct tcf_gate), }; static __net_init int gate_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, gate_net_id); + struct tc_action_net *tn = net_generic(net, act_gate_ops.net_id); return tc_action_net_init(net, tn, &act_gate_ops); } static void __net_exit gate_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, gate_net_id); + tc_action_net_exit(net_list, act_gate_ops.net_id); } static struct pernet_operations gate_net_ops = { .init = gate_init_net, .exit_batch = gate_exit_net, - .id = &gate_net_id, + .id = &act_gate_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c index 41ba55e60b1b..41d63b33461d 100644 --- a/net/sched/act_ife.c +++ b/net/sched/act_ife.c @@ -30,7 +30,6 @@ #include <linux/etherdevice.h> #include <net/ife.h> -static unsigned int ife_net_id; static int max_metacnt = IFE_META_MAX + 1; static struct tc_action_ops act_ife_ops; @@ -482,7 +481,7 @@ static int tcf_ife_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, ife_net_id); + struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_IFE_MAX + 1]; struct nlattr *tb2[IFE_META_MAX + 1]; @@ -878,23 +877,6 @@ static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a, return tcf_ife_decode(skb, a, res); } -static int tcf_ife_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, ife_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_ife_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, ife_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_ife_ops = { .kind = "ife", .id = TCA_ID_IFE, @@ -903,27 +885,25 @@ static struct tc_action_ops act_ife_ops = { .dump = tcf_ife_dump, .cleanup = tcf_ife_cleanup, .init = tcf_ife_init, - .walk = tcf_ife_walker, - .lookup = tcf_ife_search, .size = sizeof(struct tcf_ife_info), }; static __net_init int ife_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, ife_net_id); + struct tc_action_net *tn = net_generic(net, act_ife_ops.net_id); return tc_action_net_init(net, tn, &act_ife_ops); } static void __net_exit ife_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, ife_net_id); + tc_action_net_exit(net_list, act_ife_ops.net_id); } static struct pernet_operations ife_net_ops = { .init = ife_init_net, .exit_batch = ife_exit_net, - .id = &ife_net_id, + .id = &act_ife_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 2f3d507c24a1..1625e1037416 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -24,10 +24,7 @@ #include <linux/netfilter_ipv4/ip_tables.h> -static unsigned int ipt_net_id; static struct tc_action_ops act_ipt_ops; - -static unsigned int xt_net_id; static struct tc_action_ops act_xt_ops; static int ipt_init_target(struct net *net, struct xt_entry_target *t, @@ -206,8 +203,8 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, ipt_net_id, nla, est, a, &act_ipt_ops, - tp, flags); + return __tcf_ipt_init(net, act_ipt_ops.net_id, nla, est, + a, &act_ipt_ops, tp, flags); } static int tcf_xt_init(struct net *net, struct nlattr *nla, @@ -215,8 +212,8 @@ static int tcf_xt_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - return __tcf_ipt_init(net, xt_net_id, nla, est, a, &act_xt_ops, - tp, flags); + return __tcf_ipt_init(net, act_xt_ops.net_id, nla, est, + a, &act_xt_ops, tp, flags); } static int tcf_ipt_act(struct sk_buff *skb, const struct tc_action *a, @@ -316,23 +313,6 @@ nla_put_failure: return -1; } -static int tcf_ipt_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_ipt_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, ipt_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_ipt_ops = { .kind = "ipt", .id = TCA_ID_IPT, @@ -341,47 +321,28 @@ static struct tc_action_ops act_ipt_ops = { .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_ipt_init, - .walk = tcf_ipt_walker, - .lookup = tcf_ipt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int ipt_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, ipt_net_id); + struct tc_action_net *tn = net_generic(net, act_ipt_ops.net_id); return tc_action_net_init(net, tn, &act_ipt_ops); } static void __net_exit ipt_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, ipt_net_id); + tc_action_net_exit(net_list, act_ipt_ops.net_id); } static struct pernet_operations ipt_net_ops = { .init = ipt_init_net, .exit_batch = ipt_exit_net, - .id = &ipt_net_id, + .id = &act_ipt_ops.net_id, .size = sizeof(struct tc_action_net), }; -static int tcf_xt_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, xt_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_xt_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, xt_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_xt_ops = { .kind = "xt", .id = TCA_ID_XT, @@ -390,27 +351,25 @@ static struct tc_action_ops act_xt_ops = { .dump = tcf_ipt_dump, .cleanup = tcf_ipt_release, .init = tcf_xt_init, - .walk = tcf_xt_walker, - .lookup = tcf_xt_search, .size = sizeof(struct tcf_ipt), }; static __net_init int xt_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, xt_net_id); + struct tc_action_net *tn = net_generic(net, act_xt_ops.net_id); return tc_action_net_init(net, tn, &act_xt_ops); } static void __net_exit xt_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, xt_net_id); + tc_action_net_exit(net_list, act_xt_ops.net_id); } static struct pernet_operations xt_net_ops = { .init = xt_init_net, .exit_batch = xt_exit_net, - .id = &xt_net_id, + .id = &act_xt_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c index a1d70cf86843..b8ad6ae282c0 100644 --- a/net/sched/act_mirred.c +++ b/net/sched/act_mirred.c @@ -86,7 +86,6 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = { [TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) }, }; -static unsigned int mirred_net_id; static struct tc_action_ops act_mirred_ops; static int tcf_mirred_init(struct net *net, struct nlattr *nla, @@ -94,7 +93,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, mirred_net_id); + struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MIRRED_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -306,8 +305,7 @@ static int tcf_mirred_act(struct sk_buff *skb, const struct tc_action *a, /* let's the caller reinsert the packet, if possible */ if (use_reinsert) { - res->ingress = want_ingress; - err = tcf_mirred_forward(res->ingress, skb); + err = tcf_mirred_forward(want_ingress, skb); if (err) tcf_action_inc_overlimit_qstats(&m->common); __this_cpu_dec(mirred_rec_level); @@ -373,23 +371,6 @@ nla_put_failure: return -1; } -static int tcf_mirred_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, mirred_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_mirred_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, mirred_net_id); - - return tcf_idr_search(tn, a, index); -} - static int mirred_device_event(struct notifier_block *unused, unsigned long event, void *ptr) { @@ -510,8 +491,6 @@ static struct tc_action_ops act_mirred_ops = { .dump = tcf_mirred_dump, .cleanup = tcf_mirred_release, .init = tcf_mirred_init, - .walk = tcf_mirred_walker, - .lookup = tcf_mirred_search, .get_fill_size = tcf_mirred_get_fill_size, .offload_act_setup = tcf_mirred_offload_act_setup, .size = sizeof(struct tcf_mirred), @@ -520,20 +499,20 @@ static struct tc_action_ops act_mirred_ops = { static __net_init int mirred_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, mirred_net_id); + struct tc_action_net *tn = net_generic(net, act_mirred_ops.net_id); return tc_action_net_init(net, tn, &act_mirred_ops); } static void __net_exit mirred_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, mirred_net_id); + tc_action_net_exit(net_list, act_mirred_ops.net_id); } static struct pernet_operations mirred_net_ops = { .init = mirred_init_net, .exit_batch = mirred_exit_net, - .id = &mirred_net_id, + .id = &act_mirred_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_mpls.c b/net/sched/act_mpls.c index adabeccb63e1..8ad25cc8ccd5 100644 --- a/net/sched/act_mpls.c +++ b/net/sched/act_mpls.c @@ -15,7 +15,6 @@ #include <net/pkt_cls.h> #include <net/tc_act/tc_mpls.h> -static unsigned int mpls_net_id; static struct tc_action_ops act_mpls_ops; #define ACT_MPLS_TTL_DEFAULT 255 @@ -155,7 +154,7 @@ static int tcf_mpls_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, mpls_net_id); + struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_MPLS_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -367,23 +366,6 @@ nla_put_failure: return -EMSGSIZE; } -static int tcf_mpls_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, mpls_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_mpls_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, mpls_net_id); - - return tcf_idr_search(tn, a, index); -} - static int tcf_mpls_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) @@ -451,28 +433,26 @@ static struct tc_action_ops act_mpls_ops = { .dump = tcf_mpls_dump, .init = tcf_mpls_init, .cleanup = tcf_mpls_cleanup, - .walk = tcf_mpls_walker, - .lookup = tcf_mpls_search, .offload_act_setup = tcf_mpls_offload_act_setup, .size = sizeof(struct tcf_mpls), }; static __net_init int mpls_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, mpls_net_id); + struct tc_action_net *tn = net_generic(net, act_mpls_ops.net_id); return tc_action_net_init(net, tn, &act_mpls_ops); } static void __net_exit mpls_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, mpls_net_id); + tc_action_net_exit(net_list, act_mpls_ops.net_id); } static struct pernet_operations mpls_net_ops = { .init = mpls_init_net, .exit_batch = mpls_exit_net, - .id = &mpls_net_id, + .id = &act_mpls_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c index 2a39b3729e84..9265145f1040 100644 --- a/net/sched/act_nat.c +++ b/net/sched/act_nat.c @@ -26,7 +26,6 @@ #include <net/udp.h> -static unsigned int nat_net_id; static struct tc_action_ops act_nat_ops; static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = { @@ -37,7 +36,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est, struct tc_action **a, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, nat_net_id); + struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_NAT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -289,23 +288,6 @@ nla_put_failure: return -1; } -static int tcf_nat_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, nat_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_nat_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, nat_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_nat_ops = { .kind = "nat", .id = TCA_ID_NAT, @@ -313,27 +295,25 @@ static struct tc_action_ops act_nat_ops = { .act = tcf_nat_act, .dump = tcf_nat_dump, .init = tcf_nat_init, - .walk = tcf_nat_walker, - .lookup = tcf_nat_search, .size = sizeof(struct tcf_nat), }; static __net_init int nat_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, nat_net_id); + struct tc_action_net *tn = net_generic(net, act_nat_ops.net_id); return tc_action_net_init(net, tn, &act_nat_ops); } static void __net_exit nat_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, nat_net_id); + tc_action_net_exit(net_list, act_nat_ops.net_id); } static struct pernet_operations nat_net_ops = { .init = nat_init_net, .exit_batch = nat_exit_net, - .id = &nat_net_id, + .id = &act_nat_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c index 823ee643371c..94ed5857ce67 100644 --- a/net/sched/act_pedit.c +++ b/net/sched/act_pedit.c @@ -21,7 +21,6 @@ #include <uapi/linux/tc_act/tc_pedit.h> #include <net/pkt_cls.h> -static unsigned int pedit_net_id; static struct tc_action_ops act_pedit_ops; static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = { @@ -139,7 +138,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, pedit_net_id); + struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_PEDIT_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -492,23 +491,6 @@ nla_put_failure: return -1; } -static int tcf_pedit_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, pedit_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_pedit_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, pedit_net_id); - - return tcf_idr_search(tn, a, index); -} - static int tcf_pedit_offload_act_setup(struct tc_action *act, void *entry_data, u32 *index_inc, bool bind, struct netlink_ext_ack *extack) @@ -553,28 +535,26 @@ static struct tc_action_ops act_pedit_ops = { .dump = tcf_pedit_dump, .cleanup = tcf_pedit_cleanup, .init = tcf_pedit_init, - .walk = tcf_pedit_walker, - .lookup = tcf_pedit_search, .offload_act_setup = tcf_pedit_offload_act_setup, .size = sizeof(struct tcf_pedit), }; static __net_init int pedit_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, pedit_net_id); + struct tc_action_net *tn = net_generic(net, act_pedit_ops.net_id); return tc_action_net_init(net, tn, &act_pedit_ops); } static void __net_exit pedit_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, pedit_net_id); + tc_action_net_exit(net_list, act_pedit_ops.net_id); } static struct pernet_operations pedit_net_ops = { .init = pedit_init_net, .exit_batch = pedit_exit_net, - .id = &pedit_net_id, + .id = &act_pedit_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_police.c b/net/sched/act_police.c index b759628a47c2..0adb26e366a7 100644 --- a/net/sched/act_police.c +++ b/net/sched/act_police.c @@ -22,19 +22,8 @@ /* Each policer is serialized by its individual spinlock */ -static unsigned int police_net_id; static struct tc_action_ops act_police_ops; -static int tcf_police_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, police_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - static const struct nla_policy police_policy[TCA_POLICE_MAX + 1] = { [TCA_POLICE_RATE] = { .len = TC_RTAB_SIZE }, [TCA_POLICE_PEAKRATE] = { .len = TC_RTAB_SIZE }, @@ -58,7 +47,7 @@ static int tcf_police_init(struct net *net, struct nlattr *nla, struct tc_police *parm; struct tcf_police *police; struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL; - struct tc_action_net *tn = net_generic(net, police_net_id); + struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); struct tcf_police_params *new; bool exists = false; u32 index; @@ -412,13 +401,6 @@ nla_put_failure: return -1; } -static int tcf_police_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, police_net_id); - - return tcf_idr_search(tn, a, index); -} - static int tcf_police_act_to_flow_act(int tc_act, u32 *extval, struct netlink_ext_ack *extack) { @@ -513,8 +495,6 @@ static struct tc_action_ops act_police_ops = { .act = tcf_police_act, .dump = tcf_police_dump, .init = tcf_police_init, - .walk = tcf_police_walker, - .lookup = tcf_police_search, .cleanup = tcf_police_cleanup, .offload_act_setup = tcf_police_offload_act_setup, .size = sizeof(struct tcf_police), @@ -522,20 +502,20 @@ static struct tc_action_ops act_police_ops = { static __net_init int police_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, police_net_id); + struct tc_action_net *tn = net_generic(net, act_police_ops.net_id); return tc_action_net_init(net, tn, &act_police_ops); } static void __net_exit police_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, police_net_id); + tc_action_net_exit(net_list, act_police_ops.net_id); } static struct pernet_operations police_net_ops = { .init = police_init_net, .exit_batch = police_exit_net, - .id = &police_net_id, + .id = &act_police_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_sample.c b/net/sched/act_sample.c index 2f7f5e44d28c..7a25477f5d99 100644 --- a/net/sched/act_sample.c +++ b/net/sched/act_sample.c @@ -23,7 +23,6 @@ #include <linux/if_arp.h> -static unsigned int sample_net_id; static struct tc_action_ops act_sample_ops; static const struct nla_policy sample_policy[TCA_SAMPLE_MAX + 1] = { @@ -38,7 +37,7 @@ static int tcf_sample_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, sample_net_id); + struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SAMPLE_MAX + 1]; struct psample_group *psample_group; @@ -169,7 +168,7 @@ static int tcf_sample_act(struct sk_buff *skb, const struct tc_action *a, psample_group = rcu_dereference_bh(s->psample_group); /* randomly sample packets according to rate */ - if (psample_group && (prandom_u32() % s->rate == 0)) { + if (psample_group && (prandom_u32_max(s->rate) == 0)) { if (!skb_at_tc_ingress(skb)) { md.in_ifindex = skb->skb_iif; md.out_ifindex = skb->dev->ifindex; @@ -241,23 +240,6 @@ nla_put_failure: return -1; } -static int tcf_sample_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, sample_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_sample_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, sample_net_id); - - return tcf_idr_search(tn, a, index); -} - static void tcf_psample_group_put(void *priv) { struct psample_group *group = priv; @@ -321,8 +303,6 @@ static struct tc_action_ops act_sample_ops = { .dump = tcf_sample_dump, .init = tcf_sample_init, .cleanup = tcf_sample_cleanup, - .walk = tcf_sample_walker, - .lookup = tcf_sample_search, .get_psample_group = tcf_sample_get_group, .offload_act_setup = tcf_sample_offload_act_setup, .size = sizeof(struct tcf_sample), @@ -330,20 +310,20 @@ static struct tc_action_ops act_sample_ops = { static __net_init int sample_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, sample_net_id); + struct tc_action_net *tn = net_generic(net, act_sample_ops.net_id); return tc_action_net_init(net, tn, &act_sample_ops); } static void __net_exit sample_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, sample_net_id); + tc_action_net_exit(net_list, act_sample_ops.net_id); } static struct pernet_operations sample_net_ops = { .init = sample_init_net, .exit_batch = sample_exit_net, - .id = &sample_net_id, + .id = &act_sample_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c index 8c1d60bde93e..18d376135461 100644 --- a/net/sched/act_simple.c +++ b/net/sched/act_simple.c @@ -18,7 +18,6 @@ #include <linux/tc_act/tc_defact.h> #include <net/tc_act/tc_defact.h> -static unsigned int simp_net_id; static struct tc_action_ops act_simp_ops; #define SIMP_MAX_DATA 32 @@ -89,7 +88,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, simp_net_id); + struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_DEF_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -198,23 +197,6 @@ nla_put_failure: return -1; } -static int tcf_simp_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, simp_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_simp_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, simp_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_simp_ops = { .kind = "simple", .id = TCA_ID_SIMP, @@ -223,27 +205,25 @@ static struct tc_action_ops act_simp_ops = { .dump = tcf_simp_dump, .cleanup = tcf_simp_release, .init = tcf_simp_init, - .walk = tcf_simp_walker, - .lookup = tcf_simp_search, .size = sizeof(struct tcf_defact), }; static __net_init int simp_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, simp_net_id); + struct tc_action_net *tn = net_generic(net, act_simp_ops.net_id); return tc_action_net_init(net, tn, &act_simp_ops); } static void __net_exit simp_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, simp_net_id); + tc_action_net_exit(net_list, act_simp_ops.net_id); } static struct pernet_operations simp_net_ops = { .init = simp_init_net, .exit_batch = simp_exit_net, - .id = &simp_net_id, + .id = &act_simp_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c index e3bd11dfe1ca..1710780c908a 100644 --- a/net/sched/act_skbedit.c +++ b/net/sched/act_skbedit.c @@ -20,7 +20,6 @@ #include <linux/tc_act/tc_skbedit.h> #include <net/tc_act/tc_skbedit.h> -static unsigned int skbedit_net_id; static struct tc_action_ops act_skbedit_ops; static u16 tcf_skbedit_hash(struct tcf_skbedit_params *params, @@ -118,7 +117,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct tcf_skbedit_params *params_new; struct nlattr *tb[TCA_SKBEDIT_MAX + 1]; @@ -149,6 +148,11 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla, } if (tb[TCA_SKBEDIT_QUEUE_MAPPING] != NULL) { + if (is_tcf_skbedit_ingress(act_flags) && + !(act_flags & TCA_ACT_FLAGS_SKIP_SW)) { + NL_SET_ERR_MSG_MOD(extack, "\"queue_mapping\" option on receive side is hardware only, use skip_sw"); + return -EOPNOTSUPP; + } flags |= SKBEDIT_F_QUEUE_MAPPING; queue_mapping = nla_data(tb[TCA_SKBEDIT_QUEUE_MAPPING]); } @@ -347,23 +351,6 @@ static void tcf_skbedit_cleanup(struct tc_action *a) kfree_rcu(params, rcu); } -static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, skbedit_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_skbedit_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, skbedit_net_id); - - return tcf_idr_search(tn, a, index); -} - static size_t tcf_skbedit_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_skbedit)) @@ -392,9 +379,12 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data } else if (is_tcf_skbedit_priority(act)) { entry->id = FLOW_ACTION_PRIORITY; entry->priority = tcf_skbedit_priority(act); - } else if (is_tcf_skbedit_queue_mapping(act)) { - NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used"); + } else if (is_tcf_skbedit_tx_queue_mapping(act)) { + NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"queue_mapping\" option is used on transmit side"); return -EOPNOTSUPP; + } else if (is_tcf_skbedit_rx_queue_mapping(act)) { + entry->id = FLOW_ACTION_RX_QUEUE_MAPPING; + entry->rx_queue = tcf_skbedit_rx_queue_mapping(act); } else if (is_tcf_skbedit_inheritdsfield(act)) { NL_SET_ERR_MSG_MOD(extack, "Offload not supported when \"inheritdsfield\" option is used"); return -EOPNOTSUPP; @@ -412,6 +402,8 @@ static int tcf_skbedit_offload_act_setup(struct tc_action *act, void *entry_data fl_action->id = FLOW_ACTION_PTYPE; else if (is_tcf_skbedit_priority(act)) fl_action->id = FLOW_ACTION_PRIORITY; + else if (is_tcf_skbedit_rx_queue_mapping(act)) + fl_action->id = FLOW_ACTION_RX_QUEUE_MAPPING; else return -EOPNOTSUPP; } @@ -428,29 +420,27 @@ static struct tc_action_ops act_skbedit_ops = { .dump = tcf_skbedit_dump, .init = tcf_skbedit_init, .cleanup = tcf_skbedit_cleanup, - .walk = tcf_skbedit_walker, .get_fill_size = tcf_skbedit_get_fill_size, - .lookup = tcf_skbedit_search, .offload_act_setup = tcf_skbedit_offload_act_setup, .size = sizeof(struct tcf_skbedit), }; static __net_init int skbedit_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, skbedit_net_id); + struct tc_action_net *tn = net_generic(net, act_skbedit_ops.net_id); return tc_action_net_init(net, tn, &act_skbedit_ops); } static void __net_exit skbedit_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, skbedit_net_id); + tc_action_net_exit(net_list, act_skbedit_ops.net_id); } static struct pernet_operations skbedit_net_ops = { .init = skbedit_init_net, .exit_batch = skbedit_exit_net, - .id = &skbedit_net_id, + .id = &act_skbedit_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_skbmod.c b/net/sched/act_skbmod.c index 2083612d8780..d98758a63934 100644 --- a/net/sched/act_skbmod.c +++ b/net/sched/act_skbmod.c @@ -19,7 +19,6 @@ #include <linux/tc_act/tc_skbmod.h> #include <net/tc_act/tc_skbmod.h> -static unsigned int skbmod_net_id; static struct tc_action_ops act_skbmod_ops; static int tcf_skbmod_act(struct sk_buff *skb, const struct tc_action *a, @@ -103,7 +102,7 @@ static int tcf_skbmod_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, skbmod_net_id); + struct tc_action_net *tn = net_generic(net, act_skbmod_ops.net_id); bool ovr = flags & TCA_ACT_FLAGS_REPLACE; bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_SKBMOD_MAX + 1]; @@ -276,23 +275,6 @@ nla_put_failure: return -1; } -static int tcf_skbmod_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, skbmod_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tcf_skbmod_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, skbmod_net_id); - - return tcf_idr_search(tn, a, index); -} - static struct tc_action_ops act_skbmod_ops = { .kind = "skbmod", .id = TCA_ACT_SKBMOD, @@ -301,27 +283,25 @@ static struct tc_action_ops act_skbmod_ops = { .dump = tcf_skbmod_dump, .init = tcf_skbmod_init, .cleanup = tcf_skbmod_cleanup, - .walk = tcf_skbmod_walker, - .lookup = tcf_skbmod_search, .size = sizeof(struct tcf_skbmod), }; static __net_init int skbmod_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, skbmod_net_id); + struct tc_action_net *tn = net_generic(net, act_skbmod_ops.net_id); return tc_action_net_init(net, tn, &act_skbmod_ops); } static void __net_exit skbmod_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, skbmod_net_id); + tc_action_net_exit(net_list, act_skbmod_ops.net_id); } static struct pernet_operations skbmod_net_ops = { .init = skbmod_init_net, .exit_batch = skbmod_exit_net, - .id = &skbmod_net_id, + .id = &act_skbmod_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_tunnel_key.c b/net/sched/act_tunnel_key.c index 856dc23cef8c..2691a3d8e451 100644 --- a/net/sched/act_tunnel_key.c +++ b/net/sched/act_tunnel_key.c @@ -20,7 +20,6 @@ #include <linux/tc_act/tc_tunnel_key.h> #include <net/tc_act/tc_tunnel_key.h> -static unsigned int tunnel_key_net_id; static struct tc_action_ops act_tunnel_key_ops; static int tunnel_key_act(struct sk_buff *skb, const struct tc_action *a, @@ -358,7 +357,7 @@ static int tunnel_key_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 act_flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); bool bind = act_flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_TUNNEL_KEY_MAX + 1]; struct tcf_tunnel_key_params *params_new; @@ -770,23 +769,6 @@ nla_put_failure: return -1; } -static int tunnel_key_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - -static int tunnel_key_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); - - return tcf_idr_search(tn, a, index); -} - static void tcf_tunnel_encap_put_tunnel(void *priv) { struct ip_tunnel_info *tunnel = priv; @@ -850,28 +832,26 @@ static struct tc_action_ops act_tunnel_key_ops = { .dump = tunnel_key_dump, .init = tunnel_key_init, .cleanup = tunnel_key_release, - .walk = tunnel_key_walker, - .lookup = tunnel_key_search, .offload_act_setup = tcf_tunnel_key_offload_act_setup, .size = sizeof(struct tcf_tunnel_key), }; static __net_init int tunnel_key_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, tunnel_key_net_id); + struct tc_action_net *tn = net_generic(net, act_tunnel_key_ops.net_id); return tc_action_net_init(net, tn, &act_tunnel_key_ops); } static void __net_exit tunnel_key_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, tunnel_key_net_id); + tc_action_net_exit(net_list, act_tunnel_key_ops.net_id); } static struct pernet_operations tunnel_key_net_ops = { .init = tunnel_key_init_net, .exit_batch = tunnel_key_exit_net, - .id = &tunnel_key_net_id, + .id = &act_tunnel_key_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c index 68b5e772386a..7b24e898a3e6 100644 --- a/net/sched/act_vlan.c +++ b/net/sched/act_vlan.c @@ -16,7 +16,6 @@ #include <linux/tc_act/tc_vlan.h> #include <net/tc_act/tc_vlan.h> -static unsigned int vlan_net_id; static struct tc_action_ops act_vlan_ops; static int tcf_vlan_act(struct sk_buff *skb, const struct tc_action *a, @@ -117,7 +116,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla, struct tcf_proto *tp, u32 flags, struct netlink_ext_ack *extack) { - struct tc_action_net *tn = net_generic(net, vlan_net_id); + struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); bool bind = flags & TCA_ACT_FLAGS_BIND; struct nlattr *tb[TCA_VLAN_MAX + 1]; struct tcf_chain *goto_ch = NULL; @@ -333,16 +332,6 @@ nla_put_failure: return -1; } -static int tcf_vlan_walker(struct net *net, struct sk_buff *skb, - struct netlink_callback *cb, int type, - const struct tc_action_ops *ops, - struct netlink_ext_ack *extack) -{ - struct tc_action_net *tn = net_generic(net, vlan_net_id); - - return tcf_generic_walker(tn, skb, cb, type, ops, extack); -} - static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, u64 drops, u64 lastuse, bool hw) { @@ -353,13 +342,6 @@ static void tcf_vlan_stats_update(struct tc_action *a, u64 bytes, u64 packets, tm->lastuse = max_t(u64, tm->lastuse, lastuse); } -static int tcf_vlan_search(struct net *net, struct tc_action **a, u32 index) -{ - struct tc_action_net *tn = net_generic(net, vlan_net_id); - - return tcf_idr_search(tn, a, index); -} - static size_t tcf_vlan_get_fill_size(const struct tc_action *act) { return nla_total_size(sizeof(struct tc_vlan)) @@ -438,30 +420,28 @@ static struct tc_action_ops act_vlan_ops = { .dump = tcf_vlan_dump, .init = tcf_vlan_init, .cleanup = tcf_vlan_cleanup, - .walk = tcf_vlan_walker, .stats_update = tcf_vlan_stats_update, .get_fill_size = tcf_vlan_get_fill_size, - .lookup = tcf_vlan_search, .offload_act_setup = tcf_vlan_offload_act_setup, .size = sizeof(struct tcf_vlan), }; static __net_init int vlan_init_net(struct net *net) { - struct tc_action_net *tn = net_generic(net, vlan_net_id); + struct tc_action_net *tn = net_generic(net, act_vlan_ops.net_id); return tc_action_net_init(net, tn, &act_vlan_ops); } static void __net_exit vlan_exit_net(struct list_head *net_list) { - tc_action_net_exit(net_list, vlan_net_id); + tc_action_net_exit(net_list, act_vlan_ops.net_id); } static struct pernet_operations vlan_net_ops = { .init = vlan_init_net, .exit_batch = vlan_exit_net, - .id = &vlan_net_id, + .id = &act_vlan_ops.net_id, .size = sizeof(struct tc_action_net), }; diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c index 5d0d57dc5cb7..23d1cfa4f58c 100644 --- a/net/sched/cls_api.c +++ b/net/sched/cls_api.c @@ -1953,6 +1953,11 @@ static void tfilter_put(struct tcf_proto *tp, void *fh) tp->ops->put(tp, fh); } +static bool is_qdisc_ingress(__u32 classid) +{ + return (TC_H_MIN(classid) == TC_H_MIN(TC_H_MIN_INGRESS)); +} + static int tc_new_tfilter(struct sk_buff *skb, struct nlmsghdr *n, struct netlink_ext_ack *extack) { @@ -2134,6 +2139,7 @@ replay: } if (chain->tmplt_ops && chain->tmplt_ops != tp->ops) { + tfilter_put(tp, fh); NL_SET_ERR_MSG(extack, "Chain template is set to a different filter kind"); err = -EINVAL; goto errout; @@ -2143,6 +2149,8 @@ replay: flags |= TCA_ACT_FLAGS_REPLACE; if (!rtnl_held) flags |= TCA_ACT_FLAGS_NO_RTNL; + if (is_qdisc_ingress(parent)) + flags |= TCA_ACT_FLAGS_AT_INGRESS; err = tp->ops->change(net, skb, tp, cl, t->tcm_handle, tca, &fh, flags, extack); if (err == 0) { diff --git a/net/sched/cls_basic.c b/net/sched/cls_basic.c index 8158fc9ee1ab..d229ce99e554 100644 --- a/net/sched/cls_basic.c +++ b/net/sched/cls_basic.c @@ -251,15 +251,8 @@ static void basic_walk(struct tcf_proto *tp, struct tcf_walker *arg, struct basic_filter *f; list_for_each_entry(f, &head->flist, link) { - if (arg->count < arg->skip) - goto skip; - - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, f)) break; - } -skip: - arg->count++; } } @@ -268,12 +261,7 @@ static void basic_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct basic_filter *f = fh; - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } + tc_cls_bind_class(classid, cl, q, &f->res, base); } static int basic_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c index c85b85a192bf..bc317b3eac12 100644 --- a/net/sched/cls_bpf.c +++ b/net/sched/cls_bpf.c @@ -635,12 +635,7 @@ static void cls_bpf_bind_class(void *fh, u32 classid, unsigned long cl, { struct cls_bpf_prog *prog = fh; - if (prog && prog->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &prog->res, base); - else - __tcf_unbind_filter(q, &prog->res); - } + tc_cls_bind_class(classid, cl, q, &prog->res, base); } static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, @@ -650,14 +645,8 @@ static void cls_bpf_walk(struct tcf_proto *tp, struct tcf_walker *arg, struct cls_bpf_prog *prog; list_for_each_entry(prog, &head->plist, link) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, prog, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, prog)) break; - } -skip: - arg->count++; } } diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c index 972303aa8edd..014cd3de7b5d 100644 --- a/net/sched/cls_flow.c +++ b/net/sched/cls_flow.c @@ -683,14 +683,8 @@ static void flow_walk(struct tcf_proto *tp, struct tcf_walker *arg, struct flow_filter *f; list_for_each_entry(f, &head->filters, list) { - if (arg->count < arg->skip) - goto skip; - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, f)) break; - } -skip: - arg->count++; } } diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 041d63ff809a..25bc57ee6ea1 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -69,6 +69,7 @@ struct fl_flow_key { struct flow_dissector_key_hash hash; struct flow_dissector_key_num_of_vlans num_of_vlans; struct flow_dissector_key_pppoe pppoe; + struct flow_dissector_key_l2tpv3 l2tpv3; } __aligned(BITS_PER_LONG / 8); /* Ensure that we can do comparisons as longs. */ struct fl_flow_mask_range { @@ -712,6 +713,7 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 1] = { [TCA_FLOWER_KEY_NUM_OF_VLANS] = { .type = NLA_U8 }, [TCA_FLOWER_KEY_PPPOE_SID] = { .type = NLA_U16 }, [TCA_FLOWER_KEY_PPP_PROTO] = { .type = NLA_U16 }, + [TCA_FLOWER_KEY_L2TPV3_SID] = { .type = NLA_U32 }, }; @@ -1790,6 +1792,11 @@ static int fl_set_key(struct net *net, struct nlattr **tb, fl_set_key_val(tb, key->arp.tha, TCA_FLOWER_KEY_ARP_THA, mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, sizeof(key->arp.tha)); + } else if (key->basic.ip_proto == IPPROTO_L2TP) { + fl_set_key_val(tb, &key->l2tpv3.session_id, + TCA_FLOWER_KEY_L2TPV3_SID, + &mask->l2tpv3.session_id, TCA_FLOWER_UNSPEC, + sizeof(key->l2tpv3.session_id)); } if (key->basic.ip_proto == IPPROTO_TCP || @@ -1970,6 +1977,8 @@ static void fl_init_dissector(struct flow_dissector *dissector, FLOW_DISSECTOR_KEY_NUM_OF_VLANS, num_of_vlans); FL_KEY_SET_IF_MASKED(mask, keys, cnt, FLOW_DISSECTOR_KEY_PPPOE, pppoe); + FL_KEY_SET_IF_MASKED(mask, keys, cnt, + FLOW_DISSECTOR_KEY_L2TPV3, l2tpv3); skb_flow_dissector_init(dissector, keys, cnt); } @@ -3196,6 +3205,13 @@ static int fl_dump_key(struct sk_buff *skb, struct net *net, mask->arp.tha, TCA_FLOWER_KEY_ARP_THA_MASK, sizeof(key->arp.tha)))) goto nla_put_failure; + else if (key->basic.ip_proto == IPPROTO_L2TP && + fl_dump_key_val(skb, &key->l2tpv3.session_id, + TCA_FLOWER_KEY_L2TPV3_SID, + &mask->l2tpv3.session_id, + TCA_FLOWER_UNSPEC, + sizeof(key->l2tpv3.session_id))) + goto nla_put_failure; if ((key->basic.ip_proto == IPPROTO_TCP || key->basic.ip_proto == IPPROTO_UDP || @@ -3389,12 +3405,7 @@ static void fl_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct cls_fl_filter *f = fh; - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } + tc_cls_bind_class(classid, cl, q, &f->res, base); } static bool fl_delete_empty(struct tcf_proto *tp) diff --git a/net/sched/cls_fw.c b/net/sched/cls_fw.c index 8654b0ce997c..a32351da968c 100644 --- a/net/sched/cls_fw.c +++ b/net/sched/cls_fw.c @@ -358,15 +358,8 @@ static void fw_walk(struct tcf_proto *tp, struct tcf_walker *arg, for (f = rtnl_dereference(head->ht[h]); f; f = rtnl_dereference(f->next)) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, f)) return; - } - arg->count++; } } } @@ -423,12 +416,7 @@ static void fw_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct fw_filter *f = fh; - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } + tc_cls_bind_class(classid, cl, q, &f->res, base); } static struct tcf_proto_ops cls_fw_ops __read_mostly = { diff --git a/net/sched/cls_matchall.c b/net/sched/cls_matchall.c index 06cf22adbab7..39a5d9c170de 100644 --- a/net/sched/cls_matchall.c +++ b/net/sched/cls_matchall.c @@ -313,10 +313,7 @@ static int mall_reoffload(struct tcf_proto *tp, bool add, flow_setup_cb_t *cb, tc_cleanup_offload_action(&cls_mall.rule->action); kfree(cls_mall.rule); - if (err) - return err; - - return 0; + return err; } static void mall_stats_hw_filter(struct tcf_proto *tp, @@ -397,12 +394,7 @@ static void mall_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct cls_mall_head *head = fh; - if (head && head->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &head->res, base); - else - __tcf_unbind_filter(q, &head->res); - } + tc_cls_bind_class(classid, cl, q, &head->res, base); } static struct tcf_proto_ops cls_mall_ops __read_mostly = { diff --git a/net/sched/cls_route.c b/net/sched/cls_route.c index 29adac7812fe..9e43b929d4ca 100644 --- a/net/sched/cls_route.c +++ b/net/sched/cls_route.c @@ -587,15 +587,8 @@ static void route4_walk(struct tcf_proto *tp, struct tcf_walker *arg, for (f = rtnl_dereference(b->ht[h1]); f; f = rtnl_dereference(f->next)) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, f)) return; - } - arg->count++; } } } @@ -656,12 +649,7 @@ static void route4_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct route4_filter *f = fh; - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } + tc_cls_bind_class(classid, cl, q, &f->res, base); } static struct tcf_proto_ops cls_route4_ops __read_mostly = { diff --git a/net/sched/cls_rsvp.h b/net/sched/cls_rsvp.h index 5cd9d6b143c4..b00a7dbd0587 100644 --- a/net/sched/cls_rsvp.h +++ b/net/sched/cls_rsvp.h @@ -671,15 +671,8 @@ static void rsvp_walk(struct tcf_proto *tp, struct tcf_walker *arg, for (f = rtnl_dereference(s->ht[h1]); f; f = rtnl_dereference(f->next)) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(tp, f, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, f)) return; - } - arg->count++; } } } @@ -740,12 +733,7 @@ static void rsvp_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct rsvp_filter *f = fh; - if (f && f->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &f->res, base); - else - __tcf_unbind_filter(q, &f->res); - } + tc_cls_bind_class(classid, cl, q, &f->res, base); } static struct tcf_proto_ops RSVP_OPS __read_mostly = { diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c index 742c7d49a958..1c9eeb98d826 100644 --- a/net/sched/cls_tcindex.c +++ b/net/sched/cls_tcindex.c @@ -566,13 +566,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, for (i = 0; i < p->hash; i++) { if (!p->perfect[i].res.class) continue; - if (walker->count >= walker->skip) { - if (walker->fn(tp, p->perfect + i, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; + if (!tc_cls_stats_dump(tp, walker, p->perfect + i)) + return; } } if (!p->h) @@ -580,13 +575,8 @@ static void tcindex_walk(struct tcf_proto *tp, struct tcf_walker *walker, for (i = 0; i < p->hash; i++) { for (f = rtnl_dereference(p->h[i]); f; f = next) { next = rtnl_dereference(f->next); - if (walker->count >= walker->skip) { - if (walker->fn(tp, &f->result, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; + if (!tc_cls_stats_dump(tp, walker, &f->result)) + return; } } } @@ -701,12 +691,7 @@ static void tcindex_bind_class(void *fh, u32 classid, unsigned long cl, { struct tcindex_filter_result *r = fh; - if (r && r->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &r->res, base); - else - __tcf_unbind_filter(q, &r->res); - } + tc_cls_bind_class(classid, cl, q, &r->res, base); } static struct tcf_proto_ops cls_tcindex_ops __read_mostly = { diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c index 4d27300c287c..34d25f7a0687 100644 --- a/net/sched/cls_u32.c +++ b/net/sched/cls_u32.c @@ -1040,7 +1040,11 @@ static int u32_change(struct net *net, struct sk_buff *in_skb, } #endif - memcpy(&n->sel, s, sel_size); + unsafe_memcpy(&n->sel, s, sel_size, + /* A composite flex-array structure destination, + * which was correctly sized with struct_size(), + * bounds-checked against nla_len(), and allocated + * above. */); RCU_INIT_POINTER(n->ht_up, ht); n->handle = handle; n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0; @@ -1125,26 +1129,16 @@ static void u32_walk(struct tcf_proto *tp, struct tcf_walker *arg, ht = rtnl_dereference(ht->next)) { if (ht->prio != tp->prio) continue; - if (arg->count >= arg->skip) { - if (arg->fn(tp, ht, arg) < 0) { - arg->stop = 1; - return; - } - } - arg->count++; + + if (!tc_cls_stats_dump(tp, arg, ht)) + return; + for (h = 0; h <= ht->divisor; h++) { for (n = rtnl_dereference(ht->ht[h]); n; n = rtnl_dereference(n->next)) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(tp, n, arg) < 0) { - arg->stop = 1; + if (!tc_cls_stats_dump(tp, arg, n)) return; - } - arg->count++; } } } @@ -1256,12 +1250,7 @@ static void u32_bind_class(void *fh, u32 classid, unsigned long cl, void *q, { struct tc_u_knode *n = fh; - if (n && n->res.classid == classid) { - if (cl) - __tcf_bind_filter(q, &n->res, base); - else - __tcf_unbind_filter(q, &n->res); - } + tc_cls_bind_class(classid, cl, q, &n->res, base); } static int u32_dump(struct net *net, struct tcf_proto *tp, void *fh, diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index db1569fac57c..4a27dfb1ba0f 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -868,6 +868,23 @@ void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch, } EXPORT_SYMBOL(qdisc_offload_graft_helper); +void qdisc_offload_query_caps(struct net_device *dev, + enum tc_setup_type type, + void *caps, size_t caps_len) +{ + const struct net_device_ops *ops = dev->netdev_ops; + struct tc_query_caps_base base = { + .type = type, + .caps = caps, + }; + + memset(caps, 0, caps_len); + + if (ops->ndo_setup_tc) + ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base); +} +EXPORT_SYMBOL(qdisc_offload_query_caps); + static void qdisc_offload_graft_root(struct net_device *dev, struct Qdisc *new, struct Qdisc *old, struct netlink_ext_ack *extack) @@ -1082,12 +1099,13 @@ static int qdisc_graft(struct net_device *dev, struct Qdisc *parent, skip: if (!ingress) { - notify_and_destroy(net, skb, n, classid, - rtnl_dereference(dev->qdisc), new); + old = rtnl_dereference(dev->qdisc); if (new && !new->ops->attach) qdisc_refcount_inc(new); rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc); + notify_and_destroy(net, skb, n, classid, old, new); + if (new && new->ops->attach) new->ops->attach(new); } else { @@ -1898,7 +1916,7 @@ static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg) { struct tcf_bind_args *a = (void *)arg; - if (tp->ops->bind_class) { + if (n && tp->ops->bind_class) { struct Qdisc *q = tcf_block_q(tp->chain->block); sch_tree_lock(q); diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c index 816fd0d7ba38..f52255fea652 100644 --- a/net/sched/sch_atm.c +++ b/net/sched/sch_atm.c @@ -354,12 +354,8 @@ static void atm_tc_walk(struct Qdisc *sch, struct qdisc_walker *walker) if (walker->stop) return; list_for_each_entry(flow, &p->flows, list) { - if (walker->count >= walker->skip && - walker->fn(sch, (unsigned long)flow, walker) < 0) { - walker->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)flow, walker)) break; - } - walker->count++; } } diff --git a/net/sched/sch_cake.c b/net/sched/sch_cake.c index 36acc95d611e..3ed0c3342189 100644 --- a/net/sched/sch_cake.c +++ b/net/sched/sch_cake.c @@ -573,7 +573,7 @@ static bool cobalt_should_drop(struct cobalt_vars *vars, /* Simple BLUE implementation. Lack of ECN is deliberate. */ if (vars->p_drop) - drop |= (prandom_u32() < vars->p_drop); + drop |= (get_random_u32() < vars->p_drop); /* Overload the drop_next field as an activity timeout */ if (!vars->count) @@ -2092,11 +2092,11 @@ retry: WARN_ON(host_load > CAKE_QUEUES); - /* The shifted prandom_u32() is a way to apply dithering to - * avoid accumulating roundoff errors + /* The get_random_u16() is a way to apply dithering to avoid + * accumulating roundoff errors */ flow->deficit += (b->flow_quantum * quantum_div[host_load] + - (prandom_u32() >> 16)) >> 16; + get_random_u16()) >> 16; list_move_tail(&flow->flowchain, &b->old_flows); goto retry; @@ -2224,8 +2224,12 @@ retry: static void cake_reset(struct Qdisc *sch) { + struct cake_sched_data *q = qdisc_priv(sch); u32 c; + if (!q->tins) + return; + for (c = 0; c < CAKE_MAX_TINS; c++) cake_clear_tin(sch, c); } @@ -3061,16 +3065,13 @@ static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg) struct cake_tin_data *b = &q->tins[q->tin_order[i]]; for (j = 0; j < CAKE_QUEUES; j++) { - if (list_empty(&b->flows[j].flowchain) || - arg->count < arg->skip) { + if (list_empty(&b->flows[j].flowchain)) { arg->count++; continue; } - if (arg->fn(sch, i * CAKE_QUEUES + j + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1, + arg)) break; - } - arg->count++; } } } diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c index ba99ce05cd52..6568e17c4c63 100644 --- a/net/sched/sch_cbq.c +++ b/net/sched/sch_cbq.c @@ -1676,15 +1676,8 @@ static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg) for (h = 0; h < q->clhash.hashsize; h++) { hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; - } - arg->count++; } } } diff --git a/net/sched/sch_cbs.c b/net/sched/sch_cbs.c index 459cc240eda9..cac870eb7897 100644 --- a/net/sched/sch_cbs.c +++ b/net/sched/sch_cbs.c @@ -520,13 +520,7 @@ static unsigned long cbs_find(struct Qdisc *sch, u32 classid) static void cbs_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { - if (walker->count >= walker->skip) { - if (walker->fn(sch, 1, walker) < 0) { - walker->stop = 1; - return; - } - } - walker->count++; + tc_qdisc_stats_dump(sch, 1, walker); } } diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c index 4e5b1cf11b85..e35a4e90f4e6 100644 --- a/net/sched/sch_drr.c +++ b/net/sched/sch_drr.c @@ -284,15 +284,8 @@ static void drr_walk(struct Qdisc *sch, struct qdisc_walker *arg) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; - } - arg->count++; } } } diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c index 7da6dc38a382..401ffaf87d62 100644 --- a/net/sched/sch_dsmark.c +++ b/net/sched/sch_dsmark.c @@ -176,16 +176,12 @@ static void dsmark_walk(struct Qdisc *sch, struct qdisc_walker *walker) return; for (i = 0; i < p->indices; i++) { - if (p->mv[i].mask == 0xff && !p->mv[i].value) - goto ignore; - if (walker->count >= walker->skip) { - if (walker->fn(sch, i + 1, walker) < 0) { - walker->stop = 1; - break; - } + if (p->mv[i].mask == 0xff && !p->mv[i].value) { + walker->count++; + continue; } -ignore: - walker->count++; + if (!tc_qdisc_stats_dump(sch, i + 1, walker)) + break; } } diff --git a/net/sched/sch_ets.c b/net/sched/sch_ets.c index a3aea22ef09d..b10efeaf0629 100644 --- a/net/sched/sch_ets.c +++ b/net/sched/sch_ets.c @@ -341,15 +341,8 @@ static void ets_qdisc_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->nbands; i++) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, i + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c index eeea8c6d54e2..8c4fee063436 100644 --- a/net/sched/sch_fq_codel.c +++ b/net/sched/sch_fq_codel.c @@ -478,24 +478,26 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, if (opt) { err = fq_codel_change(sch, opt, extack); if (err) - return err; + goto init_failure; } err = tcf_block_get(&q->block, &q->filter_list, sch, extack); if (err) - return err; + goto init_failure; if (!q->flows) { q->flows = kvcalloc(q->flows_cnt, sizeof(struct fq_codel_flow), GFP_KERNEL); - if (!q->flows) - return -ENOMEM; - + if (!q->flows) { + err = -ENOMEM; + goto init_failure; + } q->backlogs = kvcalloc(q->flows_cnt, sizeof(u32), GFP_KERNEL); - if (!q->backlogs) - return -ENOMEM; - + if (!q->backlogs) { + err = -ENOMEM; + goto alloc_failure; + } for (i = 0; i < q->flows_cnt; i++) { struct fq_codel_flow *flow = q->flows + i; @@ -508,6 +510,13 @@ static int fq_codel_init(struct Qdisc *sch, struct nlattr *opt, else sch->flags &= ~TCQ_F_CAN_BYPASS; return 0; + +alloc_failure: + kvfree(q->flows); + q->flows = NULL; +init_failure: + q->flows_cnt = 0; + return err; } static int fq_codel_dump(struct Qdisc *sch, struct sk_buff *skb) @@ -673,16 +682,12 @@ static void fq_codel_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->flows_cnt; i++) { - if (list_empty(&q->flows[i].flowchain) || - arg->count < arg->skip) { + if (list_empty(&q->flows[i].flowchain)) { arg->count++; continue; } - if (arg->fn(sch, i + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c index c8bef923c79c..70b0c5873d32 100644 --- a/net/sched/sch_hfsc.c +++ b/net/sched/sch_hfsc.c @@ -1349,15 +1349,8 @@ hfsc_walk(struct Qdisc *sch, struct qdisc_walker *arg) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; - } - arg->count++; } } } diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 78d0c7549c74..e5b4bbf3ce3d 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -2119,15 +2119,8 @@ static void htb_walk(struct Qdisc *sch, struct qdisc_walker *arg) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; - } - arg->count++; } } } diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c index 83d2e54bf303..d0bc660d7401 100644 --- a/net/sched/sch_mq.c +++ b/net/sched/sch_mq.c @@ -247,11 +247,8 @@ static void mq_walk(struct Qdisc *sch, struct qdisc_walker *arg) arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { - if (arg->fn(sch, ntx + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c index b29f3453c6ea..4c68abaa289b 100644 --- a/net/sched/sch_mqprio.c +++ b/net/sched/sch_mqprio.c @@ -558,11 +558,8 @@ static void mqprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) /* Walk hierarchy with a virtual class per tc */ arg->count = arg->skip; for (ntx = arg->skip; ntx < netdev_get_num_tc(dev); ntx++) { - if (arg->fn(sch, ntx + TC_H_MIN_PRIORITY, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, ntx + TC_H_MIN_PRIORITY, arg)) return; - } - arg->count++; } /* Pad the values and skip over unused traffic classes */ diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c index f28050c7f12d..75c9c860182b 100644 --- a/net/sched/sch_multiq.c +++ b/net/sched/sch_multiq.c @@ -353,15 +353,8 @@ static void multiq_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (band = 0; band < q->bands; band++) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, band + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, band + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c index b70ac04110dd..fb00ac40ecb7 100644 --- a/net/sched/sch_netem.c +++ b/net/sched/sch_netem.c @@ -171,7 +171,7 @@ static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb) static void init_crandom(struct crndstate *state, unsigned long rho) { state->rho = rho; - state->last = prandom_u32(); + state->last = get_random_u32(); } /* get_crandom - correlated random number generator @@ -184,9 +184,9 @@ static u32 get_crandom(struct crndstate *state) unsigned long answer; if (!state || state->rho == 0) /* no correlation */ - return prandom_u32(); + return get_random_u32(); - value = prandom_u32(); + value = get_random_u32(); rho = (u64)state->rho + 1; answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32; state->last = answer; @@ -200,7 +200,7 @@ static u32 get_crandom(struct crndstate *state) static bool loss_4state(struct netem_sched_data *q) { struct clgstate *clg = &q->clg; - u32 rnd = prandom_u32(); + u32 rnd = get_random_u32(); /* * Makes a comparison between rnd and the transition @@ -268,15 +268,15 @@ static bool loss_gilb_ell(struct netem_sched_data *q) switch (clg->state) { case GOOD_STATE: - if (prandom_u32() < clg->a1) + if (get_random_u32() < clg->a1) clg->state = BAD_STATE; - if (prandom_u32() < clg->a4) + if (get_random_u32() < clg->a4) return true; break; case BAD_STATE: - if (prandom_u32() < clg->a2) + if (get_random_u32() < clg->a2) clg->state = GOOD_STATE; - if (prandom_u32() > clg->a3) + if (get_random_u32() > clg->a3) return true; } @@ -513,8 +513,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto finish_segs; } - skb->data[prandom_u32() % skb_headlen(skb)] ^= - 1<<(prandom_u32() % 8); + skb->data[prandom_u32_max(skb_headlen(skb))] ^= + 1<<prandom_u32_max(8); } if (unlikely(sch->q.qlen >= sch->limit)) { @@ -632,7 +632,7 @@ static void get_slot_next(struct netem_sched_data *q, u64 now) if (!q->slot_dist) next_delay = q->slot_config.min_delay + - (prandom_u32() * + (get_random_u32() * (q->slot_config.max_delay - q->slot_config.min_delay) >> 32); else @@ -1251,12 +1251,8 @@ static unsigned long netem_find(struct Qdisc *sch, u32 classid) static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { - if (walker->count >= walker->skip) - if (walker->fn(sch, 1, walker) < 0) { - walker->stop = 1; - return; - } - walker->count++; + if (!tc_qdisc_stats_dump(sch, 1, walker)) + return; } } diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c index 974038ba6c7b..265c238047a4 100644 --- a/net/sched/sch_pie.c +++ b/net/sched/sch_pie.c @@ -72,7 +72,7 @@ bool pie_drop_early(struct Qdisc *sch, struct pie_params *params, if (vars->accu_prob >= (MAX_PROB / 2) * 17) return true; - prandom_bytes(&rnd, 8); + get_random_bytes(&rnd, 8); if ((rnd >> BITS_PER_BYTE) < local_prob) { vars->accu_prob = 0; return true; diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c index 298794c04836..fdc5ef52c3ee 100644 --- a/net/sched/sch_prio.c +++ b/net/sched/sch_prio.c @@ -376,15 +376,8 @@ static void prio_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (prio = 0; prio < q->bands; prio++) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, prio + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, prio + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c index 13246a9dc5c1..cf5ebe43b3b4 100644 --- a/net/sched/sch_qfq.c +++ b/net/sched/sch_qfq.c @@ -659,15 +659,8 @@ static void qfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) for (i = 0; i < q->clhash.hashsize; i++) { hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, (unsigned long)cl, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, (unsigned long)cl, arg)) return; - } - arg->count++; } } } diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c index 4952406f70b9..a5a401f93c1a 100644 --- a/net/sched/sch_red.c +++ b/net/sched/sch_red.c @@ -516,12 +516,7 @@ static unsigned long red_find(struct Qdisc *sch, u32 classid) static void red_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { - if (walker->count >= walker->skip) - if (walker->fn(sch, 1, walker) < 0) { - walker->stop = 1; - return; - } - walker->count++; + tc_qdisc_stats_dump(sch, 1, walker); } } diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c index 31717fa45a4f..1871a1c0224d 100644 --- a/net/sched/sch_sfb.c +++ b/net/sched/sch_sfb.c @@ -135,15 +135,15 @@ static void increment_one_qlen(u32 sfbhash, u32 slot, struct sfb_sched_data *q) } } -static void increment_qlen(const struct sk_buff *skb, struct sfb_sched_data *q) +static void increment_qlen(const struct sfb_skb_cb *cb, struct sfb_sched_data *q) { u32 sfbhash; - sfbhash = sfb_hash(skb, 0); + sfbhash = cb->hashes[0]; if (sfbhash) increment_one_qlen(sfbhash, 0, q); - sfbhash = sfb_hash(skb, 1); + sfbhash = cb->hashes[1]; if (sfbhash) increment_one_qlen(sfbhash, 1, q); } @@ -281,8 +281,10 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, { struct sfb_sched_data *q = qdisc_priv(sch); + unsigned int len = qdisc_pkt_len(skb); struct Qdisc *child = q->qdisc; struct tcf_proto *fl; + struct sfb_skb_cb cb; int i; u32 p_min = ~0; u32 minqlen = ~0; @@ -377,7 +379,7 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, goto enqueue; } - r = prandom_u32() & SFB_MAX_PROB; + r = get_random_u16() & SFB_MAX_PROB; if (unlikely(r < p_min)) { if (unlikely(p_min > SFB_MAX_PROB / 2)) { @@ -399,11 +401,12 @@ static int sfb_enqueue(struct sk_buff *skb, struct Qdisc *sch, } enqueue: + memcpy(&cb, sfb_skb_cb(skb), sizeof(cb)); ret = qdisc_enqueue(skb, child, to_free); if (likely(ret == NET_XMIT_SUCCESS)) { - qdisc_qstats_backlog_inc(sch, skb); + sch->qstats.backlog += len; sch->q.qlen++; - increment_qlen(skb, q); + increment_qlen(&cb, q); } else if (net_xmit_drop_count(ret)) { q->stats.childdrop++; qdisc_qstats_drop(sch); @@ -452,7 +455,8 @@ static void sfb_reset(struct Qdisc *sch) { struct sfb_sched_data *q = qdisc_priv(sch); - qdisc_reset(q->qdisc); + if (likely(q->qdisc)) + qdisc_reset(q->qdisc); q->slot = 0; q->double_buffering = false; sfb_zero_all_buckets(q); @@ -656,12 +660,7 @@ static int sfb_delete(struct Qdisc *sch, unsigned long cl, static void sfb_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { - if (walker->count >= walker->skip) - if (walker->fn(sch, 1, walker) < 0) { - walker->stop = 1; - return; - } - walker->count++; + tc_qdisc_stats_dump(sch, 1, walker); } } diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index f8e569f79f13..abd436307d6a 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -888,16 +888,12 @@ static void sfq_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < q->divisor; i++) { - if (q->ht[i] == SFQ_EMPTY_SLOT || - arg->count < arg->skip) { + if (q->ht[i] == SFQ_EMPTY_SLOT) { arg->count++; continue; } - if (arg->fn(sch, i + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_skbprio.c b/net/sched/sch_skbprio.c index df72fb83d9c7..5df2dacb7b1a 100644 --- a/net/sched/sch_skbprio.c +++ b/net/sched/sch_skbprio.c @@ -265,15 +265,8 @@ static void skbprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) return; for (i = 0; i < SKBPRIO_MAX_PRIORITY; i++) { - if (arg->count < arg->skip) { - arg->count++; - continue; - } - if (arg->fn(sch, i + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, i + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_taprio.c b/net/sched/sch_taprio.c index db88a692ef81..570389f6cdd7 100644 --- a/net/sched/sch_taprio.c +++ b/net/sched/sch_taprio.c @@ -27,7 +27,6 @@ #include <net/tcp.h> static LIST_HEAD(taprio_list); -static DEFINE_SPINLOCK(taprio_list_lock); #define TAPRIO_ALL_GATES_OPEN -1 @@ -67,6 +66,7 @@ struct taprio_sched { u32 flags; enum tk_offsets tk_offset; int clockid; + bool offloaded; atomic64_t picos_per_byte; /* Using picoseconds because for 10Gbps+ * speeds it's sub-nanoseconds per byte */ @@ -78,8 +78,8 @@ struct taprio_sched { struct sched_gate_list __rcu *admin_sched; struct hrtimer advance_timer; struct list_head taprio_list; - struct sk_buff *(*dequeue)(struct Qdisc *sch); - struct sk_buff *(*peek)(struct Qdisc *sch); + u32 max_frm_len[TC_MAX_QUEUE]; /* for the fast path */ + u32 max_sdu[TC_MAX_QUEUE]; /* for dump and offloading */ u32 txtime_delay; }; @@ -417,6 +417,9 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child, struct sk_buff **to_free) { struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + int prio = skb->priority; + u8 tc; /* sk_flags are only safe to use on full sockets. */ if (skb->sk && sk_fullsock(skb->sk) && sock_flag(skb->sk, SOCK_TXTIME)) { @@ -428,12 +431,20 @@ static int taprio_enqueue_one(struct sk_buff *skb, struct Qdisc *sch, return qdisc_drop(skb, sch, to_free); } + /* Devices with full offload are expected to honor this in hardware */ + tc = netdev_get_prio_tc_map(dev, prio); + if (skb->len > q->max_frm_len[tc]) + return qdisc_drop(skb, sch, to_free); + qdisc_qstats_backlog_inc(sch, skb); sch->q.qlen++; return qdisc_enqueue(skb, child, to_free); } +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free) { @@ -441,11 +452,6 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct Qdisc *child; int queue; - if (unlikely(FULL_OFFLOAD_IS_ENABLED(q->flags))) { - WARN_ONCE(1, "Trying to enqueue skb into the root of a taprio qdisc configured with full offload\n"); - return qdisc_drop(skb, sch, to_free); - } - queue = skb_get_queue_mapping(skb); child = q->qdiscs[queue]; @@ -454,10 +460,10 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, /* Large packets might not be transmitted when the transmission duration * exceeds any configured interval. Therefore, segment the skb into - * smaller chunks. Skip it for the full offload case, as the driver - * and/or the hardware is expected to handle this. + * smaller chunks. Drivers with full offload are expected to handle + * this in hardware. */ - if (skb_is_gso(skb) && !FULL_OFFLOAD_IS_ENABLED(q->flags)) { + if (skb_is_gso(skb)) { unsigned int slen = 0, numsegs = 0, len = qdisc_pkt_len(skb); netdev_features_t features = netif_skb_features(skb); struct sk_buff *segs, *nskb; @@ -491,7 +497,10 @@ static int taprio_enqueue(struct sk_buff *skb, struct Qdisc *sch, return taprio_enqueue_one(skb, sch, child, to_free); } -static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ +static struct sk_buff *taprio_peek(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -535,20 +544,6 @@ static struct sk_buff *taprio_peek_soft(struct Qdisc *sch) return NULL; } -static struct sk_buff *taprio_peek_offload(struct Qdisc *sch) -{ - WARN_ONCE(1, "Trying to peek into the root of a taprio qdisc configured with full offload\n"); - - return NULL; -} - -static struct sk_buff *taprio_peek(struct Qdisc *sch) -{ - struct taprio_sched *q = qdisc_priv(sch); - - return q->peek(sch); -} - static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) { atomic_set(&entry->budget, @@ -556,7 +551,10 @@ static void taprio_set_budget(struct taprio_sched *q, struct sched_entry *entry) atomic64_read(&q->picos_per_byte))); } -static struct sk_buff *taprio_dequeue_soft(struct Qdisc *sch) +/* Will not be called in the full offload case, since the TX queues are + * attached to the Qdisc created using qdisc_create_dflt() + */ +static struct sk_buff *taprio_dequeue(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); @@ -644,20 +642,6 @@ done: return skb; } -static struct sk_buff *taprio_dequeue_offload(struct Qdisc *sch) -{ - WARN_ONCE(1, "Trying to dequeue from the root of a taprio qdisc configured with full offload\n"); - - return NULL; -} - -static struct sk_buff *taprio_dequeue(struct Qdisc *sch) -{ - struct taprio_sched *q = qdisc_priv(sch); - - return q->dequeue(sch); -} - static bool should_restart_cycle(const struct sched_gate_list *oper, const struct sched_entry *entry) { @@ -780,6 +764,11 @@ static const struct nla_policy entry_policy[TCA_TAPRIO_SCHED_ENTRY_MAX + 1] = { [TCA_TAPRIO_SCHED_ENTRY_INTERVAL] = { .type = NLA_U32 }, }; +static const struct nla_policy taprio_tc_policy[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { + [TCA_TAPRIO_TC_ENTRY_INDEX] = { .type = NLA_U32 }, + [TCA_TAPRIO_TC_ENTRY_MAX_SDU] = { .type = NLA_U32 }, +}; + static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_PRIOMAP] = { .len = sizeof(struct tc_mqprio_qopt) @@ -792,6 +781,7 @@ static const struct nla_policy taprio_policy[TCA_TAPRIO_ATTR_MAX + 1] = { [TCA_TAPRIO_ATTR_SCHED_CYCLE_TIME_EXTENSION] = { .type = NLA_S64 }, [TCA_TAPRIO_ATTR_FLAGS] = { .type = NLA_U32 }, [TCA_TAPRIO_ATTR_TXTIME_DELAY] = { .type = NLA_U32 }, + [TCA_TAPRIO_ATTR_TC_ENTRY] = { .type = NLA_NESTED }, }; static int fill_sched_entry(struct taprio_sched *q, struct nlattr **tb, @@ -1098,27 +1088,20 @@ static int taprio_dev_notifier(struct notifier_block *nb, unsigned long event, void *ptr) { struct net_device *dev = netdev_notifier_info_to_dev(ptr); - struct net_device *qdev; struct taprio_sched *q; - bool found = false; ASSERT_RTNL(); if (event != NETDEV_UP && event != NETDEV_CHANGE) return NOTIFY_DONE; - spin_lock(&taprio_list_lock); list_for_each_entry(q, &taprio_list, taprio_list) { - qdev = qdisc_dev(q->root); - if (qdev == dev) { - found = true; - break; - } - } - spin_unlock(&taprio_list_lock); + if (dev != qdisc_dev(q->root)) + continue; - if (found) taprio_set_picos_per_byte(dev, q); + break; + } return NOTIFY_DONE; } @@ -1193,16 +1176,10 @@ static void taprio_offload_config_changed(struct taprio_sched *q) { struct sched_gate_list *oper, *admin; - spin_lock(&q->current_entry_lock); - - oper = rcu_dereference_protected(q->oper_sched, - lockdep_is_held(&q->current_entry_lock)); - admin = rcu_dereference_protected(q->admin_sched, - lockdep_is_held(&q->current_entry_lock)); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); switch_schedules(q, &admin, &oper); - - spin_unlock(&q->current_entry_lock); } static u32 tc_map_to_queue_mask(struct net_device *dev, u32 tc_mask) @@ -1255,7 +1232,8 @@ static int taprio_enable_offload(struct net_device *dev, { const struct net_device_ops *ops = dev->netdev_ops; struct tc_taprio_qopt_offload *offload; - int err = 0; + struct tc_taprio_caps caps; + int tc, err = 0; if (!ops->ndo_setup_tc) { NL_SET_ERR_MSG(extack, @@ -1263,6 +1241,19 @@ static int taprio_enable_offload(struct net_device *dev, return -EOPNOTSUPP; } + qdisc_offload_query_caps(dev, TC_SETUP_QDISC_TAPRIO, + &caps, sizeof(caps)); + + if (!caps.supports_queue_max_sdu) { + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + if (q->max_sdu[tc]) { + NL_SET_ERR_MSG_MOD(extack, + "Device does not handle queueMaxSDU"); + return -EOPNOTSUPP; + } + } + } + offload = taprio_offload_alloc(sched->num_entries); if (!offload) { NL_SET_ERR_MSG(extack, @@ -1272,6 +1263,9 @@ static int taprio_enable_offload(struct net_device *dev, offload->enable = 1; taprio_sched_to_offload(dev, sched, offload); + for (tc = 0; tc < TC_MAX_QUEUE; tc++) + offload->max_sdu[tc] = q->max_sdu[tc]; + err = ops->ndo_setup_tc(dev, TC_SETUP_QDISC_TAPRIO, offload); if (err < 0) { NL_SET_ERR_MSG(extack, @@ -1279,6 +1273,8 @@ static int taprio_enable_offload(struct net_device *dev, goto done; } + q->offloaded = true; + done: taprio_offload_free(offload); @@ -1293,12 +1289,9 @@ static int taprio_disable_offload(struct net_device *dev, struct tc_taprio_qopt_offload *offload; int err; - if (!FULL_OFFLOAD_IS_ENABLED(q->flags)) + if (!q->offloaded) return 0; - if (!ops->ndo_setup_tc) - return -EOPNOTSUPP; - offload = taprio_offload_alloc(0); if (!offload) { NL_SET_ERR_MSG(extack, @@ -1314,6 +1307,8 @@ static int taprio_disable_offload(struct net_device *dev, goto out; } + q->offloaded = false; + out: taprio_offload_free(offload); @@ -1405,6 +1400,89 @@ out: return err; } +static int taprio_parse_tc_entry(struct Qdisc *sch, + struct nlattr *opt, + u32 max_sdu[TC_QOPT_MAX_QUEUE], + unsigned long *seen_tcs, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[TCA_TAPRIO_TC_ENTRY_MAX + 1] = { }; + struct net_device *dev = qdisc_dev(sch); + u32 val = 0; + int err, tc; + + err = nla_parse_nested(tb, TCA_TAPRIO_TC_ENTRY_MAX, opt, + taprio_tc_policy, extack); + if (err < 0) + return err; + + if (!tb[TCA_TAPRIO_TC_ENTRY_INDEX]) { + NL_SET_ERR_MSG_MOD(extack, "TC entry index missing"); + return -EINVAL; + } + + tc = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_INDEX]); + if (tc >= TC_QOPT_MAX_QUEUE) { + NL_SET_ERR_MSG_MOD(extack, "TC entry index out of range"); + return -ERANGE; + } + + if (*seen_tcs & BIT(tc)) { + NL_SET_ERR_MSG_MOD(extack, "Duplicate TC entry"); + return -EINVAL; + } + + *seen_tcs |= BIT(tc); + + if (tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]) + val = nla_get_u32(tb[TCA_TAPRIO_TC_ENTRY_MAX_SDU]); + + if (val > dev->max_mtu) { + NL_SET_ERR_MSG_MOD(extack, "TC max SDU exceeds device max MTU"); + return -ERANGE; + } + + max_sdu[tc] = val; + + return 0; +} + +static int taprio_parse_tc_entries(struct Qdisc *sch, + struct nlattr *opt, + struct netlink_ext_ack *extack) +{ + struct taprio_sched *q = qdisc_priv(sch); + struct net_device *dev = qdisc_dev(sch); + u32 max_sdu[TC_QOPT_MAX_QUEUE]; + unsigned long seen_tcs = 0; + struct nlattr *n; + int tc, rem; + int err = 0; + + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) + max_sdu[tc] = q->max_sdu[tc]; + + nla_for_each_nested(n, opt, rem) { + if (nla_type(n) != TCA_TAPRIO_ATTR_TC_ENTRY) + continue; + + err = taprio_parse_tc_entry(sch, n, max_sdu, &seen_tcs, extack); + if (err) + goto out; + } + + for (tc = 0; tc < TC_QOPT_MAX_QUEUE; tc++) { + q->max_sdu[tc] = max_sdu[tc]; + if (max_sdu[tc]) + q->max_frm_len[tc] = max_sdu[tc] + dev->hard_header_len; + else + q->max_frm_len[tc] = U32_MAX; /* never oversized */ + } + +out: + return err; +} + static int taprio_mqprio_cmp(const struct net_device *dev, const struct tc_mqprio_qopt *mqprio) { @@ -1483,6 +1561,10 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, if (err < 0) return err; + err = taprio_parse_tc_entries(sch, opt, extack); + if (err) + return err; + new_admin = kzalloc(sizeof(*new_admin), GFP_KERNEL); if (!new_admin) { NL_SET_ERR_MSG(extack, "Not enough memory for a new schedule"); @@ -1490,10 +1572,8 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, } INIT_LIST_HEAD(&new_admin->entries); - rcu_read_lock(); - oper = rcu_dereference(q->oper_sched); - admin = rcu_dereference(q->admin_sched); - rcu_read_unlock(); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); /* no changes - no new mqprio settings */ if (!taprio_mqprio_cmp(dev, mqprio)) @@ -1563,17 +1643,6 @@ static int taprio_change(struct Qdisc *sch, struct nlattr *opt, q->advance_timer.function = advance_sched; } - if (FULL_OFFLOAD_IS_ENABLED(q->flags)) { - q->dequeue = taprio_dequeue_offload; - q->peek = taprio_peek_offload; - } else { - /* Be sure to always keep the function pointers - * in a consistent state. - */ - q->dequeue = taprio_dequeue_soft; - q->peek = taprio_peek_soft; - } - err = taprio_get_start_time(sch, new_admin, &start); if (err < 0) { NL_SET_ERR_MSG(extack, "Internal error: failed get start time"); @@ -1642,11 +1711,10 @@ static void taprio_destroy(struct Qdisc *sch) { struct taprio_sched *q = qdisc_priv(sch); struct net_device *dev = qdisc_dev(sch); + struct sched_gate_list *oper, *admin; unsigned int i; - spin_lock(&taprio_list_lock); list_del(&q->taprio_list); - spin_unlock(&taprio_list_lock); /* Note that taprio_reset() might not be called if an error * happens in qdisc_create(), after taprio_init() has been called. @@ -1665,11 +1733,14 @@ static void taprio_destroy(struct Qdisc *sch) netdev_reset_tc(dev); - if (q->oper_sched) - call_rcu(&q->oper_sched->rcu, taprio_free_sched_cb); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); - if (q->admin_sched) - call_rcu(&q->admin_sched->rcu, taprio_free_sched_cb); + if (oper) + call_rcu(&oper->rcu, taprio_free_sched_cb); + + if (admin) + call_rcu(&admin->rcu, taprio_free_sched_cb); } static int taprio_init(struct Qdisc *sch, struct nlattr *opt, @@ -1684,9 +1755,6 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, hrtimer_init(&q->advance_timer, CLOCK_TAI, HRTIMER_MODE_ABS); q->advance_timer.function = advance_sched; - q->dequeue = taprio_dequeue_soft; - q->peek = taprio_peek_soft; - q->root = sch; /* We only support static clockids. Use an invalid value as default @@ -1695,15 +1763,17 @@ static int taprio_init(struct Qdisc *sch, struct nlattr *opt, q->clockid = -1; q->flags = TAPRIO_FLAGS_INVALID; - spin_lock(&taprio_list_lock); list_add(&q->taprio_list, &taprio_list); - spin_unlock(&taprio_list_lock); - if (sch->parent != TC_H_ROOT) + if (sch->parent != TC_H_ROOT) { + NL_SET_ERR_MSG_MOD(extack, "Can only be attached as root qdisc"); return -EOPNOTSUPP; + } - if (!netif_is_multiqueue(dev)) + if (!netif_is_multiqueue(dev)) { + NL_SET_ERR_MSG_MOD(extack, "Multi-queue device is required"); return -EOPNOTSUPP; + } /* pre-allocate qdisc, attachment can't fail */ q->qdiscs = kcalloc(dev->num_tx_queues, @@ -1875,6 +1945,33 @@ error_nest: return -1; } +static int taprio_dump_tc_entries(struct taprio_sched *q, struct sk_buff *skb) +{ + struct nlattr *n; + int tc; + + for (tc = 0; tc < TC_MAX_QUEUE; tc++) { + n = nla_nest_start(skb, TCA_TAPRIO_ATTR_TC_ENTRY); + if (!n) + return -EMSGSIZE; + + if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_INDEX, tc)) + goto nla_put_failure; + + if (nla_put_u32(skb, TCA_TAPRIO_TC_ENTRY_MAX_SDU, + q->max_sdu[tc])) + goto nla_put_failure; + + nla_nest_end(skb, n); + } + + return 0; + +nla_put_failure: + nla_nest_cancel(skb, n); + return -EMSGSIZE; +} + static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) { struct taprio_sched *q = qdisc_priv(sch); @@ -1884,9 +1981,8 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) struct nlattr *nest, *sched_nest; unsigned int i; - rcu_read_lock(); - oper = rcu_dereference(q->oper_sched); - admin = rcu_dereference(q->admin_sched); + oper = rtnl_dereference(q->oper_sched); + admin = rtnl_dereference(q->admin_sched); opt.num_tc = netdev_get_num_tc(dev); memcpy(opt.prio_tc_map, dev->prio_tc_map, sizeof(opt.prio_tc_map)); @@ -1914,6 +2010,9 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_put_u32(skb, TCA_TAPRIO_ATTR_TXTIME_DELAY, q->txtime_delay)) goto options_error; + if (taprio_dump_tc_entries(q, skb)) + goto options_error; + if (oper && dump_schedule(skb, oper)) goto options_error; @@ -1930,8 +2029,6 @@ static int taprio_dump(struct Qdisc *sch, struct sk_buff *skb) nla_nest_end(skb, sched_nest); done: - rcu_read_unlock(); - return nla_nest_end(skb, nest); admin_error: @@ -1941,7 +2038,6 @@ options_error: nla_nest_cancel(skb, nest); start_error: - rcu_read_unlock(); return -ENOSPC; } @@ -2000,11 +2096,8 @@ static void taprio_walk(struct Qdisc *sch, struct qdisc_walker *arg) arg->count = arg->skip; for (ntx = arg->skip; ntx < dev->num_tx_queues; ntx++) { - if (arg->fn(sch, ntx + 1, arg) < 0) { - arg->stop = 1; + if (!tc_qdisc_stats_dump(sch, ntx + 1, arg)) break; - } - arg->count++; } } diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c index e031c1a41ea6..277ad11f4d61 100644 --- a/net/sched/sch_tbf.c +++ b/net/sched/sch_tbf.c @@ -580,12 +580,7 @@ static unsigned long tbf_find(struct Qdisc *sch, u32 classid) static void tbf_walk(struct Qdisc *sch, struct qdisc_walker *walker) { if (!walker->stop) { - if (walker->count >= walker->skip) - if (walker->fn(sch, 1, walker) < 0) { - walker->stop = 1; - return; - } - walker->count++; + tc_qdisc_stats_dump(sch, 1, walker); } } diff --git a/net/sctp/associola.c b/net/sctp/associola.c index 3460abceba44..63ba5551c13f 100644 --- a/net/sctp/associola.c +++ b/net/sctp/associola.c @@ -226,8 +226,7 @@ static struct sctp_association *sctp_association_init( /* Create an output queue. */ sctp_outq_init(asoc, &asoc->outqueue); - if (!sctp_ulpq_init(&asoc->ulpq, asoc)) - goto fail_init; + sctp_ulpq_init(&asoc->ulpq, asoc); if (sctp_stream_init(&asoc->stream, asoc->c.sinit_num_ostreams, 0, gfp)) goto stream_free; @@ -277,7 +276,6 @@ static struct sctp_association *sctp_association_init( stream_free: sctp_stream_free(&asoc->stream); -fail_init: sock_put(asoc->base.sk); sctp_endpoint_put(asoc->ep); return NULL; diff --git a/net/sctp/auth.c b/net/sctp/auth.c index db6b7373d16c..34964145514e 100644 --- a/net/sctp/auth.c +++ b/net/sctp/auth.c @@ -863,12 +863,17 @@ int sctp_auth_set_key(struct sctp_endpoint *ep, } list_del_init(&shkey->key_list); - sctp_auth_shkey_release(shkey); list_add(&cur_key->key_list, sh_keys); - if (asoc && asoc->active_key_id == auth_key->sca_keynumber) - sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + if (asoc && asoc->active_key_id == auth_key->sca_keynumber && + sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { + list_del_init(&cur_key->key_list); + sctp_auth_shkey_release(cur_key); + list_add(&shkey->key_list, sh_keys); + return -ENOMEM; + } + sctp_auth_shkey_release(shkey); return 0; } @@ -902,8 +907,13 @@ int sctp_auth_set_active_key(struct sctp_endpoint *ep, return -EINVAL; if (asoc) { + __u16 active_key_id = asoc->active_key_id; + asoc->active_key_id = key_id; - sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL); + if (sctp_auth_asoc_init_active_key(asoc, GFP_KERNEL)) { + asoc->active_key_id = active_key_id; + return -ENOMEM; + } } else ep->active_key_id = key_id; diff --git a/net/sctp/socket.c b/net/sctp/socket.c index 171f1a35d205..3e83963d1b8a 100644 --- a/net/sctp/socket.c +++ b/net/sctp/socket.c @@ -5098,13 +5098,17 @@ static void sctp_destroy_sock(struct sock *sk) } /* Triggered when there are no references on the socket anymore */ -static void sctp_destruct_sock(struct sock *sk) +static void sctp_destruct_common(struct sock *sk) { struct sctp_sock *sp = sctp_sk(sk); /* Free up the HMAC transform. */ crypto_free_shash(sp->hmac); +} +static void sctp_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); inet_sock_destruct(sk); } @@ -8319,7 +8323,7 @@ static int sctp_get_port_local(struct sock *sk, union sctp_addr *addr) inet_get_local_port_range(net, &low, &high); remaining = (high - low) + 1; - rover = prandom_u32() % remaining + low; + rover = prandom_u32_max(remaining) + low; do { rover++; @@ -9427,7 +9431,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, sctp_sk(newsk)->reuse = sp->reuse; newsk->sk_shutdown = sk->sk_shutdown; - newsk->sk_destruct = sctp_destruct_sock; + newsk->sk_destruct = sk->sk_destruct; newsk->sk_family = sk->sk_family; newsk->sk_protocol = IPPROTO_SCTP; newsk->sk_backlog_rcv = sk->sk_prot->backlog_rcv; @@ -9448,7 +9452,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk, newinet->inet_rcv_saddr = inet->inet_rcv_saddr; newinet->inet_dport = htons(asoc->peer.port); newinet->pmtudisc = inet->pmtudisc; - newinet->inet_id = prandom_u32(); + newinet->inet_id = get_random_u16(); newinet->uc_ttl = inet->uc_ttl; newinet->mc_loop = 1; @@ -9662,11 +9666,20 @@ struct proto sctp_prot = { #if IS_ENABLED(CONFIG_IPV6) -#include <net/transp_v6.h> -static void sctp_v6_destroy_sock(struct sock *sk) +static void sctp_v6_destruct_sock(struct sock *sk) +{ + sctp_destruct_common(sk); + inet6_sock_destruct(sk); +} + +static int sctp_v6_init_sock(struct sock *sk) { - sctp_destroy_sock(sk); - inet6_destroy_sock(sk); + int ret = sctp_init_sock(sk); + + if (!ret) + sk->sk_destruct = sctp_v6_destruct_sock; + + return ret; } struct proto sctpv6_prot = { @@ -9676,8 +9689,8 @@ struct proto sctpv6_prot = { .disconnect = sctp_disconnect, .accept = sctp_accept, .ioctl = sctp_ioctl, - .init = sctp_init_sock, - .destroy = sctp_v6_destroy_sock, + .init = sctp_v6_init_sock, + .destroy = sctp_destroy_sock, .shutdown = sctp_shutdown, .setsockopt = sctp_setsockopt, .getsockopt = sctp_getsockopt, diff --git a/net/sctp/stream_interleave.c b/net/sctp/stream_interleave.c index bb22b71df7a3..94727feb07b3 100644 --- a/net/sctp/stream_interleave.c +++ b/net/sctp/stream_interleave.c @@ -490,11 +490,8 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, if (!sctp_ulpevent_is_enabled(event, ulpq->asoc->subscribe)) goto out_free; - if (skb_list) - skb_queue_splice_tail_init(skb_list, - &sk->sk_receive_queue); - else - __skb_queue_tail(&sk->sk_receive_queue, skb); + skb_queue_splice_tail_init(skb_list, + &sk->sk_receive_queue); if (!sp->data_ready_signalled) { sp->data_ready_signalled = 1; @@ -504,10 +501,7 @@ static int sctp_enqueue_event(struct sctp_ulpq *ulpq, return 1; out_free: - if (skb_list) - sctp_queue_purge_ulpevents(skb_list); - else - sctp_ulpevent_free(event); + sctp_queue_purge_ulpevents(skb_list); return 0; } diff --git a/net/sctp/ulpqueue.c b/net/sctp/ulpqueue.c index 0a8510a0c5e6..b05daafd369a 100644 --- a/net/sctp/ulpqueue.c +++ b/net/sctp/ulpqueue.c @@ -38,8 +38,7 @@ static void sctp_ulpq_reasm_drain(struct sctp_ulpq *ulpq); /* 1st Level Abstractions */ /* Initialize a ULP queue from a block of memory. */ -struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, - struct sctp_association *asoc) +void sctp_ulpq_init(struct sctp_ulpq *ulpq, struct sctp_association *asoc) { memset(ulpq, 0, sizeof(struct sctp_ulpq)); @@ -48,8 +47,6 @@ struct sctp_ulpq *sctp_ulpq_init(struct sctp_ulpq *ulpq, skb_queue_head_init(&ulpq->reasm_uo); skb_queue_head_init(&ulpq->lobby); ulpq->pd_mode = 0; - - return ulpq; } @@ -259,10 +256,7 @@ int sctp_ulpq_tail_event(struct sctp_ulpq *ulpq, struct sk_buff_head *skb_list) return 1; out_free: - if (skb_list) - sctp_queue_purge_ulpevents(skb_list); - else - sctp_ulpevent_free(event); + sctp_queue_purge_ulpevents(skb_list); return 0; } diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c index 0939cc3b915a..3ccbf3c201cd 100644 --- a/net/smc/af_smc.c +++ b/net/smc/af_smc.c @@ -379,6 +379,8 @@ static struct sock *smc_sock_alloc(struct net *net, struct socket *sock, sk->sk_state = SMC_INIT; sk->sk_destruct = smc_destruct; sk->sk_protocol = protocol; + WRITE_ONCE(sk->sk_sndbuf, READ_ONCE(net->smc.sysctl_wmem)); + WRITE_ONCE(sk->sk_rcvbuf, READ_ONCE(net->smc.sysctl_rmem)); smc = smc_sk(sk); INIT_WORK(&smc->tcp_listen_work, smc_tcp_listen_work); INIT_WORK(&smc->connect_work, smc_connect_work); @@ -427,6 +429,7 @@ static int smc_bind(struct socket *sock, struct sockaddr *uaddr, goto out_rel; smc->clcsock->sk->sk_reuse = sk->sk_reuse; + smc->clcsock->sk->sk_reuseport = sk->sk_reuseport; rc = kernel_bind(smc->clcsock, uaddr, addr_len); out_rel: @@ -3253,9 +3256,6 @@ static int __smc_create(struct net *net, struct socket *sock, int protocol, smc->clcsock = clcsock; } - smc->sk.sk_sndbuf = max(smc->clcsock->sk->sk_sndbuf, SMC_BUF_MIN_SIZE); - smc->sk.sk_rcvbuf = max(smc->clcsock->sk->sk_rcvbuf, SMC_BUF_MIN_SIZE); - out: return rc; } diff --git a/net/smc/smc_core.c b/net/smc/smc_core.c index ff49a11f57b8..c305d8dd23f8 100644 --- a/net/smc/smc_core.c +++ b/net/smc/smc_core.c @@ -757,6 +757,7 @@ int smcr_link_init(struct smc_link_group *lgr, struct smc_link *lnk, lnk->lgr = lgr; smc_lgr_hold(lgr); /* lgr_put in smcr_link_clear() */ lnk->link_idx = link_idx; + lnk->wr_rx_id_compl = 0; smc_ibdev_cnt_inc(lnk); smcr_copy_dev_info_to_link(lnk); atomic_set(&lnk->conn_cnt, 0); @@ -895,7 +896,8 @@ static int smc_lgr_create(struct smc_sock *smc, struct smc_init_info *ini) } memcpy(lgr->pnet_id, ibdev->pnetid[ibport - 1], SMC_MAX_PNETID_LEN); - if (smc_wr_alloc_lgr_mem(lgr)) + rc = smc_wr_alloc_lgr_mem(lgr); + if (rc) goto free_wq; smc_llc_lgr_init(lgr, smc); @@ -2238,7 +2240,7 @@ out: static int smcr_buf_map_usable_links(struct smc_link_group *lgr, struct smc_buf_desc *buf_desc, bool is_rmb) { - int i, rc = 0; + int i, rc = 0, cnt = 0; /* protect against parallel link reconfiguration */ mutex_lock(&lgr->llc_conf_mutex); @@ -2251,9 +2253,12 @@ static int smcr_buf_map_usable_links(struct smc_link_group *lgr, rc = -ENOMEM; goto out; } + cnt++; } out: mutex_unlock(&lgr->llc_conf_mutex); + if (!rc && !cnt) + rc = -EINVAL; return rc; } @@ -2306,10 +2311,10 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) /* use socket recv buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_rcvbuf / 2; + sk_buf_size = smc->sk.sk_rcvbuf; else /* use socket send buffer size (w/o overhead) as start value */ - sk_buf_size = smc->sk.sk_sndbuf / 2; + sk_buf_size = smc->sk.sk_sndbuf; for (bufsize_short = smc_compress_bufsize(sk_buf_size, is_smcd, is_rmb); bufsize_short >= 0; bufsize_short--) { @@ -2368,7 +2373,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) if (is_rmb) { conn->rmb_desc = buf_desc; conn->rmbe_size_short = bufsize_short; - smc->sk.sk_rcvbuf = bufsize * 2; + smc->sk.sk_rcvbuf = bufsize; atomic_set(&conn->bytes_to_rcv, 0); conn->rmbe_update_limit = smc_rmb_wnd_update_limit(buf_desc->len); @@ -2376,7 +2381,7 @@ static int __smc_buf_create(struct smc_sock *smc, bool is_smcd, bool is_rmb) smc_ism_set_conn(conn); /* map RMB/smcd_dev to conn */ } else { conn->sndbuf_desc = buf_desc; - smc->sk.sk_sndbuf = bufsize * 2; + smc->sk.sk_sndbuf = bufsize; atomic_set(&conn->sndbuf_space, bufsize); } return 0; diff --git a/net/smc/smc_core.h b/net/smc/smc_core.h index fe8b524ad846..285f9bd8e232 100644 --- a/net/smc/smc_core.h +++ b/net/smc/smc_core.h @@ -115,8 +115,10 @@ struct smc_link { dma_addr_t wr_rx_dma_addr; /* DMA address of wr_rx_bufs */ dma_addr_t wr_rx_v2_dma_addr; /* DMA address of v2 rx buf*/ u64 wr_rx_id; /* seq # of last recv WR */ + u64 wr_rx_id_compl; /* seq # of last completed WR */ u32 wr_rx_cnt; /* number of WR recv buffers */ unsigned long wr_rx_tstamp; /* jiffies when last buf rx */ + wait_queue_head_t wr_rx_empty_wait; /* wait for RQ empty */ struct ib_reg_wr wr_reg; /* WR register memory region */ wait_queue_head_t wr_reg_wait; /* wait for wr_reg result */ diff --git a/net/smc/smc_llc.c b/net/smc/smc_llc.c index 175026ae33ae..524649d0ab65 100644 --- a/net/smc/smc_llc.c +++ b/net/smc/smc_llc.c @@ -2127,7 +2127,7 @@ void smc_llc_lgr_init(struct smc_link_group *lgr, struct smc_sock *smc) init_waitqueue_head(&lgr->llc_flow_waiter); init_waitqueue_head(&lgr->llc_msg_waiter); mutex_init(&lgr->llc_conf_mutex); - lgr->llc_testlink_time = READ_ONCE(net->ipv4.sysctl_tcp_keepalive_time); + lgr->llc_testlink_time = READ_ONCE(net->smc.sysctl_smcr_testlink_time); } /* called after lgr was removed from lgr_list */ diff --git a/net/smc/smc_llc.h b/net/smc/smc_llc.h index 4404e52b3346..7e7a3162c68b 100644 --- a/net/smc/smc_llc.h +++ b/net/smc/smc_llc.h @@ -19,6 +19,7 @@ #define SMC_LLC_WAIT_FIRST_TIME (5 * HZ) #define SMC_LLC_WAIT_TIME (2 * HZ) +#define SMC_LLC_TESTLINK_DEFAULT_TIME (30 * HZ) enum smc_llc_reqresp { SMC_LLC_REQ, diff --git a/net/smc/smc_sysctl.c b/net/smc/smc_sysctl.c index 0613868fdb97..b6f79fabb9d3 100644 --- a/net/smc/smc_sysctl.c +++ b/net/smc/smc_sysctl.c @@ -16,8 +16,12 @@ #include "smc.h" #include "smc_core.h" +#include "smc_llc.h" #include "smc_sysctl.h" +static int min_sndbuf = SMC_BUF_MIN_SIZE; +static int min_rcvbuf = SMC_BUF_MIN_SIZE; + static struct ctl_table smc_table[] = { { .procname = "autocorking_size", @@ -35,6 +39,29 @@ static struct ctl_table smc_table[] = { .extra1 = SYSCTL_ZERO, .extra2 = SYSCTL_TWO, }, + { + .procname = "smcr_testlink_time", + .data = &init_net.smc.sysctl_smcr_testlink_time, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_jiffies, + }, + { + .procname = "wmem", + .data = &init_net.smc.sysctl_wmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_sndbuf, + }, + { + .procname = "rmem", + .data = &init_net.smc.sysctl_rmem, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &min_rcvbuf, + }, { } }; @@ -60,6 +87,9 @@ int __net_init smc_sysctl_net_init(struct net *net) net->smc.sysctl_autocorking_size = SMC_AUTOCORKING_DEFAULT_SIZE; net->smc.sysctl_smcr_buf_type = SMCR_PHYS_CONT_BUFS; + net->smc.sysctl_smcr_testlink_time = SMC_LLC_TESTLINK_DEFAULT_TIME; + WRITE_ONCE(net->smc.sysctl_wmem, READ_ONCE(net->ipv4.sysctl_tcp_wmem[1])); + WRITE_ONCE(net->smc.sysctl_rmem, READ_ONCE(net->ipv4.sysctl_tcp_rmem[1])); return 0; diff --git a/net/smc/smc_wr.c b/net/smc/smc_wr.c index 26f8f240d9e8..b0678a417e09 100644 --- a/net/smc/smc_wr.c +++ b/net/smc/smc_wr.c @@ -454,6 +454,7 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) for (i = 0; i < num; i++) { link = wc[i].qp->qp_context; + link->wr_rx_id_compl = wc[i].wr_id; if (wc[i].status == IB_WC_SUCCESS) { link->wr_rx_tstamp = jiffies; smc_wr_rx_demultiplex(&wc[i]); @@ -465,6 +466,8 @@ static inline void smc_wr_rx_process_cqes(struct ib_wc wc[], int num) case IB_WC_RNR_RETRY_EXC_ERR: case IB_WC_WR_FLUSH_ERR: smcr_link_down_cond_sched(link); + if (link->wr_rx_id_compl == link->wr_rx_id) + wake_up(&link->wr_rx_empty_wait); break; default: smc_wr_rx_post(link); /* refill WR RX */ @@ -639,6 +642,7 @@ void smc_wr_free_link(struct smc_link *lnk) return; ibdev = lnk->smcibdev->ibdev; + smc_wr_drain_cq(lnk); smc_wr_wakeup_reg_wait(lnk); smc_wr_wakeup_tx_wait(lnk); @@ -889,6 +893,7 @@ int smc_wr_create_link(struct smc_link *lnk) atomic_set(&lnk->wr_tx_refcnt, 0); init_waitqueue_head(&lnk->wr_reg_wait); atomic_set(&lnk->wr_reg_refcnt, 0); + init_waitqueue_head(&lnk->wr_rx_empty_wait); return rc; dma_unmap: diff --git a/net/smc/smc_wr.h b/net/smc/smc_wr.h index a54e90a1110f..45e9b894d3f8 100644 --- a/net/smc/smc_wr.h +++ b/net/smc/smc_wr.h @@ -73,6 +73,11 @@ static inline void smc_wr_tx_link_put(struct smc_link *link) wake_up_all(&link->wr_tx_wait); } +static inline void smc_wr_drain_cq(struct smc_link *lnk) +{ + wait_event(lnk->wr_rx_empty_wait, lnk->wr_rx_id_compl == lnk->wr_rx_id); +} + static inline void smc_wr_wakeup_tx_wait(struct smc_link *lnk) { wake_up_all(&lnk->wr_tx_wait); diff --git a/net/socket.c b/net/socket.c index 7378375d3a5b..55c5d536e5f6 100644 --- a/net/socket.c +++ b/net/socket.c @@ -355,7 +355,7 @@ static const struct super_operations sockfs_ops = { */ static char *sockfs_dname(struct dentry *dentry, char *buffer, int buflen) { - return dynamic_dname(dentry, buffer, buflen, "socket:[%lu]", + return dynamic_dname(buffer, buflen, "socket:[%lu]", d_inode(dentry)->i_ino); } @@ -2199,13 +2199,7 @@ SYSCALL_DEFINE4(recv, int, fd, void __user *, ubuf, size_t, size, static bool sock_use_custom_sol_socket(const struct socket *sock) { - const struct sock *sk = sock->sk; - - /* Use sock->ops->setsockopt() for MPTCP */ - return IS_ENABLED(CONFIG_MPTCP) && - sk->sk_protocol == IPPROTO_MPTCP && - sk->sk_type == SOCK_STREAM && - (sk->sk_family == AF_INET || sk->sk_family == AF_INET6); + return test_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); } /* diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c index 5f96e75f9eec..48337687848c 100644 --- a/net/sunrpc/auth_gss/gss_krb5_wrap.c +++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c @@ -130,8 +130,8 @@ gss_krb5_make_confounder(char *p, u32 conflen) /* initialize to random value */ if (i == 0) { - i = prandom_u32(); - i = (i << 32) | prandom_u32(); + i = get_random_u32(); + i = (i << 32) | get_random_u32(); } switch (conflen) { diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index c3c693b51c94..f075a9fb5ccc 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -677,7 +677,7 @@ static void cache_limit_defers(void) /* Consider removing either the first or the last */ if (cache_defer_cnt > DFR_MAX) { - if (prandom_u32() & 1) + if (prandom_u32_max(2)) discard = list_entry(cache_defer_list.next, struct cache_deferred_req, recent); else diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c index 7d268a291486..993acf38af87 100644 --- a/net/sunrpc/clnt.c +++ b/net/sunrpc/clnt.c @@ -345,7 +345,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) { int clid; - clid = ida_simple_get(&rpc_clids, 0, 0, GFP_KERNEL); + clid = ida_alloc(&rpc_clids, GFP_KERNEL); if (clid < 0) return clid; clnt->cl_clid = clid; @@ -354,7 +354,7 @@ static int rpc_alloc_clid(struct rpc_clnt *clnt) static void rpc_free_clid(struct rpc_clnt *clnt) { - ida_simple_remove(&rpc_clids, clnt->cl_clid); + ida_free(&rpc_clids, clnt->cl_clid); } static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args, @@ -873,6 +873,57 @@ void rpc_killall_tasks(struct rpc_clnt *clnt) } EXPORT_SYMBOL_GPL(rpc_killall_tasks); +/** + * rpc_cancel_tasks - try to cancel a set of RPC tasks + * @clnt: Pointer to RPC client + * @error: RPC task error value to set + * @fnmatch: Pointer to selector function + * @data: User data + * + * Uses @fnmatch to define a set of RPC tasks that are to be cancelled. + * The argument @error must be a negative error value. + */ +unsigned long rpc_cancel_tasks(struct rpc_clnt *clnt, int error, + bool (*fnmatch)(const struct rpc_task *, + const void *), + const void *data) +{ + struct rpc_task *task; + unsigned long count = 0; + + if (list_empty(&clnt->cl_tasks)) + return 0; + /* + * Spin lock all_tasks to prevent changes... + */ + spin_lock(&clnt->cl_lock); + list_for_each_entry(task, &clnt->cl_tasks, tk_task) { + if (!RPC_IS_ACTIVATED(task)) + continue; + if (!fnmatch(task, data)) + continue; + rpc_task_try_cancel(task, error); + count++; + } + spin_unlock(&clnt->cl_lock); + return count; +} +EXPORT_SYMBOL_GPL(rpc_cancel_tasks); + +static int rpc_clnt_disconnect_xprt(struct rpc_clnt *clnt, + struct rpc_xprt *xprt, void *dummy) +{ + if (xprt_connected(xprt)) + xprt_force_disconnect(xprt); + return 0; +} + +void rpc_clnt_disconnect(struct rpc_clnt *clnt) +{ + rpc_clnt_iterate_for_each_xprt(clnt, rpc_clnt_disconnect_xprt, NULL); +} +EXPORT_SYMBOL_GPL(rpc_clnt_disconnect); + /* * Properly shut down an RPC client, terminating all outstanding * requests. @@ -1642,7 +1693,7 @@ static void __rpc_call_rpcerror(struct rpc_task *task, int tk_status, int rpc_status) { trace_rpc_call_rpcerror(task, tk_status, rpc_status); - task->tk_rpc_status = rpc_status; + rpc_task_set_rpc_status(task, rpc_status); rpc_exit(task, tk_status); } @@ -2435,10 +2486,8 @@ rpc_check_timeout(struct rpc_task *task) { struct rpc_clnt *clnt = task->tk_client; - if (RPC_SIGNALLED(task)) { - rpc_call_rpcerror(task, -ERESTARTSYS); + if (RPC_SIGNALLED(task)) return; - } if (xprt_adjust_timeout(task->tk_rqstp) == 0) return; @@ -2873,6 +2922,9 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt, task = rpc_call_null_helper(clnt, xprt, NULL, RPC_TASK_ASYNC, &rpc_cb_add_xprt_call_ops, data); + if (IS_ERR(task)) + return PTR_ERR(task); + data->xps->xps_nunique_destaddr_xprts++; rpc_put_task(task); success: diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c index 25b9221950ff..be587a308e05 100644 --- a/net/sunrpc/sched.c +++ b/net/sunrpc/sched.c @@ -65,6 +65,13 @@ gfp_t rpc_task_gfp_mask(void) } EXPORT_SYMBOL_GPL(rpc_task_gfp_mask); +bool rpc_task_set_rpc_status(struct rpc_task *task, int rpc_status) +{ + if (cmpxchg(&task->tk_rpc_status, 0, rpc_status) == 0) + return true; + return false; +} + unsigned long rpc_task_timeout(const struct rpc_task *task) { @@ -269,7 +276,7 @@ EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue); static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode) { - freezable_schedule_unsafe(); + schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; return 0; @@ -333,14 +340,12 @@ static int rpc_complete_task(struct rpc_task *task) * to enforce taking of the wq->lock and hence avoid races with * rpc_complete_task(). */ -int __rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *action) +int rpc_wait_for_completion_task(struct rpc_task *task) { - if (action == NULL) - action = rpc_wait_bit_killable; return out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_ACTIVE, - action, TASK_KILLABLE); + rpc_wait_bit_killable, TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); } -EXPORT_SYMBOL_GPL(__rpc_wait_for_completion_task); +EXPORT_SYMBOL_GPL(rpc_wait_for_completion_task); /* * Make an RPC task runnable. @@ -855,12 +860,25 @@ void rpc_signal_task(struct rpc_task *task) if (!RPC_IS_ACTIVATED(task)) return; + if (!rpc_task_set_rpc_status(task, -ERESTARTSYS)) + return; trace_rpc_task_signalled(task, task->tk_action); set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); smp_mb__after_atomic(); queue = READ_ONCE(task->tk_waitqueue); if (queue) - rpc_wake_up_queued_task_set_status(queue, task, -ERESTARTSYS); + rpc_wake_up_queued_task(queue, task); +} + +void rpc_task_try_cancel(struct rpc_task *task, int error) +{ + struct rpc_wait_queue *queue; + + if (!rpc_task_set_rpc_status(task, error)) + return; + queue = READ_ONCE(task->tk_waitqueue); + if (queue) + rpc_wake_up_queued_task(queue, task); } void rpc_exit(struct rpc_task *task, int status) @@ -907,10 +925,16 @@ static void __rpc_execute(struct rpc_task *task) * Perform the next FSM step or a pending callback. * * tk_action may be NULL if the task has been killed. - * In particular, note that rpc_killall_tasks may - * do this at any time, so beware when dereferencing. */ do_action = task->tk_action; + /* Tasks with an RPC error status should exit */ + if (do_action != rpc_exit_task && + (status = READ_ONCE(task->tk_rpc_status)) != 0) { + task->tk_status = status; + if (do_action != NULL) + do_action = rpc_exit_task; + } + /* Callbacks override all actions */ if (task->tk_callback) { do_action = task->tk_callback; task->tk_callback = NULL; @@ -933,14 +957,6 @@ static void __rpc_execute(struct rpc_task *task) } /* - * Signalled tasks should exit rather than sleep. - */ - if (RPC_SIGNALLED(task)) { - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); - } - - /* * The queue->lock protects against races with * rpc_make_runnable(). * @@ -955,6 +971,12 @@ static void __rpc_execute(struct rpc_task *task) spin_unlock(&queue->lock); continue; } + /* Wake up any task that has an exit status */ + if (READ_ONCE(task->tk_rpc_status) != 0) { + rpc_wake_up_task_queue_locked(queue, task); + spin_unlock(&queue->lock); + continue; + } rpc_clear_running(task); spin_unlock(&queue->lock); if (task_is_async) @@ -964,7 +986,7 @@ static void __rpc_execute(struct rpc_task *task) trace_rpc_task_sync_sleep(task, task->tk_action); status = out_of_line_wait_on_bit(&task->tk_runstate, RPC_TASK_QUEUED, rpc_wait_bit_killable, - TASK_KILLABLE); + TASK_KILLABLE|TASK_FREEZABLE); if (status < 0) { /* * When a sync task receives a signal, it exits with @@ -972,10 +994,7 @@ static void __rpc_execute(struct rpc_task *task) * clean up after sleeping on some queue, we don't * break the loop here, but go around once more. */ - trace_rpc_task_signalled(task, task->tk_action); - set_bit(RPC_TASK_SIGNALLED, &task->tk_runstate); - task->tk_rpc_status = -ERESTARTSYS; - rpc_exit(task, -ERESTARTSYS); + rpc_signal_task(task); } trace_rpc_task_sync_wake(task, task->tk_action); } diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c index 7c9a0d0b1230..149171774bc6 100644 --- a/net/sunrpc/svc.c +++ b/net/sunrpc/svc.c @@ -1205,7 +1205,7 @@ svc_generic_init_request(struct svc_rqst *rqstp, goto err_bad_proc; /* Initialize storage for argp and resp */ - memset(rqstp->rq_argp, 0, procp->pc_argsize); + memset(rqstp->rq_argp, 0, procp->pc_argzero); memset(rqstp->rq_resp, 0, procp->pc_ressize); /* Bump per-procedure stats counter */ @@ -1434,8 +1434,7 @@ svc_process(struct svc_rqst *rqstp) { struct kvec *argv = &rqstp->rq_arg.head[0]; struct kvec *resv = &rqstp->rq_res.head[0]; - struct svc_serv *serv = rqstp->rq_server; - u32 dir; + __be32 dir; #if IS_ENABLED(CONFIG_FAIL_SUNRPC) if (!fail_sunrpc.ignore_server_disconnect && @@ -1450,7 +1449,7 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_next_page = &rqstp->rq_respages[1]; resv->iov_base = page_address(rqstp->rq_respages[0]); resv->iov_len = 0; - rqstp->rq_res.pages = rqstp->rq_respages + 1; + rqstp->rq_res.pages = rqstp->rq_next_page; rqstp->rq_res.len = 0; rqstp->rq_res.page_base = 0; rqstp->rq_res.page_len = 0; @@ -1458,18 +1457,17 @@ svc_process(struct svc_rqst *rqstp) rqstp->rq_res.tail[0].iov_base = NULL; rqstp->rq_res.tail[0].iov_len = 0; - dir = svc_getnl(argv); - if (dir != 0) { - /* direction != CALL */ - svc_printk(rqstp, "bad direction %d, dropping request\n", dir); - serv->sv_stats->rpcbadfmt++; + dir = svc_getu32(argv); + if (dir != rpc_call) + goto out_baddir; + if (!svc_process_common(rqstp, argv, resv)) goto out_drop; - } - - /* Returns 1 for send, 0 for drop */ - if (likely(svc_process_common(rqstp, argv, resv))) - return svc_send(rqstp); + return svc_send(rqstp); +out_baddir: + svc_printk(rqstp, "bad direction 0x%08x, dropping request\n", + be32_to_cpu(dir)); + rqstp->rq_server->sv_stats->rpcbadfmt++; out_drop: svc_drop(rqstp); return 0; @@ -1556,8 +1554,12 @@ out: EXPORT_SYMBOL_GPL(bc_svc_process); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* - * Return (transport-specific) limit on the rpc payload. +/** + * svc_max_payload - Return transport-specific limit on the RPC payload + * @rqstp: RPC transaction context + * + * Returns the maximum number of payload bytes the current transport + * allows. */ u32 svc_max_payload(const struct svc_rqst *rqstp) { diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c index 482586c23fdd..336a7c7833e4 100644 --- a/net/sunrpc/xdr.c +++ b/net/sunrpc/xdr.c @@ -947,6 +947,28 @@ void xdr_init_encode(struct xdr_stream *xdr, struct xdr_buf *buf, __be32 *p, EXPORT_SYMBOL_GPL(xdr_init_encode); /** + * xdr_init_encode_pages - Initialize an xdr_stream for encoding into pages + * @xdr: pointer to xdr_stream struct + * @buf: pointer to XDR buffer into which to encode data + * @pages: list of pages to decode into + * @rqst: pointer to controlling rpc_rqst, for debugging + * + */ +void xdr_init_encode_pages(struct xdr_stream *xdr, struct xdr_buf *buf, + struct page **pages, struct rpc_rqst *rqst) +{ + xdr_reset_scratch_buffer(xdr); + + xdr->buf = buf; + xdr->page_ptr = pages; + xdr->iov = NULL; + xdr->p = page_address(*pages); + xdr->end = (void *)xdr->p + min_t(u32, buf->buflen, PAGE_SIZE); + xdr->rqst = rqst; +} +EXPORT_SYMBOL_GPL(xdr_init_encode_pages); + +/** * __xdr_commit_encode - Ensure all data is written to buffer * @xdr: pointer to xdr_stream * @@ -1575,7 +1597,7 @@ EXPORT_SYMBOL_GPL(xdr_buf_from_iov); * * @buf and @subbuf may be pointers to the same struct xdr_buf. * - * Returns -1 if base of length are out of bounds. + * Returns -1 if base or length are out of bounds. */ int xdr_buf_subsegment(const struct xdr_buf *buf, struct xdr_buf *subbuf, unsigned int base, unsigned int len) diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index d71eec494826..656cec208371 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1179,11 +1179,8 @@ xprt_request_dequeue_receive_locked(struct rpc_task *task) { struct rpc_rqst *req = task->tk_rqstp; - if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) { + if (test_and_clear_bit(RPC_TASK_NEED_RECV, &task->tk_runstate)) xprt_request_rb_remove(req->rq_xprt, req); - xdr_free_bvec(&req->rq_rcv_buf); - req->rq_private_buf.bvec = NULL; - } } /** @@ -1221,6 +1218,8 @@ void xprt_complete_rqst(struct rpc_task *task, int copied) xprt->stat.recvs++; + xdr_free_bvec(&req->rq_rcv_buf); + req->rq_private_buf.bvec = NULL; req->rq_private_buf.len = copied; /* Ensure all writes are done before we update */ /* req->rq_reply_bytes_recvd */ @@ -1453,6 +1452,7 @@ xprt_request_dequeue_xprt(struct rpc_task *task) xprt_request_dequeue_transmit_locked(task); xprt_request_dequeue_receive_locked(task); spin_unlock(&xprt->queue_lock); + xdr_free_bvec(&req->rq_rcv_buf); } } @@ -1788,7 +1788,7 @@ static int xprt_alloc_id(struct rpc_xprt *xprt) { int id; - id = ida_simple_get(&rpc_xprt_ids, 0, 0, GFP_KERNEL); + id = ida_alloc(&rpc_xprt_ids, GFP_KERNEL); if (id < 0) return id; @@ -1798,7 +1798,7 @@ static int xprt_alloc_id(struct rpc_xprt *xprt) static void xprt_free_id(struct rpc_xprt *xprt) { - ida_simple_remove(&rpc_xprt_ids, xprt->id); + ida_free(&rpc_xprt_ids, xprt->id); } struct rpc_xprt *xprt_alloc(struct net *net, size_t size, @@ -1822,10 +1822,7 @@ struct rpc_xprt *xprt_alloc(struct net *net, size_t size, goto out_free; list_add(&req->rq_list, &xprt->free); } - if (max_alloc > num_prealloc) - xprt->max_reqs = max_alloc; - else - xprt->max_reqs = num_prealloc; + xprt->max_reqs = max_t(unsigned int, max_alloc, num_prealloc); xprt->min_reqs = num_prealloc; xprt->num_reqs = num_prealloc; @@ -1868,7 +1865,7 @@ xprt_alloc_xid(struct rpc_xprt *xprt) static void xprt_init_xid(struct rpc_xprt *xprt) { - xprt->xid = prandom_u32(); + xprt->xid = get_random_u32(); } static void diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c index 685db598acbe..701250b305db 100644 --- a/net/sunrpc/xprtmultipath.c +++ b/net/sunrpc/xprtmultipath.c @@ -103,7 +103,7 @@ static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) { int id; - id = ida_simple_get(&rpc_xprtswitch_ids, 0, 0, gfp_flags); + id = ida_alloc(&rpc_xprtswitch_ids, gfp_flags); if (id < 0) return id; @@ -113,7 +113,7 @@ static int xprt_switch_alloc_id(struct rpc_xprt_switch *xps, gfp_t gfp_flags) static void xprt_switch_free_id(struct rpc_xprt_switch *xps) { - ida_simple_remove(&rpc_xprtswitch_ids, xps->xps_id); + ida_free(&rpc_xprtswitch_ids, xps->xps_id); } /** diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c index faba7136dd9a..e4d84a13c566 100644 --- a/net/sunrpc/xprtrdma/backchannel.c +++ b/net/sunrpc/xprtrdma/backchannel.c @@ -189,7 +189,7 @@ create_req: return NULL; size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE); - req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, size); if (!req) return NULL; if (rpcrdma_req_setup(r_xprt, req)) { diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index de0bdb6b729f..ffbf99894970 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -124,16 +124,16 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) unsigned int depth = ep->re_max_fr_depth; struct scatterlist *sg; struct ib_mr *frmr; - int rc; + + sg = kcalloc_node(depth, sizeof(*sg), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(ep->re_id->device)); + if (!sg) + return -ENOMEM; frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth); if (IS_ERR(frmr)) goto out_mr_err; - sg = kmalloc_array(depth, sizeof(*sg), GFP_KERNEL); - if (!sg) - goto out_list_err; - mr->mr_xprt = r_xprt; mr->mr_ibmr = frmr; mr->mr_device = NULL; @@ -146,13 +146,9 @@ int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr) return 0; out_mr_err: - rc = PTR_ERR(frmr); - trace_xprtrdma_frwr_alloc(mr, rc); - return rc; - -out_list_err: - ib_dereg_mr(frmr); - return -ENOMEM; + kfree(sg); + trace_xprtrdma_frwr_alloc(mr, PTR_ERR(frmr)); + return PTR_ERR(frmr); } /** diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c index 85c8cdda98b1..aa2227a7e552 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -119,12 +119,12 @@ xprt_rdma_bc_allocate(struct rpc_task *task) return -EINVAL; } - page = alloc_page(RPCRDMA_DEF_GFP); + page = alloc_page(GFP_NOIO | __GFP_NOWARN); if (!page) return -ENOMEM; rqst->rq_buffer = page_address(page); - rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, RPCRDMA_DEF_GFP); + rqst->rq_rbuffer = kmalloc(rqst->rq_rcvsize, GFP_NOIO | __GFP_NOWARN); if (!rqst->rq_rbuffer) { put_page(page); return -ENOMEM; diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index bcb37b51adf6..10bb2b929c6d 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -494,8 +494,7 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task) xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO); } trace_xprtrdma_op_connect(r_xprt, delay); - queue_delayed_work(xprtiod_workqueue, &r_xprt->rx_connect_worker, - delay); + queue_delayed_work(system_long_wq, &r_xprt->rx_connect_worker, delay); } /** diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 2fbe9aaeec34..44b87e4274b4 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -76,8 +76,7 @@ static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt); static void rpcrdma_ep_get(struct rpcrdma_ep *ep); static int rpcrdma_ep_put(struct rpcrdma_ep *ep); static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags); +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction); static void rpcrdma_regbuf_dma_unmap(struct rpcrdma_regbuf *rb); static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb); @@ -373,7 +372,7 @@ static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt) struct rpcrdma_ep *ep; int rc; - ep = kzalloc(sizeof(*ep), GFP_KERNEL); + ep = kzalloc(sizeof(*ep), XPRTRDMA_GFP_FLAGS); if (!ep) return -ENOTCONN; ep->re_xprt = &r_xprt->rx_xprt; @@ -606,7 +605,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep) struct rpcrdma_sendctx *sc; sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge), - GFP_KERNEL); + XPRTRDMA_GFP_FLAGS); if (!sc) return NULL; @@ -629,7 +628,7 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt) * Sends are posted. */ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS; - buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL); + buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), XPRTRDMA_GFP_FLAGS); if (!buf->rb_sc_ctxs) return -ENOMEM; @@ -740,13 +739,16 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt) { struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_ep *ep = r_xprt->rx_ep; + struct ib_device *device = ep->re_id->device; unsigned int count; + /* Try to allocate enough to perform one full-sized I/O */ for (count = 0; count < ep->re_max_rdma_segs; count++) { struct rpcrdma_mr *mr; int rc; - mr = kzalloc(sizeof(*mr), GFP_KERNEL); + mr = kzalloc_node(sizeof(*mr), XPRTRDMA_GFP_FLAGS, + ibdev_to_node(device)); if (!mr) break; @@ -791,38 +793,33 @@ void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt) /* If there is no underlying connection, it's no use * to wake the refresh worker. */ - if (ep->re_connect_status == 1) { - /* The work is scheduled on a WQ_MEM_RECLAIM - * workqueue in order to prevent MR allocation - * from recursing into NFS during direct reclaim. - */ - queue_work(xprtiod_workqueue, &buf->rb_refresh_worker); - } + if (ep->re_connect_status != 1) + return; + queue_work(system_highpri_wq, &buf->rb_refresh_worker); } /** * rpcrdma_req_create - Allocate an rpcrdma_req object * @r_xprt: controlling r_xprt * @size: initial size, in bytes, of send and receive buffers - * @flags: GFP flags passed to memory allocators * * Returns an allocated and fully initialized rpcrdma_req or NULL. */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags) +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size) { struct rpcrdma_buffer *buffer = &r_xprt->rx_buf; struct rpcrdma_req *req; - req = kzalloc(sizeof(*req), flags); + req = kzalloc(sizeof(*req), XPRTRDMA_GFP_FLAGS); if (req == NULL) goto out1; - req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE, flags); + req->rl_sendbuf = rpcrdma_regbuf_alloc(size, DMA_TO_DEVICE); if (!req->rl_sendbuf) goto out2; - req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE, flags); + req->rl_recvbuf = rpcrdma_regbuf_alloc(size, DMA_NONE); if (!req->rl_recvbuf) goto out3; @@ -858,7 +855,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req) r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz; maxhdrsize *= sizeof(__be32); rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize), - DMA_TO_DEVICE, GFP_KERNEL); + DMA_TO_DEVICE); if (!rb) goto out; @@ -929,12 +926,12 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt, struct rpcrdma_buffer *buf = &r_xprt->rx_buf; struct rpcrdma_rep *rep; - rep = kzalloc(sizeof(*rep), GFP_KERNEL); + rep = kzalloc(sizeof(*rep), XPRTRDMA_GFP_FLAGS); if (rep == NULL) goto out; rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv, - DMA_FROM_DEVICE, GFP_KERNEL); + DMA_FROM_DEVICE); if (!rep->rr_rdmabuf) goto out_free; @@ -1064,8 +1061,8 @@ int rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt) for (i = 0; i < r_xprt->rx_xprt.max_reqs; i++) { struct rpcrdma_req *req; - req = rpcrdma_req_create(r_xprt, RPCRDMA_V1_DEF_INLINE_SIZE * 2, - GFP_KERNEL); + req = rpcrdma_req_create(r_xprt, + RPCRDMA_V1_DEF_INLINE_SIZE * 2); if (!req) goto out; list_add(&req->rl_list, &buf->rb_send_bufs); @@ -1235,15 +1232,14 @@ void rpcrdma_buffer_put(struct rpcrdma_buffer *buffers, struct rpcrdma_req *req) * or Replies they may be registered externally via frwr_map. */ static struct rpcrdma_regbuf * -rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction, - gfp_t flags) +rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction) { struct rpcrdma_regbuf *rb; - rb = kmalloc(sizeof(*rb), flags); + rb = kmalloc(sizeof(*rb), XPRTRDMA_GFP_FLAGS); if (!rb) return NULL; - rb->rg_data = kmalloc(size, flags); + rb->rg_data = kmalloc(size, XPRTRDMA_GFP_FLAGS); if (!rb->rg_data) { kfree(rb); return NULL; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index c79f92eeda76..5e5ff6784ef5 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -149,7 +149,11 @@ static inline void *rdmab_data(const struct rpcrdma_regbuf *rb) return rb->rg_data; } -#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) +/* Do not use emergency memory reserves, and fail quickly if memory + * cannot be allocated easily. These flags may be used wherever there + * is robust logic to handle a failure to allocate. + */ +#define XPRTRDMA_GFP_FLAGS (__GFP_NOMEMALLOC | __GFP_NORETRY | __GFP_NOWARN) /* To ensure a transport can always make forward progress, * the number of RDMA segments allowed in header chunk lists @@ -467,8 +471,8 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, int needed, bool temp); /* * Buffer calls - xprtrdma/verbs.c */ -struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, size_t size, - gfp_t flags); +struct rpcrdma_req *rpcrdma_req_create(struct rpcrdma_xprt *r_xprt, + size_t size); int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req); void rpcrdma_req_destroy(struct rpcrdma_req *req); int rpcrdma_buffer_create(struct rpcrdma_xprt *); diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index e976007f4fd0..915b9902f673 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -261,7 +261,7 @@ static void xs_format_common_peer_addresses(struct rpc_xprt *xprt) switch (sap->sa_family) { case AF_LOCAL: sun = xs_addr_un(xprt); - strlcpy(buf, sun->sun_path, sizeof(buf)); + strscpy(buf, sun->sun_path, sizeof(buf)); xprt->address_strings[RPC_DISPLAY_ADDR] = kstrdup(buf, GFP_KERNEL); break; @@ -1619,7 +1619,7 @@ static int xs_get_random_port(void) if (max < min) return -EADDRINUSE; range = max - min + 1; - rand = (unsigned short) prandom_u32() % range; + rand = prandom_u32_max(range); return rand + min; } @@ -1978,8 +1978,7 @@ static void xs_local_connect(struct rpc_xprt *xprt, struct rpc_task *task) * we'll need to figure out how to pass a namespace to * connect. */ - task->tk_rpc_status = -ENOTCONN; - rpc_exit(task, -ENOTCONN); + rpc_task_set_rpc_status(task, -ENOTCONN); goto out_wake; } ret = xs_local_setup_socket(transport); diff --git a/net/tipc/discover.c b/net/tipc/discover.c index da69e1abf68f..e8630707901e 100644 --- a/net/tipc/discover.c +++ b/net/tipc/discover.c @@ -148,8 +148,8 @@ static bool tipc_disc_addr_trial_msg(struct tipc_discoverer *d, { struct net *net = d->net; struct tipc_net *tn = tipc_net(net); - bool trial = time_before(jiffies, tn->addr_trial_end); u32 self = tipc_own_addr(net); + bool trial = time_before(jiffies, tn->addr_trial_end) && !self; if (mtyp == DSC_TRIAL_FAIL_MSG) { if (!trial) diff --git a/net/tipc/monitor.c b/net/tipc/monitor.c index 2f4d23238a7e..9618e4429f0f 100644 --- a/net/tipc/monitor.c +++ b/net/tipc/monitor.c @@ -160,7 +160,7 @@ static void map_set(u64 *up_map, int i, unsigned int v) static int map_get(u64 up_map, int i) { - return (up_map & (1 << i)) >> i; + return (up_map & (1ULL << i)) >> i; } static struct tipc_peer *peer_prev(struct tipc_peer *peer) diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c index 8267b751a526..190b49c5cbc3 100644 --- a/net/tipc/name_distr.c +++ b/net/tipc/name_distr.c @@ -41,14 +41,6 @@ int sysctl_tipc_named_timeout __read_mostly = 2000; -struct distr_queue_item { - struct distr_item i; - u32 dtype; - u32 node; - unsigned long expires; - struct list_head next; -}; - /** * publ_to_item - add publication info to a publication message * @p: publication info diff --git a/net/tipc/socket.c b/net/tipc/socket.c index f1c3b8eb4b3d..e902b01ea3cb 100644 --- a/net/tipc/socket.c +++ b/net/tipc/socket.c @@ -3010,7 +3010,7 @@ static int tipc_sk_insert(struct tipc_sock *tsk) struct net *net = sock_net(sk); struct tipc_net *tn = net_generic(net, tipc_net_id); u32 remaining = (TIPC_MAX_PORT - TIPC_MIN_PORT) + 1; - u32 portid = prandom_u32() % remaining + TIPC_MIN_PORT; + u32 portid = prandom_u32_max(remaining) + TIPC_MIN_PORT; while (remaining--) { portid++; diff --git a/net/tipc/topsrv.c b/net/tipc/topsrv.c index 5522865deae9..d92ec92f0b71 100644 --- a/net/tipc/topsrv.c +++ b/net/tipc/topsrv.c @@ -450,12 +450,19 @@ static void tipc_conn_data_ready(struct sock *sk) static void tipc_topsrv_accept(struct work_struct *work) { struct tipc_topsrv *srv = container_of(work, struct tipc_topsrv, awork); - struct socket *lsock = srv->listener; - struct socket *newsock; + struct socket *newsock, *lsock; struct tipc_conn *con; struct sock *newsk; int ret; + spin_lock_bh(&srv->idr_lock); + if (!srv->listener) { + spin_unlock_bh(&srv->idr_lock); + return; + } + lsock = srv->listener; + spin_unlock_bh(&srv->idr_lock); + while (1) { ret = kernel_accept(lsock, &newsock, O_NONBLOCK); if (ret < 0) @@ -489,7 +496,7 @@ static void tipc_topsrv_listener_data_ready(struct sock *sk) read_lock_bh(&sk->sk_callback_lock); srv = sk->sk_user_data; - if (srv->listener) + if (srv) queue_work(srv->rcv_wq, &srv->awork); read_unlock_bh(&sk->sk_callback_lock); } @@ -568,7 +575,7 @@ bool tipc_topsrv_kern_subscr(struct net *net, u32 port, u32 type, u32 lower, sub.seq.upper = upper; sub.timeout = TIPC_WAIT_FOREVER; sub.filter = filter; - *(u32 *)&sub.usr_handle = port; + *(u64 *)&sub.usr_handle = (u64)port; con = tipc_conn_alloc(tipc_topsrv(net)); if (IS_ERR(con)) @@ -699,8 +706,9 @@ static void tipc_topsrv_stop(struct net *net) __module_get(lsock->sk->sk_prot_creator->owner); srv->listener = NULL; spin_unlock_bh(&srv->idr_lock); - sock_release(lsock); + tipc_topsrv_work_stop(srv); + sock_release(lsock); idr_destroy(&srv->conn_idr); kfree(srv); } diff --git a/net/tls/tls_device.c b/net/tls/tls_device.c index 0f983e5f7dde..a03d66046ca3 100644 --- a/net/tls/tls_device.c +++ b/net/tls/tls_device.c @@ -902,17 +902,28 @@ static void tls_device_core_ctrl_rx_resync(struct tls_context *tls_ctx, } static int -tls_device_reencrypt(struct sock *sk, struct tls_sw_context_rx *sw_ctx) +tls_device_reencrypt(struct sock *sk, struct tls_context *tls_ctx) { + struct tls_sw_context_rx *sw_ctx = tls_sw_ctx_rx(tls_ctx); + const struct tls_cipher_size_desc *cipher_sz; int err, offset, copy, data_len, pos; struct sk_buff *skb, *skb_iter; struct scatterlist sg[1]; struct strp_msg *rxm; char *orig_buf, *buf; + switch (tls_ctx->crypto_recv.info.cipher_type) { + case TLS_CIPHER_AES_GCM_128: + case TLS_CIPHER_AES_GCM_256: + break; + default: + return -EINVAL; + } + cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_recv.info.cipher_type]; + rxm = strp_msg(tls_strp_msg(sw_ctx)); - orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + - TLS_CIPHER_AES_GCM_128_IV_SIZE, sk->sk_allocation); + orig_buf = kmalloc(rxm->full_len + TLS_HEADER_SIZE + cipher_sz->iv, + sk->sk_allocation); if (!orig_buf) return -ENOMEM; buf = orig_buf; @@ -927,10 +938,8 @@ tls_device_reencrypt(struct sock *sk, struct tls_sw_context_rx *sw_ctx) sg_init_table(sg, 1); sg_set_buf(&sg[0], buf, - rxm->full_len + TLS_HEADER_SIZE + - TLS_CIPHER_AES_GCM_128_IV_SIZE); - err = skb_copy_bits(skb, offset, buf, - TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE); + rxm->full_len + TLS_HEADER_SIZE + cipher_sz->iv); + err = skb_copy_bits(skb, offset, buf, TLS_HEADER_SIZE + cipher_sz->iv); if (err) goto free_buf; @@ -941,7 +950,7 @@ tls_device_reencrypt(struct sock *sk, struct tls_sw_context_rx *sw_ctx) else err = 0; - data_len = rxm->full_len - TLS_CIPHER_AES_GCM_128_TAG_SIZE; + data_len = rxm->full_len - cipher_sz->tag; if (skb_pagelen(skb) > offset) { copy = min_t(int, skb_pagelen(skb) - offset, data_len); @@ -1024,7 +1033,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) * likely have initial fragments decrypted, and final ones not * decrypted. We need to reencrypt that single SKB. */ - return tls_device_reencrypt(sk, sw_ctx); + return tls_device_reencrypt(sk, tls_ctx); } /* Return immediately if the record is either entirely plaintext or @@ -1041,7 +1050,7 @@ int tls_device_decrypted(struct sock *sk, struct tls_context *tls_ctx) } ctx->resync_nh_reset = 1; - return tls_device_reencrypt(sk, sw_ctx); + return tls_device_reencrypt(sk, tls_ctx); } static void tls_device_attach(struct tls_context *ctx, struct sock *sk, @@ -1062,9 +1071,9 @@ static void tls_device_attach(struct tls_context *ctx, struct sock *sk, int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) { - u16 nonce_size, tag_size, iv_size, rec_seq_size, salt_size; struct tls_context *tls_ctx = tls_get_ctx(sk); struct tls_prot_info *prot = &tls_ctx->prot_info; + const struct tls_cipher_size_desc *cipher_sz; struct tls_record_info *start_marker_record; struct tls_offload_context_tx *offload_ctx; struct tls_crypto_info *crypto_info; @@ -1099,44 +1108,44 @@ int tls_set_device_offload(struct sock *sk, struct tls_context *ctx) switch (crypto_info->cipher_type) { case TLS_CIPHER_AES_GCM_128: - nonce_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; - tag_size = TLS_CIPHER_AES_GCM_128_TAG_SIZE; - iv_size = TLS_CIPHER_AES_GCM_128_IV_SIZE; iv = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->iv; - rec_seq_size = TLS_CIPHER_AES_GCM_128_REC_SEQ_SIZE; - salt_size = TLS_CIPHER_AES_GCM_128_SALT_SIZE; rec_seq = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->rec_seq; break; + case TLS_CIPHER_AES_GCM_256: + iv = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->iv; + rec_seq = + ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->rec_seq; + break; default: rc = -EINVAL; goto release_netdev; } + cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type]; /* Sanity-check the rec_seq_size for stack allocations */ - if (rec_seq_size > TLS_MAX_REC_SEQ_SIZE) { + if (cipher_sz->rec_seq > TLS_MAX_REC_SEQ_SIZE) { rc = -EINVAL; goto release_netdev; } prot->version = crypto_info->version; prot->cipher_type = crypto_info->cipher_type; - prot->prepend_size = TLS_HEADER_SIZE + nonce_size; - prot->tag_size = tag_size; + prot->prepend_size = TLS_HEADER_SIZE + cipher_sz->iv; + prot->tag_size = cipher_sz->tag; prot->overhead_size = prot->prepend_size + prot->tag_size; - prot->iv_size = iv_size; - prot->salt_size = salt_size; - ctx->tx.iv = kmalloc(iv_size + TLS_CIPHER_AES_GCM_128_SALT_SIZE, - GFP_KERNEL); + prot->iv_size = cipher_sz->iv; + prot->salt_size = cipher_sz->salt; + ctx->tx.iv = kmalloc(cipher_sz->iv + cipher_sz->salt, GFP_KERNEL); if (!ctx->tx.iv) { rc = -ENOMEM; goto release_netdev; } - memcpy(ctx->tx.iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, iv, iv_size); + memcpy(ctx->tx.iv + cipher_sz->salt, iv, cipher_sz->iv); - prot->rec_seq_size = rec_seq_size; - ctx->tx.rec_seq = kmemdup(rec_seq, rec_seq_size, GFP_KERNEL); + prot->rec_seq_size = cipher_sz->rec_seq; + ctx->tx.rec_seq = kmemdup(rec_seq, cipher_sz->rec_seq, GFP_KERNEL); if (!ctx->tx.rec_seq) { rc = -ENOMEM; goto free_iv; diff --git a/net/tls/tls_device_fallback.c b/net/tls/tls_device_fallback.c index 7dfc8023e0f1..cdb391a8754b 100644 --- a/net/tls/tls_device_fallback.c +++ b/net/tls/tls_device_fallback.c @@ -54,13 +54,25 @@ static int tls_enc_record(struct aead_request *aead_req, struct scatter_walk *out, int *in_len, struct tls_prot_info *prot) { - unsigned char buf[TLS_HEADER_SIZE + TLS_CIPHER_AES_GCM_128_IV_SIZE]; + unsigned char buf[TLS_HEADER_SIZE + MAX_IV_SIZE]; + const struct tls_cipher_size_desc *cipher_sz; struct scatterlist sg_in[3]; struct scatterlist sg_out[3]; + unsigned int buf_size; u16 len; int rc; - len = min_t(int, *in_len, ARRAY_SIZE(buf)); + switch (prot->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + case TLS_CIPHER_AES_GCM_256: + break; + default: + return -EINVAL; + } + cipher_sz = &tls_cipher_size_desc[prot->cipher_type]; + + buf_size = TLS_HEADER_SIZE + cipher_sz->iv; + len = min_t(int, *in_len, buf_size); scatterwalk_copychunks(buf, in, len, 0); scatterwalk_copychunks(buf, out, len, 1); @@ -73,13 +85,11 @@ static int tls_enc_record(struct aead_request *aead_req, scatterwalk_pagedone(out, 1, 1); len = buf[4] | (buf[3] << 8); - len -= TLS_CIPHER_AES_GCM_128_IV_SIZE; + len -= cipher_sz->iv; - tls_make_aad(aad, len - TLS_CIPHER_AES_GCM_128_TAG_SIZE, - (char *)&rcd_sn, buf[0], prot); + tls_make_aad(aad, len - cipher_sz->tag, (char *)&rcd_sn, buf[0], prot); - memcpy(iv + TLS_CIPHER_AES_GCM_128_SALT_SIZE, buf + TLS_HEADER_SIZE, - TLS_CIPHER_AES_GCM_128_IV_SIZE); + memcpy(iv + cipher_sz->salt, buf + TLS_HEADER_SIZE, cipher_sz->iv); sg_init_table(sg_in, ARRAY_SIZE(sg_in)); sg_init_table(sg_out, ARRAY_SIZE(sg_out)); @@ -90,7 +100,7 @@ static int tls_enc_record(struct aead_request *aead_req, *in_len -= len; if (*in_len < 0) { - *in_len += TLS_CIPHER_AES_GCM_128_TAG_SIZE; + *in_len += cipher_sz->tag; /* the input buffer doesn't contain the entire record. * trim len accordingly. The resulting authentication tag * will contain garbage, but we don't care, so we won't @@ -111,7 +121,7 @@ static int tls_enc_record(struct aead_request *aead_req, scatterwalk_pagedone(out, 1, 1); } - len -= TLS_CIPHER_AES_GCM_128_TAG_SIZE; + len -= cipher_sz->tag; aead_request_set_crypt(aead_req, sg_in, sg_out, len, iv); rc = crypto_aead_encrypt(aead_req); @@ -299,11 +309,14 @@ static void fill_sg_out(struct scatterlist sg_out[3], void *buf, int sync_size, void *dummy_buf) { + const struct tls_cipher_size_desc *cipher_sz = + &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type]; + sg_set_buf(&sg_out[0], dummy_buf, sync_size); sg_set_buf(&sg_out[1], nskb->data + tcp_payload_offset, payload_len); /* Add room for authentication tag produced by crypto */ dummy_buf += sync_size; - sg_set_buf(&sg_out[2], dummy_buf, TLS_CIPHER_AES_GCM_128_TAG_SIZE); + sg_set_buf(&sg_out[2], dummy_buf, cipher_sz->tag); } static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, @@ -315,7 +328,8 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, struct tls_offload_context_tx *ctx = tls_offload_ctx_tx(tls_ctx); int tcp_payload_offset = skb_tcp_all_headers(skb); int payload_len = skb->len - tcp_payload_offset; - void *buf, *iv, *aad, *dummy_buf; + const struct tls_cipher_size_desc *cipher_sz; + void *buf, *iv, *aad, *dummy_buf, *salt; struct aead_request *aead_req; struct sk_buff *nskb = NULL; int buf_len; @@ -324,20 +338,26 @@ static struct sk_buff *tls_enc_skb(struct tls_context *tls_ctx, if (!aead_req) return NULL; - buf_len = TLS_CIPHER_AES_GCM_128_SALT_SIZE + - TLS_CIPHER_AES_GCM_128_IV_SIZE + - TLS_AAD_SPACE_SIZE + - sync_size + - TLS_CIPHER_AES_GCM_128_TAG_SIZE; + switch (tls_ctx->crypto_send.info.cipher_type) { + case TLS_CIPHER_AES_GCM_128: + salt = tls_ctx->crypto_send.aes_gcm_128.salt; + break; + case TLS_CIPHER_AES_GCM_256: + salt = tls_ctx->crypto_send.aes_gcm_256.salt; + break; + default: + return NULL; + } + cipher_sz = &tls_cipher_size_desc[tls_ctx->crypto_send.info.cipher_type]; + buf_len = cipher_sz->salt + cipher_sz->iv + TLS_AAD_SPACE_SIZE + + sync_size + cipher_sz->tag; buf = kmalloc(buf_len, GFP_ATOMIC); if (!buf) goto free_req; iv = buf; - memcpy(iv, tls_ctx->crypto_send.aes_gcm_128.salt, - TLS_CIPHER_AES_GCM_128_SALT_SIZE); - aad = buf + TLS_CIPHER_AES_GCM_128_SALT_SIZE + - TLS_CIPHER_AES_GCM_128_IV_SIZE; + memcpy(iv, salt, cipher_sz->salt); + aad = buf + cipher_sz->salt + cipher_sz->iv; dummy_buf = aad + TLS_AAD_SPACE_SIZE; nskb = alloc_skb(skb_headroom(skb) + skb->len, GFP_ATOMIC); @@ -451,6 +471,7 @@ int tls_sw_fallback_init(struct sock *sk, struct tls_offload_context_tx *offload_ctx, struct tls_crypto_info *crypto_info) { + const struct tls_cipher_size_desc *cipher_sz; const u8 *key; int rc; @@ -463,15 +484,23 @@ int tls_sw_fallback_init(struct sock *sk, goto err_out; } - key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; + switch (crypto_info->cipher_type) { + case TLS_CIPHER_AES_GCM_128: + key = ((struct tls12_crypto_info_aes_gcm_128 *)crypto_info)->key; + break; + case TLS_CIPHER_AES_GCM_256: + key = ((struct tls12_crypto_info_aes_gcm_256 *)crypto_info)->key; + break; + default: + return -EINVAL; + } + cipher_sz = &tls_cipher_size_desc[crypto_info->cipher_type]; - rc = crypto_aead_setkey(offload_ctx->aead_send, key, - TLS_CIPHER_AES_GCM_128_KEY_SIZE); + rc = crypto_aead_setkey(offload_ctx->aead_send, key, cipher_sz->key); if (rc) goto free_aead; - rc = crypto_aead_setauthsize(offload_ctx->aead_send, - TLS_CIPHER_AES_GCM_128_TAG_SIZE); + rc = crypto_aead_setauthsize(offload_ctx->aead_send, cipher_sz->tag); if (rc) goto free_aead; diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c index 08ddf9d837ae..3735cb00905d 100644 --- a/net/tls/tls_main.c +++ b/net/tls/tls_main.c @@ -58,6 +58,23 @@ enum { TLS_NUM_PROTS, }; +#define CIPHER_SIZE_DESC(cipher) [cipher] = { \ + .iv = cipher ## _IV_SIZE, \ + .key = cipher ## _KEY_SIZE, \ + .salt = cipher ## _SALT_SIZE, \ + .tag = cipher ## _TAG_SIZE, \ + .rec_seq = cipher ## _REC_SEQ_SIZE, \ +} + +const struct tls_cipher_size_desc tls_cipher_size_desc[] = { + CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_128), + CIPHER_SIZE_DESC(TLS_CIPHER_AES_GCM_256), + CIPHER_SIZE_DESC(TLS_CIPHER_AES_CCM_128), + CIPHER_SIZE_DESC(TLS_CIPHER_CHACHA20_POLY1305), + CIPHER_SIZE_DESC(TLS_CIPHER_SM4_GCM), + CIPHER_SIZE_DESC(TLS_CIPHER_SM4_CCM), +}; + static const struct proto *saved_tcpv6_prot; static DEFINE_MUTEX(tcpv6_prot_mutex); static const struct proto *saved_tcpv4_prot; @@ -507,6 +524,54 @@ static int do_tls_getsockopt_conf(struct sock *sk, char __user *optval, rc = -EFAULT; break; } + case TLS_CIPHER_ARIA_GCM_128: { + struct tls12_crypto_info_aria_gcm_128 * + crypto_info_aria_gcm_128 = + container_of(crypto_info, + struct tls12_crypto_info_aria_gcm_128, + info); + + if (len != sizeof(*crypto_info_aria_gcm_128)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(crypto_info_aria_gcm_128->iv, + cctx->iv + TLS_CIPHER_ARIA_GCM_128_SALT_SIZE, + TLS_CIPHER_ARIA_GCM_128_IV_SIZE); + memcpy(crypto_info_aria_gcm_128->rec_seq, cctx->rec_seq, + TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, + crypto_info_aria_gcm_128, + sizeof(*crypto_info_aria_gcm_128))) + rc = -EFAULT; + break; + } + case TLS_CIPHER_ARIA_GCM_256: { + struct tls12_crypto_info_aria_gcm_256 * + crypto_info_aria_gcm_256 = + container_of(crypto_info, + struct tls12_crypto_info_aria_gcm_256, + info); + + if (len != sizeof(*crypto_info_aria_gcm_256)) { + rc = -EINVAL; + goto out; + } + lock_sock(sk); + memcpy(crypto_info_aria_gcm_256->iv, + cctx->iv + TLS_CIPHER_ARIA_GCM_256_SALT_SIZE, + TLS_CIPHER_ARIA_GCM_256_IV_SIZE); + memcpy(crypto_info_aria_gcm_256->rec_seq, cctx->rec_seq, + TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE); + release_sock(sk); + if (copy_to_user(optval, + crypto_info_aria_gcm_256, + sizeof(*crypto_info_aria_gcm_256))) + rc = -EFAULT; + break; + } default: rc = -EINVAL; } @@ -668,6 +733,20 @@ static int do_tls_setsockopt_conf(struct sock *sk, sockptr_t optval, case TLS_CIPHER_SM4_CCM: optsize = sizeof(struct tls12_crypto_info_sm4_ccm); break; + case TLS_CIPHER_ARIA_GCM_128: + if (crypto_info->version != TLS_1_2_VERSION) { + rc = -EINVAL; + goto err_crypto_info; + } + optsize = sizeof(struct tls12_crypto_info_aria_gcm_128); + break; + case TLS_CIPHER_ARIA_GCM_256: + if (crypto_info->version != TLS_1_2_VERSION) { + rc = -EINVAL; + goto err_crypto_info; + } + optsize = sizeof(struct tls12_crypto_info_aria_gcm_256); + break; default: rc = -EINVAL; goto err_crypto_info; diff --git a/net/tls/tls_strp.c b/net/tls/tls_strp.c index 9b79e334dbd9..955ac3e0bf4d 100644 --- a/net/tls/tls_strp.c +++ b/net/tls/tls_strp.c @@ -273,7 +273,7 @@ static int tls_strp_read_copyin(struct tls_strparser *strp) return desc.error; } -static int tls_strp_read_short(struct tls_strparser *strp) +static int tls_strp_read_copy(struct tls_strparser *strp, bool qshort) { struct skb_shared_info *shinfo; struct page *page; @@ -283,7 +283,7 @@ static int tls_strp_read_short(struct tls_strparser *strp) * to read the data out. Otherwise the connection will stall. * Without pressure threshold of INT_MAX will never be ready. */ - if (likely(!tcp_epollin_ready(strp->sk, INT_MAX))) + if (likely(qshort && !tcp_epollin_ready(strp->sk, INT_MAX))) return 0; shinfo = skb_shinfo(strp->anchor); @@ -315,6 +315,27 @@ static int tls_strp_read_short(struct tls_strparser *strp) return 0; } +static bool tls_strp_check_no_dup(struct tls_strparser *strp) +{ + unsigned int len = strp->stm.offset + strp->stm.full_len; + struct sk_buff *skb; + u32 seq; + + skb = skb_shinfo(strp->anchor)->frag_list; + seq = TCP_SKB_CB(skb)->seq; + + while (skb->len < len) { + seq += skb->len; + len -= skb->len; + skb = skb->next; + + if (TCP_SKB_CB(skb)->seq != seq) + return false; + } + + return true; +} + static void tls_strp_load_anchor_with_queue(struct tls_strparser *strp, int len) { struct tcp_sock *tp = tcp_sk(strp->sk); @@ -373,7 +394,7 @@ static int tls_strp_read_sock(struct tls_strparser *strp) return tls_strp_read_copyin(strp); if (inq < strp->stm.full_len) - return tls_strp_read_short(strp); + return tls_strp_read_copy(strp, true); if (!strp->stm.full_len) { tls_strp_load_anchor_with_queue(strp, inq); @@ -387,9 +408,12 @@ static int tls_strp_read_sock(struct tls_strparser *strp) strp->stm.full_len = sz; if (!strp->stm.full_len || inq < strp->stm.full_len) - return tls_strp_read_short(strp); + return tls_strp_read_copy(strp, true); } + if (!tls_strp_check_no_dup(strp)) + return tls_strp_read_copy(strp, false); + strp->msg_ready = 1; tls_rx_msg_ready(strp); diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c index fe27241cd13f..264cf367e265 100644 --- a/net/tls/tls_sw.c +++ b/net/tls/tls_sw.c @@ -2629,6 +2629,40 @@ int tls_set_sw_offload(struct sock *sk, struct tls_context *ctx, int tx) cipher_name = "ccm(sm4)"; break; } + case TLS_CIPHER_ARIA_GCM_128: { + struct tls12_crypto_info_aria_gcm_128 *aria_gcm_128_info; + + aria_gcm_128_info = (void *)crypto_info; + nonce_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE; + tag_size = TLS_CIPHER_ARIA_GCM_128_TAG_SIZE; + iv_size = TLS_CIPHER_ARIA_GCM_128_IV_SIZE; + iv = aria_gcm_128_info->iv; + rec_seq_size = TLS_CIPHER_ARIA_GCM_128_REC_SEQ_SIZE; + rec_seq = aria_gcm_128_info->rec_seq; + keysize = TLS_CIPHER_ARIA_GCM_128_KEY_SIZE; + key = aria_gcm_128_info->key; + salt = aria_gcm_128_info->salt; + salt_size = TLS_CIPHER_ARIA_GCM_128_SALT_SIZE; + cipher_name = "gcm(aria)"; + break; + } + case TLS_CIPHER_ARIA_GCM_256: { + struct tls12_crypto_info_aria_gcm_256 *gcm_256_info; + + gcm_256_info = (void *)crypto_info; + nonce_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE; + tag_size = TLS_CIPHER_ARIA_GCM_256_TAG_SIZE; + iv_size = TLS_CIPHER_ARIA_GCM_256_IV_SIZE; + iv = gcm_256_info->iv; + rec_seq_size = TLS_CIPHER_ARIA_GCM_256_REC_SEQ_SIZE; + rec_seq = gcm_256_info->rec_seq; + keysize = TLS_CIPHER_ARIA_GCM_256_KEY_SIZE; + key = gcm_256_info->key; + salt = gcm_256_info->salt; + salt_size = TLS_CIPHER_ARIA_GCM_256_SALT_SIZE; + cipher_name = "gcm(aria)"; + break; + } default: rc = -EINVAL; goto free_priv; diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dea2972c8178..b3545fc68097 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -569,12 +569,6 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); @@ -620,6 +614,13 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_unlock(sk); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { @@ -1146,7 +1147,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); - ordernum = prandom_u32(); + ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; @@ -2536,32 +2537,18 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - int copied = 0; - - while (1) { - struct unix_sock *u = unix_sk(sk); - struct sk_buff *skb; - int used, err; - - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) - return err; + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int err, copied; - used = recv_actor(sk, skb); - if (used <= 0) { - if (!copied) - copied = used; - kfree_skb(skb); - break; - } else if (used <= skb->len) { - copied += used; - } + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; - kfree_skb(skb); - break; - } + copied = recv_actor(sk, skb); + kfree_skb(skb); return copied; } @@ -2573,13 +2560,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { + unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || @@ -2592,10 +2580,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - if (freezable) - timeo = freezable_schedule_timeout(timeo); - else - timeo = schedule_timeout(timeo); + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) diff --git a/net/unix/garbage.c b/net/unix/garbage.c index d45d5366115a..dc2763540393 100644 --- a/net/unix/garbage.c +++ b/net/unix/garbage.c @@ -204,6 +204,7 @@ void wait_for_unix_gc(void) /* The external entry point: unix_gc() */ void unix_gc(void) { + struct sk_buff *next_skb, *skb; struct unix_sock *u; struct unix_sock *next; struct sk_buff_head hitlist; @@ -297,11 +298,30 @@ void unix_gc(void) spin_unlock(&unix_gc_lock); + /* We need io_uring to clean its registered files, ignore all io_uring + * originated skbs. It's fine as io_uring doesn't keep references to + * other io_uring instances and so killing all other files in the cycle + * will put all io_uring references forcing it to go through normal + * release.path eventually putting registered files. + */ + skb_queue_walk_safe(&hitlist, skb, next_skb) { + if (skb->scm_io_uring) { + __skb_unlink(skb, &hitlist); + skb_queue_tail(&skb->sk->sk_receive_queue, skb); + } + } + /* Here we are. Hitlist is filled. Die. */ __skb_queue_purge(&hitlist); spin_lock(&unix_gc_lock); + /* There could be io_uring registered files, just push them back to + * the inflight list + */ + list_for_each_entry_safe(u, next, &gc_candidates, link) + list_move_tail(&u->link, &gc_inflight_list); + /* All candidates should have been detached by now. */ BUG_ON(!list_empty(&gc_candidates)); diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c index 35863132f4f1..a9980e9b9304 100644 --- a/net/vmw_vsock/virtio_transport_common.c +++ b/net/vmw_vsock/virtio_transport_common.c @@ -1339,7 +1339,7 @@ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) { - kfree(pkt->buf); + kvfree(pkt->buf); kfree(pkt); } EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c index b14f0ed7427b..842c94286d31 100644 --- a/net/vmw_vsock/vmci_transport.c +++ b/net/vmw_vsock/vmci_transport.c @@ -951,7 +951,7 @@ static int vmci_transport_recv_listen(struct sock *sk, * for ourself or any previous connection requests that we received. * If it's the latter, we try to find a socket in our list of pending * connections and, if we do, call the appropriate handler for the - * state that that socket is in. Otherwise we try to service the + * state that socket is in. Otherwise we try to service the * connection request. */ pending = vmci_transport_get_pending(sk, pkt); diff --git a/net/wireless/core.h b/net/wireless/core.h index 775e16cb99ed..af85d8909935 100644 --- a/net/wireless/core.h +++ b/net/wireless/core.h @@ -271,6 +271,8 @@ struct cfg80211_event { } ij; struct { u8 bssid[ETH_ALEN]; + const u8 *td_bitmap; + u8 td_bitmap_len; } pa; }; }; @@ -409,7 +411,8 @@ int cfg80211_disconnect(struct cfg80211_registered_device *rdev, bool wextev); void __cfg80211_roamed(struct wireless_dev *wdev, struct cfg80211_roam_info *info); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid); +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); int cfg80211_mgd_wext_connect(struct cfg80211_registered_device *rdev, struct wireless_dev *wdev); void cfg80211_autodisconnect_wk(struct work_struct *work); diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c index 581df7f4c524..58e1fb18f85a 100644 --- a/net/wireless/mlme.c +++ b/net/wireless/mlme.c @@ -42,6 +42,10 @@ void cfg80211_rx_assoc_resp(struct net_device *dev, unsigned int link_id; for (link_id = 0; link_id < ARRAY_SIZE(data->links); link_id++) { + cr.links[link_id].status = data->links[link_id].status; + WARN_ON_ONCE(cr.links[link_id].status != WLAN_STATUS_SUCCESS && + (!cr.ap_mld_addr || !cr.links[link_id].bss)); + cr.links[link_id].bss = data->links[link_id].bss; if (!cr.links[link_id].bss) continue; diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ff8b1c040f0..148f66edb015 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -7780,6 +7780,7 @@ static int nl80211_set_bss(struct sk_buff *skb, struct genl_info *info) int err; memset(¶ms, 0, sizeof(params)); + params.link_id = nl80211_link_id_or_invalid(info->attrs); /* default to not changing parameters */ params.use_cts_prot = -1; params.use_short_preamble = -1; @@ -13265,7 +13266,9 @@ static int nl80211_parse_wowlan_tcp(struct cfg80211_registered_device *rdev, wake_mask_size); if (tok) { cfg->tokens_size = tokens_size; - memcpy(&cfg->payload_tok, tok, sizeof(*tok) + tokens_size); + cfg->payload_tok = *tok; + memcpy(cfg->payload_tok.token_stream, tok->token_stream, + tokens_size); } trig->tcp = cfg; @@ -16564,7 +16567,8 @@ static const struct genl_small_ops nl80211_small_ops[] = { .validate = GENL_DONT_VALIDATE_STRICT | GENL_DONT_VALIDATE_DUMP, .doit = nl80211_set_bss, .flags = GENL_UNS_ADMIN_PERM, - .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP), + .internal_flags = IFLAGS(NL80211_FLAG_NEED_NETDEV_UP | + NL80211_FLAG_MLO_VALID_LINK_ID), }, { .cmd = NL80211_CMD_GET_REG, @@ -17745,6 +17749,7 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, link_info_size += (cr->links[link].bssid || cr->links[link].bss) ? nla_total_size(ETH_ALEN) : 0; + link_info_size += nla_total_size(sizeof(u16)); } } @@ -17813,7 +17818,9 @@ void nl80211_send_connect_result(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_BSSID, ETH_ALEN, bssid)) || (cr->links[link].addr && nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, - cr->links[link].addr))) + cr->links[link].addr)) || + nla_put_u16(msg, NL80211_ATTR_STATUS_CODE, + cr->links[link].status)) goto nla_put_failure; nla_nest_end(msg, nested_mlo_links); @@ -17937,7 +17944,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, } void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid) + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { struct sk_buff *msg; void *hdr; @@ -17957,6 +17965,11 @@ void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, nla_put(msg, NL80211_ATTR_MAC, ETH_ALEN, bssid)) goto nla_put_failure; + if ((td_bitmap_len > 0) && td_bitmap) + if (nla_put(msg, NL80211_ATTR_TD_BITMAP, + td_bitmap_len, td_bitmap)) + goto nla_put_failure; + genlmsg_end(msg, hdr); genlmsg_multicast_netns(&nl80211_fam, wiphy_net(&rdev->wiphy), msg, 0, diff --git a/net/wireless/nl80211.h b/net/wireless/nl80211.h index 855d540ddfb9..ba9457e94c43 100644 --- a/net/wireless/nl80211.h +++ b/net/wireless/nl80211.h @@ -83,7 +83,8 @@ void nl80211_send_roamed(struct cfg80211_registered_device *rdev, struct net_device *netdev, struct cfg80211_roam_info *info, gfp_t gfp); void nl80211_send_port_authorized(struct cfg80211_registered_device *rdev, - struct net_device *netdev, const u8 *bssid); + struct net_device *netdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len); void nl80211_send_disconnected(struct cfg80211_registered_device *rdev, struct net_device *netdev, u16 reason, const u8 *ie, size_t ie_len, bool from_ap); diff --git a/net/wireless/scan.c b/net/wireless/scan.c index 5382fc2003db..806a5f1330ff 100644 --- a/net/wireless/scan.c +++ b/net/wireless/scan.c @@ -143,18 +143,12 @@ static inline void bss_ref_get(struct cfg80211_registered_device *rdev, lockdep_assert_held(&rdev->bss_lock); bss->refcount++; - if (bss->pub.hidden_beacon_bss) { - bss = container_of(bss->pub.hidden_beacon_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } - if (bss->pub.transmitted_bss) { - bss = container_of(bss->pub.transmitted_bss, - struct cfg80211_internal_bss, - pub); - bss->refcount++; - } + + if (bss->pub.hidden_beacon_bss) + bss_from_pub(bss->pub.hidden_beacon_bss)->refcount++; + + if (bss->pub.transmitted_bss) + bss_from_pub(bss->pub.transmitted_bss)->refcount++; } static inline void bss_ref_put(struct cfg80211_registered_device *rdev, @@ -304,7 +298,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, tmp_old = cfg80211_find_ie(WLAN_EID_SSID, ie, ielen); tmp_old = (tmp_old) ? tmp_old + tmp_old[1] + 2 : ie; - while (tmp_old + tmp_old[1] + 2 - ie <= ielen) { + while (tmp_old + 2 - ie <= ielen && + tmp_old + tmp_old[1] + 2 - ie <= ielen) { if (tmp_old[0] == 0) { tmp_old++; continue; @@ -364,7 +359,8 @@ static size_t cfg80211_gen_new_ie(const u8 *ie, size_t ielen, * copied to new ie, skip ssid, capability, bssid-index ie */ tmp_new = sub_copy; - while (tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { + while (tmp_new + 2 - sub_copy <= subie_len && + tmp_new + tmp_new[1] + 2 - sub_copy <= subie_len) { if (!(tmp_new[0] == WLAN_EID_NON_TX_BSSID_CAP || tmp_new[0] == WLAN_EID_SSID)) { memcpy(pos, tmp_new, tmp_new[1] + 2); @@ -427,6 +423,15 @@ cfg80211_add_nontrans_list(struct cfg80211_bss *trans_bss, rcu_read_unlock(); + /* + * This is a bit weird - it's not on the list, but already on another + * one! The only way that could happen is if there's some BSSID/SSID + * shared by multiple APs in their multi-BSSID profiles, potentially + * with hidden SSID mixed in ... ignore it. + */ + if (!list_empty(&nontrans_bss->nontrans_list)) + return -EINVAL; + /* add to the list */ list_add_tail(&nontrans_bss->nontrans_list, &trans_bss->nontrans_list); return 0; @@ -1602,6 +1607,23 @@ struct cfg80211_non_tx_bss { u8 bssid_index; }; +static void cfg80211_update_hidden_bsses(struct cfg80211_internal_bss *known, + const struct cfg80211_bss_ies *new_ies, + const struct cfg80211_bss_ies *old_ies) +{ + struct cfg80211_internal_bss *bss; + + /* Assign beacon IEs to all sub entries */ + list_for_each_entry(bss, &known->hidden_list, hidden_list) { + const struct cfg80211_bss_ies *ies; + + ies = rcu_access_pointer(bss->pub.beacon_ies); + WARN_ON(ies != old_ies); + + rcu_assign_pointer(bss->pub.beacon_ies, new_ies); + } +} + static bool cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, struct cfg80211_internal_bss *known, @@ -1625,7 +1647,6 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); } else if (rcu_access_pointer(new->pub.beacon_ies)) { const struct cfg80211_bss_ies *old; - struct cfg80211_internal_bss *bss; if (known->pub.hidden_beacon_bss && !list_empty(&known->hidden_list)) { @@ -1653,16 +1674,7 @@ cfg80211_update_known_bss(struct cfg80211_registered_device *rdev, if (old == rcu_access_pointer(known->pub.ies)) rcu_assign_pointer(known->pub.ies, new->pub.beacon_ies); - /* Assign beacon IEs to all sub entries */ - list_for_each_entry(bss, &known->hidden_list, hidden_list) { - const struct cfg80211_bss_ies *ies; - - ies = rcu_access_pointer(bss->pub.beacon_ies); - WARN_ON(ies != old); - - rcu_assign_pointer(bss->pub.beacon_ies, - new->pub.beacon_ies); - } + cfg80211_update_hidden_bsses(known, new->pub.beacon_ies, old); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); @@ -1739,6 +1751,8 @@ cfg80211_bss_update(struct cfg80211_registered_device *rdev, new->refcount = 1; INIT_LIST_HEAD(&new->hidden_list); INIT_LIST_HEAD(&new->pub.nontrans_list); + /* we'll set this later if it was non-NULL */ + new->pub.transmitted_bss = NULL; if (rcu_access_pointer(tmp->pub.proberesp_ies)) { hidden = rb_find_bss(rdev, tmp, BSS_CMP_HIDE_ZLEN); @@ -2021,10 +2035,15 @@ cfg80211_inform_single_bss_data(struct wiphy *wiphy, spin_lock_bh(&rdev->bss_lock); if (cfg80211_add_nontrans_list(non_tx_data->tx_bss, &res->pub)) { - if (__cfg80211_unlink_bss(rdev, res)) + if (__cfg80211_unlink_bss(rdev, res)) { rdev->bss_generation++; + res = NULL; + } } spin_unlock_bh(&rdev->bss_lock); + + if (!res) + return NULL; } trace_cfg80211_return_bss(&res->pub); @@ -2143,6 +2162,8 @@ static void cfg80211_parse_mbssid_data(struct wiphy *wiphy, for_each_element_id(elem, WLAN_EID_MULTIPLE_BSSID, ie, ielen) { if (elem->datalen < 4) continue; + if (elem->data[0] < 1 || (int)elem->data[0] > 8) + continue; for_each_element(sub, elem->data + 1, elem->datalen - 1) { u8 profile_len; @@ -2279,7 +2300,7 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, size_t new_ie_len; struct cfg80211_bss_ies *new_ies; const struct cfg80211_bss_ies *old; - u8 cpy_len; + size_t cpy_len; lockdep_assert_held(&wiphy_to_rdev(wiphy)->bss_lock); @@ -2346,6 +2367,8 @@ cfg80211_update_notlisted_nontrans(struct wiphy *wiphy, } else { old = rcu_access_pointer(nontrans_bss->beacon_ies); rcu_assign_pointer(nontrans_bss->beacon_ies, new_ies); + cfg80211_update_hidden_bsses(bss_from_pub(nontrans_bss), + new_ies, old); rcu_assign_pointer(nontrans_bss->ies, new_ies); if (old) kfree_rcu((struct cfg80211_bss_ies *)old, rcu_head); diff --git a/net/wireless/sme.c b/net/wireless/sme.c index d513536617bd..4b5b6ee0fe01 100644 --- a/net/wireless/sme.c +++ b/net/wireless/sme.c @@ -793,6 +793,10 @@ void __cfg80211_connect_result(struct net_device *dev, } for_each_valid_link(cr, link) { + /* don't do extra lookups for failures */ + if (cr->links[link].status != WLAN_STATUS_SUCCESS) + continue; + if (cr->links[link].bss) continue; @@ -829,6 +833,16 @@ void __cfg80211_connect_result(struct net_device *dev, } memset(wdev->links, 0, sizeof(wdev->links)); + for_each_valid_link(cr, link) { + if (cr->links[link].status == WLAN_STATUS_SUCCESS) + continue; + cr->valid_links &= ~BIT(link); + /* don't require bss pointer for failed links */ + if (!cr->links[link].bss) + continue; + cfg80211_unhold_bss(bss_from_pub(cr->links[link].bss)); + cfg80211_put_bss(wdev->wiphy, cr->links[link].bss); + } wdev->valid_links = cr->valid_links; for_each_valid_link(cr, link) wdev->links[link].client.current_bss = @@ -1237,7 +1251,8 @@ out: } EXPORT_SYMBOL(cfg80211_roamed); -void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) +void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid, + const u8 *td_bitmap, u8 td_bitmap_len) { ASSERT_WDEV_LOCK(wdev); @@ -1250,11 +1265,11 @@ void __cfg80211_port_authorized(struct wireless_dev *wdev, const u8 *bssid) return; nl80211_send_port_authorized(wiphy_to_rdev(wdev->wiphy), wdev->netdev, - bssid); + bssid, td_bitmap, td_bitmap_len); } void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, - gfp_t gfp) + const u8 *td_bitmap, u8 td_bitmap_len, gfp_t gfp) { struct wireless_dev *wdev = dev->ieee80211_ptr; struct cfg80211_registered_device *rdev = wiphy_to_rdev(wdev->wiphy); @@ -1264,12 +1279,15 @@ void cfg80211_port_authorized(struct net_device *dev, const u8 *bssid, if (WARN_ON(!bssid)) return; - ev = kzalloc(sizeof(*ev), gfp); + ev = kzalloc(sizeof(*ev) + td_bitmap_len, gfp); if (!ev) return; ev->type = EVENT_PORT_AUTHORIZED; memcpy(ev->pa.bssid, bssid, ETH_ALEN); + ev->pa.td_bitmap = ((u8 *)ev) + sizeof(*ev); + ev->pa.td_bitmap_len = td_bitmap_len; + memcpy((void *)ev->pa.td_bitmap, td_bitmap, td_bitmap_len); /* * Use the wdev event list so that if there are pending diff --git a/net/wireless/util.c b/net/wireless/util.c index 0b28d00ba8f5..eb277105a878 100644 --- a/net/wireless/util.c +++ b/net/wireless/util.c @@ -559,7 +559,7 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, return -1; hdrlen = ieee80211_hdrlen(hdr->frame_control) + data_offset; - if (skb->len < hdrlen + 8) + if (skb->len < hdrlen) return -1; /* convert IEEE 802.11 header + possible LLC headers into Ethernet @@ -574,8 +574,9 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN); memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN); - if (iftype == NL80211_IFTYPE_MESH_POINT) - skb_copy_bits(skb, hdrlen, &mesh_flags, 1); + if (iftype == NL80211_IFTYPE_MESH_POINT && + skb_copy_bits(skb, hdrlen, &mesh_flags, 1) < 0) + return -1; mesh_flags &= MESH_FLAGS_AE; @@ -595,11 +596,12 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) { if (mesh_flags == MESH_FLAGS_AE_A4) return -1; - if (mesh_flags == MESH_FLAGS_AE_A5_A6) { - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_dest, 2 * ETH_ALEN); - } + if (mesh_flags == MESH_FLAGS_AE_A5_A6 && + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + tmp.h_dest, 2 * ETH_ALEN) < 0) + return -1; + hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; @@ -613,10 +615,11 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, if (iftype == NL80211_IFTYPE_MESH_POINT) { if (mesh_flags == MESH_FLAGS_AE_A5_A6) return -1; - if (mesh_flags == MESH_FLAGS_AE_A4) - skb_copy_bits(skb, hdrlen + - offsetof(struct ieee80211s_hdr, eaddr1), - tmp.h_source, ETH_ALEN); + if (mesh_flags == MESH_FLAGS_AE_A4 && + skb_copy_bits(skb, hdrlen + + offsetof(struct ieee80211s_hdr, eaddr1), + tmp.h_source, ETH_ALEN) < 0) + return -1; hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags); } break; @@ -628,16 +631,15 @@ int ieee80211_data_to_8023_exthdr(struct sk_buff *skb, struct ethhdr *ehdr, break; } - skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)); - tmp.h_proto = payload.proto; - - if (likely((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && - tmp.h_proto != htons(ETH_P_AARP) && - tmp.h_proto != htons(ETH_P_IPX)) || - ether_addr_equal(payload.hdr, bridge_tunnel_header))) { + if (likely(skb_copy_bits(skb, hdrlen, &payload, sizeof(payload)) == 0 && + ((!is_amsdu && ether_addr_equal(payload.hdr, rfc1042_header) && + payload.proto != htons(ETH_P_AARP) && + payload.proto != htons(ETH_P_IPX)) || + ether_addr_equal(payload.hdr, bridge_tunnel_header)))) { /* remove RFC1042 or Bridge-Tunnel encapsulation and * replace EtherType */ hdrlen += ETH_ALEN + 2; + tmp.h_proto = payload.proto; skb_postpull_rcsum(skb, &payload, ETH_ALEN + 2); } else { tmp.h_proto = htons(skb->len - hdrlen); @@ -988,7 +990,9 @@ void cfg80211_process_wdev_events(struct wireless_dev *wdev) __cfg80211_leave(wiphy_to_rdev(wdev->wiphy), wdev); break; case EVENT_PORT_AUTHORIZED: - __cfg80211_port_authorized(wdev, ev->pa.bssid); + __cfg80211_port_authorized(wdev, ev->pa.bssid, + ev->pa.td_bitmap, + ev->pa.td_bitmap_len); break; } wdev_unlock(wdev); @@ -1361,7 +1365,7 @@ static u32 cfg80211_calculate_bitrate_he(struct rate_info *rate) 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ - 11769, /* 1.851851... */ + 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ @@ -1444,7 +1448,7 @@ static u32 cfg80211_calculate_bitrate_eht(struct rate_info *rate) 25599, /* 4.166666... */ 17067, /* 2.777777... */ 12801, /* 2.083333... */ - 11769, /* 1.851851... */ + 11377, /* 1.851725... */ 10239, /* 1.666666... */ 8532, /* 1.388888... */ 7680, /* 1.250000... */ diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c index 76a80a41615b..fe8765c4075d 100644 --- a/net/wireless/wext-core.c +++ b/net/wireless/wext-core.c @@ -468,6 +468,7 @@ void wireless_send_event(struct net_device * dev, struct __compat_iw_event *compat_event; struct compat_iw_point compat_wrqu; struct sk_buff *compskb; + int ptr_len; #endif /* @@ -582,6 +583,9 @@ void wireless_send_event(struct net_device * dev, nlmsg_end(skb, nlh); #ifdef CONFIG_COMPAT hdr_len = compat_event_type_size[descr->header_type]; + + /* ptr_len is remaining size in event header apart from LCP */ + ptr_len = hdr_len - IW_EV_COMPAT_LCP_LEN; event_len = hdr_len + extra_len; compskb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_ATOMIC); @@ -612,16 +616,15 @@ void wireless_send_event(struct net_device * dev, if (descr->header_type == IW_HEADER_TYPE_POINT) { compat_wrqu.length = wrqu->data.length; compat_wrqu.flags = wrqu->data.flags; - memcpy(&compat_event->pointer, - ((char *) &compat_wrqu) + IW_EV_COMPAT_POINT_OFF, - hdr_len - IW_EV_COMPAT_LCP_LEN); + memcpy(compat_event->ptr_bytes, + ((char *)&compat_wrqu) + IW_EV_COMPAT_POINT_OFF, + ptr_len); if (extra_len) - memcpy(((char *) compat_event) + hdr_len, - extra, extra_len); + memcpy(&compat_event->ptr_bytes[ptr_len], + extra, extra_len); } else { /* extra_len must be zero, so no if (extra) needed */ - memcpy(&compat_event->pointer, wrqu, - hdr_len - IW_EV_COMPAT_LCP_LEN); + memcpy(compat_event->ptr_bytes, wrqu, ptr_len); } nlmsg_end(compskb, nlh); diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index 869b9b9b9fad..4681e8e8ad94 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -19,8 +19,6 @@ #include "xdp_umem.h" #include "xsk_queue.h" -#define XDP_UMEM_MIN_CHUNK_SIZE 2048 - static DEFINE_IDA(umem_ida); static void xdp_umem_unpin_pages(struct xdp_umem *umem) diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c index 5b4ce6ba1bc7..9f0561b67c12 100644 --- a/net/xdp/xsk.c +++ b/net/xdp/xsk.c @@ -355,16 +355,15 @@ static u32 xsk_tx_peek_release_fallback(struct xsk_buff_pool *pool, u32 max_entr return nb_pkts; } -u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) +u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 nb_pkts) { struct xdp_sock *xs; - u32 nb_pkts; rcu_read_lock(); if (!list_is_singular(&pool->xsk_tx_list)) { /* Fallback to the non-batched version */ rcu_read_unlock(); - return xsk_tx_peek_release_fallback(pool, max_entries); + return xsk_tx_peek_release_fallback(pool, nb_pkts); } xs = list_first_or_null_rcu(&pool->xsk_tx_list, struct xdp_sock, tx_list); @@ -373,12 +372,7 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) goto out; } - max_entries = xskq_cons_nb_entries(xs->tx, max_entries); - nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, max_entries); - if (!nb_pkts) { - xs->tx->queue_empty_descs++; - goto out; - } + nb_pkts = xskq_cons_nb_entries(xs->tx, nb_pkts); /* This is the backpressure mechanism for the Tx path. Try to * reserve space in the completion queue for all packets, but @@ -386,12 +380,18 @@ u32 xsk_tx_peek_release_desc_batch(struct xsk_buff_pool *pool, u32 max_entries) * packets. This avoids having to implement any buffering in * the Tx path. */ - nb_pkts = xskq_prod_reserve_addr_batch(pool->cq, pool->tx_descs, nb_pkts); + nb_pkts = xskq_prod_nb_free(pool->cq, nb_pkts); if (!nb_pkts) goto out; - xskq_cons_release_n(xs->tx, max_entries); + nb_pkts = xskq_cons_read_desc_batch(xs->tx, pool, nb_pkts); + if (!nb_pkts) { + xs->tx->queue_empty_descs++; + goto out; + } + __xskq_cons_release(xs->tx); + xskq_prod_write_addr_batch(pool->cq, pool->tx_descs, nb_pkts); xs->sk.sk_write_space(&xs->sk); out: @@ -954,8 +954,8 @@ static int xsk_bind(struct socket *sock, struct sockaddr *addr, int addr_len) goto out_unlock; } - err = xp_assign_dev_shared(xs->pool, umem_xs->umem, - dev, qid); + err = xp_assign_dev_shared(xs->pool, umem_xs, dev, + qid); if (err) { xp_destroy(xs->pool); xs->pool = NULL; diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c index a71a8c6edf55..ed6c71826d31 100644 --- a/net/xdp/xsk_buff_pool.c +++ b/net/xdp/xsk_buff_pool.c @@ -212,17 +212,18 @@ err_unreg_pool: return err; } -int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_umem *umem, +int xp_assign_dev_shared(struct xsk_buff_pool *pool, struct xdp_sock *umem_xs, struct net_device *dev, u16 queue_id) { u16 flags; + struct xdp_umem *umem = umem_xs->umem; /* One fill and completion ring required for each queue id. */ if (!pool->fq || !pool->cq) return -EINVAL; flags = umem->zc ? XDP_ZEROCOPY : XDP_COPY; - if (pool->uses_need_wakeup) + if (umem_xs->pool->uses_need_wakeup) flags |= XDP_USE_NEED_WAKEUP; return xp_assign_dev(pool, dev, queue_id, flags); diff --git a/net/xdp/xsk_queue.h b/net/xdp/xsk_queue.h index fb20bf7207cf..c6fb6b763658 100644 --- a/net/xdp/xsk_queue.h +++ b/net/xdp/xsk_queue.h @@ -205,6 +205,11 @@ static inline bool xskq_cons_read_desc(struct xsk_queue *q, return false; } +static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) +{ + q->cached_cons += cnt; +} + static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff_pool *pool, u32 max) { @@ -226,6 +231,8 @@ static inline u32 xskq_cons_read_desc_batch(struct xsk_queue *q, struct xsk_buff cached_cons++; } + /* Release valid plus any invalid entries */ + xskq_cons_release_n(q, cached_cons - q->cached_cons); return nb_entries; } @@ -291,11 +298,6 @@ static inline void xskq_cons_release(struct xsk_queue *q) q->cached_cons++; } -static inline void xskq_cons_release_n(struct xsk_queue *q, u32 cnt) -{ - q->cached_cons += cnt; -} - static inline u32 xskq_cons_present_entries(struct xsk_queue *q) { /* No barriers needed since data is not accessed */ @@ -350,21 +352,17 @@ static inline int xskq_prod_reserve_addr(struct xsk_queue *q, u64 addr) return 0; } -static inline u32 xskq_prod_reserve_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, - u32 max) +static inline void xskq_prod_write_addr_batch(struct xsk_queue *q, struct xdp_desc *descs, + u32 nb_entries) { struct xdp_umem_ring *ring = (struct xdp_umem_ring *)q->ring; - u32 nb_entries, i, cached_prod; - - nb_entries = xskq_prod_nb_free(q, max); + u32 i, cached_prod; /* A, matches D */ cached_prod = q->cached_prod; for (i = 0; i < nb_entries; i++) ring->desc[cached_prod++ & q->ring_mask] = descs[i].addr; q->cached_prod = cached_prod; - - return nb_entries; } static inline int xskq_prod_reserve_desc(struct xsk_queue *q, diff --git a/net/xfrm/espintcp.c b/net/xfrm/espintcp.c index 974eb97b77d2..29a540dcb5a7 100644 --- a/net/xfrm/espintcp.c +++ b/net/xfrm/espintcp.c @@ -91,7 +91,7 @@ static void espintcp_rcv(struct strparser *strp, struct sk_buff *skb) } /* remove header, leave non-ESP marker/SPI */ - if (!__pskb_pull(skb, rxm->offset + 2)) { + if (!pskb_pull(skb, rxm->offset + 2)) { XFRM_INC_STATS(sock_net(strp->sk), LINUX_MIB_XFRMINERROR); kfree_skb(skb); return; diff --git a/net/xfrm/xfrm_device.c b/net/xfrm/xfrm_device.c index 637ca8838436..5f5aafd418af 100644 --- a/net/xfrm/xfrm_device.c +++ b/net/xfrm/xfrm_device.c @@ -207,7 +207,8 @@ struct sk_buff *validate_xmit_xfrm(struct sk_buff *skb, netdev_features_t featur EXPORT_SYMBOL_GPL(validate_xmit_xfrm); int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, - struct xfrm_user_offload *xuo) + struct xfrm_user_offload *xuo, + struct netlink_ext_ack *extack) { int err; struct dst_entry *dst; @@ -216,15 +217,21 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xfrm_address_t *saddr; xfrm_address_t *daddr; - if (!x->type_offload) + if (!x->type_offload) { + NL_SET_ERR_MSG(extack, "Type doesn't support offload"); return -EINVAL; + } /* We don't yet support UDP encapsulation and TFC padding. */ - if (x->encap || x->tfcpad) + if (x->encap || x->tfcpad) { + NL_SET_ERR_MSG(extack, "Encapsulation and TFC padding can't be offloaded"); return -EINVAL; + } - if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) + if (xuo->flags & ~(XFRM_OFFLOAD_IPV6 | XFRM_OFFLOAD_INBOUND)) { + NL_SET_ERR_MSG(extack, "Unrecognized flags in offload request"); return -EINVAL; + } dev = dev_get_by_index(net, xuo->ifindex); if (!dev) { @@ -256,6 +263,7 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, if (x->props.flags & XFRM_STATE_ESN && !dev->xfrmdev_ops->xdo_dev_state_advance_esn) { + NL_SET_ERR_MSG(extack, "Device doesn't support offload with ESN"); xso->dev = NULL; dev_put(dev); return -EINVAL; @@ -277,8 +285,10 @@ int xfrm_dev_state_add(struct net *net, struct xfrm_state *x, xso->real_dev = NULL; netdev_put(dev, &xso->dev_tracker); - if (err != -EOPNOTSUPP) + if (err != -EOPNOTSUPP) { + NL_SET_ERR_MSG(extack, "Device failed to offload this state"); return err; + } } return 0; diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c index b2f4ec9c537f..97074f6f2bde 100644 --- a/net/xfrm/xfrm_input.c +++ b/net/xfrm/xfrm_input.c @@ -20,11 +20,13 @@ #include <net/xfrm.h> #include <net/ip_tunnels.h> #include <net/ip6_tunnel.h> +#include <net/dst_metadata.h> #include "xfrm_inout.h" struct xfrm_trans_tasklet { - struct tasklet_struct tasklet; + struct work_struct work; + spinlock_t queue_lock; struct sk_buff_head queue; }; @@ -719,7 +721,8 @@ resume: sp = skb_sec_path(skb); if (sp) sp->olen = 0; - skb_dst_drop(skb); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return 0; } else { @@ -737,7 +740,8 @@ resume: sp = skb_sec_path(skb); if (sp) sp->olen = 0; - skb_dst_drop(skb); + if (skb_valid_dst(skb)) + skb_dst_drop(skb); gro_cells_receive(&gro_cells, skb); return err; } @@ -760,18 +764,22 @@ int xfrm_input_resume(struct sk_buff *skb, int nexthdr) } EXPORT_SYMBOL(xfrm_input_resume); -static void xfrm_trans_reinject(struct tasklet_struct *t) +static void xfrm_trans_reinject(struct work_struct *work) { - struct xfrm_trans_tasklet *trans = from_tasklet(trans, t, tasklet); + struct xfrm_trans_tasklet *trans = container_of(work, struct xfrm_trans_tasklet, work); struct sk_buff_head queue; struct sk_buff *skb; __skb_queue_head_init(&queue); + spin_lock_bh(&trans->queue_lock); skb_queue_splice_init(&trans->queue, &queue); + spin_unlock_bh(&trans->queue_lock); + local_bh_disable(); while ((skb = __skb_dequeue(&queue))) XFRM_TRANS_SKB_CB(skb)->finish(XFRM_TRANS_SKB_CB(skb)->net, NULL, skb); + local_bh_enable(); } int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, @@ -789,8 +797,10 @@ int xfrm_trans_queue_net(struct net *net, struct sk_buff *skb, XFRM_TRANS_SKB_CB(skb)->finish = finish; XFRM_TRANS_SKB_CB(skb)->net = net; + spin_lock_bh(&trans->queue_lock); __skb_queue_tail(&trans->queue, skb); - tasklet_schedule(&trans->tasklet); + spin_unlock_bh(&trans->queue_lock); + schedule_work(&trans->work); return 0; } EXPORT_SYMBOL(xfrm_trans_queue_net); @@ -817,7 +827,8 @@ void __init xfrm_input_init(void) struct xfrm_trans_tasklet *trans; trans = &per_cpu(xfrm_trans_tasklet, i); + spin_lock_init(&trans->queue_lock); __skb_queue_head_init(&trans->queue); - tasklet_setup(&trans->tasklet, xfrm_trans_reinject); + INIT_WORK(&trans->work, xfrm_trans_reinject); } } diff --git a/net/xfrm/xfrm_interface.c b/net/xfrm/xfrm_interface.c index 5113fa0fbcee..5a67b120c4db 100644 --- a/net/xfrm/xfrm_interface.c +++ b/net/xfrm/xfrm_interface.c @@ -41,6 +41,7 @@ #include <net/addrconf.h> #include <net/xfrm.h> #include <net/net_namespace.h> +#include <net/dst_metadata.h> #include <net/netns/generic.h> #include <linux/etherdevice.h> @@ -56,6 +57,89 @@ static const struct net_device_ops xfrmi_netdev_ops; struct xfrmi_net { /* lists for storing interfaces in use */ struct xfrm_if __rcu *xfrmi[XFRMI_HASH_SIZE]; + struct xfrm_if __rcu *collect_md_xfrmi; +}; + +static const struct nla_policy xfrm_lwt_policy[LWT_XFRM_MAX + 1] = { + [LWT_XFRM_IF_ID] = NLA_POLICY_MIN(NLA_U32, 1), + [LWT_XFRM_LINK] = NLA_POLICY_MIN(NLA_U32, 1), +}; + +static void xfrmi_destroy_state(struct lwtunnel_state *lwt) +{ +} + +static int xfrmi_build_state(struct net *net, struct nlattr *nla, + unsigned int family, const void *cfg, + struct lwtunnel_state **ts, + struct netlink_ext_ack *extack) +{ + struct nlattr *tb[LWT_XFRM_MAX + 1]; + struct lwtunnel_state *new_state; + struct xfrm_md_info *info; + int ret; + + ret = nla_parse_nested(tb, LWT_XFRM_MAX, nla, xfrm_lwt_policy, extack); + if (ret < 0) + return ret; + + if (!tb[LWT_XFRM_IF_ID]) { + NL_SET_ERR_MSG(extack, "if_id must be set"); + return -EINVAL; + } + + new_state = lwtunnel_state_alloc(sizeof(*info)); + if (!new_state) { + NL_SET_ERR_MSG(extack, "failed to create encap info"); + return -ENOMEM; + } + + new_state->type = LWTUNNEL_ENCAP_XFRM; + + info = lwt_xfrm_info(new_state); + + info->if_id = nla_get_u32(tb[LWT_XFRM_IF_ID]); + + if (tb[LWT_XFRM_LINK]) + info->link = nla_get_u32(tb[LWT_XFRM_LINK]); + + *ts = new_state; + return 0; +} + +static int xfrmi_fill_encap_info(struct sk_buff *skb, + struct lwtunnel_state *lwt) +{ + struct xfrm_md_info *info = lwt_xfrm_info(lwt); + + if (nla_put_u32(skb, LWT_XFRM_IF_ID, info->if_id) || + (info->link && nla_put_u32(skb, LWT_XFRM_LINK, info->link))) + return -EMSGSIZE; + + return 0; +} + +static int xfrmi_encap_nlsize(struct lwtunnel_state *lwtstate) +{ + return nla_total_size(sizeof(u32)) + /* LWT_XFRM_IF_ID */ + nla_total_size(sizeof(u32)); /* LWT_XFRM_LINK */ +} + +static int xfrmi_encap_cmp(struct lwtunnel_state *a, struct lwtunnel_state *b) +{ + struct xfrm_md_info *a_info = lwt_xfrm_info(a); + struct xfrm_md_info *b_info = lwt_xfrm_info(b); + + return memcmp(a_info, b_info, sizeof(*a_info)); +} + +static const struct lwtunnel_encap_ops xfrmi_encap_ops = { + .build_state = xfrmi_build_state, + .destroy_state = xfrmi_destroy_state, + .fill_encap = xfrmi_fill_encap_info, + .get_encap_size = xfrmi_encap_nlsize, + .cmp_encap = xfrmi_encap_cmp, + .owner = THIS_MODULE, }; #define for_each_xfrmi_rcu(start, xi) \ @@ -77,17 +161,23 @@ static struct xfrm_if *xfrmi_lookup(struct net *net, struct xfrm_state *x) return xi; } + xi = rcu_dereference(xfrmn->collect_md_xfrmi); + if (xi && (xi->dev->flags & IFF_UP)) + return xi; + return NULL; } -static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, - unsigned short family) +static bool xfrmi_decode_session(struct sk_buff *skb, + unsigned short family, + struct xfrm_if_decode_session_result *res) { struct net_device *dev; + struct xfrm_if *xi; int ifindex = 0; if (!secpath_exists(skb) || !skb->dev) - return NULL; + return false; switch (family) { case AF_INET6: @@ -107,11 +197,18 @@ static struct xfrm_if *xfrmi_decode_session(struct sk_buff *skb, } if (!dev || !(dev->flags & IFF_UP)) - return NULL; + return false; if (dev->netdev_ops != &xfrmi_netdev_ops) - return NULL; + return false; - return netdev_priv(dev); + xi = netdev_priv(dev); + res->net = xi->net; + + if (xi->p.collect_md) + res->if_id = xfrm_input_state(skb)->if_id; + else + res->if_id = xi->p.if_id; + return true; } static void xfrmi_link(struct xfrmi_net *xfrmn, struct xfrm_if *xi) @@ -157,7 +254,10 @@ static int xfrmi_create(struct net_device *dev) if (err < 0) goto out; - xfrmi_link(xfrmn, xi); + if (xi->p.collect_md) + rcu_assign_pointer(xfrmn->collect_md_xfrmi, xi); + else + xfrmi_link(xfrmn, xi); return 0; @@ -185,7 +285,10 @@ static void xfrmi_dev_uninit(struct net_device *dev) struct xfrm_if *xi = netdev_priv(dev); struct xfrmi_net *xfrmn = net_generic(xi->net, xfrmi_net_id); - xfrmi_unlink(xfrmn, xi); + if (xi->p.collect_md) + RCU_INIT_POINTER(xfrmn->collect_md_xfrmi, NULL); + else + xfrmi_unlink(xfrmn, xi); } static void xfrmi_scrub_packet(struct sk_buff *skb, bool xnet) @@ -214,6 +317,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) struct xfrm_state *x; struct xfrm_if *xi; bool xnet; + int link; if (err && !secpath_exists(skb)) return 0; @@ -224,6 +328,7 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) if (!xi) return 1; + link = skb->dev->ifindex; dev = xi->dev; skb->dev = dev; @@ -254,6 +359,17 @@ static int xfrmi_rcv_cb(struct sk_buff *skb, int err) } xfrmi_scrub_packet(skb, xnet); + if (xi->p.collect_md) { + struct metadata_dst *md_dst; + + md_dst = metadata_dst_alloc(0, METADATA_XFRM, GFP_ATOMIC); + if (!md_dst) + return -ENOMEM; + + md_dst->u.xfrm_info.if_id = x->if_id; + md_dst->u.xfrm_info.link = link; + skb_dst_set(skb, (struct dst_entry *)md_dst); + } dev_sw_netstats_rx_add(dev, skb->len); return 0; @@ -269,10 +385,23 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) struct net_device *tdev; struct xfrm_state *x; int err = -1; + u32 if_id; int mtu; + if (xi->p.collect_md) { + struct xfrm_md_info *md_info = skb_xfrm_md_info(skb); + + if (unlikely(!md_info)) + return -EINVAL; + + if_id = md_info->if_id; + fl->flowi_oif = md_info->link; + } else { + if_id = xi->p.if_id; + } + dst_hold(dst); - dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, xi->p.if_id); + dst = xfrm_lookup_with_ifid(xi->net, dst, fl, NULL, 0, if_id); if (IS_ERR(dst)) { err = PTR_ERR(dst); dst = NULL; @@ -283,7 +412,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) if (!x) goto tx_err_link_failure; - if (x->if_id != xi->p.if_id) + if (x->if_id != if_id) goto tx_err_link_failure; tdev = dst->dev; @@ -633,6 +762,9 @@ static void xfrmi_netlink_parms(struct nlattr *data[], if (data[IFLA_XFRM_IF_ID]) parms->if_id = nla_get_u32(data[IFLA_XFRM_IF_ID]); + + if (data[IFLA_XFRM_COLLECT_METADATA]) + parms->collect_md = true; } static int xfrmi_newlink(struct net *src_net, struct net_device *dev, @@ -645,14 +777,27 @@ static int xfrmi_newlink(struct net *src_net, struct net_device *dev, int err; xfrmi_netlink_parms(data, &p); - if (!p.if_id) { - NL_SET_ERR_MSG(extack, "if_id must be non zero"); - return -EINVAL; - } + if (p.collect_md) { + struct xfrmi_net *xfrmn = net_generic(net, xfrmi_net_id); - xi = xfrmi_locate(net, &p); - if (xi) - return -EEXIST; + if (p.link || p.if_id) { + NL_SET_ERR_MSG(extack, "link and if_id must be zero"); + return -EINVAL; + } + + if (rtnl_dereference(xfrmn->collect_md_xfrmi)) + return -EEXIST; + + } else { + if (!p.if_id) { + NL_SET_ERR_MSG(extack, "if_id must be non zero"); + return -EINVAL; + } + + xi = xfrmi_locate(net, &p); + if (xi) + return -EEXIST; + } xi = netdev_priv(dev); xi->p = p; @@ -682,12 +827,22 @@ static int xfrmi_changelink(struct net_device *dev, struct nlattr *tb[], return -EINVAL; } + if (p.collect_md) { + NL_SET_ERR_MSG(extack, "collect_md can't be changed"); + return -EINVAL; + } + xi = xfrmi_locate(net, &p); if (!xi) { xi = netdev_priv(dev); } else { if (xi->dev != dev) return -EEXIST; + if (xi->p.collect_md) { + NL_SET_ERR_MSG(extack, + "device can't be changed to collect_md"); + return -EINVAL; + } } return xfrmi_update(xi, &p); @@ -700,6 +855,8 @@ static size_t xfrmi_get_size(const struct net_device *dev) nla_total_size(4) + /* IFLA_XFRM_IF_ID */ nla_total_size(4) + + /* IFLA_XFRM_COLLECT_METADATA */ + nla_total_size(0) + 0; } @@ -709,7 +866,8 @@ static int xfrmi_fill_info(struct sk_buff *skb, const struct net_device *dev) struct xfrm_if_parms *parm = &xi->p; if (nla_put_u32(skb, IFLA_XFRM_LINK, parm->link) || - nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id)) + nla_put_u32(skb, IFLA_XFRM_IF_ID, parm->if_id) || + (xi->p.collect_md && nla_put_flag(skb, IFLA_XFRM_COLLECT_METADATA))) goto nla_put_failure; return 0; @@ -725,8 +883,10 @@ static struct net *xfrmi_get_link_net(const struct net_device *dev) } static const struct nla_policy xfrmi_policy[IFLA_XFRM_MAX + 1] = { - [IFLA_XFRM_LINK] = { .type = NLA_U32 }, - [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, + [IFLA_XFRM_UNSPEC] = { .strict_start_type = IFLA_XFRM_COLLECT_METADATA }, + [IFLA_XFRM_LINK] = { .type = NLA_U32 }, + [IFLA_XFRM_IF_ID] = { .type = NLA_U32 }, + [IFLA_XFRM_COLLECT_METADATA] = { .type = NLA_FLAG }, }; static struct rtnl_link_ops xfrmi_link_ops __read_mostly = { @@ -762,6 +922,9 @@ static void __net_exit xfrmi_exit_batch_net(struct list_head *net_exit_list) xip = &xi->next) unregister_netdevice_queue(xi->dev, &list); } + xi = rtnl_dereference(xfrmn->collect_md_xfrmi); + if (xi) + unregister_netdevice_queue(xi->dev, &list); } unregister_netdevice_many(&list); rtnl_unlock(); @@ -999,6 +1162,8 @@ static int __init xfrmi_init(void) if (err < 0) goto rtnl_link_failed; + lwtunnel_encap_add_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); + xfrm_if_register_cb(&xfrm_if_cb); return err; @@ -1017,6 +1182,7 @@ pernet_dev_failed: static void __exit xfrmi_fini(void) { xfrm_if_unregister_cb(); + lwtunnel_encap_del_ops(&xfrmi_encap_ops, LWTUNNEL_ENCAP_XFRM); rtnl_link_unregister(&xfrmi_link_ops); xfrmi4_fini(); xfrmi6_fini(); diff --git a/net/xfrm/xfrm_ipcomp.c b/net/xfrm/xfrm_ipcomp.c index cb40ff0ff28d..80143360bf09 100644 --- a/net/xfrm/xfrm_ipcomp.c +++ b/net/xfrm/xfrm_ipcomp.c @@ -203,6 +203,7 @@ static void ipcomp_free_scratches(void) vfree(*per_cpu_ptr(scratches, i)); free_percpu(scratches); + ipcomp_scratches = NULL; } static void * __percpu *ipcomp_alloc_scratches(void) @@ -325,18 +326,22 @@ void ipcomp_destroy(struct xfrm_state *x) } EXPORT_SYMBOL_GPL(ipcomp_destroy); -int ipcomp_init_state(struct xfrm_state *x) +int ipcomp_init_state(struct xfrm_state *x, struct netlink_ext_ack *extack) { int err; struct ipcomp_data *ipcd; struct xfrm_algo_desc *calg_desc; err = -EINVAL; - if (!x->calg) + if (!x->calg) { + NL_SET_ERR_MSG(extack, "Missing required compression algorithm"); goto out; + } - if (x->encap) + if (x->encap) { + NL_SET_ERR_MSG(extack, "IPComp is not compatible with encapsulation"); goto out; + } err = -ENOMEM; ipcd = kzalloc(sizeof(*ipcd), GFP_KERNEL); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index cc6ab79609e2..e392d8d05e0c 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -1889,7 +1889,7 @@ EXPORT_SYMBOL(xfrm_policy_walk_done); */ static int xfrm_policy_match(const struct xfrm_policy *pol, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { const struct xfrm_selector *sel = &pol->selector; int ret = -ESRCH; @@ -2014,7 +2014,7 @@ static struct xfrm_policy * __xfrm_policy_eval_candidates(struct hlist_head *chain, struct xfrm_policy *prefer, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { u32 priority = prefer ? prefer->priority : ~0u; struct xfrm_policy *pol; @@ -2028,7 +2028,7 @@ __xfrm_policy_eval_candidates(struct hlist_head *chain, if (pol->priority > priority) break; - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err != -ESRCH) return ERR_PTR(err); @@ -2053,7 +2053,7 @@ static struct xfrm_policy * xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, struct xfrm_policy *prefer, const struct flowi *fl, - u8 type, u16 family, int dir, u32 if_id) + u8 type, u16 family, u32 if_id) { struct xfrm_policy *tmp; int i; @@ -2061,8 +2061,7 @@ xfrm_policy_eval_candidates(struct xfrm_pol_inexact_candidates *cand, for (i = 0; i < ARRAY_SIZE(cand->res); i++) { tmp = __xfrm_policy_eval_candidates(cand->res[i], prefer, - fl, type, family, dir, - if_id); + fl, type, family, if_id); if (!tmp) continue; @@ -2101,7 +2100,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, ret = NULL; hlist_for_each_entry_rcu(pol, chain, bydst) { - err = xfrm_policy_match(pol, fl, type, family, dir, if_id); + err = xfrm_policy_match(pol, fl, type, family, if_id); if (err) { if (err == -ESRCH) continue; @@ -2120,7 +2119,7 @@ static struct xfrm_policy *xfrm_policy_lookup_bytype(struct net *net, u8 type, goto skip_inexact; pol = xfrm_policy_eval_candidates(&cand, ret, fl, type, - family, dir, if_id); + family, if_id); if (pol) { ret = pol; if (IS_ERR(pol)) @@ -3516,17 +3515,17 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb, int xerr_idx = -1; const struct xfrm_if_cb *ifcb; struct sec_path *sp; - struct xfrm_if *xi; u32 if_id = 0; rcu_read_lock(); ifcb = xfrm_if_get_cb(); if (ifcb) { - xi = ifcb->decode_session(skb, family); - if (xi) { - if_id = xi->p.if_id; - net = xi->net; + struct xfrm_if_decode_session_result r; + + if (ifcb->decode_session(skb, family, &r)) { + if_id = r.if_id; + net = r.net; } } rcu_read_unlock(); diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c index 9277d81b344c..9f4d42eb090f 100644 --- a/net/xfrm/xfrm_replay.c +++ b/net/xfrm/xfrm_replay.c @@ -766,18 +766,22 @@ int xfrm_replay_overflow(struct xfrm_state *x, struct sk_buff *skb) } #endif -int xfrm_init_replay(struct xfrm_state *x) +int xfrm_init_replay(struct xfrm_state *x, struct netlink_ext_ack *extack) { struct xfrm_replay_state_esn *replay_esn = x->replay_esn; if (replay_esn) { if (replay_esn->replay_window > - replay_esn->bmp_len * sizeof(__u32) * 8) + replay_esn->bmp_len * sizeof(__u32) * 8) { + NL_SET_ERR_MSG(extack, "ESN replay window is too large for the chosen bitmap size"); return -EINVAL; + } if (x->props.flags & XFRM_STATE_ESN) { - if (replay_esn->replay_window == 0) + if (replay_esn->replay_window == 0) { + NL_SET_ERR_MSG(extack, "ESN replay window must be > 0"); return -EINVAL; + } x->repl_mode = XFRM_REPLAY_MODE_ESN; } else { x->repl_mode = XFRM_REPLAY_MODE_BMP; diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c index 91c32a3b6924..3d2fe7712ac5 100644 --- a/net/xfrm/xfrm_state.c +++ b/net/xfrm/xfrm_state.c @@ -2072,7 +2072,7 @@ int xfrm_alloc_spi(struct xfrm_state *x, u32 low, u32 high) } else { u32 spi = 0; for (h = 0; h < high-low+1; h++) { - spi = low + prandom_u32()%(high-low+1); + spi = low + prandom_u32_max(high - low + 1); x0 = xfrm_state_lookup(net, mark, &x->id.daddr, htonl(spi), x->id.proto, x->props.family); if (x0 == NULL) { newspi = htonl(spi); @@ -2611,7 +2611,8 @@ u32 xfrm_state_mtu(struct xfrm_state *x, int mtu) } EXPORT_SYMBOL_GPL(xfrm_state_mtu); -int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) +int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload, + struct netlink_ext_ack *extack) { const struct xfrm_mode *inner_mode; const struct xfrm_mode *outer_mode; @@ -2626,12 +2627,16 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) if (x->sel.family != AF_UNSPEC) { inner_mode = xfrm_get_mode(x->props.mode, x->sel.family); - if (inner_mode == NULL) + if (inner_mode == NULL) { + NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; + } if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL) && - family != x->sel.family) + family != x->sel.family) { + NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate a change of family"); goto error; + } x->inner_mode = *inner_mode; } else { @@ -2639,11 +2644,15 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) int iafamily = AF_INET; inner_mode = xfrm_get_mode(x->props.mode, x->props.family); - if (inner_mode == NULL) + if (inner_mode == NULL) { + NL_SET_ERR_MSG(extack, "Requested mode not found"); goto error; + } - if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) + if (!(inner_mode->flags & XFRM_MODE_FLAG_TUNNEL)) { + NL_SET_ERR_MSG(extack, "Only tunnel modes can accommodate an AF_UNSPEC selector"); goto error; + } x->inner_mode = *inner_mode; @@ -2658,24 +2667,27 @@ int __xfrm_init_state(struct xfrm_state *x, bool init_replay, bool offload) } x->type = xfrm_get_type(x->id.proto, family); - if (x->type == NULL) + if (x->type == NULL) { + NL_SET_ERR_MSG(extack, "Requested type not found"); goto error; + } x->type_offload = xfrm_get_type_offload(x->id.proto, family, offload); - err = x->type->init_state(x); + err = x->type->init_state(x, extack); if (err) goto error; outer_mode = xfrm_get_mode(x->props.mode, family); if (!outer_mode) { + NL_SET_ERR_MSG(extack, "Requested mode not found"); err = -EPROTONOSUPPORT; goto error; } x->outer_mode = *outer_mode; if (init_replay) { - err = xfrm_init_replay(x); + err = xfrm_init_replay(x, extack); if (err) goto error; } @@ -2690,7 +2702,7 @@ int xfrm_init_state(struct xfrm_state *x) { int err; - err = __xfrm_init_state(x, true, false); + err = __xfrm_init_state(x, true, false, NULL); if (!err) x->km.state = XFRM_STATE_VALID; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 2ff017117730..e73f9efc54c1 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -35,7 +35,8 @@ #endif #include <asm/unaligned.h> -static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) +static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[type]; struct xfrm_algo *algp; @@ -44,8 +45,10 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) return 0; algp = nla_data(rt); - if (nla_len(rt) < (int)xfrm_alg_len(algp)) + if (nla_len(rt) < (int)xfrm_alg_len(algp)) { + NL_SET_ERR_MSG(extack, "Invalid AUTH/CRYPT/COMP attribute length"); return -EINVAL; + } switch (type) { case XFRMA_ALG_AUTH: @@ -54,6 +57,7 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) break; default: + NL_SET_ERR_MSG(extack, "Invalid algorithm attribute type"); return -EINVAL; } @@ -61,7 +65,8 @@ static int verify_one_alg(struct nlattr **attrs, enum xfrm_attr_type_t type) return 0; } -static int verify_auth_trunc(struct nlattr **attrs) +static int verify_auth_trunc(struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_ALG_AUTH_TRUNC]; struct xfrm_algo_auth *algp; @@ -70,14 +75,16 @@ static int verify_auth_trunc(struct nlattr **attrs) return 0; algp = nla_data(rt); - if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) + if (nla_len(rt) < (int)xfrm_alg_auth_len(algp)) { + NL_SET_ERR_MSG(extack, "Invalid AUTH_TRUNC attribute length"); return -EINVAL; + } algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; } -static int verify_aead(struct nlattr **attrs) +static int verify_aead(struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_ALG_AEAD]; struct xfrm_algo_aead *algp; @@ -86,8 +93,10 @@ static int verify_aead(struct nlattr **attrs) return 0; algp = nla_data(rt); - if (nla_len(rt) < (int)aead_len(algp)) + if (nla_len(rt) < (int)aead_len(algp)) { + NL_SET_ERR_MSG(extack, "Invalid AEAD attribute length"); return -EINVAL; + } algp->alg_name[sizeof(algp->alg_name) - 1] = '\0'; return 0; @@ -102,7 +111,7 @@ static void verify_one_addr(struct nlattr **attrs, enum xfrm_attr_type_t type, *addrp = nla_data(rt); } -static inline int verify_sec_ctx_len(struct nlattr **attrs) +static inline int verify_sec_ctx_len(struct nlattr **attrs, struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_user_sec_ctx *uctx; @@ -112,42 +121,59 @@ static inline int verify_sec_ctx_len(struct nlattr **attrs) uctx = nla_data(rt); if (uctx->len > nla_len(rt) || - uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) + uctx->len != (sizeof(struct xfrm_user_sec_ctx) + uctx->ctx_len)) { + NL_SET_ERR_MSG(extack, "Invalid security context length"); return -EINVAL; + } return 0; } static inline int verify_replay(struct xfrm_usersa_info *p, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_REPLAY_ESN_VAL]; struct xfrm_replay_state_esn *rs; - if (!rt) - return (p->flags & XFRM_STATE_ESN) ? -EINVAL : 0; + if (!rt) { + if (p->flags & XFRM_STATE_ESN) { + NL_SET_ERR_MSG(extack, "Missing required attribute for ESN"); + return -EINVAL; + } + return 0; + } rs = nla_data(rt); - if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) + if (rs->bmp_len > XFRMA_REPLAY_ESN_MAX / sizeof(rs->bmp[0]) / 8) { + NL_SET_ERR_MSG(extack, "ESN bitmap length must be <= 128"); return -EINVAL; + } if (nla_len(rt) < (int)xfrm_replay_state_esn_len(rs) && - nla_len(rt) != sizeof(*rs)) + nla_len(rt) != sizeof(*rs)) { + NL_SET_ERR_MSG(extack, "ESN attribute is too short to fit the full bitmap length"); return -EINVAL; + } /* As only ESP and AH support ESN feature. */ - if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) + if ((p->id.proto != IPPROTO_ESP) && (p->id.proto != IPPROTO_AH)) { + NL_SET_ERR_MSG(extack, "ESN only supported for ESP and AH"); return -EINVAL; + } - if (p->replay_window != 0) + if (p->replay_window != 0) { + NL_SET_ERR_MSG(extack, "ESN not compatible with legacy replay_window"); return -EINVAL; + } return 0; } static int verify_newsa_info(struct xfrm_usersa_info *p, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { int err; @@ -161,10 +187,12 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, break; #else err = -EAFNOSUPPORT; + NL_SET_ERR_MSG(extack, "IPv6 support disabled"); goto out; #endif default: + NL_SET_ERR_MSG(extack, "Invalid address family"); goto out; } @@ -173,65 +201,98 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, break; case AF_INET: - if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)"); goto out; + } break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) - if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)"); goto out; + } break; #else + NL_SET_ERR_MSG(extack, "IPv6 support disabled"); err = -EAFNOSUPPORT; goto out; #endif default: + NL_SET_ERR_MSG(extack, "Invalid address family in selector"); goto out; } err = -EINVAL; switch (p->id.proto) { case IPPROTO_AH: - if ((!attrs[XFRMA_ALG_AUTH] && - !attrs[XFRMA_ALG_AUTH_TRUNC]) || - attrs[XFRMA_ALG_AEAD] || + if (!attrs[XFRMA_ALG_AUTH] && + !attrs[XFRMA_ALG_AUTH_TRUNC]) { + NL_SET_ERR_MSG(extack, "Missing required attribute for AH: AUTH_TRUNC or AUTH"); + goto out; + } + + if (attrs[XFRMA_ALG_AEAD] || attrs[XFRMA_ALG_CRYPT] || attrs[XFRMA_ALG_COMP] || - attrs[XFRMA_TFCPAD]) + attrs[XFRMA_TFCPAD]) { + NL_SET_ERR_MSG(extack, "Invalid attributes for AH: AEAD, CRYPT, COMP, TFCPAD"); goto out; + } break; case IPPROTO_ESP: - if (attrs[XFRMA_ALG_COMP]) + if (attrs[XFRMA_ALG_COMP]) { + NL_SET_ERR_MSG(extack, "Invalid attribute for ESP: COMP"); goto out; + } + if (!attrs[XFRMA_ALG_AUTH] && !attrs[XFRMA_ALG_AUTH_TRUNC] && !attrs[XFRMA_ALG_CRYPT] && - !attrs[XFRMA_ALG_AEAD]) + !attrs[XFRMA_ALG_AEAD]) { + NL_SET_ERR_MSG(extack, "Missing required attribute for ESP: at least one of AUTH, AUTH_TRUNC, CRYPT, AEAD"); goto out; + } + if ((attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_CRYPT]) && - attrs[XFRMA_ALG_AEAD]) + attrs[XFRMA_ALG_AEAD]) { + NL_SET_ERR_MSG(extack, "Invalid attribute combination for ESP: AEAD can't be used with AUTH, AUTH_TRUNC, CRYPT"); goto out; + } + if (attrs[XFRMA_TFCPAD] && - p->mode != XFRM_MODE_TUNNEL) + p->mode != XFRM_MODE_TUNNEL) { + NL_SET_ERR_MSG(extack, "TFC padding can only be used in tunnel mode"); goto out; + } break; case IPPROTO_COMP: - if (!attrs[XFRMA_ALG_COMP] || - attrs[XFRMA_ALG_AEAD] || + if (!attrs[XFRMA_ALG_COMP]) { + NL_SET_ERR_MSG(extack, "Missing required attribute for COMP: COMP"); + goto out; + } + + if (attrs[XFRMA_ALG_AEAD] || attrs[XFRMA_ALG_AUTH] || attrs[XFRMA_ALG_AUTH_TRUNC] || attrs[XFRMA_ALG_CRYPT] || - attrs[XFRMA_TFCPAD] || - (ntohl(p->id.spi) >= 0x10000)) + attrs[XFRMA_TFCPAD]) { + NL_SET_ERR_MSG(extack, "Invalid attributes for COMP: AEAD, AUTH, AUTH_TRUNC, CRYPT, TFCPAD"); goto out; + } + + if (ntohl(p->id.spi) >= 0x10000) { + NL_SET_ERR_MSG(extack, "SPI is too large for COMP (must be < 0x10000)"); + goto out; + } break; #if IS_ENABLED(CONFIG_IPV6) @@ -244,29 +305,36 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, attrs[XFRMA_ALG_CRYPT] || attrs[XFRMA_ENCAP] || attrs[XFRMA_SEC_CTX] || - attrs[XFRMA_TFCPAD] || - !attrs[XFRMA_COADDR]) + attrs[XFRMA_TFCPAD]) { + NL_SET_ERR_MSG(extack, "Invalid attributes for DSTOPTS/ROUTING"); goto out; + } + + if (!attrs[XFRMA_COADDR]) { + NL_SET_ERR_MSG(extack, "Missing required COADDR attribute for DSTOPTS/ROUTING"); + goto out; + } break; #endif default: + NL_SET_ERR_MSG(extack, "Unsupported protocol"); goto out; } - if ((err = verify_aead(attrs))) + if ((err = verify_aead(attrs, extack))) goto out; - if ((err = verify_auth_trunc(attrs))) + if ((err = verify_auth_trunc(attrs, extack))) goto out; - if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH))) + if ((err = verify_one_alg(attrs, XFRMA_ALG_AUTH, extack))) goto out; - if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT))) + if ((err = verify_one_alg(attrs, XFRMA_ALG_CRYPT, extack))) goto out; - if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP))) + if ((err = verify_one_alg(attrs, XFRMA_ALG_COMP, extack))) goto out; - if ((err = verify_sec_ctx_len(attrs))) + if ((err = verify_sec_ctx_len(attrs, extack))) goto out; - if ((err = verify_replay(p, attrs))) + if ((err = verify_replay(p, attrs, extack))) goto out; err = -EINVAL; @@ -278,14 +346,19 @@ static int verify_newsa_info(struct xfrm_usersa_info *p, break; default: + NL_SET_ERR_MSG(extack, "Unsupported mode"); goto out; } err = 0; - if (attrs[XFRMA_MTIMER_THRESH]) - if (!attrs[XFRMA_ENCAP]) + if (attrs[XFRMA_MTIMER_THRESH]) { + if (!attrs[XFRMA_ENCAP]) { + NL_SET_ERR_MSG(extack, "MTIMER_THRESH attribute can only be set on ENCAP states"); err = -EINVAL; + goto out; + } + } out: return err; @@ -293,7 +366,7 @@ out: static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, struct xfrm_algo_desc *(*get_byname)(const char *, int), - struct nlattr *rta) + struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; @@ -304,8 +377,10 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, ualg = nla_data(rta); algo = get_byname(ualg->alg_name, 1); - if (!algo) + if (!algo) { + NL_SET_ERR_MSG(extack, "Requested COMP algorithm not found"); return -ENOSYS; + } *props = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); @@ -317,7 +392,8 @@ static int attach_one_algo(struct xfrm_algo **algpp, u8 *props, return 0; } -static int attach_crypt(struct xfrm_state *x, struct nlattr *rta) +static int attach_crypt(struct xfrm_state *x, struct nlattr *rta, + struct netlink_ext_ack *extack) { struct xfrm_algo *p, *ualg; struct xfrm_algo_desc *algo; @@ -328,8 +404,10 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta) ualg = nla_data(rta); algo = xfrm_ealg_get_byname(ualg->alg_name, 1); - if (!algo) + if (!algo) { + NL_SET_ERR_MSG(extack, "Requested CRYPT algorithm not found"); return -ENOSYS; + } x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_len(ualg), GFP_KERNEL); @@ -343,7 +421,7 @@ static int attach_crypt(struct xfrm_state *x, struct nlattr *rta) } static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, - struct nlattr *rta) + struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo *ualg; struct xfrm_algo_auth *p; @@ -355,8 +433,10 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, ualg = nla_data(rta); algo = xfrm_aalg_get_byname(ualg->alg_name, 1); - if (!algo) + if (!algo) { + NL_SET_ERR_MSG(extack, "Requested AUTH algorithm not found"); return -ENOSYS; + } *props = algo->desc.sadb_alg_id; p = kmalloc(sizeof(*p) + (ualg->alg_key_len + 7) / 8, GFP_KERNEL); @@ -373,7 +453,7 @@ static int attach_auth(struct xfrm_algo_auth **algpp, u8 *props, } static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, - struct nlattr *rta) + struct nlattr *rta, struct netlink_ext_ack *extack) { struct xfrm_algo_auth *p, *ualg; struct xfrm_algo_desc *algo; @@ -384,10 +464,14 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, ualg = nla_data(rta); algo = xfrm_aalg_get_byname(ualg->alg_name, 1); - if (!algo) + if (!algo) { + NL_SET_ERR_MSG(extack, "Requested AUTH_TRUNC algorithm not found"); return -ENOSYS; - if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) + } + if (ualg->alg_trunc_len > algo->uinfo.auth.icv_fullbits) { + NL_SET_ERR_MSG(extack, "Invalid length requested for truncated ICV"); return -EINVAL; + } *props = algo->desc.sadb_alg_id; p = kmemdup(ualg, xfrm_alg_auth_len(ualg), GFP_KERNEL); @@ -402,7 +486,8 @@ static int attach_auth_trunc(struct xfrm_algo_auth **algpp, u8 *props, return 0; } -static int attach_aead(struct xfrm_state *x, struct nlattr *rta) +static int attach_aead(struct xfrm_state *x, struct nlattr *rta, + struct netlink_ext_ack *extack) { struct xfrm_algo_aead *p, *ualg; struct xfrm_algo_desc *algo; @@ -413,8 +498,10 @@ static int attach_aead(struct xfrm_state *x, struct nlattr *rta) ualg = nla_data(rta); algo = xfrm_aead_get_byname(ualg->alg_name, ualg->alg_icv_len, 1); - if (!algo) + if (!algo) { + NL_SET_ERR_MSG(extack, "Requested AEAD algorithm not found"); return -ENOSYS; + } x->props.ealgo = algo->desc.sadb_alg_id; p = kmemdup(ualg, aead_len(ualg), GFP_KERNEL); @@ -579,7 +666,8 @@ static void xfrm_smark_init(struct nlattr **attrs, struct xfrm_mark *m) static struct xfrm_state *xfrm_state_construct(struct net *net, struct xfrm_usersa_info *p, struct nlattr **attrs, - int *errp) + int *errp, + struct netlink_ext_ack *extack) { struct xfrm_state *x = xfrm_state_alloc(net); int err = -ENOMEM; @@ -606,21 +694,21 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_SA_EXTRA_FLAGS]) x->props.extra_flags = nla_get_u32(attrs[XFRMA_SA_EXTRA_FLAGS]); - if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD]))) + if ((err = attach_aead(x, attrs[XFRMA_ALG_AEAD], extack))) goto error; if ((err = attach_auth_trunc(&x->aalg, &x->props.aalgo, - attrs[XFRMA_ALG_AUTH_TRUNC]))) + attrs[XFRMA_ALG_AUTH_TRUNC], extack))) goto error; if (!x->props.aalgo) { if ((err = attach_auth(&x->aalg, &x->props.aalgo, - attrs[XFRMA_ALG_AUTH]))) + attrs[XFRMA_ALG_AUTH], extack))) goto error; } - if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT]))) + if ((err = attach_crypt(x, attrs[XFRMA_ALG_CRYPT], extack))) goto error; if ((err = attach_one_algo(&x->calg, &x->props.calgo, xfrm_calg_get_byname, - attrs[XFRMA_ALG_COMP]))) + attrs[XFRMA_ALG_COMP], extack))) goto error; if (attrs[XFRMA_TFCPAD]) @@ -633,7 +721,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, if (attrs[XFRMA_IF_ID]) x->if_id = nla_get_u32(attrs[XFRMA_IF_ID]); - err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV]); + err = __xfrm_init_state(x, false, attrs[XFRMA_OFFLOAD_DEV], extack); if (err) goto error; @@ -653,7 +741,7 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* sysctl_xfrm_aevent_etime is in 100ms units */ x->replay_maxage = (net->xfrm.sysctl_aevent_etime*HZ)/XFRM_AE_ETH_M; - if ((err = xfrm_init_replay(x))) + if ((err = xfrm_init_replay(x, extack))) goto error; /* override default values from above */ @@ -662,7 +750,8 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, /* configure the hardware if offload is requested */ if (attrs[XFRMA_OFFLOAD_DEV]) { err = xfrm_dev_state_add(net, x, - nla_data(attrs[XFRMA_OFFLOAD_DEV])); + nla_data(attrs[XFRMA_OFFLOAD_DEV]), + extack); if (err) goto error; } @@ -678,7 +767,7 @@ error_no_put: } static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_usersa_info *p = nlmsg_data(nlh); @@ -686,11 +775,11 @@ static int xfrm_add_sa(struct sk_buff *skb, struct nlmsghdr *nlh, int err; struct km_event c; - err = verify_newsa_info(p, attrs); + err = verify_newsa_info(p, attrs, extack); if (err) return err; - x = xfrm_state_construct(net, p, attrs, &err); + x = xfrm_state_construct(net, p, attrs, &err, extack); if (!x) return err; @@ -757,7 +846,7 @@ static struct xfrm_state *xfrm_user_state_lookup(struct net *net, } static int xfrm_del_sa(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; @@ -1254,7 +1343,8 @@ static int build_spdinfo(struct sk_buff *skb, struct net *net, } static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrmu_spdhthresh *thresh4 = NULL; @@ -1299,7 +1389,8 @@ static int xfrm_set_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_get_spdinfo(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct sk_buff *r_skb; @@ -1358,7 +1449,8 @@ static int build_sadinfo(struct sk_buff *skb, struct net *net, } static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct sk_buff *r_skb; @@ -1378,7 +1470,7 @@ static int xfrm_get_sadinfo(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_get_sa(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_usersa_id *p = nlmsg_data(nlh); @@ -1402,7 +1494,8 @@ out_noput: } static int xfrm_alloc_userspi(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; @@ -1477,7 +1570,7 @@ out_noput: return err; } -static int verify_policy_dir(u8 dir) +static int verify_policy_dir(u8 dir, struct netlink_ext_ack *extack) { switch (dir) { case XFRM_POLICY_IN: @@ -1486,13 +1579,14 @@ static int verify_policy_dir(u8 dir) break; default: + NL_SET_ERR_MSG(extack, "Invalid policy direction"); return -EINVAL; } return 0; } -static int verify_policy_type(u8 type) +static int verify_policy_type(u8 type, struct netlink_ext_ack *extack) { switch (type) { case XFRM_POLICY_TYPE_MAIN: @@ -1502,13 +1596,15 @@ static int verify_policy_type(u8 type) break; default: + NL_SET_ERR_MSG(extack, "Invalid policy type"); return -EINVAL; } return 0; } -static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) +static int verify_newpolicy_info(struct xfrm_userpolicy_info *p, + struct netlink_ext_ack *extack) { int ret; @@ -1520,6 +1616,7 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) break; default: + NL_SET_ERR_MSG(extack, "Invalid policy share"); return -EINVAL; } @@ -1529,35 +1626,44 @@ static int verify_newpolicy_info(struct xfrm_userpolicy_info *p) break; default: + NL_SET_ERR_MSG(extack, "Invalid policy action"); return -EINVAL; } switch (p->sel.family) { case AF_INET: - if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) + if (p->sel.prefixlen_d > 32 || p->sel.prefixlen_s > 32) { + NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 32 for IPv4)"); return -EINVAL; + } break; case AF_INET6: #if IS_ENABLED(CONFIG_IPV6) - if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) + if (p->sel.prefixlen_d > 128 || p->sel.prefixlen_s > 128) { + NL_SET_ERR_MSG(extack, "Invalid prefix length in selector (must be <= 128 for IPv6)"); return -EINVAL; + } break; #else + NL_SET_ERR_MSG(extack, "IPv6 support disabled"); return -EAFNOSUPPORT; #endif default: + NL_SET_ERR_MSG(extack, "Invalid selector family"); return -EINVAL; } - ret = verify_policy_dir(p->dir); + ret = verify_policy_dir(p->dir, extack); if (ret) return ret; - if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) + if (p->index && (xfrm_policy_id2dir(p->index) != p->dir)) { + NL_SET_ERR_MSG(extack, "Policy index doesn't match direction"); return -EINVAL; + } return 0; } @@ -1599,13 +1705,16 @@ static void copy_templates(struct xfrm_policy *xp, struct xfrm_user_tmpl *ut, } } -static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) +static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family, + struct netlink_ext_ack *extack) { u16 prev_family; int i; - if (nr > XFRM_MAX_DEPTH) + if (nr > XFRM_MAX_DEPTH) { + NL_SET_ERR_MSG(extack, "Template count must be <= XFRM_MAX_DEPTH (" __stringify(XFRM_MAX_DEPTH) ")"); return -EINVAL; + } prev_family = family; @@ -1625,12 +1734,16 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) case XFRM_MODE_BEET: break; default: - if (ut[i].family != prev_family) + if (ut[i].family != prev_family) { + NL_SET_ERR_MSG(extack, "Mode in template doesn't support a family change"); return -EINVAL; + } break; } - if (ut[i].mode >= XFRM_MODE_MAX) + if (ut[i].mode >= XFRM_MODE_MAX) { + NL_SET_ERR_MSG(extack, "Mode in template must be < XFRM_MODE_MAX (" __stringify(XFRM_MODE_MAX) ")"); return -EINVAL; + } prev_family = ut[i].family; @@ -1642,17 +1755,21 @@ static int validate_tmpl(int nr, struct xfrm_user_tmpl *ut, u16 family) break; #endif default: + NL_SET_ERR_MSG(extack, "Invalid family in template"); return -EINVAL; } - if (!xfrm_id_proto_valid(ut[i].id.proto)) + if (!xfrm_id_proto_valid(ut[i].id.proto)) { + NL_SET_ERR_MSG(extack, "Invalid XFRM protocol in template"); return -EINVAL; + } } return 0; } -static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs) +static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_TMPL]; @@ -1663,7 +1780,7 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs) int nr = nla_len(rt) / sizeof(*utmpl); int err; - err = validate_tmpl(nr, utmpl, pol->family); + err = validate_tmpl(nr, utmpl, pol->family, extack); if (err) return err; @@ -1672,7 +1789,8 @@ static int copy_from_user_tmpl(struct xfrm_policy *pol, struct nlattr **attrs) return 0; } -static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs) +static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct nlattr *rt = attrs[XFRMA_POLICY_TYPE]; struct xfrm_userpolicy_type *upt; @@ -1684,7 +1802,7 @@ static int copy_from_user_policy_type(u8 *tp, struct nlattr **attrs) type = upt->type; } - err = verify_policy_type(type); + err = verify_policy_type(type, extack); if (err) return err; @@ -1719,7 +1837,11 @@ static void copy_to_user_policy(struct xfrm_policy *xp, struct xfrm_userpolicy_i p->share = XFRM_SHARE_ANY; /* XXX xp->share */ } -static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_userpolicy_info *p, struct nlattr **attrs, int *errp) +static struct xfrm_policy *xfrm_policy_construct(struct net *net, + struct xfrm_userpolicy_info *p, + struct nlattr **attrs, + int *errp, + struct netlink_ext_ack *extack) { struct xfrm_policy *xp = xfrm_policy_alloc(net, GFP_KERNEL); int err; @@ -1731,11 +1853,11 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us copy_from_user_policy(xp, p); - err = copy_from_user_policy_type(&xp->type, attrs); + err = copy_from_user_policy_type(&xp->type, attrs, extack); if (err) goto error; - if (!(err = copy_from_user_tmpl(xp, attrs))) + if (!(err = copy_from_user_tmpl(xp, attrs, extack))) err = copy_from_user_sec_ctx(xp, attrs); if (err) goto error; @@ -1754,7 +1876,8 @@ static struct xfrm_policy *xfrm_policy_construct(struct net *net, struct xfrm_us } static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_info *p = nlmsg_data(nlh); @@ -1763,14 +1886,14 @@ static int xfrm_add_policy(struct sk_buff *skb, struct nlmsghdr *nlh, int err; int excl; - err = verify_newpolicy_info(p); + err = verify_newpolicy_info(p, extack); if (err) return err; - err = verify_sec_ctx_len(attrs); + err = verify_sec_ctx_len(attrs, extack); if (err) return err; - xp = xfrm_policy_construct(net, p, attrs, &err); + xp = xfrm_policy_construct(net, p, attrs, &err, extack); if (!xp) return err; @@ -2015,7 +2138,7 @@ static bool xfrm_userpolicy_is_valid(__u8 policy) } static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_userpolicy_default *up = nlmsg_data(nlh); @@ -2036,7 +2159,7 @@ static int xfrm_set_default(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct sk_buff *r_skb; struct nlmsghdr *r_nlh; @@ -2066,7 +2189,8 @@ static int xfrm_get_default(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; @@ -2081,11 +2205,11 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, p = nlmsg_data(nlh); delete = nlh->nlmsg_type == XFRM_MSG_DELPOLICY; - err = copy_from_user_policy_type(&type, attrs); + err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; - err = verify_policy_dir(p->dir); + err = verify_policy_dir(p->dir, extack); if (err) return err; @@ -2101,7 +2225,7 @@ static int xfrm_get_policy(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; - err = verify_sec_ctx_len(attrs); + err = verify_sec_ctx_len(attrs, extack); if (err) return err; @@ -2149,7 +2273,8 @@ out: } static int xfrm_flush_sa(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct km_event c; @@ -2249,7 +2374,7 @@ out_cancel: } static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; @@ -2293,7 +2418,7 @@ static int xfrm_get_ae(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_new_ae(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; @@ -2344,14 +2469,15 @@ out: } static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct km_event c; u8 type = XFRM_POLICY_TYPE_MAIN; int err; - err = copy_from_user_policy_type(&type, attrs); + err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; @@ -2372,7 +2498,8 @@ static int xfrm_flush_policy(struct sk_buff *skb, struct nlmsghdr *nlh, } static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; @@ -2383,11 +2510,11 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, struct xfrm_mark m; u32 if_id = 0; - err = copy_from_user_policy_type(&type, attrs); + err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; - err = verify_policy_dir(p->dir); + err = verify_policy_dir(p->dir, extack); if (err) return err; @@ -2403,7 +2530,7 @@ static int xfrm_add_pol_expire(struct sk_buff *skb, struct nlmsghdr *nlh, struct nlattr *rt = attrs[XFRMA_SEC_CTX]; struct xfrm_sec_ctx *ctx; - err = verify_sec_ctx_len(attrs); + err = verify_sec_ctx_len(attrs, extack); if (err) return err; @@ -2438,7 +2565,8 @@ out: } static int xfrm_add_sa_expire(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_state *x; @@ -2472,7 +2600,8 @@ out: } static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, + struct netlink_ext_ack *extack) { struct net *net = sock_net(skb->sk); struct xfrm_policy *xp; @@ -2490,15 +2619,15 @@ static int xfrm_add_acquire(struct sk_buff *skb, struct nlmsghdr *nlh, xfrm_mark_get(attrs, &mark); - err = verify_newpolicy_info(&ua->policy); + err = verify_newpolicy_info(&ua->policy, extack); if (err) goto free_state; - err = verify_sec_ctx_len(attrs); + err = verify_sec_ctx_len(attrs, extack); if (err) goto free_state; /* build an XP */ - xp = xfrm_policy_construct(net, &ua->policy, attrs, &err); + xp = xfrm_policy_construct(net, &ua->policy, attrs, &err, extack); if (!xp) goto free_state; @@ -2577,7 +2706,7 @@ static int copy_from_user_migrate(struct xfrm_migrate *ma, } static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { struct xfrm_userpolicy_id *pi = nlmsg_data(nlh); struct xfrm_migrate m[XFRM_MAX_DEPTH]; @@ -2594,7 +2723,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, kmp = attrs[XFRMA_KMADDRESS] ? &km : NULL; - err = copy_from_user_policy_type(&type, attrs); + err = copy_from_user_policy_type(&type, attrs, extack); if (err) return err; @@ -2623,7 +2752,7 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, } #else static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, - struct nlattr **attrs) + struct nlattr **attrs, struct netlink_ext_ack *extack) { return -ENOPROTOOPT; } @@ -2819,7 +2948,8 @@ static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { }; static const struct xfrm_link { - int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **); + int (*doit)(struct sk_buff *, struct nlmsghdr *, struct nlattr **, + struct netlink_ext_ack *); int (*start)(struct netlink_callback *); int (*dump)(struct sk_buff *, struct netlink_callback *); int (*done)(struct netlink_callback *); @@ -2921,7 +3051,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh, goto err; } - err = link->doit(skb, nlh, attrs); + err = link->doit(skb, nlh, attrs, extack); /* We need to free skb allocated in xfrm_alloc_compat() before * returning from this function, because consume_skb() won't take @@ -3272,11 +3402,11 @@ static struct xfrm_policy *xfrm_compile_policy(struct sock *sk, int opt, *dir = -EINVAL; if (len < sizeof(*p) || - verify_newpolicy_info(p)) + verify_newpolicy_info(p, NULL)) return NULL; nr = ((len - sizeof(*p)) / sizeof(*ut)); - if (validate_tmpl(nr, ut, p->sel.family)) + if (validate_tmpl(nr, ut, p->sel.family, NULL)) return NULL; if (p->dir > XFRM_POLICY_OUT) |
