From 4408d55a64677febdcb50d1b44d0dc714ce4187e Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:45 +0900 Subject: af_unix: Refactor unix_next_socket(). Currently, unix_next_socket() is overloaded depending on the 2nd argument. If it is NULL, unix_next_socket() returns the first socket in the hash. If not NULL, it returns the next socket in the same hash list or the first socket in the next non-empty hash list. This patch refactors unix_next_socket() into two functions unix_get_first() and unix_get_next(). unix_get_first() newly acquires a lock and returns the first socket in the list. unix_get_next() returns the next socket in a list or releases a lock and falls back to unix_get_first(). In the following patch, bpf iter holds entire sockets in a list and always releases the lock before .show(). It always calls unix_get_first() to acquire a lock in each iteration. So, this patch makes the change easier to follow. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-2-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 51 ++++++++++++++++++++++++++++++--------------------- 1 file changed, 30 insertions(+), 21 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c19569819866..e1c4082accdb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3240,49 +3240,58 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) return sk; } -static struct sock *unix_next_socket(struct seq_file *seq, - struct sock *sk, - loff_t *pos) +static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct sock *sk; - while (sk > (struct sock *)SEQ_START_TOKEN) { - sk = sk_next(sk); - if (!sk) - goto next_bucket; - if (sock_net(sk) == seq_file_net(seq)) - return sk; - } - - do { + while (bucket < ARRAY_SIZE(unix_socket_table)) { spin_lock(&unix_table_locks[bucket]); + sk = unix_from_bucket(seq, pos); if (sk) return sk; -next_bucket: - spin_unlock(&unix_table_locks[bucket++]); - *pos = set_bucket_offset(bucket, 1); - } while (bucket < ARRAY_SIZE(unix_socket_table)); + spin_unlock(&unix_table_locks[bucket]); + + *pos = set_bucket_offset(++bucket, 1); + } return NULL; } +static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, + loff_t *pos) +{ + unsigned long bucket = get_bucket(*pos); + + for (sk = sk_next(sk); sk; sk = sk_next(sk)) + if (sock_net(sk) == seq_file_net(seq)) + return sk; + + spin_unlock(&unix_table_locks[bucket]); + + *pos = set_bucket_offset(++bucket, 1); + + return unix_get_first(seq, pos); +} + static void *unix_seq_start(struct seq_file *seq, loff_t *pos) { if (!*pos) return SEQ_START_TOKEN; - if (get_bucket(*pos) >= ARRAY_SIZE(unix_socket_table)) - return NULL; - - return unix_next_socket(seq, NULL, pos); + return unix_get_first(seq, pos); } static void *unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) { ++*pos; - return unix_next_socket(seq, v, pos); + + if (v == SEQ_START_TOKEN) + return unix_get_first(seq, pos); + + return unix_get_next(seq, v, pos); } static void unix_seq_stop(struct seq_file *seq, void *v) -- cgit From 855d8e77ffb05be6e54c34dababccb20318aec00 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:46 +0900 Subject: bpf: af_unix: Use batching algorithm in bpf unix iter. The commit 04c7820b776f ("bpf: tcp: Bpf iter batching and lock_sock") introduces the batching algorithm to iterate TCP sockets with more consistency. This patch uses the same algorithm to iterate AF_UNIX sockets. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-3-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 184 +++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 177 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e1c4082accdb..d383d5f63b6b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3356,6 +3356,15 @@ static const struct seq_operations unix_seq_ops = { }; #if IS_BUILTIN(CONFIG_UNIX) && defined(CONFIG_BPF_SYSCALL) +struct bpf_unix_iter_state { + struct seq_net_private p; + unsigned int cur_sk; + unsigned int end_sk; + unsigned int max_sk; + struct sock **batch; + bool st_bucket_done; +}; + struct bpf_iter__unix { __bpf_md_ptr(struct bpf_iter_meta *, meta); __bpf_md_ptr(struct unix_sock *, unix_sk); @@ -3374,24 +3383,156 @@ static int unix_prog_seq_show(struct bpf_prog *prog, struct bpf_iter_meta *meta, return bpf_iter_run_prog(prog, &ctx); } +static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) + +{ + struct bpf_unix_iter_state *iter = seq->private; + unsigned int expected = 1; + struct sock *sk; + + sock_hold(start_sk); + iter->batch[iter->end_sk++] = start_sk; + + for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { + if (sock_net(sk) != seq_file_net(seq)) + continue; + + if (iter->end_sk < iter->max_sk) { + sock_hold(sk); + iter->batch[iter->end_sk++] = sk; + } + + expected++; + } + + spin_unlock(&unix_table_locks[start_sk->sk_hash]); + + return expected; +} + +static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state *iter) +{ + while (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); +} + +static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state *iter, + unsigned int new_batch_sz) +{ + struct sock **new_batch; + + new_batch = kvmalloc(sizeof(*new_batch) * new_batch_sz, + GFP_USER | __GFP_NOWARN); + if (!new_batch) + return -ENOMEM; + + bpf_iter_unix_put_batch(iter); + kvfree(iter->batch); + iter->batch = new_batch; + iter->max_sk = new_batch_sz; + + return 0; +} + +static struct sock *bpf_iter_unix_batch(struct seq_file *seq, + loff_t *pos) +{ + struct bpf_unix_iter_state *iter = seq->private; + unsigned int expected; + bool resized = false; + struct sock *sk; + + if (iter->st_bucket_done) + *pos = set_bucket_offset(get_bucket(*pos) + 1, 1); + +again: + /* Get a new batch */ + iter->cur_sk = 0; + iter->end_sk = 0; + + sk = unix_get_first(seq, pos); + if (!sk) + return NULL; /* Done */ + + expected = bpf_iter_unix_hold_batch(seq, sk); + + if (iter->end_sk == expected) { + iter->st_bucket_done = true; + return sk; + } + + if (!resized && !bpf_iter_unix_realloc_batch(iter, expected * 3 / 2)) { + resized = true; + goto again; + } + + return sk; +} + +static void *bpf_iter_unix_seq_start(struct seq_file *seq, loff_t *pos) +{ + if (!*pos) + return SEQ_START_TOKEN; + + /* bpf iter does not support lseek, so it always + * continue from where it was stop()-ped. + */ + return bpf_iter_unix_batch(seq, pos); +} + +static void *bpf_iter_unix_seq_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct bpf_unix_iter_state *iter = seq->private; + struct sock *sk; + + /* Whenever seq_next() is called, the iter->cur_sk is + * done with seq_show(), so advance to the next sk in + * the batch. + */ + if (iter->cur_sk < iter->end_sk) + sock_put(iter->batch[iter->cur_sk++]); + + ++*pos; + + if (iter->cur_sk < iter->end_sk) + sk = iter->batch[iter->cur_sk]; + else + sk = bpf_iter_unix_batch(seq, pos); + + return sk; +} + static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) { struct bpf_iter_meta meta; struct bpf_prog *prog; struct sock *sk = v; uid_t uid; + bool slow; + int ret; if (v == SEQ_START_TOKEN) return 0; + slow = lock_sock_fast(sk); + + if (unlikely(sk_unhashed(sk))) { + ret = SEQ_SKIP; + goto unlock; + } + uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); - return unix_prog_seq_show(prog, &meta, v, uid); + ret = unix_prog_seq_show(prog, &meta, v, uid); +unlock: + unlock_sock_fast(sk, slow); + return ret; } static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) { + struct bpf_unix_iter_state *iter = seq->private; struct bpf_iter_meta meta; struct bpf_prog *prog; @@ -3402,12 +3543,13 @@ static void bpf_iter_unix_seq_stop(struct seq_file *seq, void *v) (void)unix_prog_seq_show(prog, &meta, v, 0); } - unix_seq_stop(seq, v); + if (iter->cur_sk < iter->end_sk) + bpf_iter_unix_put_batch(iter); } static const struct seq_operations bpf_iter_unix_seq_ops = { - .start = unix_seq_start, - .next = unix_seq_next, + .start = bpf_iter_unix_seq_start, + .next = bpf_iter_unix_seq_next, .stop = bpf_iter_unix_seq_stop, .show = bpf_iter_unix_seq_show, }; @@ -3456,11 +3598,39 @@ static struct pernet_operations unix_net_ops = { DEFINE_BPF_ITER_FUNC(unix, struct bpf_iter_meta *meta, struct unix_sock *unix_sk, uid_t uid) +#define INIT_BATCH_SZ 16 + +static int bpf_iter_init_unix(void *priv_data, struct bpf_iter_aux_info *aux) +{ + struct bpf_unix_iter_state *iter = priv_data; + int err; + + err = bpf_iter_init_seq_net(priv_data, aux); + if (err) + return err; + + err = bpf_iter_unix_realloc_batch(iter, INIT_BATCH_SZ); + if (err) { + bpf_iter_fini_seq_net(priv_data); + return err; + } + + return 0; +} + +static void bpf_iter_fini_unix(void *priv_data) +{ + struct bpf_unix_iter_state *iter = priv_data; + + bpf_iter_fini_seq_net(priv_data); + kvfree(iter->batch); +} + static const struct bpf_iter_seq_info unix_seq_info = { .seq_ops = &bpf_iter_unix_seq_ops, - .init_seq_private = bpf_iter_init_seq_net, - .fini_seq_private = bpf_iter_fini_seq_net, - .seq_priv_size = sizeof(struct seq_net_private), + .init_seq_private = bpf_iter_init_unix, + .fini_seq_private = bpf_iter_fini_unix, + .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; static struct bpf_iter_reg unix_reg_info = { -- cgit From eb7d8f1d9ebc7379f09a51bf4faa35e0bfa7437d Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 13 Jan 2022 09:28:47 +0900 Subject: bpf: Support bpf_(get|set)sockopt() in bpf unix iter. This patch makes bpf_(get|set)sockopt() available when iterating AF_UNIX sockets. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220113002849.4384-4-kuniyu@amazon.co.jp Signed-off-by: Alexei Starovoitov --- net/unix/af_unix.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d383d5f63b6b..3e0d6281fd1e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3633,6 +3633,20 @@ static const struct bpf_iter_seq_info unix_seq_info = { .seq_priv_size = sizeof(struct bpf_unix_iter_state), }; +static const struct bpf_func_proto * +bpf_iter_unix_get_func_proto(enum bpf_func_id func_id, + const struct bpf_prog *prog) +{ + switch (func_id) { + case BPF_FUNC_setsockopt: + return &bpf_sk_setsockopt_proto; + case BPF_FUNC_getsockopt: + return &bpf_sk_getsockopt_proto; + default: + return NULL; + } +} + static struct bpf_iter_reg unix_reg_info = { .target = "unix", .ctx_arg_info_size = 1, @@ -3640,6 +3654,7 @@ static struct bpf_iter_reg unix_reg_info = { { offsetof(struct bpf_iter__unix, unix_sk), PTR_TO_BTF_ID_OR_NULL }, }, + .get_func_proto = bpf_iter_unix_get_func_proto, .seq_info = &unix_seq_info, }; -- cgit From e82025c623e2bf04d162bafceb66a59115814479 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:08:08 +0900 Subject: af_unix: Fix some data-races around unix_sk(sk)->oob_skb. Out-of-band data automatically places a "mark" showing wherein the sequence the out-of-band data would have been. If the out-of-band data implies cancelling everything sent so far, the "mark" is helpful to flush them. When the socket's read pointer reaches the "mark", the ioctl() below sets a non zero value to the arg `atmark`: The out-of-band data is queued in sk->sk_receive_queue as well as ordinary data and also saved in unix_sk(sk)->oob_skb. It can be used to test if the head of the receive queue is the out-of-band data meaning the socket is at the "mark". While testing that, unix_ioctl() reads unix_sk(sk)->oob_skb locklessly. Thus, all accesses to oob_skb need some basic protection to avoid load/store tearing which KCSAN detects when these are called concurrently: - ioctl(fd_a, SIOCATMARK, &atmark, sizeof(atmark)) - send(fd_b_connected_to_a, buf, sizeof(buf), MSG_OOB) BUG: KCSAN: data-race in unix_ioctl / unix_stream_sendmsg write to 0xffff888003d9cff0 of 8 bytes by task 175 on cpu 1: unix_stream_sendmsg (net/unix/af_unix.c:2087 net/unix/af_unix.c:2191) sock_sendmsg (net/socket.c:705 net/socket.c:725) __sys_sendto (net/socket.c:2040) __x64_sys_sendto (net/socket.c:2048) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) read to 0xffff888003d9cff0 of 8 bytes by task 176 on cpu 0: unix_ioctl (net/unix/af_unix.c:3101 (discriminator 1)) sock_do_ioctl (net/socket.c:1128) sock_ioctl (net/socket.c:1242) __x64_sys_ioctl (fs/ioctl.c:52 fs/ioctl.c:874 fs/ioctl.c:860 fs/ioctl.c:860) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:113) value changed: 0xffff888003da0c00 -> 0xffff888003da0d00 Reported by Kernel Concurrency Sanitizer on: CPU: 0 PID: 176 Comm: unix_race_oob_i Not tainted 5.17.0-rc5-59529-g83dc4c2af682 #12 Hardware name: Red Hat KVM, BIOS 1.11.0-2.amzn2 04/01/2014 Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c19569819866..0c37e5595aae 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2084,7 +2084,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other if (ousk->oob_skb) consume_skb(ousk->oob_skb); - ousk->oob_skb = skb; + WRITE_ONCE(ousk->oob_skb, skb); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); @@ -2602,9 +2602,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) { - u->oob_skb = NULL; - } + if (!(state->flags & MSG_PEEK)) + WRITE_ONCE(u->oob_skb, NULL); unix_state_unlock(sk); @@ -2639,7 +2638,7 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, skb = NULL; } else if (sock_flag(sk, SOCK_URGINLINE)) { if (!(flags & MSG_PEEK)) { - u->oob_skb = NULL; + WRITE_ONCE(u->oob_skb, NULL); consume_skb(skb); } } else if (!(flags & MSG_PEEK)) { @@ -3094,11 +3093,10 @@ static int unix_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg) case SIOCATMARK: { struct sk_buff *skb; - struct unix_sock *u = unix_sk(sk); int answ = 0; skb = skb_peek(&sk->sk_receive_queue); - if (skb && skb == u->oob_skb) + if (skb && skb == READ_ONCE(unix_sk(sk)->oob_skb)) answ = 1; err = put_user(answ, (int __user *)arg); } -- cgit From d9a232d435dcc966738b0f414a86f7edf4f4c8c4 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:08:09 +0900 Subject: af_unix: Support POLLPRI for OOB. The commit 314001f0bf92 ("af_unix: Add OOB support") introduced OOB for AF_UNIX, but it lacks some changes for POLLPRI. Let's add the missing piece. In the selftest, normal datagrams are sent followed by OOB data, so this commit replaces `POLLIN | POLLPRI` with just `POLLPRI` in the first test case. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 0c37e5595aae..1e7ed5829ed5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3137,6 +3137,10 @@ static __poll_t unix_poll(struct file *file, struct socket *sock, poll_table *wa mask |= EPOLLIN | EPOLLRDNORM; if (sk_is_readable(sk)) mask |= EPOLLIN | EPOLLRDNORM; +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (READ_ONCE(unix_sk(sk)->oob_skb)) + mask |= EPOLLPRI; +#endif /* Connection-based need to check for termination and startup */ if ((sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) && -- cgit From 4edf21aa94ee33c75f819f2b6eb6dd52ef8a1628 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 17 Mar 2022 12:23:08 +0900 Subject: af_unix: Remove unnecessary brackets around CONFIG_AF_UNIX_OOB. Let's remove unnecessary brackets around CONFIG_AF_UNIX_OOB. Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220317032308.65372-1-kuniyu@amazon.co.jp Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3e0d6281fd1e..4247c4134f31 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2049,7 +2049,7 @@ out: */ #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other) { struct unix_sock *ousk = unix_sk(other); @@ -2115,7 +2115,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, err = -EOPNOTSUPP; if (msg->msg_flags & MSG_OOB) { -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (len) len--; else @@ -2186,7 +2186,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, sent += size; } -#if (IS_ENABLED(CONFIG_AF_UNIX_OOB)) +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { err = queue_oob(sock, msg, other); if (err) -- cgit From f4b41f062c424209e3939a81e6da022e049a45f2 Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 4 Apr 2022 18:30:22 +0200 Subject: net: remove noblock parameter from skb_recv_datagram() skb_recv_datagram() has two parameters 'flags' and 'noblock' that are merged inside skb_recv_datagram() by 'flags | (noblock ? MSG_DONTWAIT : 0)' As 'flags' may contain MSG_DONTWAIT as value most callers split the 'flags' into 'flags' and 'noblock' with finally obsolete bit operations like this: skb_recv_datagram(sk, flags & ~MSG_DONTWAIT, flags & MSG_DONTWAIT, &rc); And this is not even done consistently with the 'flags' parameter. This patch removes the obsolete and costly splitting into two parameters and only performs bit operations when really needed on the caller side. One missing conversion thankfully reported by kernel test robot. I missed to enable kunit tests to build the mctp code. Reported-by: kernel test robot Signed-off-by: Oliver Hartkopp Signed-off-by: David S. Miller --- net/unix/af_unix.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e71a312faa1e..fecbd95da918 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1643,7 +1643,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags, * so that no locks are necessary. */ - skb = skb_recv_datagram(sk, 0, flags&O_NONBLOCK, &err); + skb = skb_recv_datagram(sk, (flags & O_NONBLOCK) ? MSG_DONTWAIT : 0, + &err); if (!skb) { /* This means receive shutdown. */ if (err == 0) @@ -2500,7 +2501,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, int used, err; mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, 0, 1, &err); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); mutex_unlock(&u->iolock); if (!skb) return err; -- cgit From ec095263a965720e1ca39db1d9c5cd47846c789b Mon Sep 17 00:00:00 2001 From: Oliver Hartkopp Date: Mon, 11 Apr 2022 14:49:55 +0200 Subject: net: remove noblock parameter from recvmsg() entities The internal recvmsg() functions have two parameters 'flags' and 'noblock' that were merged inside skb_recv_datagram(). As a follow up patch to commit f4b41f062c42 ("net: remove noblock parameter from skb_recv_datagram()") this patch removes the separate 'noblock' parameter for recvmsg(). Analogue to the referenced patch for skb_recv_datagram() the 'flags' and 'noblock' parameters are unnecessarily split up with e.g. err = sk->sk_prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); or in err = INDIRECT_CALL_2(sk->sk_prot->recvmsg, tcp_recvmsg, udp_recvmsg, sk, msg, size, flags & MSG_DONTWAIT, flags & ~MSG_DONTWAIT, &addr_len); instead of simply using only flags all the time and check for MSG_DONTWAIT where needed (to preserve for the formerly separated no(n)block condition). Signed-off-by: Oliver Hartkopp Link: https://lore.kernel.org/r/20220411124955.154876-1-socketcan@hartkopp.net Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fecbd95da918..e1dd9e9c8452 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2484,8 +2484,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_dgram_proto) - return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, NULL); + return prot->recvmsg(sk, msg, size, flags, NULL); #endif return __unix_dgram_recvmsg(sk, msg, size, flags); } @@ -2917,8 +2916,7 @@ static int unix_stream_recvmsg(struct socket *sock, struct msghdr *msg, const struct proto *prot = READ_ONCE(sk->sk_prot); if (prot != &unix_stream_proto) - return prot->recvmsg(sk, msg, size, flags & MSG_DONTWAIT, - flags & ~MSG_DONTWAIT, NULL); + return prot->recvmsg(sk, msg, size, flags, NULL); #endif return unix_stream_read_generic(&state, true); } -- cgit From b146cbf2e32f01f56244d670aef2f43d44fcf120 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 10 May 2022 15:46:26 -0700 Subject: af_unix: Silence randstruct GCC plugin warning While preparing for Clang randstruct support (which duplicated many of the warnings the randstruct GCC plugin warned about), one strange one remained only for the randstruct GCC plugin. Eliminating this rids the plugin of the last exception. It seems the plugin is happy to dereference individual members of a cross-struct cast, but it is upset about casting to a whole object pointer. This only manifests in one place in the kernel, so just replace the variable with individual member accesses. There is no change in executable instruction output. Drop the last exception from the randstruct GCC plugin. Cc: "David S. Miller" Cc: Christoph Hellwig Cc: Paolo Abeni Cc: Alexei Starovoitov Cc: Cong Wang Cc: Al Viro Cc: netdev@vger.kernel.org Cc: linux-hardening@vger.kernel.org Acked-by: Kuniyuki Iwashima Link: https://lore.kernel.org/lkml/20220511022217.58586-1-kuniyu@amazon.co.jp Acked-by: Jakub Kicinski Link: https://lore.kernel.org/lkml/20220511151542.4cb3ff17@kernel.org Signed-off-by: Kees Cook --- net/unix/af_unix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index e71a312faa1e..36367e7e3e0a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1808,11 +1808,9 @@ static int maybe_init_creds(struct scm_cookie *scm, static bool unix_skb_scm_eq(struct sk_buff *skb, struct scm_cookie *scm) { - const struct unix_skb_parms *u = &UNIXCB(skb); - - return u->pid == scm->pid && - uid_eq(u->uid, scm->creds.uid) && - gid_eq(u->gid, scm->creds.gid) && + return UNIXCB(skb).pid == scm->pid && + uid_eq(UNIXCB(skb).uid, scm->creds.uid) && + gid_eq(UNIXCB(skb).gid, scm->creds.gid) && unix_secdata_eq(scm, skb); } -- cgit From 662a80946ce13633ae90a55379f1346c10f0c432 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sun, 5 Jun 2022 16:23:25 -0700 Subject: af_unix: Fix a data-race in unix_dgram_peer_wake_me(). unix_dgram_poll() calls unix_dgram_peer_wake_me() without `other`'s lock held and check if its receive queue is full. Here we need to use unix_recvq_full_lockless() instead of unix_recvq_full(), otherwise KCSAN will report a data-race. Fixes: 7d267278a9ec ("unix: avoid use-after-free in ep_remove_wait_queue") Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220605232325.11804-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 654dcef7cfb3..2206e6f8902d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -490,7 +490,7 @@ static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other) * -ECONNREFUSED. Otherwise, if we haven't queued any skbs * to other and its full, we will hang waiting for POLLOUT. */ - if (unix_recvq_full(other) && !sock_flag(other, SOCK_DEAD)) + if (unix_recvq_full_lockless(other) && !sock_flag(other, SOCK_DEAD)) return 1; if (connected) -- cgit From dd29c67dbbbf97c8aa786a502977856d8eb9a73d Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Jun 2022 09:04:35 -0700 Subject: af_unix: use DEBUG_NET_WARN_ON_ONCE() Replace four WARN_ON() that have not triggered recently with DEBUG_NET_WARN_ON_ONCE(). Signed-off-by: Eric Dumazet Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2206e6f8902d..3453e0053f76 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -302,7 +302,7 @@ static void __unix_remove_socket(struct sock *sk) static void __unix_insert_socket(struct sock *sk) { - WARN_ON(!sk_unhashed(sk)); + DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); sk_add_node(sk, &unix_socket_table[sk->sk_hash]); } @@ -554,9 +554,9 @@ static void unix_sock_destructor(struct sock *sk) u->oob_skb = NULL; } #endif - WARN_ON(refcount_read(&sk->sk_wmem_alloc)); - WARN_ON(!sk_unhashed(sk)); - WARN_ON(sk->sk_socket); + DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); + DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); + DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); if (!sock_flag(sk, SOCK_DEAD)) { pr_info("Attempt to release alive unix socket: %p\n", sk); return; -- cgit From 965b57b469a589d64d81b1688b38dcb537011bb0 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Wed, 15 Jun 2022 09:20:12 -0700 Subject: net: Introduce a new proto_ops ->read_skb() Currently both splice() and sockmap use ->read_sock() to read skb from receive queue, but for sockmap we only read one entire skb at a time, so ->read_sock() is too conservative to use. Introduce a new proto_ops ->read_skb() which supports this sematic, with this we can finally pass the ownership of skb to recv actors. For non-TCP protocols, all ->read_sock() can be simply converted to ->read_skb(). Signed-off-by: Cong Wang Signed-off-by: Daniel Borkmann Reviewed-by: John Fastabend Link: https://lore.kernel.org/bpf/20220615162014.89193-3-xiyou.wangcong@gmail.com --- net/unix/af_unix.c | 23 +++++++++-------------- 1 file changed, 9 insertions(+), 14 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..1bed3739768c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -741,10 +741,8 @@ static ssize_t unix_stream_splice_read(struct socket *, loff_t *ppos, unsigned int flags); static int unix_dgram_sendmsg(struct socket *, struct msghdr *, size_t); static int unix_dgram_recvmsg(struct socket *, struct msghdr *, size_t, int); -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor); +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor); +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor); static int unix_dgram_connect(struct socket *, struct sockaddr *, int, int); static int unix_seqpacket_sendmsg(struct socket *, struct msghdr *, size_t); @@ -798,7 +796,7 @@ static const struct proto_ops unix_stream_ops = { .shutdown = unix_shutdown, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, - .read_sock = unix_stream_read_sock, + .read_skb = unix_stream_read_skb, .mmap = sock_no_mmap, .sendpage = unix_stream_sendpage, .splice_read = unix_stream_splice_read, @@ -823,7 +821,7 @@ static const struct proto_ops unix_dgram_ops = { .listen = sock_no_listen, .shutdown = unix_shutdown, .sendmsg = unix_dgram_sendmsg, - .read_sock = unix_read_sock, + .read_skb = unix_read_skb, .recvmsg = unix_dgram_recvmsg, .mmap = sock_no_mmap, .sendpage = sock_no_sendpage, @@ -2487,8 +2485,7 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si return __unix_dgram_recvmsg(sk, msg, size, flags); } -static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { int copied = 0; @@ -2503,7 +2500,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, if (!skb) return err; - used = recv_actor(desc, skb, 0, skb->len); + used = recv_actor(sk, skb); if (used <= 0) { if (!copied) copied = used; @@ -2514,8 +2511,7 @@ static int unix_read_sock(struct sock *sk, read_descriptor_t *desc, } kfree_skb(skb); - if (!desc->count) - break; + break; } return copied; @@ -2650,13 +2646,12 @@ static struct sk_buff *manage_oob(struct sk_buff *skb, struct sock *sk, } #endif -static int unix_stream_read_sock(struct sock *sk, read_descriptor_t *desc, - sk_read_actor_t recv_actor) +static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { if (unlikely(sk->sk_state != TCP_ESTABLISHED)) return -ENOTCONN; - return unix_read_sock(sk, desc, recv_actor); + return unix_read_skb(sk, recv_actor); } static int unix_stream_read_generic(struct unix_stream_read_state *state, -- cgit From 340c3d337119ea177a98338be2e3bc62ee87ac80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:08 -0700 Subject: af_unix: Clean up some sock_net() uses. Some functions define a net pointer only for one-shot use. Others call sock_net() redundantly even when a net pointer is available. Let's fix these and make the code simpler. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 3453e0053f76..990257f02e7c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -932,7 +932,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_unbound_socket(sk); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + sock_prot_inuse_add(net, sk->sk_prot, 1); return sk; @@ -1293,9 +1293,8 @@ static void unix_state_double_unlock(struct sock *sk1, struct sock *sk2) static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, int alen, int flags) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); struct sockaddr_un *sunaddr = (struct sockaddr_un *)addr; + struct sock *sk = sock->sk; struct sock *other; int err; @@ -1316,7 +1315,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(net, sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1404,15 +1403,13 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, int addr_len, int flags) { struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); + struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; - struct sock *newsk = NULL; - struct sock *other = NULL; + struct net *net = sock_net(sk); struct sk_buff *skb = NULL; - int st; - int err; long timeo; + int err; + int st; err = unix_validate_addr(sunaddr, addr_len); if (err) @@ -1432,7 +1429,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, */ /* create new sock for complete connection */ - newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); + newsk = unix_create1(net, NULL, 0, sock->type); if (IS_ERR(newsk)) { err = PTR_ERR(newsk); newsk = NULL; @@ -1840,17 +1837,15 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb) static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) { - struct sock *sk = sock->sk; - struct net *net = sock_net(sk); - struct unix_sock *u = unix_sk(sk); DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - struct sock *other = NULL; - int err; - struct sk_buff *skb; - long timeo; + struct sock *sk = sock->sk, *other = NULL; + struct unix_sock *u = unix_sk(sk); struct scm_cookie scm; + struct sk_buff *skb; int data_len = 0; int sk_locked; + long timeo; + int err; wait_for_unix_gc(); err = scm_send(sock, msg, &scm, false); @@ -1917,7 +1912,7 @@ restart: if (sunaddr == NULL) goto out_free; - other = unix_find_other(net, sunaddr, msg->msg_namelen, + other = unix_find_other(sock_net(sk), sunaddr, msg->msg_namelen, sk->sk_type); if (IS_ERR(other)) { err = PTR_ERR(other); -- cgit From f302d180c6d430ea99643b9b2b3407aedaa36703 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:09 -0700 Subject: af_unix: Include the whole hash table size in UNIX_HASH_SIZE. Currently, the size of AF_UNIX hash table is UNIX_HASH_SIZE * 2, the first half for bind()ed sockets and the second half for unbound ones. UNIX_HASH_SIZE * 2 is used to define the table and iterate over it. In some places, we use ARRAY_SIZE(unix_socket_table) instead of UNIX_HASH_SIZE * 2. However, we cannot use it anymore because we will allocate the hash table dynamically. Then, we would have to add UNIX_HASH_SIZE * 2 in many places, which would be troublesome. This patch adapts the UNIX_HASH_SIZE definition to include bound and unbound sockets and defines a new UNIX_HASH_MOD macro to ease calculations. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 990257f02e7c..c0804ae9c96a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,9 +118,9 @@ #include "scm.h" -spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE]; +spinlock_t unix_table_locks[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_table_locks); -struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE]; +struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_socket_table); static atomic_long_t unix_nr_socks; @@ -137,12 +137,12 @@ static unsigned int unix_unbound_hash(struct sock *sk) hash ^= hash >> 8; hash ^= sk->sk_type; - return UNIX_HASH_SIZE + (hash & (UNIX_HASH_SIZE - 1)); + return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static unsigned int unix_bsd_hash(struct inode *i) { - return i->i_ino & (UNIX_HASH_SIZE - 1); + return i->i_ino & UNIX_HASH_MOD; } static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, @@ -155,14 +155,14 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, hash ^= hash >> 8; hash ^= type; - return hash & (UNIX_HASH_SIZE - 1); + return hash & UNIX_HASH_MOD; } static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) { /* hash1 and hash2 is never the same because - * one is between 0 and UNIX_HASH_SIZE - 1, and - * another is between UNIX_HASH_SIZE and UNIX_HASH_SIZE * 2. + * one is between 0 and UNIX_HASH_MOD, and + * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1. */ if (hash1 > hash2) swap(hash1, hash2); @@ -3239,7 +3239,7 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) unsigned long bucket = get_bucket(*pos); struct sock *sk; - while (bucket < ARRAY_SIZE(unix_socket_table)) { + while (bucket < UNIX_HASH_SIZE) { spin_lock(&unix_table_locks[bucket]); sk = unix_from_bucket(seq, pos); @@ -3666,7 +3666,7 @@ static int __init af_unix_init(void) BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - for (i = 0; i < 2 * UNIX_HASH_SIZE; i++) + for (i = 0; i < UNIX_HASH_SIZE; i++) spin_lock_init(&unix_table_locks[i]); rc = proto_register(&unix_dgram_proto, 1); -- cgit From b6e811383062f88212082714db849127fa95142c Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:10 -0700 Subject: af_unix: Define a per-netns hash table. This commit adds a per netns hash table for AF_UNIX, which size is fixed as UNIX_HASH_SIZE for now. The first implementation defines a per-netns hash table as a single array of lock and list: struct unix_hashbucket { spinlock_t lock; struct hlist_head head; }; struct netns_unix { struct unix_hashbucket *hash; ... }; But, Eric pointed out memory cost that the structure has holes because of sizeof(spinlock_t), which is 4 (or more if LOCKDEP is enabled). [0] It could be expensive on a host with thousands of netns and few AF_UNIX sockets. For this reason, a per-netns hash table uses two dense arrays. struct unix_table { spinlock_t *locks; struct hlist_head *buckets; }; struct netns_unix { struct unix_table table; ... }; Note the length of the list has a significant impact rather than lock contention, so having shared locks can be an option. But, per-netns locks and lists still perform better than the global locks and per-netns lists. [1] Also, this patch adds a change so that struct netns_unix disappears from struct net if CONFIG_UNIX is disabled. [0]: https://lore.kernel.org/netdev/CANn89iLVxO5aqx16azNU7p7Z-nz5NrnM5QTqOzueVxEnkVTxyg@mail.gmail.com/ [1]: https://lore.kernel.org/netdev/20220617175215.1769-1-kuniyu@amazon.com/ Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 38 ++++++++++++++++++++++++++++++++------ 1 file changed, 32 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c0804ae9c96a..cdd12881a39d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3559,7 +3559,7 @@ static const struct net_proto_family unix_family_ops = { static int __net_init unix_net_init(struct net *net) { - int error = -ENOMEM; + int i; net->unx.sysctl_max_dgram_qlen = 10; if (unix_sysctl_register(net)) @@ -3567,18 +3567,44 @@ static int __net_init unix_net_init(struct net *net) #ifdef CONFIG_PROC_FS if (!proc_create_net("unix", 0, net->proc_net, &unix_seq_ops, - sizeof(struct seq_net_private))) { - unix_sysctl_unregister(net); - goto out; + sizeof(struct seq_net_private))) + goto err_sysctl; +#endif + + net->unx.table.locks = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(spinlock_t), GFP_KERNEL); + if (!net->unx.table.locks) + goto err_proc; + + net->unx.table.buckets = kvmalloc_array(UNIX_HASH_SIZE, + sizeof(struct hlist_head), + GFP_KERNEL); + if (!net->unx.table.buckets) + goto free_locks; + + for (i = 0; i < UNIX_HASH_SIZE; i++) { + spin_lock_init(&net->unx.table.locks[i]); + INIT_HLIST_HEAD(&net->unx.table.buckets[i]); } + + return 0; + +free_locks: + kvfree(net->unx.table.locks); +err_proc: +#ifdef CONFIG_PROC_FS + remove_proc_entry("unix", net->proc_net); +err_sysctl: #endif - error = 0; + unix_sysctl_unregister(net); out: - return error; + return -ENOMEM; } static void __net_exit unix_net_exit(struct net *net) { + kvfree(net->unx.table.buckets); + kvfree(net->unx.table.locks); unix_sysctl_unregister(net); remove_proc_entry("unix", net->proc_net); } -- cgit From 79b05beaa5c340e1649dc7462e056ae33b916131 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:11 -0700 Subject: af_unix: Acquire/Release per-netns hash table's locks. This commit adds extra spin_lock/spin_unlock() for a per-netns hash table inside the existing ones for unix_table_locks. As of this commit, sockets are still linked in the global hash table. After putting sockets in a per-netns hash table and removing the old one in the next patch, we remove the global locks in the last patch. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 75 +++++++++++++++++++++++++++++++++++++----------------- 1 file changed, 51 insertions(+), 24 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cdd12881a39d..79f8fc5cdce8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -158,7 +158,8 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, return hash & UNIX_HASH_MOD; } -static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_lock(struct net *net, + unsigned int hash1, unsigned int hash2) { /* hash1 and hash2 is never the same because * one is between 0 and UNIX_HASH_MOD, and @@ -169,10 +170,17 @@ static void unix_table_double_lock(unsigned int hash1, unsigned int hash2) spin_lock(&unix_table_locks[hash1]); spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); + + spin_lock(&net->unx.table.locks[hash1]); + spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } -static void unix_table_double_unlock(unsigned int hash1, unsigned int hash2) +static void unix_table_double_unlock(struct net *net, + unsigned int hash1, unsigned int hash2) { + spin_unlock(&net->unx.table.locks[hash1]); + spin_unlock(&net->unx.table.locks[hash2]); + spin_unlock(&unix_table_locks[hash1]); spin_unlock(&unix_table_locks[hash2]); } @@ -316,17 +324,21 @@ static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, __unix_insert_socket(sk); } -static void unix_remove_socket(struct sock *sk) +static void unix_remove_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); + spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } -static void unix_insert_unbound_socket(struct sock *sk) +static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); + spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(sk); + spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } @@ -356,28 +368,33 @@ static inline struct sock *unix_find_socket_byname(struct net *net, struct sock *s; spin_lock(&unix_table_locks[hash]); + spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return s; } -static struct sock *unix_find_socket_byinode(struct inode *i) +static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; spin_lock(&unix_table_locks[hash]); + spin_lock(&net->unx.table.locks[hash]); sk_for_each(s, &unix_socket_table[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return s; } } + spin_unlock(&net->unx.table.locks[hash]); spin_unlock(&unix_table_locks[hash]); return NULL; } @@ -576,12 +593,12 @@ static void unix_sock_destructor(struct sock *sk) static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); - struct path path; struct sock *skpair; struct sk_buff *skb; + struct path path; int state; - unix_remove_socket(sk); + unix_remove_socket(sock_net(sk), sk); /* Clear state */ unix_state_lock(sk); @@ -930,7 +947,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_head(&u->peer_wait); init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); - unix_insert_unbound_socket(sk); + unix_insert_unbound_socket(net, sk); sock_prot_inuse_add(net, sk->sk_prot, 1); @@ -1015,7 +1032,7 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, if (!S_ISSOCK(inode->i_mode)) goto path_put; - sk = unix_find_socket_byinode(inode); + sk = unix_find_socket_byinode(net, inode); if (!sk) goto path_put; @@ -1074,6 +1091,7 @@ static int unix_autobind(struct sock *sk) { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; u32 lastnum, ordernum; int err; @@ -1102,11 +1120,10 @@ retry: sprintf(addr->name->sun_path + 1, "%05x", ordernum); new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) { - unix_table_double_unlock(old_hash, new_hash); + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) { + unix_table_double_unlock(net, old_hash, new_hash); /* __unix_find_socket_byname() may take long time if many names * are already in use. @@ -1124,7 +1141,7 @@ retry: } __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = 0; out: mutex_unlock(&u->bindlock); @@ -1138,6 +1155,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, (SOCK_INODE(sk->sk_socket)->i_mode & ~current_umask()); unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct user_namespace *ns; // barf... struct unix_address *addr; struct dentry *dentry; @@ -1178,11 +1196,11 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, goto out_unlock; new_hash = unix_bsd_hash(d_backing_inode(dentry)); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -1205,6 +1223,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, { unsigned int new_hash, old_hash = sk->sk_hash; struct unix_sock *u = unix_sk(sk); + struct net *net = sock_net(sk); struct unix_address *addr; int err; @@ -1222,19 +1241,18 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, } new_hash = unix_abstract_hash(addr->name, addr->len, sk->sk_type); - unix_table_double_lock(old_hash, new_hash); + unix_table_double_lock(net, old_hash, new_hash); - if (__unix_find_socket_byname(sock_net(sk), addr->name, addr->len, - new_hash)) + if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; __unix_set_addr_hash(sk, addr, new_hash); - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; out_spin: - unix_table_double_unlock(old_hash, new_hash); + unix_table_double_unlock(net, old_hash, new_hash); err = -EADDRINUSE; out_mutex: mutex_unlock(&u->bindlock); @@ -3237,15 +3255,18 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct net *net = seq_file_net(seq); struct sock *sk; while (bucket < UNIX_HASH_SIZE) { spin_lock(&unix_table_locks[bucket]); + spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); if (sk) return sk; + spin_unlock(&net->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3258,11 +3279,13 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); + struct net *net = seq_file_net(seq); for (sk = sk_next(sk); sk; sk = sk_next(sk)) - if (sock_net(sk) == seq_file_net(seq)) + if (sock_net(sk) == net) return sk; + spin_unlock(&net->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3292,8 +3315,10 @@ static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; - if (sk) + if (sk) { + spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); + } } static int unix_seq_show(struct seq_file *seq, void *v) @@ -3381,6 +3406,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; + struct net *net = seq_file_net(seq); unsigned int expected = 1; struct sock *sk; @@ -3388,7 +3414,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) + if (sock_net(sk) != net) continue; if (iter->end_sk < iter->max_sk) { @@ -3399,6 +3425,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) expected++; } + spin_unlock(&net->unx.table.locks[start_sk->sk_hash]); spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; -- cgit From cf2f225e2653734e66e91c09e1cbe004bfd3d4a7 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:12 -0700 Subject: af_unix: Put a socket into a per-netns hash table. This commit replaces the global hash table with a per-netns one and removes the global one. We now link a socket in each netns's hash table so we can save some netns comparisons when iterating through a hash bucket. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 50 ++++++++++++++++++++------------------------------ 1 file changed, 20 insertions(+), 30 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 79f8fc5cdce8..9d0b07235dbc 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -120,8 +120,6 @@ spinlock_t unix_table_locks[UNIX_HASH_SIZE]; EXPORT_SYMBOL_GPL(unix_table_locks); -struct hlist_head unix_socket_table[UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_socket_table); static atomic_long_t unix_nr_socks; /* SMP locking strategy: @@ -308,20 +306,20 @@ static void __unix_remove_socket(struct sock *sk) sk_del_node_init(sk); } -static void __unix_insert_socket(struct sock *sk) +static void __unix_insert_socket(struct net *net, struct sock *sk) { DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); - sk_add_node(sk, &unix_socket_table[sk->sk_hash]); + sk_add_node(sk, &net->unx.table.buckets[sk->sk_hash]); } -static void __unix_set_addr_hash(struct sock *sk, struct unix_address *addr, - unsigned int hash) +static void __unix_set_addr_hash(struct net *net, struct sock *sk, + struct unix_address *addr, unsigned int hash) { __unix_remove_socket(sk); smp_store_release(&unix_sk(sk)->addr, addr); sk->sk_hash = hash; - __unix_insert_socket(sk); + __unix_insert_socket(net, sk); } static void unix_remove_socket(struct net *net, struct sock *sk) @@ -337,7 +335,7 @@ static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); - __unix_insert_socket(sk); + __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); spin_unlock(&unix_table_locks[sk->sk_hash]); } @@ -348,12 +346,9 @@ static struct sock *__unix_find_socket_byname(struct net *net, { struct sock *s; - sk_for_each(s, &unix_socket_table[hash]) { + sk_for_each(s, &net->unx.table.buckets[hash]) { struct unix_sock *u = unix_sk(s); - if (!net_eq(sock_net(s), net)) - continue; - if (u->addr->len == len && !memcmp(u->addr->name, sunname, len)) return s; @@ -384,7 +379,7 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); - sk_for_each(s, &unix_socket_table[hash]) { + sk_for_each(s, &net->unx.table.buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { @@ -1140,7 +1135,7 @@ retry: goto retry; } - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); err = 0; @@ -1199,7 +1194,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, unix_table_double_lock(net, old_hash, new_hash); u->path.mnt = mntget(parent.mnt); u->path.dentry = dget(dentry); - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); @@ -1246,7 +1241,7 @@ static int unix_bind_abstract(struct sock *sk, struct sockaddr_un *sunaddr, if (__unix_find_socket_byname(net, addr->name, addr->len, new_hash)) goto out_spin; - __unix_set_addr_hash(sk, addr, new_hash); + __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); mutex_unlock(&u->bindlock); return 0; @@ -3239,12 +3234,11 @@ static struct sock *unix_from_bucket(struct seq_file *seq, loff_t *pos) { unsigned long offset = get_offset(*pos); unsigned long bucket = get_bucket(*pos); - struct sock *sk; unsigned long count = 0; + struct sock *sk; - for (sk = sk_head(&unix_socket_table[bucket]); sk; sk = sk_next(sk)) { - if (sock_net(sk) != seq_file_net(seq)) - continue; + for (sk = sk_head(&seq_file_net(seq)->unx.table.buckets[bucket]); + sk; sk = sk_next(sk)) { if (++count == offset) break; } @@ -3279,13 +3273,13 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, loff_t *pos) { unsigned long bucket = get_bucket(*pos); - struct net *net = seq_file_net(seq); - for (sk = sk_next(sk); sk; sk = sk_next(sk)) - if (sock_net(sk) == net) - return sk; + sk = sk_next(sk); + if (sk) + return sk; + - spin_unlock(&net->unx.table.locks[bucket]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3406,7 +3400,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) { struct bpf_unix_iter_state *iter = seq->private; - struct net *net = seq_file_net(seq); unsigned int expected = 1; struct sock *sk; @@ -3414,9 +3407,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) iter->batch[iter->end_sk++] = start_sk; for (sk = sk_next(start_sk); sk; sk = sk_next(sk)) { - if (sock_net(sk) != net) - continue; - if (iter->end_sk < iter->max_sk) { sock_hold(sk); iter->batch[iter->end_sk++] = sk; @@ -3425,7 +3415,7 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) expected++; } - spin_unlock(&net->unx.table.locks[start_sk->sk_hash]); + spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; -- cgit From 2f7ca90a0188b57a54d3b1159eb7874427a7e07a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 21 Jun 2022 10:19:13 -0700 Subject: af_unix: Remove unix_table_locks. unix_table_locks are to protect the global hash table, unix_socket_table. The previous commit removed it, so let's clean up the unnecessary locks. Here is a test result on EC2 c5.9xlarge where 10 processes run concurrently in different netns and bind 100,000 sockets for each. without this series : 1m 38s with this series : 11s It is ~10x faster because the global hash table is split into 10 netns in this case. Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 42 ++++++++---------------------------------- 1 file changed, 8 insertions(+), 34 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 9d0b07235dbc..49f6626330c3 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -118,13 +118,11 @@ #include "scm.h" -spinlock_t unix_table_locks[UNIX_HASH_SIZE]; -EXPORT_SYMBOL_GPL(unix_table_locks); static atomic_long_t unix_nr_socks; /* SMP locking strategy: - * hash table is protected with spinlock unix_table_locks - * each socket state is protected by separate spin lock. + * hash table is protected with spinlock. + * each socket state is protected by separate spinlock. */ static unsigned int unix_unbound_hash(struct sock *sk) @@ -166,9 +164,6 @@ static void unix_table_double_lock(struct net *net, if (hash1 > hash2) swap(hash1, hash2); - spin_lock(&unix_table_locks[hash1]); - spin_lock_nested(&unix_table_locks[hash2], SINGLE_DEPTH_NESTING); - spin_lock(&net->unx.table.locks[hash1]); spin_lock_nested(&net->unx.table.locks[hash2], SINGLE_DEPTH_NESTING); } @@ -178,9 +173,6 @@ static void unix_table_double_unlock(struct net *net, { spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); - - spin_unlock(&unix_table_locks[hash1]); - spin_unlock(&unix_table_locks[hash2]); } #ifdef CONFIG_SECURITY_NETWORK @@ -324,20 +316,16 @@ static void __unix_set_addr_hash(struct net *net, struct sock *sk, static void unix_remove_socket(struct net *net, struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_remove_socket(sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); } static void unix_insert_unbound_socket(struct net *net, struct sock *sk) { - spin_lock(&unix_table_locks[sk->sk_hash]); spin_lock(&net->unx.table.locks[sk->sk_hash]); __unix_insert_socket(net, sk); spin_unlock(&net->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); } static struct sock *__unix_find_socket_byname(struct net *net, @@ -362,13 +350,11 @@ static inline struct sock *unix_find_socket_byname(struct net *net, { struct sock *s; - spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); s = __unix_find_socket_byname(net, sunname, len, hash); if (s) sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return s; } @@ -377,7 +363,6 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&unix_table_locks[hash]); spin_lock(&net->unx.table.locks[hash]); sk_for_each(s, &net->unx.table.buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; @@ -385,12 +370,10 @@ static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return s; } } spin_unlock(&net->unx.table.locks[hash]); - spin_unlock(&unix_table_locks[hash]); return NULL; } @@ -1551,9 +1534,9 @@ restart: * * The contents of *(otheru->addr) and otheru->path * are seen fully set up here, since we have found - * otheru in hash under unix_table_locks. Insertion - * into the hash chain we'd found it in had been done - * in an earlier critical area protected by unix_table_locks, + * otheru in hash under its lock. Insertion into the + * hash chain we'd found it in had been done in an + * earlier critical area protected by the chain's lock, * the same one where we'd set *(otheru->addr) contents, * as well as otheru->path and otheru->addr itself. * @@ -3253,7 +3236,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) struct sock *sk; while (bucket < UNIX_HASH_SIZE) { - spin_lock(&unix_table_locks[bucket]); spin_lock(&net->unx.table.locks[bucket]); sk = unix_from_bucket(seq, pos); @@ -3261,7 +3243,6 @@ static struct sock *unix_get_first(struct seq_file *seq, loff_t *pos) return sk; spin_unlock(&net->unx.table.locks[bucket]); - spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); } @@ -3280,7 +3261,6 @@ static struct sock *unix_get_next(struct seq_file *seq, struct sock *sk, spin_unlock(&seq_file_net(seq)->unx.table.locks[bucket]); - spin_unlock(&unix_table_locks[bucket]); *pos = set_bucket_offset(++bucket, 1); @@ -3309,10 +3289,8 @@ static void unix_seq_stop(struct seq_file *seq, void *v) { struct sock *sk = v; - if (sk) { + if (sk) spin_unlock(&seq_file_net(seq)->unx.table.locks[sk->sk_hash]); - spin_unlock(&unix_table_locks[sk->sk_hash]); - } } static int unix_seq_show(struct seq_file *seq, void *v) @@ -3337,7 +3315,7 @@ static int unix_seq_show(struct seq_file *seq, void *v) (s->sk_state == TCP_ESTABLISHED ? SS_CONNECTING : SS_DISCONNECTING), sock_i_ino(s)); - if (u->addr) { // under unix_table_locks here + if (u->addr) { // under a hash table lock here int i, len; seq_putc(seq, ' '); @@ -3416,7 +3394,6 @@ static int bpf_iter_unix_hold_batch(struct seq_file *seq, struct sock *start_sk) } spin_unlock(&seq_file_net(seq)->unx.table.locks[start_sk->sk_hash]); - spin_unlock(&unix_table_locks[start_sk->sk_hash]); return expected; } @@ -3705,13 +3682,10 @@ static void __init bpf_iter_register(void) static int __init af_unix_init(void) { - int i, rc = -1; + int rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); - for (i = 0; i < UNIX_HASH_SIZE; i++) - spin_lock_init(&unix_table_locks[i]); - rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); -- cgit From 51bae889fe111e418321ff0e6bb5f67e64cb9042 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Sat, 2 Jul 2022 08:48:17 -0700 Subject: af_unix: Put pathname sockets in the global hash table. Commit cf2f225e2653 ("af_unix: Put a socket into a per-netns hash table.") accidentally broke user API for pathname sockets. A socket was able to connect() to a pathname socket whose file was visible even if they were in different network namespaces. The commit puts all sockets into a per-netns hash table. As a result, connect() to a pathname socket in a different netns fails to find it in the caller's per-netns hash table and returns -ECONNREFUSED even when the task can view the peer socket file. We can reproduce this issue by: Console A: # python3 >>> from socket import * >>> s = socket(AF_UNIX, SOCK_STREAM, 0) >>> s.bind('test') >>> s.listen(32) Console B: # ip netns add test # ip netns exec test sh # python3 >>> from socket import * >>> s = socket(AF_UNIX, SOCK_STREAM, 0) >>> s.connect('test') Note when dumping sockets by sock_diag, procfs, and bpf_iter, they are filtered only by netns. In other words, even if they are visible and connect()able, all sockets in different netns are skipped while iterating sockets. Thus, we need a fix only for finding a peer pathname socket. This patch adds a global hash table for pathname sockets, links them with sk_bind_node, and uses it in unix_find_socket_byinode(). By doing so, we can keep sockets in per-netns hash tables and dump them easily. Thanks to Sachin Sant and Leonard Crestez for reports, logs and a reproducer. Fixes: cf2f225e2653 ("af_unix: Put a socket into a per-netns hash table.") Reported-by: Sachin Sant Reported-by: Leonard Crestez Tested-by: Sachin Sant Tested-by: Nathan Chancellor Signed-off-by: Kuniyuki Iwashima Tested-by: Leonard Crestez Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 47 +++++++++++++++++++++++++++++++++++++---------- 1 file changed, 37 insertions(+), 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 49f6626330c3..526b872cc710 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -119,6 +119,8 @@ #include "scm.h" static atomic_long_t unix_nr_socks; +static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; +static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; /* SMP locking strategy: * hash table is protected with spinlock. @@ -328,6 +330,24 @@ static void unix_insert_unbound_socket(struct net *net, struct sock *sk) spin_unlock(&net->unx.table.locks[sk->sk_hash]); } +static void unix_insert_bsd_socket(struct sock *sk) +{ + spin_lock(&bsd_socket_locks[sk->sk_hash]); + sk_add_bind_node(sk, &bsd_socket_buckets[sk->sk_hash]); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); +} + +static void unix_remove_bsd_socket(struct sock *sk) +{ + if (!hlist_unhashed(&sk->sk_bind_node)) { + spin_lock(&bsd_socket_locks[sk->sk_hash]); + __sk_del_bind_node(sk); + spin_unlock(&bsd_socket_locks[sk->sk_hash]); + + sk_node_init(&sk->sk_bind_node); + } +} + static struct sock *__unix_find_socket_byname(struct net *net, struct sockaddr_un *sunname, int len, unsigned int hash) @@ -358,22 +378,22 @@ static inline struct sock *unix_find_socket_byname(struct net *net, return s; } -static struct sock *unix_find_socket_byinode(struct net *net, struct inode *i) +static struct sock *unix_find_socket_byinode(struct inode *i) { unsigned int hash = unix_bsd_hash(i); struct sock *s; - spin_lock(&net->unx.table.locks[hash]); - sk_for_each(s, &net->unx.table.buckets[hash]) { + spin_lock(&bsd_socket_locks[hash]); + sk_for_each_bound(s, &bsd_socket_buckets[hash]) { struct dentry *dentry = unix_sk(s)->path.dentry; if (dentry && d_backing_inode(dentry) == i) { sock_hold(s); - spin_unlock(&net->unx.table.locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return s; } } - spin_unlock(&net->unx.table.locks[hash]); + spin_unlock(&bsd_socket_locks[hash]); return NULL; } @@ -577,6 +597,7 @@ static void unix_release_sock(struct sock *sk, int embrion) int state; unix_remove_socket(sock_net(sk), sk); + unix_remove_bsd_socket(sk); /* Clear state */ unix_state_lock(sk); @@ -988,8 +1009,8 @@ static int unix_release(struct socket *sock) return 0; } -static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) +static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, + int type) { struct inode *inode; struct path path; @@ -1010,7 +1031,7 @@ static struct sock *unix_find_bsd(struct net *net, struct sockaddr_un *sunaddr, if (!S_ISSOCK(inode->i_mode)) goto path_put; - sk = unix_find_socket_byinode(net, inode); + sk = unix_find_socket_byinode(inode); if (!sk) goto path_put; @@ -1058,7 +1079,7 @@ static struct sock *unix_find_other(struct net *net, struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(net, sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1179,6 +1200,7 @@ static int unix_bind_bsd(struct sock *sk, struct sockaddr_un *sunaddr, u->path.dentry = dget(dentry); __unix_set_addr_hash(net, sk, addr, new_hash); unix_table_double_unlock(net, old_hash, new_hash); + unix_insert_bsd_socket(sk); mutex_unlock(&u->bindlock); done_path_create(&parent, dentry); return 0; @@ -3682,10 +3704,15 @@ static void __init bpf_iter_register(void) static int __init af_unix_init(void) { - int rc = -1; + int i, rc = -1; BUILD_BUG_ON(sizeof(struct unix_skb_parms) > sizeof_field(struct sk_buff, cb)); + for (i = 0; i < UNIX_HASH_SIZE / 2; i++) { + spin_lock_init(&bsd_socket_locks[i]); + INIT_HLIST_HEAD(&bsd_socket_buckets[i]); + } + rc = proto_register(&unix_dgram_proto, 1); if (rc != 0) { pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__); -- cgit From cf21b355ccb39b0de0b6a7362532bb5584c84a80 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 5 Jul 2022 16:37:15 -0700 Subject: af_unix: Optimise hash table layout. Commit 6dd4142fb5a9 ("Merge branch 'af_unix-per-netns-socket-hash'") and commit 51bae889fe11 ("af_unix: Put pathname sockets in the global hash table.") changed a hash table layout. Before: unix_socket_table [0 - 255] : abstract & pathname sockets [256 - 511] : unnamed sockets After: per-netns table [0 - 255] : abstract & pathname sockets [256 - 511] : unnamed sockets bsd_socket_table [0 - 255] : pathname sockets (sk_bind_node) Now, while looking up sockets, we traverse the global table for the pathname sockets and the first half of each per-netns hash table for abstract sockets, where pathname sockets are also linked. Thus, the more pathname sockets we have, the longer we take to look up abstract sockets. This characteristic has been there before the layout change, but we can improve it now. This patch changes the per-netns hash table's layout so that sockets not requiring lookup reside in the first half and do not impact the lookup of abstract sockets. per-netns table [0 - 255] : pathname & unnamed sockets [256 - 511] : abstract sockets bsd_socket_table [0 - 255] : pathname sockets (sk_bind_node) We have run a test that bind()s 100,000 abstract/pathname sockets for each, bind()s an abstract socket 100,000 times and measures the time on __unix_find_socket_byname(). The result shows that the patch makes each lookup faster. Without this patch: $ sudo ./funclatency -p 2278 --microseconds __unix_find_socket_byname.isra.44 usec : count distribution 0 -> 1 : 0 | | 2 -> 3 : 0 | | 4 -> 7 : 0 | | 8 -> 15 : 126 | | 16 -> 31 : 1438 |* | 32 -> 63 : 4150 |*** | 64 -> 127 : 9049 |******* | 128 -> 255 : 37704 |******************************* | 256 -> 511 : 47533 |****************************************| With this patch: $ sudo ./funclatency -p 3648 --microseconds __unix_find_socket_byname.isra.46 usec : count distribution 0 -> 1 : 109 | | 2 -> 3 : 318 | | 4 -> 7 : 725 | | 8 -> 15 : 2501 |* | 16 -> 31 : 3061 |** | 32 -> 63 : 4028 |*** | 64 -> 127 : 9312 |******* | 128 -> 255 : 51372 |****************************************| 256 -> 511 : 28574 |********************** | Signed-off-by: Kuniyuki Iwashima Link: https://lore.kernel.org/r/20220705233715.759-1-kuniyu@amazon.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 526b872cc710..784b4b30ce9a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -135,7 +135,7 @@ static unsigned int unix_unbound_hash(struct sock *sk) hash ^= hash >> 8; hash ^= sk->sk_type; - return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); + return hash & UNIX_HASH_MOD; } static unsigned int unix_bsd_hash(struct inode *i) @@ -153,16 +153,17 @@ static unsigned int unix_abstract_hash(struct sockaddr_un *sunaddr, hash ^= hash >> 8; hash ^= type; - return hash & UNIX_HASH_MOD; + return UNIX_HASH_MOD + 1 + (hash & UNIX_HASH_MOD); } static void unix_table_double_lock(struct net *net, unsigned int hash1, unsigned int hash2) { - /* hash1 and hash2 is never the same because - * one is between 0 and UNIX_HASH_MOD, and - * another is between UNIX_HASH_MOD + 1 and UNIX_HASH_SIZE - 1. - */ + if (hash1 == hash2) { + spin_lock(&net->unx.table.locks[hash1]); + return; + } + if (hash1 > hash2) swap(hash1, hash2); @@ -173,6 +174,11 @@ static void unix_table_double_lock(struct net *net, static void unix_table_double_unlock(struct net *net, unsigned int hash1, unsigned int hash2) { + if (hash1 == hash2) { + spin_unlock(&net->unx.table.locks[hash1]); + return; + } + spin_unlock(&net->unx.table.locks[hash1]); spin_unlock(&net->unx.table.locks[hash2]); } -- cgit From de43708924438fac2b0c04b099d50e3b523a5817 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Wed, 17 Aug 2022 00:51:54 +0300 Subject: af_unix: Show number of inflight fds for sockets in TCP_LISTEN state too TCP_LISTEN sockets is a special case. They preserve skb with a newly connected sock till accept() makes it fully functional socket. Receive queue of such socket may grow after connected peer send messages there. Since these messages may contain scm_fds, we should expose correct fdinfo::scm_fds for listening socket too. Signed-off-by: Kirill Tkhai Signed-off-by: David S. Miller --- net/unix/af_unix.c | 36 +++++++++++++++++++++++++++++++++--- 1 file changed, 33 insertions(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..dea2972c8178 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -785,15 +785,45 @@ static int unix_set_peek_off(struct sock *sk, int val) } #ifdef CONFIG_PROC_FS +static int unix_count_nr_fds(struct sock *sk) +{ + struct sk_buff *skb; + struct unix_sock *u; + int nr_fds = 0; + + spin_lock(&sk->sk_receive_queue.lock); + skb = skb_peek(&sk->sk_receive_queue); + while (skb) { + u = unix_sk(skb->sk); + nr_fds += atomic_read(&u->scm_stat.nr_fds); + skb = skb_peek_next(skb, &sk->sk_receive_queue); + } + spin_unlock(&sk->sk_receive_queue.lock); + + return nr_fds; +} + static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) { struct sock *sk = sock->sk; struct unix_sock *u; + int nr_fds; if (sk) { - u = unix_sk(sock->sk); - seq_printf(m, "scm_fds: %u\n", - atomic_read(&u->scm_stat.nr_fds)); + u = unix_sk(sk); + if (sock->type == SOCK_DGRAM) { + nr_fds = atomic_read(&u->scm_stat.nr_fds); + goto out_print; + } + + unix_state_lock(sk); + if (sk->sk_state != TCP_LISTEN) + nr_fds = atomic_read(&u->scm_stat.nr_fds); + else + nr_fds = unix_count_nr_fds(sk); + unix_state_unlock(sk); +out_print: + seq_printf(m, "scm_fds: %u\n", nr_fds); } } #else -- cgit From f5d39b020809146cc28e6e73369bf8065e0310aa Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 22 Aug 2022 13:18:22 +0200 Subject: freezer,sched: Rewrite core freezer logic Rewrite the core freezer to behave better wrt thawing and be simpler in general. By replacing PF_FROZEN with TASK_FROZEN, a special block state, it is ensured frozen tasks stay frozen until thawed and don't randomly wake up early, as is currently possible. As such, it does away with PF_FROZEN and PF_FREEZER_SKIP, freeing up two PF_flags (yay!). Specifically; the current scheme works a little like: freezer_do_not_count(); schedule(); freezer_count(); And either the task is blocked, or it lands in try_to_freezer() through freezer_count(). Now, when it is blocked, the freezer considers it frozen and continues. However, on thawing, once pm_freezing is cleared, freezer_count() stops working, and any random/spurious wakeup will let a task run before its time. That is, thawing tries to thaw things in explicit order; kernel threads and workqueues before doing bringing SMP back before userspace etc.. However due to the above mentioned races it is entirely possible for userspace tasks to thaw (by accident) before SMP is back. This can be a fatal problem in asymmetric ISA architectures (eg ARMv9) where the userspace task requires a special CPU to run. As said; replace this with a special task state TASK_FROZEN and add the following state transitions: TASK_FREEZABLE -> TASK_FROZEN __TASK_STOPPED -> TASK_FROZEN __TASK_TRACED -> TASK_FROZEN The new TASK_FREEZABLE can be set on any state part of TASK_NORMAL (IOW. TASK_INTERRUPTIBLE and TASK_UNINTERRUPTIBLE) -- any such state is already required to deal with spurious wakeups and the freezer causes one such when thawing the task (since the original state is lost). The special __TASK_{STOPPED,TRACED} states *can* be restored since their canonical state is in ->jobctl. With this, frozen tasks need an explicit TASK_FROZEN wakeup and are free of undue (early / spurious) wakeups. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ingo Molnar Acked-by: Rafael J. Wysocki Link: https://lore.kernel.org/r/20220822114649.055452969@infradead.org --- net/unix/af_unix.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..dda9eb1ab41f 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2543,13 +2543,14 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, struct sk_buff *last, unsigned int last_len, bool freezable) { + unsigned int state = TASK_INTERRUPTIBLE | freezable * TASK_FREEZABLE; struct sk_buff *tail; DEFINE_WAIT(wait); unix_state_lock(sk); for (;;) { - prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE); + prepare_to_wait(sk_sleep(sk), &wait, state); tail = skb_peek_tail(&sk->sk_receive_queue); if (tail != last || @@ -2562,10 +2563,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk); unix_state_unlock(sk); - if (freezable) - timeo = freezable_schedule_timeout(timeo); - else - timeo = schedule_timeout(timeo); + timeo = schedule_timeout(timeo); unix_state_lock(sk); if (sock_flag(sk, SOCK_DEAD)) -- cgit From d6e3b27cbd2df555ff0736796ad2f9a17e74be8b Mon Sep 17 00:00:00 2001 From: Peilin Ye Date: Thu, 22 Sep 2022 21:59:26 -0700 Subject: af_unix: Refactor unix_read_skb() Similar to udp_read_skb(), delete the unnecessary while loop in unix_read_skb() for readability. Since recv_actor() cannot return a value greater than skb->len (see sk_psock_verdict_recv()), remove the redundant check. Suggested-by: Cong Wang Signed-off-by: Peilin Ye Link: https://lore.kernel.org/r/7009141683ad6cd3785daced3e4a80ba0eb773b5.1663909008.git.peilin.ye@bytedance.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 34 ++++++++++------------------------ 1 file changed, 10 insertions(+), 24 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index dea2972c8178..c955c7253d4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2536,32 +2536,18 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg, size_t si static int unix_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { - int copied = 0; - - while (1) { - struct unix_sock *u = unix_sk(sk); - struct sk_buff *skb; - int used, err; - - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) - return err; + struct unix_sock *u = unix_sk(sk); + struct sk_buff *skb; + int err, copied; - used = recv_actor(sk, skb); - if (used <= 0) { - if (!copied) - copied = used; - kfree_skb(skb); - break; - } else if (used <= skb->len) { - copied += used; - } + mutex_lock(&u->iolock); + skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); + mutex_unlock(&u->iolock); + if (!skb) + return err; - kfree_skb(skb); - break; - } + copied = recv_actor(sk, skb); + kfree_skb(skb); return copied; } -- cgit From 7a62ed61367b8fd01bae1e18e30602c25060d824 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Thu, 29 Sep 2022 08:52:04 -0700 Subject: af_unix: Fix memory leaks of the whole sk due to OOB skb. syzbot reported a sequence of memory leaks, and one of them indicated we failed to free a whole sk: unreferenced object 0xffff8880126e0000 (size 1088): comm "syz-executor419", pid 326, jiffies 4294773607 (age 12.609s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 7d 00 00 00 00 00 00 00 ........}....... 01 00 07 40 00 00 00 00 00 00 00 00 00 00 00 00 ...@............ backtrace: [<000000006fefe750>] sk_prot_alloc+0x64/0x2a0 net/core/sock.c:1970 [<0000000074006db5>] sk_alloc+0x3b/0x800 net/core/sock.c:2029 [<00000000728cd434>] unix_create1+0xaf/0x920 net/unix/af_unix.c:928 [<00000000a279a139>] unix_create+0x113/0x1d0 net/unix/af_unix.c:997 [<0000000068259812>] __sock_create+0x2ab/0x550 net/socket.c:1516 [<00000000da1521e1>] sock_create net/socket.c:1566 [inline] [<00000000da1521e1>] __sys_socketpair+0x1a8/0x550 net/socket.c:1698 [<000000007ab259e1>] __do_sys_socketpair net/socket.c:1751 [inline] [<000000007ab259e1>] __se_sys_socketpair net/socket.c:1748 [inline] [<000000007ab259e1>] __x64_sys_socketpair+0x97/0x100 net/socket.c:1748 [<000000007dedddc1>] do_syscall_x64 arch/x86/entry/common.c:50 [inline] [<000000007dedddc1>] do_syscall_64+0x38/0x90 arch/x86/entry/common.c:80 [<000000009456679f>] entry_SYSCALL_64_after_hwframe+0x63/0xcd We can reproduce this issue by creating two AF_UNIX SOCK_STREAM sockets, send()ing an OOB skb to each other, and close()ing them without consuming the OOB skbs. int skpair[2]; socketpair(AF_UNIX, SOCK_STREAM, 0, skpair); send(skpair[0], "x", 1, MSG_OOB); send(skpair[1], "x", 1, MSG_OOB); close(skpair[0]); close(skpair[1]); Currently, we free an OOB skb in unix_sock_destructor() which is called via __sk_free(), but it's too late because the receiver's unix_sk(sk)->oob_skb is accounted against the sender's sk->sk_wmem_alloc and __sk_free() is called only when sk->sk_wmem_alloc is 0. In the repro sequences, we do not consume the OOB skb, so both two sk's sock_put() never reach __sk_free() due to the positive sk->sk_wmem_alloc. Then, no one can consume the OOB skb nor call __sk_free(), and we finally leak the two whole sk. Thus, we must free the unconsumed OOB skb earlier when close()ing the socket. Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: syzbot Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index bf338b782fc4..d686804119c9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -569,12 +569,6 @@ static void unix_sock_destructor(struct sock *sk) skb_queue_purge(&sk->sk_receive_queue); -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (u->oob_skb) { - kfree_skb(u->oob_skb); - u->oob_skb = NULL; - } -#endif DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk->sk_wmem_alloc)); DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk)); DEBUG_NET_WARN_ON_ONCE(sk->sk_socket); @@ -620,6 +614,13 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_state_unlock(sk); +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (u->oob_skb) { + kfree_skb(u->oob_skb); + u->oob_skb = NULL; + } +#endif + wake_up_interruptible_all(&u->peer_wait); if (skpair != NULL) { -- cgit From a251c17aa558d8e3128a528af5cf8b9d7caae4fd Mon Sep 17 00:00:00 2001 From: "Jason A. Donenfeld" Date: Wed, 5 Oct 2022 17:43:22 +0200 Subject: treewide: use get_random_u32() when possible MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The prandom_u32() function has been a deprecated inline wrapper around get_random_u32() for several releases now, and compiles down to the exact same code. Replace the deprecated wrapper with a direct call to the real function. The same also applies to get_random_int(), which is just a wrapper around get_random_u32(). This was done as a basic find and replace. Reviewed-by: Greg Kroah-Hartman Reviewed-by: Kees Cook Reviewed-by: Yury Norov Reviewed-by: Jan Kara # for ext4 Acked-by: Toke Høiland-Jørgensen # for sch_cake Acked-by: Chuck Lever # for nfsd Acked-by: Jakub Kicinski Acked-by: Mika Westerberg # for thunderbolt Acked-by: Darrick J. Wong # for xfs Acked-by: Helge Deller # for parisc Acked-by: Heiko Carstens # for s390 Signed-off-by: Jason A. Donenfeld --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 15dbb392c875..b3545fc68097 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1147,7 +1147,7 @@ static int unix_autobind(struct sock *sk) addr->name->sun_family = AF_UNIX; refcount_set(&addr->refcnt, 1); - ordernum = prandom_u32(); + ordernum = get_random_u32(); lastnum = ordernum & 0xFFFFF; retry: ordernum = (ordernum + 1) & 0xFFFFF; -- cgit