From 04f08eb44b5011493d77b602fdec29ff0f5c6cd5 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 8 Sep 2021 17:00:29 -0700 Subject: net/af_unix: fix a data-race in unix_dgram_poll syzbot reported another data-race in af_unix [1] Lets change __skb_insert() to use WRITE_ONCE() when changing skb head qlen. Also, change unix_dgram_poll() to use lockless version of unix_recvq_full() It is verry possible we can switch all/most unix_recvq_full() to the lockless version, this will be done in a future kernel version. [1] HEAD commit: 8596e589b787732c8346f0482919e83cc9362db1 BUG: KCSAN: data-race in skb_queue_tail / unix_dgram_poll write to 0xffff88814eeb24e0 of 4 bytes by task 25815 on cpu 0: __skb_insert include/linux/skbuff.h:1938 [inline] __skb_queue_before include/linux/skbuff.h:2043 [inline] __skb_queue_tail include/linux/skbuff.h:2076 [inline] skb_queue_tail+0x80/0xa0 net/core/skbuff.c:3264 unix_dgram_sendmsg+0xff2/0x1600 net/unix/af_unix.c:1850 sock_sendmsg_nosec net/socket.c:703 [inline] sock_sendmsg net/socket.c:723 [inline] ____sys_sendmsg+0x360/0x4d0 net/socket.c:2392 ___sys_sendmsg net/socket.c:2446 [inline] __sys_sendmmsg+0x315/0x4b0 net/socket.c:2532 __do_sys_sendmmsg net/socket.c:2561 [inline] __se_sys_sendmmsg net/socket.c:2558 [inline] __x64_sys_sendmmsg+0x53/0x60 net/socket.c:2558 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae read to 0xffff88814eeb24e0 of 4 bytes by task 25834 on cpu 1: skb_queue_len include/linux/skbuff.h:1869 [inline] unix_recvq_full net/unix/af_unix.c:194 [inline] unix_dgram_poll+0x2bc/0x3e0 net/unix/af_unix.c:2777 sock_poll+0x23e/0x260 net/socket.c:1288 vfs_poll include/linux/poll.h:90 [inline] ep_item_poll fs/eventpoll.c:846 [inline] ep_send_events fs/eventpoll.c:1683 [inline] ep_poll fs/eventpoll.c:1798 [inline] do_epoll_wait+0x6ad/0xf00 fs/eventpoll.c:2226 __do_sys_epoll_wait fs/eventpoll.c:2238 [inline] __se_sys_epoll_wait fs/eventpoll.c:2233 [inline] __x64_sys_epoll_wait+0xf6/0x120 fs/eventpoll.c:2233 do_syscall_x64 arch/x86/entry/common.c:50 [inline] do_syscall_64+0x3d/0x90 arch/x86/entry/common.c:80 entry_SYSCALL_64_after_hwframe+0x44/0xae value changed: 0x0000001b -> 0x00000001 Reported by Kernel Concurrency Sanitizer on: CPU: 1 PID: 25834 Comm: syz-executor.1 Tainted: G W 5.14.0-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Fixes: 86b18aaa2b5b ("skbuff: fix a data race in skb_queue_len()") Cc: Qian Cai Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index eb47b9de2380..92345c9bb60c 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3073,7 +3073,7 @@ static __poll_t unix_dgram_poll(struct file *file, struct socket *sock, other = unix_peer(sk); if (other && unix_peer(other) != sk && - unix_recvq_full(other) && + unix_recvq_full_lockless(other) && unix_dgram_peer_wake_me(sk, other)) writable = 0; -- cgit From f4bd73b5a950866f6c6fc98a7b684d307c5d586a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Tue, 28 Sep 2021 09:42:27 +0900 Subject: af_unix: Return errno instead of NULL in unix_create1(). unix_create1() returns NULL on error, and the callers assume that it never fails for reasons other than out of memory. So, the callers always return -ENOMEM when unix_create1() fails. However, it also returns NULL when the number of af_unix sockets exceeds twice the limit controlled by sysctl: fs.file-max. In this case, the callers should return -ENFILE like alloc_empty_file(). This patch changes unix_create1() to return the correct error value instead of NULL on error. Out of curiosity, the assumption has been wrong since 1999 due to this change introduced in 2.2.4 [0]. diff -u --recursive --new-file v2.2.3/linux/net/unix/af_unix.c linux/net/unix/af_unix.c --- v2.2.3/linux/net/unix/af_unix.c Tue Jan 19 11:32:53 1999 +++ linux/net/unix/af_unix.c Sun Mar 21 07:22:00 1999 @@ -388,6 +413,9 @@ { struct sock *sk; + if (atomic_read(&unix_nr_socks) >= 2*max_files) + return NULL; + MOD_INC_USE_COUNT; sk = sk_alloc(PF_UNIX, GFP_KERNEL, 1); if (!sk) { [0]: https://cdn.kernel.org/pub/linux/kernel/v2.2/patch-2.2.4.gz Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Kuniyuki Iwashima Signed-off-by: David S. Miller --- net/unix/af_unix.c | 49 ++++++++++++++++++++++++++++++++----------------- 1 file changed, 32 insertions(+), 17 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 92345c9bb60c..f505b89bda6a 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -828,20 +828,25 @@ struct proto unix_stream_proto = { static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, int type) { - struct sock *sk = NULL; struct unix_sock *u; + struct sock *sk; + int err; atomic_long_inc(&unix_nr_socks); - if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) - goto out; + if (atomic_long_read(&unix_nr_socks) > 2 * get_max_files()) { + err = -ENFILE; + goto err; + } if (type == SOCK_STREAM) sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_stream_proto, kern); else /*dgram and seqpacket */ sk = sk_alloc(net, PF_UNIX, GFP_KERNEL, &unix_dgram_proto, kern); - if (!sk) - goto out; + if (!sk) { + err = -ENOMEM; + goto err; + } sock_init_data(sock, sk); @@ -861,20 +866,23 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay); memset(&u->scm_stat, 0, sizeof(struct scm_stat)); unix_insert_socket(unix_sockets_unbound(sk), sk); -out: - if (sk == NULL) - atomic_long_dec(&unix_nr_socks); - else { - local_bh_disable(); - sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); - local_bh_enable(); - } + + local_bh_disable(); + sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1); + local_bh_enable(); + return sk; + +err: + atomic_long_dec(&unix_nr_socks); + return ERR_PTR(err); } static int unix_create(struct net *net, struct socket *sock, int protocol, int kern) { + struct sock *sk; + if (protocol && protocol != PF_UNIX) return -EPROTONOSUPPORT; @@ -901,7 +909,11 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, return -ESOCKTNOSUPPORT; } - return unix_create1(net, sock, kern, sock->type) ? 0 : -ENOMEM; + sk = unix_create1(net, sock, kern, sock->type); + if (IS_ERR(sk)) + return PTR_ERR(sk); + + return 0; } static int unix_release(struct socket *sock) @@ -1314,12 +1326,15 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, we will have to recheck all again in any case. */ - err = -ENOMEM; - /* create new sock for complete connection */ newsk = unix_create1(sock_net(sk), NULL, 0, sock->type); - if (newsk == NULL) + if (IS_ERR(newsk)) { + err = PTR_ERR(newsk); + newsk = NULL; goto out; + } + + err = -ENOMEM; /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); -- cgit From 35306eb23814444bd4021f8a1c3047d3cb0c8b2b Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 29 Sep 2021 15:57:50 -0700 Subject: af_unix: fix races in sk_peer_pid and sk_peer_cred accesses Jann Horn reported that SO_PEERCRED and SO_PEERGROUPS implementations are racy, as af_unix can concurrently change sk_peer_pid and sk_peer_cred. In order to fix this issue, this patch adds a new spinlock that needs to be used whenever these fields are read or written. Jann also pointed out that l2cap_sock_get_peer_pid_cb() is currently reading sk->sk_peer_pid which makes no sense, as this field is only possibly set by AF_UNIX sockets. We will have to clean this in a separate patch. This could be done by reverting b48596d1dc25 "Bluetooth: L2CAP: Add get_peer_pid callback" or implementing what was truly expected. Fixes: 109f6e39fa07 ("af_unix: Allow SO_PEERCRED to work across namespaces.") Signed-off-by: Eric Dumazet Reported-by: Jann Horn Cc: Eric W. Biederman Cc: Luiz Augusto von Dentz Cc: Marcel Holtmann Signed-off-by: David S. Miller --- net/unix/af_unix.c | 34 ++++++++++++++++++++++++++++------ 1 file changed, 28 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f505b89bda6a..efac5989edb5 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -608,20 +608,42 @@ static void unix_release_sock(struct sock *sk, int embrion) static void init_peercred(struct sock *sk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + spin_lock(&sk->sk_peer_lock); + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(task_tgid(current)); sk->sk_peer_cred = get_current_cred(); + spin_unlock(&sk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static void copy_peercred(struct sock *sk, struct sock *peersk) { - put_pid(sk->sk_peer_pid); - if (sk->sk_peer_cred) - put_cred(sk->sk_peer_cred); + const struct cred *old_cred; + struct pid *old_pid; + + if (sk < peersk) { + spin_lock(&sk->sk_peer_lock); + spin_lock_nested(&peersk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } else { + spin_lock(&peersk->sk_peer_lock); + spin_lock_nested(&sk->sk_peer_lock, SINGLE_DEPTH_NESTING); + } + old_pid = sk->sk_peer_pid; + old_cred = sk->sk_peer_cred; sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); + + spin_unlock(&sk->sk_peer_lock); + spin_unlock(&peersk->sk_peer_lock); + + put_pid(old_pid); + put_cred(old_cred); } static int unix_listen(struct socket *sock, int backlog) -- cgit