From 709894c52c1cafa36fe2047ba5a0b83bdf398133 Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 9 Apr 2025 14:50:58 +0200 Subject: af_unix: Remove unix_unhash() Dummy unix_unhash() was introduced for sockmap in commit 94531cfcbe79 ("af_unix: Add unix_stream_proto for sockmap"), but there's no need to implement it anymore. ->unhash() is only called conditionally: in unix_shutdown() since commit d359902d5c35 ("af_unix: Fix NULL pointer bug in unix_shutdown"), and in BPF proto's sock_map_unhash() since commit 5b4a79ba65a1 ("bpf, sockmap: Don't let sock_map_{close,destroy,unhash} call itself"). Remove it. Signed-off-by: Michal Luczaj Reviewed-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250409-cleanup-drop-unix-unhash-v1-1-1659e5b8ee84@rbox.co Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 8 -------- 1 file changed, 8 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f..2ab20821d6bb 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -950,13 +950,6 @@ static void unix_close(struct sock *sk, long timeout) */ } -static void unix_unhash(struct sock *sk) -{ - /* Nothing to do here, unix socket does not need a ->unhash(). - * This is merely for sockmap. - */ -} - static bool unix_bpf_bypass_getsockopt(int level, int optname) { if (level == SOL_SOCKET) { @@ -987,7 +980,6 @@ struct proto unix_stream_proto = { .owner = THIS_MODULE, .obj_size = sizeof(struct unix_sock), .close = unix_close, - .unhash = unix_unhash, .bpf_bypass_getsockopt = unix_bpf_bypass_getsockopt, #ifdef CONFIG_BPF_SYSCALL .psock_update_sk_prot = unix_stream_bpf_update_proto, -- cgit From fd0a109a0f6b7524543d17520da92a44a9f5343c Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 25 Apr 2025 10:11:31 +0200 Subject: net, pidfs: prepare for handing out pidfds for reaped sk->sk_peer_pid SO_PEERPIDFD currently doesn't support handing out pidfds if the sk->sk_peer_pid thread-group leader has already been reaped. In this case it currently returns EINVAL. Userspace still wants to get a pidfd for a reaped process to have a stable handle it can pass on. This is especially useful now that it is possible to retrieve exit information through a pidfd via the PIDFD_GET_INFO ioctl()'s PIDFD_INFO_EXIT flag. Another summary has been provided by David in [1]: > A pidfd can outlive the task it refers to, and thus user-space must > already be prepared that the task underlying a pidfd is gone at the time > they get their hands on the pidfd. For instance, resolving the pidfd to > a PID via the fdinfo must be prepared to read `-1`. > > Despite user-space knowing that a pidfd might be stale, several kernel > APIs currently add another layer that checks for this. In particular, > SO_PEERPIDFD returns `EINVAL` if the peer-task was already reaped, > but returns a stale pidfd if the task is reaped immediately after the > respective alive-check. > > This has the unfortunate effect that user-space now has two ways to > check for the exact same scenario: A syscall might return > EINVAL/ESRCH/... *or* the pidfd might be stale, even though there is no > particular reason to distinguish both cases. This also propagates > through user-space APIs, which pass on pidfds. They must be prepared to > pass on `-1` *or* the pidfd, because there is no guaranteed way to get a > stale pidfd from the kernel. > Userspace must already deal with a pidfd referring to a reaped task as > the task may exit and get reaped at any time will there are still many > pidfds referring to it. In order to allow handing out reaped pidfd SO_PEERPIDFD needs to ensure that PIDFD_INFO_EXIT information is available whenever a pidfd for a reaped task is created by PIDFD_INFO_EXIT. The uapi promises that reaped pidfds are only handed out if it is guaranteed that the caller sees the exit information: TEST_F(pidfd_info, success_reaped) { struct pidfd_info info = { .mask = PIDFD_INFO_CGROUPID | PIDFD_INFO_EXIT, }; /* * Process has already been reaped and PIDFD_INFO_EXIT been set. * Verify that we can retrieve the exit status of the process. */ ASSERT_EQ(ioctl(self->child_pidfd4, PIDFD_GET_INFO, &info), 0); ASSERT_FALSE(!!(info.mask & PIDFD_INFO_CREDS)); ASSERT_TRUE(!!(info.mask & PIDFD_INFO_EXIT)); ASSERT_TRUE(WIFEXITED(info.exit_code)); ASSERT_EQ(WEXITSTATUS(info.exit_code), 0); } To hand out pidfds for reaped processes we thus allocate a pidfs entry for the relevant sk->sk_peer_pid at the time the sk->sk_peer_pid is stashed and drop it when the socket is destroyed. This guarantees that exit information will always be recorded for the sk->sk_peer_pid task and we can hand out pidfds for reaped processes. Link: https://lore.kernel.org/lkml/20230807085203.819772-1-david@readahead.eu [1] Link: https://lore.kernel.org/20250425-work-pidfs-net-v2-2-450a19461e75@kernel.org Reviewed-by: David Rheinsberg Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 85 +++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 74 insertions(+), 11 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index f78a2492826f..472f8aa9ea15 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -100,6 +100,7 @@ #include #include #include +#include #include #include #include @@ -643,6 +644,9 @@ static void unix_sock_destructor(struct sock *sk) return; } + if (sk->sk_peer_pid) + pidfs_put_pid(sk->sk_peer_pid); + if (u->addr) unix_release_addr(u->addr); @@ -734,13 +738,48 @@ static void unix_release_sock(struct sock *sk, int embrion) unix_gc(); /* Garbage collect fds */ } -static void init_peercred(struct sock *sk) +struct unix_peercred { + struct pid *peer_pid; + const struct cred *peer_cred; +}; + +static inline int prepare_peercred(struct unix_peercred *peercred) { - sk->sk_peer_pid = get_pid(task_tgid(current)); - sk->sk_peer_cred = get_current_cred(); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (likely(!err)) { + peercred->peer_pid = get_pid(pid); + peercred->peer_cred = get_current_cred(); + } + return err; } -static void update_peercred(struct sock *sk) +static void drop_peercred(struct unix_peercred *peercred) +{ + const struct cred *cred = NULL; + struct pid *pid = NULL; + + might_sleep(); + + swap(peercred->peer_pid, pid); + swap(peercred->peer_cred, cred); + + pidfs_put_pid(pid); + put_pid(pid); + put_cred(cred); +} + +static inline void init_peercred(struct sock *sk, + const struct unix_peercred *peercred) +{ + sk->sk_peer_pid = peercred->peer_pid; + sk->sk_peer_cred = peercred->peer_cred; +} + +static void update_peercred(struct sock *sk, struct unix_peercred *peercred) { const struct cred *old_cred; struct pid *old_pid; @@ -748,11 +787,11 @@ static void update_peercred(struct sock *sk) spin_lock(&sk->sk_peer_lock); old_pid = sk->sk_peer_pid; old_cred = sk->sk_peer_cred; - init_peercred(sk); + init_peercred(sk, peercred); spin_unlock(&sk->sk_peer_lock); - put_pid(old_pid); - put_cred(old_cred); + peercred->peer_pid = old_pid; + peercred->peer_cred = old_cred; } static void copy_peercred(struct sock *sk, struct sock *peersk) @@ -761,6 +800,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); + pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } @@ -770,6 +810,7 @@ static int unix_listen(struct socket *sock, int backlog) int err; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); + struct unix_peercred peercred = {}; err = -EOPNOTSUPP; if (sock->type != SOCK_STREAM && sock->type != SOCK_SEQPACKET) @@ -777,6 +818,9 @@ static int unix_listen(struct socket *sock, int backlog) err = -EINVAL; if (!READ_ONCE(u->addr)) goto out; /* No listens on an unbound socket */ + err = prepare_peercred(&peercred); + if (err) + goto out; unix_state_lock(sk); if (sk->sk_state != TCP_CLOSE && sk->sk_state != TCP_LISTEN) goto out_unlock; @@ -786,11 +830,12 @@ static int unix_listen(struct socket *sock, int backlog) WRITE_ONCE(sk->sk_state, TCP_LISTEN); /* set credentials so connect can copy them */ - update_peercred(sk); + update_peercred(sk, &peercred); err = 0; out_unlock: unix_state_unlock(sk); + drop_peercred(&peercred); out: return err; } @@ -1525,6 +1570,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr; struct sock *sk = sock->sk, *newsk = NULL, *other = NULL; struct unix_sock *u = unix_sk(sk), *newu, *otheru; + struct unix_peercred peercred = {}; struct net *net = sock_net(sk); struct sk_buff *skb = NULL; unsigned char state; @@ -1561,6 +1607,10 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, goto out; } + err = prepare_peercred(&peercred); + if (err) + goto out; + /* Allocate skb for sending to listening sock */ skb = sock_wmalloc(newsk, 1, 0, GFP_KERNEL); if (!skb) { @@ -1636,7 +1686,7 @@ restart: unix_peer(newsk) = sk; newsk->sk_state = TCP_ESTABLISHED; newsk->sk_type = sk->sk_type; - init_peercred(newsk); + init_peercred(newsk, &peercred); newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1695,20 +1745,33 @@ out_free_skb: out_free_sk: unix_release_sock(newsk, 0); out: + drop_peercred(&peercred); return err; } static int unix_socketpair(struct socket *socka, struct socket *sockb) { + struct unix_peercred ska_peercred = {}, skb_peercred = {}; struct sock *ska = socka->sk, *skb = sockb->sk; + int err; + + err = prepare_peercred(&ska_peercred); + if (err) + return err; + + err = prepare_peercred(&skb_peercred); + if (err) { + drop_peercred(&ska_peercred); + return err; + } /* Join our sockets back to back */ sock_hold(ska); sock_hold(skb); unix_peer(ska) = skb; unix_peer(skb) = ska; - init_peercred(ska); - init_peercred(skb); + init_peercred(ska, &ska_peercred); + init_peercred(skb, &skb_peercred); ska->sk_state = TCP_ESTABLISHED; skb->sk_state = TCP_ESTABLISHED; -- cgit From a9194f88782afa1386641451a6c76beaa60485a0 Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Fri, 16 May 2025 13:25:31 +0200 Subject: coredump: add coredump socket Coredumping currently supports two modes: (1) Dumping directly into a file somewhere on the filesystem. (2) Dumping into a pipe connected to a usermode helper process spawned as a child of the system_unbound_wq or kthreadd. For simplicity I'm mostly ignoring (1). There's probably still some users of (1) out there but processing coredumps in this way can be considered adventurous especially in the face of set*id binaries. The most common option should be (2) by now. It works by allowing userspace to put a string into /proc/sys/kernel/core_pattern like: |/usr/lib/systemd/systemd-coredump %P %u %g %s %t %c %h The "|" at the beginning indicates to the kernel that a pipe must be used. The path following the pipe indicator is a path to a binary that will be spawned as a usermode helper process. Any additional parameters pass information about the task that is generating the coredump to the binary that processes the coredump. In the example core_pattern shown above systemd-coredump is spawned as a usermode helper. There's various conceptual consequences of this (non-exhaustive list): - systemd-coredump is spawned with file descriptor number 0 (stdin) connected to the read-end of the pipe. All other file descriptors are closed. That specifically includes 1 (stdout) and 2 (stderr). This has already caused bugs because userspace assumed that this cannot happen (Whether or not this is a sane assumption is irrelevant.). - systemd-coredump will be spawned as a child of system_unbound_wq. So it is not a child of any userspace process and specifically not a child of PID 1. It cannot be waited upon and is in a weird hybrid upcall which are difficult for userspace to control correctly. - systemd-coredump is spawned with full kernel privileges. This necessitates all kinds of weird privilege dropping excercises in userspace to make this safe. - A new usermode helper has to be spawned for each crashing process. This series adds a new mode: (3) Dumping into an AF_UNIX socket. Userspace can set /proc/sys/kernel/core_pattern to: @/path/to/coredump.socket The "@" at the beginning indicates to the kernel that an AF_UNIX coredump socket will be used to process coredumps. The coredump socket must be located in the initial mount namespace. When a task coredumps it opens a client socket in the initial network namespace and connects to the coredump socket. - The coredump server uses SO_PEERPIDFD to get a stable handle on the connected crashing task. The retrieved pidfd will provide a stable reference even if the crashing task gets SIGKILLed while generating the coredump. - By setting core_pipe_limit non-zero userspace can guarantee that the crashing task cannot be reaped behind it's back and thus process all necessary information in /proc/. The SO_PEERPIDFD can be used to detect whether /proc/ still refers to the same process. The core_pipe_limit isn't used to rate-limit connections to the socket. This can simply be done via AF_UNIX sockets directly. - The pidfd for the crashing task will grow new information how the task coredumps. - The coredump server should mark itself as non-dumpable. - A container coredump server in a separate network namespace can simply bind to another well-know address and systemd-coredump fowards coredumps to the container. - Coredumps could in the future also be handled via per-user/session coredump servers that run only with that users privileges. The coredump server listens on the coredump socket and accepts a new coredump connection. It then retrieves SO_PEERPIDFD for the client, inspects uid/gid and hands the accepted client to the users own coredump handler which runs with the users privileges only (It must of coure pay close attention to not forward crashing suid binaries.). The new coredump socket will allow userspace to not have to rely on usermode helpers for processing coredumps and provides a safer way to handle them instead of relying on super privileged coredumping helpers that have and continue to cause significant CVEs. This will also be significantly more lightweight since no fork()+exec() for the usermodehelper is required for each crashing process. The coredump server in userspace can e.g., just keep a worker pool. Link: https://lore.kernel.org/20250516-work-coredump-socket-v8-4-664f3caf2516@kernel.org Acked-by: Luca Boccassi Reviewed-by: Kuniyuki Iwashima Reviewed-by: Alexander Mikhalitsyn Reviewed-by: Jann Horn Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 54 +++++++++++++++++++++++++++++++++++++++++------------- 1 file changed, 41 insertions(+), 13 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 472f8aa9ea15..59a64b2ced6e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -85,10 +85,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include #include @@ -100,7 +103,6 @@ #include #include #include -#include #include #include #include @@ -1146,7 +1148,7 @@ static int unix_release(struct socket *sock) } static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, - int type) + int type, int flags) { struct inode *inode; struct path path; @@ -1154,13 +1156,39 @@ static struct sock *unix_find_bsd(struct sockaddr_un *sunaddr, int addr_len, int err; unix_mkname_bsd(sunaddr, addr_len); - err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); - if (err) - goto fail; - err = path_permission(&path, MAY_WRITE); - if (err) - goto path_put; + if (flags & SOCK_COREDUMP) { + const struct cred *cred; + struct cred *kcred; + struct path root; + + kcred = prepare_kernel_cred(&init_task); + if (!kcred) { + err = -ENOMEM; + goto fail; + } + + task_lock(&init_task); + get_fs_root(init_task.fs, &root); + task_unlock(&init_task); + + cred = override_creds(kcred); + err = vfs_path_lookup(root.dentry, root.mnt, sunaddr->sun_path, + LOOKUP_BENEATH | LOOKUP_NO_SYMLINKS | + LOOKUP_NO_MAGICLINKS, &path); + put_cred(revert_creds(cred)); + path_put(&root); + if (err) + goto fail; + } else { + err = kern_path(sunaddr->sun_path, LOOKUP_FOLLOW, &path); + if (err) + goto fail; + + err = path_permission(&path, MAY_WRITE); + if (err) + goto path_put; + } err = -ECONNREFUSED; inode = d_backing_inode(path.dentry); @@ -1210,12 +1238,12 @@ static struct sock *unix_find_abstract(struct net *net, static struct sock *unix_find_other(struct net *net, struct sockaddr_un *sunaddr, - int addr_len, int type) + int addr_len, int type, int flags) { struct sock *sk; if (sunaddr->sun_path[0]) - sk = unix_find_bsd(sunaddr, addr_len, type); + sk = unix_find_bsd(sunaddr, addr_len, type, flags); else sk = unix_find_abstract(net, sunaddr, addr_len, type); @@ -1473,7 +1501,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, } restart: - other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type); + other = unix_find_other(sock_net(sk), sunaddr, alen, sock->type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out; @@ -1620,7 +1648,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, restart: /* Find listening sock. */ - other = unix_find_other(net, sunaddr, addr_len, sk->sk_type); + other = unix_find_other(net, sunaddr, addr_len, sk->sk_type, flags); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free_skb; @@ -2089,7 +2117,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { lookup: other = unix_find_other(sock_net(sk), msg->msg_name, - msg->msg_namelen, sk->sk_type); + msg->msg_namelen, sk->sk_type, 0); if (IS_ERR(other)) { err = PTR_ERR(other); goto out_free; -- cgit From 350d4546295949c8a29e345b340e57772813037a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:52 -0700 Subject: af_unix: Factorise test_bit() for SOCK_PASSCRED and SOCK_PASSPIDFD. Currently, the same checks for SOCK_PASSCRED and SOCK_PASSPIDFD are scattered across many places. Let's centralise the bit tests to make the following changes cleaner. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2ab20821d6bb..464e183ffdb8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -765,6 +765,14 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_unlock(&sk->sk_peer_lock); } +static bool unix_may_passcred(const struct sock *sk) +{ + struct socket *sock = sk->sk_socket; + + return test_bit(SOCK_PASSCRED, &sock->flags) || + test_bit(SOCK_PASSPIDFD, &sock->flags); +} + static int unix_listen(struct socket *sock, int backlog) { int err; @@ -1411,9 +1419,7 @@ static int unix_dgram_connect(struct socket *sock, struct sockaddr *addr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(unix_sk(sk)->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(unix_sk(sk)->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1531,9 +1537,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr, if (err) goto out; - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -1877,16 +1881,6 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -static bool unix_passcred_enabled(const struct socket *sock, - const struct sock *other) -{ - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags) || - !other->sk_socket || - test_bit(SOCK_PASSCRED, &other->sk_socket->flags) || - test_bit(SOCK_PASSPIDFD, &other->sk_socket->flags); -} - /* * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket @@ -1897,7 +1891,9 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, { if (UNIXCB(skb).pid) return; - if (unix_passcred_enabled(sock, other)) { + + if (unix_may_passcred(sock->sk) || + !other->sk_socket || unix_may_passcred(other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } @@ -1974,9 +1970,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg, goto out; } - if ((test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) && - !READ_ONCE(u->addr)) { + if (unix_may_passcred(sk) && !READ_ONCE(u->addr)) { err = unix_autobind(sk); if (err) goto out; @@ -2846,8 +2840,7 @@ unlock: /* Never glue messages from different writers */ if (!unix_skb_scm_eq(skb, &scm)) break; - } else if (test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags)) { + } else if (unix_may_passcred(sk)) { /* Copy credentials */ scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); unix_set_secdata(&scm, skb); -- cgit From 3041bbbeb41b807d2e24d7d78d9cc1387d95898a Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:53 -0700 Subject: af_unix: Don't pass struct socket to maybe_add_creds(). We will move SOCK_PASS{CRED,PIDFD,SEC} from struct socket.flags to struct sock for better handling with SOCK_PASSRIGHTS. Then, we don't need to access struct socket in maybe_add_creds(). Let's pass struct sock to maybe_add_creds() and its caller queue_oob(). While at it, we append the unix_ prefix and fix double spaces around the pid assignment. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 464e183ffdb8..a39497fd6e98 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1869,7 +1869,7 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen { int err = 0; - UNIXCB(skb).pid = get_pid(scm->pid); + UNIXCB(skb).pid = get_pid(scm->pid); UNIXCB(skb).uid = scm->creds.uid; UNIXCB(skb).gid = scm->creds.gid; UNIXCB(skb).fp = NULL; @@ -1886,15 +1886,15 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen * We include credentials if source or destination socket * asserted SOCK_PASSCRED. */ -static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock, - const struct sock *other) +static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) return; - if (unix_may_passcred(sock->sk) || + if (unix_may_passcred(sk) || !other->sk_socket || unix_may_passcred(other)) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } } @@ -2133,7 +2133,8 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - maybe_add_creds(skb, sock, other); + + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2161,14 +2162,14 @@ out: #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768)) #if IS_ENABLED(CONFIG_AF_UNIX_OOB) -static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other, +static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, struct scm_cookie *scm, bool fds_sent) { struct unix_sock *ousk = unix_sk(other); struct sk_buff *skb; int err; - skb = sock_alloc_send_skb(sock->sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); + skb = sock_alloc_send_skb(sk, 1, msg->msg_flags & MSG_DONTWAIT, &err); if (!skb) return err; @@ -2192,7 +2193,7 @@ static int queue_oob(struct socket *sock, struct msghdr *msg, struct sock *other goto out; } - maybe_add_creds(skb, sock, other); + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); @@ -2308,7 +2309,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; - maybe_add_creds(skb, sock, other); + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2318,7 +2319,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (msg->msg_flags & MSG_OOB) { - err = queue_oob(sock, msg, other, &scm, fds_sent); + err = queue_oob(sk, msg, other, &scm, fds_sent); if (err) goto out_err; sent++; -- cgit From 0e81cfd971dc4833c699dcd8924e54a5021bc4e8 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:57 -0700 Subject: af_unix: Move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. As explained in the next patch, SO_PASSRIGHTS would have a problem if we assigned a corresponding bit to socket->flags, so it must be managed in struct sock. Mixing socket->flags and sk->sk_flags for similar options will look confusing, and sk->sk_flags does not have enough space on 32bit system. Also, as mentioned in commit 16e572626961 ("af_unix: dont send SCM_CREDENTIALS by default"), SOCK_PASSCRED and SOCK_PASSPID handling is known to be slow, and managing the flags in struct socket cannot avoid that for embryo sockets. Let's move SOCK_PASS{CRED,PIDFD,SEC} to struct sock. While at it, other SOCK_XXX flags in net.h are grouped as enum. Note that assign_bit() was atomic, so the writer side is moved down after lock_sock() in setsockopt(), but the bit is only read once in sendmsg() and recvmsg(), so lock_sock() is not needed there. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 18 ++---------------- 1 file changed, 2 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index a39497fd6e98..27ebda4cd9b9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -767,10 +767,7 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) static bool unix_may_passcred(const struct sock *sk) { - struct socket *sock = sk->sk_socket; - - return test_bit(SOCK_PASSCRED, &sock->flags) || - test_bit(SOCK_PASSPIDFD, &sock->flags); + return sk->sk_scm_credentials || sk->sk_scm_pidfd; } static int unix_listen(struct socket *sock, int backlog) @@ -1713,17 +1710,6 @@ static int unix_socketpair(struct socket *socka, struct socket *sockb) return 0; } -static void unix_sock_inherit_flags(const struct socket *old, - struct socket *new) -{ - if (test_bit(SOCK_PASSCRED, &old->flags)) - set_bit(SOCK_PASSCRED, &new->flags); - if (test_bit(SOCK_PASSPIDFD, &old->flags)) - set_bit(SOCK_PASSPIDFD, &new->flags); - if (test_bit(SOCK_PASSSEC, &old->flags)) - set_bit(SOCK_PASSSEC, &new->flags); -} - static int unix_accept(struct socket *sock, struct socket *newsock, struct proto_accept_arg *arg) { @@ -1760,7 +1746,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - unix_sock_inherit_flags(sock, newsock); + tsk->sk_scm_recv_flags = READ_ONCE(sk->sk_scm_recv_flags); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; -- cgit From 3f84d577b79d2fce8221244f2509734940609ca6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:58 -0700 Subject: af_unix: Inherit sk_flags at connect(). MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For SOCK_STREAM embryo sockets, the SO_PASS{CRED,PIDFD,SEC} options are inherited from the parent listen()ing socket. Currently, this inheritance happens at accept(), because these attributes were stored in sk->sk_socket->flags and the struct socket is not allocated until accept(). This leads to unintentional behaviour. When a peer sends data to an embryo socket in the accept() queue, unix_maybe_add_creds() embeds credentials into the skb, even if neither the peer nor the listener has enabled these options. If the option is enabled, the embryo socket receives the ancillary data after accept(). If not, the data is silently discarded. This conservative approach works for SO_PASS{CRED,PIDFD,SEC}, but would not for SO_PASSRIGHTS; once an SCM_RIGHTS with a hung file descriptor was sent, it'd be game over. To avoid this, we will need to preserve SOCK_PASSRIGHTS even on embryo sockets. Commit aed6ecef55d7 ("af_unix: Save listener for embryo socket.") made it possible to access the parent's flags in sendmsg() via unix_sk(other)->listener->sk->sk_socket->flags, but this introduces an unnecessary condition that is irrelevant for most sockets, accept()ed sockets and clients. Therefore, we moved SOCK_PASSXXX into struct sock. Let’s inherit sk->sk_scm_recv_flags at connect() to avoid receiving SCM_RIGHTS on embryo sockets created from a parent with SO_PASSRIGHTS=0. Note that the parent socket is locked in connect() so we don't need READ_ONCE() for sk_scm_recv_flags. Now, we can remove !other->sk_socket check in unix_maybe_add_creds() to avoid slow SOCK_PASS{CRED,PIDFD} handling for embryo sockets created from a parent with SO_PASS{CRED,PIDFD}=0. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 27ebda4cd9b9..900bad88fbd2 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1626,10 +1626,12 @@ restart: /* The way is open! Fastly set all the necessary fields... */ sock_hold(sk); - unix_peer(newsk) = sk; - newsk->sk_state = TCP_ESTABLISHED; - newsk->sk_type = sk->sk_type; + unix_peer(newsk) = sk; + newsk->sk_state = TCP_ESTABLISHED; + newsk->sk_type = sk->sk_type; + newsk->sk_scm_recv_flags = other->sk_scm_recv_flags; init_peercred(newsk); + newu = unix_sk(newsk); newu->listener = other; RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); @@ -1746,7 +1748,6 @@ static int unix_accept(struct socket *sock, struct socket *newsock, unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); newsock->state = SS_CONNECTED; - tsk->sk_scm_recv_flags = READ_ONCE(sk->sk_scm_recv_flags); sock_graft(tsk, newsock); unix_state_unlock(tsk); return 0; @@ -1878,8 +1879,7 @@ static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, if (UNIXCB(skb).pid) return; - if (unix_may_passcred(sk) || - !other->sk_socket || unix_may_passcred(other)) { + if (unix_may_passcred(sk) || unix_may_passcred(other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } -- cgit From 77cbe1a6d8730a07f99f9263c2d5f2304cf5e830 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Mon, 19 May 2025 13:57:59 -0700 Subject: af_unix: Introduce SO_PASSRIGHTS. As long as recvmsg() or recvmmsg() is used with cmsg, it is not possible to avoid receiving file descriptors via SCM_RIGHTS. This behaviour has occasionally been flagged as problematic, as it can be (ab)used to trigger DoS during close(), for example, by passing a FUSE-controlled fd or a hung NFS fd. For instance, as noted on the uAPI Group page [0], an untrusted peer could send a file descriptor pointing to a hung NFS mount and then close it. Once the receiver calls recvmsg() with msg_control, the descriptor is automatically installed, and then the responsibility for the final close() now falls on the receiver, which may result in blocking the process for a long time. Regarding this, systemd calls cmsg_close_all() [1] after each recvmsg() to close() unwanted file descriptors sent via SCM_RIGHTS. However, this cannot work around the issue at all, because the final fput() may still occur on the receiver's side once sendmsg() with SCM_RIGHTS succeeds. Also, even filtering by LSM at recvmsg() does not work for the same reason. Thus, we need a better way to refuse SCM_RIGHTS at sendmsg(). Let's introduce SO_PASSRIGHTS to disable SCM_RIGHTS. Note that this option is enabled by default for backward compatibility. Link: https://uapi-group.org/kernel-features/#disabling-reception-of-scm_rights-for-af_unix-sockets #[0] Link: https://github.com/systemd/systemd/blob/v257.5/src/basic/fd-util.c#L612-L628 #[1] Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Signed-off-by: David S. Miller --- net/unix/af_unix.c | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 900bad88fbd2..bd507f74e35e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1015,6 +1015,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern, sock_init_data(sock, sk); + sk->sk_scm_rights = 1; sk->sk_hash = unix_unbound_hash(sk); sk->sk_allocation = GFP_KERNEL_ACCOUNT; sk->sk_write_space = unix_write_space; @@ -2073,6 +2074,11 @@ restart_locked: goto out_unlock; } + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; + } + if (sk->sk_type != SOCK_SEQPACKET) { err = security_unix_may_send(sk->sk_socket, other->sk_socket); if (err) @@ -2174,9 +2180,13 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (sock_flag(other, SOCK_DEAD) || (other->sk_shutdown & RCV_SHUTDOWN)) { - unix_state_unlock(other); err = -EPIPE; - goto out; + goto out_unlock; + } + + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + err = -EPERM; + goto out_unlock; } unix_maybe_add_creds(skb, sk, other); @@ -2192,6 +2202,8 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, other->sk_data_ready(other); return 0; +out_unlock: + unix_state_unlock(other); out: consume_skb(skb); return err; @@ -2295,6 +2307,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, (other->sk_shutdown & RCV_SHUTDOWN)) goto out_pipe_unlock; + if (UNIXCB(skb).fp && !other->sk_scm_rights) { + unix_state_unlock(other); + err = -EPERM; + goto out_free; + } + unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); -- cgit From 43fb2b30eea7cfc40214484935b026ec29838a91 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 11 Jun 2025 13:27:35 -0700 Subject: af_unix: Allow passing cred for embryo without SO_PASSCRED/SO_PASSPIDFD. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before the cited commit, the kernel unconditionally embedded SCM credentials to skb for embryo sockets even when both the sender and listener disabled SO_PASSCRED and SO_PASSPIDFD. Now, the credentials are added to skb only when configured by the sender or the listener. However, as reported in the link below, it caused a regression for some programs that assume credentials are included in every skb, but sometimes not now. The only problematic scenario would be that a socket starts listening before setting the option. Then, there will be 2 types of non-small race window, where a client can send skb without credentials, which the peer receives as an "invalid" message (and aborts the connection it seems ?): Client Server ------ ------ s1.listen() <-- No SO_PASS{CRED,PIDFD} s2.connect() s2.send() <-- w/o cred s1.setsockopt(SO_PASS{CRED,PIDFD}) s2.send() <-- w/ cred or Client Server ------ ------ s1.listen() <-- No SO_PASS{CRED,PIDFD} s2.connect() s2.send() <-- w/o cred s3, _ = s1.accept() <-- Inherit cred options s2.send() <-- w/o cred but not set yet s3.setsockopt(SO_PASS{CRED,PIDFD}) s2.send() <-- w/ cred It's unfortunate that buggy programs depend on the behaviour, but let's restore the previous behaviour. Fixes: 3f84d577b79d ("af_unix: Inherit sk_flags at connect().") Reported-by: Jacek Łuczak Closes: https://lore.kernel.org/all/68d38b0b-1666-4974-85d4-15575789c8d4@gmail.com/ Signed-off-by: Kuniyuki Iwashima Tested-by: Christian Heusel Tested-by: André Almeida Tested-by: Jacek Łuczak Link: https://patch.msgid.link/20250611202758.3075858-1-kuni1840@gmail.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2e2e9997a68e..22e170fb5dda 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1971,7 +1971,8 @@ static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, if (UNIXCB(skb).pid) return; - if (unix_may_passcred(sk) || unix_may_passcred(other)) { + if (unix_may_passcred(sk) || unix_may_passcred(other) || + !other->sk_socket) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } -- cgit From 804d6794497e6f3992d156e07d01e22b037ce09e Mon Sep 17 00:00:00 2001 From: Christian Brauner Date: Wed, 18 Jun 2025 22:53:42 +0200 Subject: pidfs: remove pidfs_{get,put}_pid() Now that we stash persistent information in struct pid there's no need to play volatile games with pinning struct pid via dentries in pidfs. Link: https://lore.kernel.org/20250618-work-pidfs-persistent-v2-8-98f3456fd552@kernel.org Reviewed-by: Alexander Mikhalitsyn Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 5 ----- 1 file changed, 5 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 2e2e9997a68e..129388c309b0 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -646,9 +646,6 @@ static void unix_sock_destructor(struct sock *sk) return; } - if (sk->sk_peer_pid) - pidfs_put_pid(sk->sk_peer_pid); - if (u->addr) unix_release_addr(u->addr); @@ -769,7 +766,6 @@ static void drop_peercred(struct unix_peercred *peercred) swap(peercred->peer_pid, pid); swap(peercred->peer_cred, cred); - pidfs_put_pid(pid); put_pid(pid); put_cred(cred); } @@ -802,7 +798,6 @@ static void copy_peercred(struct sock *sk, struct sock *peersk) spin_lock(&sk->sk_peer_lock); sk->sk_peer_pid = get_pid(peersk->sk_peer_pid); - pidfs_get_pid(sk->sk_peer_pid); sk->sk_peer_cred = get_cred(peersk->sk_peer_cred); spin_unlock(&sk->sk_peer_lock); } -- cgit From c51da3f7a161c6822232be832abdffe47eb55b4c Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 20 Jun 2025 13:30:01 +0000 Subject: net: remove sock_i_uid() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Difference between sock_i_uid() and sk_uid() is that after sock_orphan(), sock_i_uid() returns GLOBAL_ROOT_UID while sk_uid() returns the last cached sk->sk_uid value. None of sock_i_uid() callers care about this. Use sk_uid() which is much faster and inlined. Note that diag/dump users are calling sock_i_ino() and can not see the full benefit yet. Signed-off-by: Eric Dumazet Cc: Lorenzo Colitti Reviewed-by: Maciej Żenczykowski Link: https://patch.msgid.link/20250620133001.4090592-3-edumazet@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..1e320f89168d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3682,7 +3682,7 @@ static int bpf_iter_unix_seq_show(struct seq_file *seq, void *v) goto unlock; } - uid = from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)); + uid = from_kuid_munged(seq_user_ns(seq), sk_uid(sk)); meta.seq = seq; prog = bpf_iter_get_info(&meta, false); ret = unix_prog_seq_show(prog, &meta, v, uid); -- cgit From 32ca245464e1479bfea8592b9db227fdc1641705 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:55 -0700 Subject: af_unix: Don't leave consecutive consumed OOB skbs. Jann Horn reported a use-after-free in unix_stream_read_generic(). The following sequences reproduce the issue: $ python3 from socket import * s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) s1.send(b'x', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'y', MSG_OOB) s2.recv(1, MSG_OOB) # leave a consumed OOB skb s1.send(b'z', MSG_OOB) s2.recv(1) # recv 'z' illegally s2.recv(1, MSG_OOB) # access 'z' skb (use-after-free) Even though a user reads OOB data, the skb holding the data stays on the recv queue to mark the OOB boundary and break the next recv(). After the last send() in the scenario above, the sk2's recv queue has 2 leading consumed OOB skbs and 1 real OOB skb. Then, the following happens during the next recv() without MSG_OOB 1. unix_stream_read_generic() peeks the first consumed OOB skb 2. manage_oob() returns the next consumed OOB skb 3. unix_stream_read_generic() fetches the next not-yet-consumed OOB skb 4. unix_stream_read_generic() reads and frees the OOB skb , and the last recv(MSG_OOB) triggers KASAN splat. The 3. above occurs because of the SO_PEEK_OFF code, which does not expect unix_skb_len(skb) to be 0, but this is true for such consumed OOB skbs. while (skip >= unix_skb_len(skb)) { skip -= unix_skb_len(skb); skb = skb_peek_next(skb, &sk->sk_receive_queue); ... } In addition to this use-after-free, there is another issue that ioctl(SIOCATMARK) does not function properly with consecutive consumed OOB skbs. So, nothing good comes out of such a situation. Instead of complicating manage_oob(), ioctl() handling, and the next ECONNRESET fix by introducing a loop for consecutive consumed OOB skbs, let's not leave such consecutive OOB unnecessarily. Now, while receiving an OOB skb in unix_stream_recv_urg(), if its previous skb is a consumed OOB skb, it is freed. [0]: BUG: KASAN: slab-use-after-free in unix_stream_read_actor (net/unix/af_unix.c:3027) Read of size 4 at addr ffff888106ef2904 by task python3/315 CPU: 2 UID: 0 PID: 315 Comm: python3 Not tainted 6.16.0-rc1-00407-gec315832f6f9 #8 PREEMPT(voluntary) Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-4.fc42 04/01/2014 Call Trace: dump_stack_lvl (lib/dump_stack.c:122) print_report (mm/kasan/report.c:409 mm/kasan/report.c:521) kasan_report (mm/kasan/report.c:636) unix_stream_read_actor (net/unix/af_unix.c:3027) unix_stream_read_generic (net/unix/af_unix.c:2708 net/unix/af_unix.c:2847) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) RIP: 0033:0x7f8911fcea06 Code: 5d e8 41 8b 93 08 03 00 00 59 5e 48 83 f8 fc 75 19 83 e2 39 83 fa 08 75 11 e8 26 ff ff ff 66 0f 1f 44 00 00 48 8b 45 10 0f 05 <48> 8b 5d f8 c9 c3 0f 1f 40 00 f3 0f 1e fa 55 48 89 e5 48 83 ec 08 RSP: 002b:00007fffdb0dccb0 EFLAGS: 00000202 ORIG_RAX: 000000000000002d RAX: ffffffffffffffda RBX: 00007fffdb0dcdc8 RCX: 00007f8911fcea06 RDX: 0000000000000001 RSI: 00007f8911a5e060 RDI: 0000000000000006 RBP: 00007fffdb0dccd0 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000202 R12: 00007f89119a7d20 R13: ffffffffc4653600 R14: 0000000000000000 R15: 0000000000000000 Allocated by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) __kasan_slab_alloc (mm/kasan/common.c:348) kmem_cache_alloc_node_noprof (./include/linux/kasan.h:250 mm/slub.c:4148 mm/slub.c:4197 mm/slub.c:4249) __alloc_skb (net/core/skbuff.c:660 (discriminator 4)) alloc_skb_with_frags (./include/linux/skbuff.h:1336 net/core/skbuff.c:6668) sock_alloc_send_pskb (net/core/sock.c:2993) unix_stream_sendmsg (./include/net/sock.h:1847 net/unix/af_unix.c:2256 net/unix/af_unix.c:2418) __sys_sendto (net/socket.c:712 (discriminator 20) net/socket.c:727 (discriminator 20) net/socket.c:2226 (discriminator 20)) __x64_sys_sendto (net/socket.c:2233 (discriminator 1) net/socket.c:2229 (discriminator 1) net/socket.c:2229 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) Freed by task 315: kasan_save_stack (mm/kasan/common.c:48) kasan_save_track (mm/kasan/common.c:60 (discriminator 1) mm/kasan/common.c:69 (discriminator 1)) kasan_save_free_info (mm/kasan/generic.c:579 (discriminator 1)) __kasan_slab_free (mm/kasan/common.c:271) kmem_cache_free (mm/slub.c:4643 (discriminator 3) mm/slub.c:4745 (discriminator 3)) unix_stream_read_generic (net/unix/af_unix.c:3010) unix_stream_recvmsg (net/unix/af_unix.c:3048) sock_recvmsg (net/socket.c:1063 (discriminator 20) net/socket.c:1085 (discriminator 20)) __sys_recvfrom (net/socket.c:2278) __x64_sys_recvfrom (net/socket.c:2291 (discriminator 1) net/socket.c:2287 (discriminator 1) net/socket.c:2287 (discriminator 1)) do_syscall_64 (arch/x86/entry/syscall_64.c:63 (discriminator 1) arch/x86/entry/syscall_64.c:94 (discriminator 1)) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:130) The buggy address belongs to the object at ffff888106ef28c0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 68 bytes inside of freed 224-byte region [ffff888106ef28c0, ffff888106ef29a0) The buggy address belongs to the physical page: page: refcount:0 mapcount:0 mapping:0000000000000000 index:0xffff888106ef3cc0 pfn:0x106ef2 head: order:1 mapcount:0 entire_mapcount:0 nr_pages_mapped:0 pincount:0 flags: 0x200000000000040(head|node=0|zone=2) page_type: f5(slab) raw: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 raw: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000040 ffff8881001d28c0 ffffea000422fe00 0000000000000004 head: ffff888106ef3cc0 0000000080190010 00000000f5000000 0000000000000000 head: 0200000000000001 ffffea00041bbc81 00000000ffffffff 00000000ffffffff head: 0000000000000000 0000000000000000 00000000ffffffff 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff888106ef2800: 00 00 00 00 00 00 00 00 00 00 00 00 fc fc fc fc ffff888106ef2880: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb >ffff888106ef2900: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff888106ef2980: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ffff888106ef2a00: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Jann Horn Signed-off-by: Kuniyuki Iwashima Reviewed-by: Jann Horn Link: https://patch.msgid.link/20250619041457.1132791-2-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 22e170fb5dda..5392aa53cbc8 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2680,11 +2680,11 @@ struct unix_stream_read_state { #if IS_ENABLED(CONFIG_AF_UNIX_OOB) static int unix_stream_recv_urg(struct unix_stream_read_state *state) { + struct sk_buff *oob_skb, *read_skb = NULL; struct socket *sock = state->socket; struct sock *sk = sock->sk; struct unix_sock *u = unix_sk(sk); int chunk = 1; - struct sk_buff *oob_skb; mutex_lock(&u->iolock); unix_state_lock(sk); @@ -2699,9 +2699,16 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) oob_skb = u->oob_skb; - if (!(state->flags & MSG_PEEK)) + if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && + !unix_skb_len(oob_skb->prev)) { + read_skb = oob_skb->prev; + __skb_unlink(read_skb, &sk->sk_receive_queue); + } + } + spin_unlock(&sk->sk_receive_queue.lock); unix_state_unlock(sk); @@ -2712,6 +2719,8 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) mutex_unlock(&u->iolock); + consume_skb(read_skb); + if (chunk < 0) return -EFAULT; -- cgit From 2a5a4841846b079b5fca5752fe94e59346fbda40 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 18 Jun 2025 21:13:57 -0700 Subject: af_unix: Don't set -ECONNRESET for consumed OOB skb. Christian Brauner reported that even after MSG_OOB data is consumed, calling close() on the receiver socket causes the peer's recv() to return -ECONNRESET: 1. send() and recv() an OOB data. >>> from socket import * >>> s1, s2 = socketpair(AF_UNIX, SOCK_STREAM) >>> s1.send(b'x', MSG_OOB) 1 >>> s2.recv(1, MSG_OOB) b'x' 2. close() for s2 sets ECONNRESET to s1->sk_err even though s2 consumed the OOB data >>> s2.close() >>> s1.recv(10, MSG_DONTWAIT) ... ConnectionResetError: [Errno 104] Connection reset by peer Even after being consumed, the skb holding the OOB 1-byte data stays in the recv queue to mark the OOB boundary and break recv() at that point. This must be considered while close()ing a socket. Let's skip the leading consumed OOB skb while checking the -ECONNRESET condition in unix_release_sock(). Fixes: 314001f0bf92 ("af_unix: Add OOB support") Reported-by: Christian Brauner Closes: https://lore.kernel.org/netdev/20250529-sinkt-abfeuern-e7b08200c6b0@brauner/ Signed-off-by: Kuniyuki Iwashima Acked-by: Christian Brauner Link: https://patch.msgid.link/20250619041457.1132791-4-kuni1840@gmail.com Signed-off-by: Paolo Abeni --- net/unix/af_unix.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 5392aa53cbc8..52b155123985 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -660,6 +660,11 @@ static void unix_sock_destructor(struct sock *sk) #endif } +static unsigned int unix_skb_len(const struct sk_buff *skb) +{ + return skb->len - UNIXCB(skb).consumed; +} + static void unix_release_sock(struct sock *sk, int embrion) { struct unix_sock *u = unix_sk(sk); @@ -694,10 +699,16 @@ static void unix_release_sock(struct sock *sk, int embrion) if (skpair != NULL) { if (sk->sk_type == SOCK_STREAM || sk->sk_type == SOCK_SEQPACKET) { + struct sk_buff *skb = skb_peek(&sk->sk_receive_queue); + +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb && !unix_skb_len(skb)) + skb = skb_peek_next(skb, &sk->sk_receive_queue); +#endif unix_state_lock(skpair); /* No more writes */ WRITE_ONCE(skpair->sk_shutdown, SHUTDOWN_MASK); - if (!skb_queue_empty_lockless(&sk->sk_receive_queue) || embrion) + if (skb || embrion) WRITE_ONCE(skpair->sk_err, ECONNRESET); unix_state_unlock(skpair); skpair->sk_state_change(skpair); @@ -2661,11 +2672,6 @@ static long unix_stream_data_wait(struct sock *sk, long timeo, return timeo; } -static unsigned int unix_skb_len(const struct sk_buff *skb) -{ - return skb->len - UNIXCB(skb).consumed; -} - struct unix_stream_read_state { int (*recv_actor)(struct sk_buff *, int, int, struct unix_stream_read_state *); -- cgit From 9bedee7cdf4cb7f9a4928f10567b326eaab8125d Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:05 +0200 Subject: af_unix: rework unix_maybe_add_creds() to allow sleep As a preparation for the next patches we need to allow sleeping in unix_maybe_add_creds() and also return err. Currently, we can't do that as unix_maybe_add_creds() is being called under unix_state_lock(). There is no need for this, really. So let's move call sites of this helper a bit and do necessary function signature changes. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-2-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Christian Brauner Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 32 +++++++++++++++++++++++++------- 1 file changed, 25 insertions(+), 7 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 129388c309b0..fba50ceab42b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1955,21 +1955,30 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } -/* +/** + * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. + * @skb: skb to attach creds to. + * @sk: Sender sock. + * @other: Receiver sock. + * * Some apps rely on write() giving SCM_CREDENTIALS * We include credentials if source or destination socket * asserted SOCK_PASSCRED. + * + * Return: On success zero, on error a negative error code is returned. */ -static void unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, - const struct sock *other) +static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, + const struct sock *other) { if (UNIXCB(skb).pid) - return; + return 0; if (unix_may_passcred(sk) || unix_may_passcred(other)) { UNIXCB(skb).pid = get_pid(task_tgid(current)); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } + + return 0; } static bool unix_skb_scm_eq(struct sk_buff *skb, @@ -2104,6 +2113,10 @@ lookup: goto out_sock_put; } + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_sock_put; + restart: sk_locked = 0; unix_state_lock(other); @@ -2212,7 +2225,6 @@ restart_locked: if (sock_flag(other, SOCK_RCVTSTAMP)) __net_timestamp(skb); - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); @@ -2256,6 +2268,10 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, if (err < 0) goto out; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out; + skb_put(skb, 1); err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, 1); @@ -2275,7 +2291,6 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, goto out_unlock; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); spin_lock(&other->sk_receive_queue.lock); @@ -2369,6 +2384,10 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, fds_sent = true; + err = unix_maybe_add_creds(skb, sk, other); + if (err) + goto out_free; + if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; err = skb_splice_from_iter(skb, &msg->msg_iter, size, @@ -2399,7 +2418,6 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, goto out_free; } - unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); skb_queue_tail(&other->sk_receive_queue, skb); unix_state_unlock(other); -- cgit From ee47976264cd499426c89328827970ffb6acd406 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:06 +0200 Subject: af_unix: introduce unix_skb_to_scm helper Instead of open-coding let's consolidate this logic in a separate helper. This will simplify further changes. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-3-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Reviewed-by: Christian Brauner Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index fba50ceab42b..df2174d9904d 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1955,6 +1955,12 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen return err; } +static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) +{ + scm_set_cred(scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); + unix_set_secdata(scm, skb); +} + /** * unix_maybe_add_creds() - Adds current task uid/gid and struct pid to skb if needed. * @skb: skb to attach creds to. @@ -2565,8 +2571,7 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, memset(&scm, 0, sizeof(scm)); - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); if (!(flags & MSG_PEEK)) { if (UNIXCB(skb).fp) @@ -2951,8 +2956,7 @@ unlock: break; } else if (unix_may_passcred(sk)) { /* Copy credentials */ - scm_set_cred(&scm, UNIXCB(skb).pid, UNIXCB(skb).uid, UNIXCB(skb).gid); - unix_set_secdata(&scm, skb); + unix_skb_to_scm(skb, &scm); check_creds = true; } -- cgit From 2b9996417e4ec231c91818f9ea8107ae62ef75ad Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:08 +0200 Subject: af_unix/scm: fix whitespace errors Fix whitespace/formatting errors. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-5-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index df2174d9904d..323e4fc85d4b 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1929,7 +1929,7 @@ static void unix_destruct_scm(struct sk_buff *skb) struct scm_cookie scm; memset(&scm, 0, sizeof(scm)); - scm.pid = UNIXCB(skb).pid; + scm.pid = UNIXCB(skb).pid; if (UNIXCB(skb).fp) unix_detach_fds(&scm, skb); -- cgit From 2775832f71e53a294c93fa4b343a71787a87e5d3 Mon Sep 17 00:00:00 2001 From: Alexander Mikhalitsyn Date: Fri, 4 Jul 2025 00:23:09 +0200 Subject: af_unix: stash pidfs dentry when needed We need to ensure that pidfs dentry is allocated when we meet any struct pid for the first time. This will allows us to open pidfd even after the task it corresponds to is reaped. Basically, we need to identify all places where we fill skb/scm_cookie with struct pid reference for the first time and call pidfs_register_pid(). Tricky thing here is that we have a few places where this happends depending on what userspace is doing: - [__scm_replace_pid()] explicitly sending an SCM_CREDENTIALS message and specified pid in a numeric format - [unix_maybe_add_creds()] enabled SO_PASSCRED/SO_PASSPIDFD but didn't send SCM_CREDENTIALS explicitly - [scm_send()] force_creds is true. Netlink case, we don't need to touch it. Cc: linux-kernel@vger.kernel.org Cc: netdev@vger.kernel.org Cc: David S. Miller Cc: Eric Dumazet Cc: Jakub Kicinski Cc: Paolo Abeni Cc: Simon Horman Cc: Leon Romanovsky Cc: Arnd Bergmann Cc: Christian Brauner Cc: Kuniyuki Iwashima Cc: Lennart Poettering Cc: Luca Boccassi Cc: David Rheinsberg Signed-off-by: Alexander Mikhalitsyn Link: https://lore.kernel.org/20250703222314.309967-6-aleksandr.mikhalitsyn@canonical.com Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 323e4fc85d4b..d52811321fce 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -1971,6 +1971,7 @@ static void unix_skb_to_scm(struct sk_buff *skb, struct scm_cookie *scm) * We include credentials if source or destination socket * asserted SOCK_PASSCRED. * + * Context: May sleep. * Return: On success zero, on error a negative error code is returned. */ static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, @@ -1980,7 +1981,15 @@ static int unix_maybe_add_creds(struct sk_buff *skb, const struct sock *sk, return 0; if (unix_may_passcred(sk) || unix_may_passcred(other)) { - UNIXCB(skb).pid = get_pid(task_tgid(current)); + struct pid *pid; + int err; + + pid = task_tgid(current); + err = pidfs_register_pid(pid); + if (unlikely(err)) + return err; + + UNIXCB(skb).pid = get_pid(pid); current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid); } -- cgit From 25489a4f556414445d342951615178368ee45cde Mon Sep 17 00:00:00 2001 From: Michal Luczaj Date: Wed, 2 Jul 2025 15:38:08 +0200 Subject: net: splice: Drop unused @gfp Since its introduction in commit 2e910b95329c ("net: Add a function to splice pages into an skbuff for MSG_SPLICE_PAGES"), skb_splice_from_iter() never used the @gfp argument. Remove it and adapt callers. No functional change intended. Reviewed-by: Simon Horman Signed-off-by: Michal Luczaj Link: https://patch.msgid.link/20250702-splice-drop-unused-v3-2-55f68b60d2b7@rbox.co Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 564c970d97ff..cd0d582bc7d4 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2388,8 +2388,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (unlikely(msg->msg_flags & MSG_SPLICE_PAGES)) { skb->ip_summed = CHECKSUM_UNNECESSARY; - err = skb_splice_from_iter(skb, &msg->msg_iter, size, - sk->sk_allocation); + err = skb_splice_from_iter(skb, &msg->msg_iter, size); if (err < 0) goto out_free; -- cgit From b429a5ad19cb4efe63d18388a2a4deebcba742c6 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:13 +0000 Subject: af_unix: Don't hold unix_state_lock() in __unix_dgram_recvmsg(). When __skb_try_recv_datagram() returns NULL in __unix_dgram_recvmsg(), we hold unix_state_lock() unconditionally. This is because SOCK_SEQPACKET sk needs to return EOF in case its peer has been close()d concurrently. This behaviour totally depends on the timing of the peer's close() and reading sk->sk_shutdown, and taking the lock does not play a role. Let's drop the lock from __unix_dgram_recvmsg() and use READ_ONCE(). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-2-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index cd0d582bc7d4..becd84737635 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2527,12 +2527,10 @@ int __unix_dgram_recvmsg(struct sock *sk, struct msghdr *msg, size_t size, &err, &timeo, last)); if (!skb) { /* implies iolock unlocked */ - unix_state_lock(sk); /* Signal EOF on disconnected non-blocking SEQPACKET socket. */ if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN && - (sk->sk_shutdown & RCV_SHUTDOWN)) + (READ_ONCE(sk->sk_shutdown) & RCV_SHUTDOWN)) err = 0; - unix_state_unlock(sk); goto out; } -- cgit From 772f01049c4b722b28b3f7025b4996379f127ebf Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:14 +0000 Subject: af_unix: Don't check SOCK_DEAD in unix_stream_read_skb(). unix_stream_read_skb() checks SOCK_DEAD only when the dequeued skb is OOB skb. unix_stream_read_skb() is called for a SOCK_STREAM socket in SOCKMAP when data is sent to it. The function is invoked via sk_psock_verdict_data_ready(), which is set to sk->sk_data_ready(). During sendmsg(), we check if the receiver has SOCK_DEAD, so there is no point in checking it again later in ->read_skb(). Also, unix_read_skb() for SOCK_DGRAM does not have the test either. Let's remove the SOCK_DEAD test in unix_stream_read_skb(). Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-3-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 10 ---------- 1 file changed, 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index becd84737635..34ddea34e87e 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2803,14 +2803,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (unlikely(skb == READ_ONCE(u->oob_skb))) { bool drop = false; - unix_state_lock(sk); - - if (sock_flag(sk, SOCK_DEAD)) { - unix_state_unlock(sk); - kfree_skb_reason(skb, SKB_DROP_REASON_SOCKET_CLOSE); - return -ECONNRESET; - } - spin_lock(&sk->sk_receive_queue.lock); if (likely(skb == u->oob_skb)) { WRITE_ONCE(u->oob_skb, NULL); @@ -2818,8 +2810,6 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) } spin_unlock(&sk->sk_receive_queue.lock); - unix_state_unlock(sk); - if (drop) { kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); return -EAGAIN; -- cgit From d0aac85449dec992bb8dc2503f2cb9e94ef436db Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:15 +0000 Subject: af_unix: Don't use skb_recv_datagram() in unix_stream_read_skb(). unix_stream_read_skb() calls skb_recv_datagram() with MSG_DONTWAIT, which is mostly equivalent to sock_error(sk) + skb_dequeue(). In the following patch, we will add a new field to cache the number of bytes in the receive queue. Then, we want to avoid introducing atomic ops in the fast path, so we will reuse the receive queue lock. As a preparation for the change, let's not use skb_recv_datagram() in unix_stream_read_skb(). Note that sock_error() is now moved out of the u->iolock mutex as the mutex does not synchronise the peer's close() at all. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-4-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 39 ++++++++++++++++++++++----------------- 1 file changed, 22 insertions(+), 17 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 34ddea34e87e..94596d6c37e9 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2786,6 +2786,7 @@ unlock: static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) { + struct sk_buff_head *queue = &sk->sk_receive_queue; struct unix_sock *u = unix_sk(sk); struct sk_buff *skb; int err; @@ -2793,30 +2794,34 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) return -ENOTCONN; - mutex_lock(&u->iolock); - skb = skb_recv_datagram(sk, MSG_DONTWAIT, &err); - mutex_unlock(&u->iolock); - if (!skb) + err = sock_error(sk); + if (err) return err; -#if IS_ENABLED(CONFIG_AF_UNIX_OOB) - if (unlikely(skb == READ_ONCE(u->oob_skb))) { - bool drop = false; + mutex_lock(&u->iolock); + spin_lock(&queue->lock); - spin_lock(&sk->sk_receive_queue.lock); - if (likely(skb == u->oob_skb)) { - WRITE_ONCE(u->oob_skb, NULL); - drop = true; - } - spin_unlock(&sk->sk_receive_queue.lock); + skb = __skb_dequeue(queue); + if (!skb) { + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return -EAGAIN; + } - if (drop) { - kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); - return -EAGAIN; - } +#if IS_ENABLED(CONFIG_AF_UNIX_OOB) + if (skb == u->oob_skb) { + WRITE_ONCE(u->oob_skb, NULL); + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + + kfree_skb_reason(skb, SKB_DROP_REASON_UNIX_SKIP_OOB); + return -EAGAIN; } #endif + spin_unlock(&queue->lock); + mutex_unlock(&u->iolock); + return recv_actor(sk, skb); } -- cgit From f4e1fb04c12384fb1b69a95c33527b515a652a74 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:16 +0000 Subject: af_unix: Use cached value for SOCK_STREAM in unix_inq_len(). Compared to TCP, ioctl(SIOCINQ) for AF_UNIX SOCK_STREAM socket is more expensive, as unix_inq_len() requires iterating through the receive queue and accumulating skb->len. Let's cache the value for SOCK_STREAM to a new field during sendmsg() and recvmsg(). The field is protected by the receive queue lock. Note that ioctl(SIOCINQ) for SOCK_DGRAM returns the length of the first skb in the queue. SOCK_SEQPACKET still requires iterating through the queue because we do not touch functions shared with unix_dgram_ops. But, if really needed, we can support it by switching __skb_try_recv_datagram() to a custom version. Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-5-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 38 ++++++++++++++++++++++++++++---------- 1 file changed, 28 insertions(+), 10 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index 94596d6c37e9..d9e604295a71 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2297,6 +2297,7 @@ static int queue_oob(struct sock *sk, struct msghdr *msg, struct sock *other, spin_lock(&other->sk_receive_queue.lock); WRITE_ONCE(ousk->oob_skb, skb); + WRITE_ONCE(ousk->inq_len, ousk->inq_len + 1); __skb_queue_tail(&other->sk_receive_queue, skb); spin_unlock(&other->sk_receive_queue.lock); @@ -2319,6 +2320,7 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, struct sock *sk = sock->sk; struct sk_buff *skb = NULL; struct sock *other = NULL; + struct unix_sock *otheru; struct scm_cookie scm; bool fds_sent = false; int err, sent = 0; @@ -2342,14 +2344,16 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, if (msg->msg_namelen) { err = READ_ONCE(sk->sk_state) == TCP_ESTABLISHED ? -EISCONN : -EOPNOTSUPP; goto out_err; - } else { - other = unix_peer(sk); - if (!other) { - err = -ENOTCONN; - goto out_err; - } } + other = unix_peer(sk); + if (!other) { + err = -ENOTCONN; + goto out_err; + } + + otheru = unix_sk(other); + if (READ_ONCE(sk->sk_shutdown) & SEND_SHUTDOWN) goto out_pipe; @@ -2417,7 +2421,12 @@ static int unix_stream_sendmsg(struct socket *sock, struct msghdr *msg, unix_maybe_add_creds(skb, sk, other); scm_stat_add(other, skb); - skb_queue_tail(&other->sk_receive_queue, skb); + + spin_lock(&other->sk_receive_queue.lock); + WRITE_ONCE(otheru->inq_len, otheru->inq_len + skb->len); + __skb_queue_tail(&other->sk_receive_queue, skb); + spin_unlock(&other->sk_receive_queue.lock); + unix_state_unlock(other); other->sk_data_ready(other); sent += size; @@ -2704,6 +2713,7 @@ static int unix_stream_recv_urg(struct unix_stream_read_state *state) if (!(state->flags & MSG_PEEK)) { WRITE_ONCE(u->oob_skb, NULL); + WRITE_ONCE(u->inq_len, u->inq_len - 1); if (oob_skb->prev != (struct sk_buff *)&sk->sk_receive_queue && !unix_skb_len(oob_skb->prev)) { @@ -2808,6 +2818,8 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) return -EAGAIN; } + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + #if IS_ENABLED(CONFIG_AF_UNIX_OOB) if (skb == u->oob_skb) { WRITE_ONCE(u->oob_skb, NULL); @@ -2988,7 +3000,11 @@ unlock: if (unix_skb_len(skb)) break; - skb_unlink(skb, &sk->sk_receive_queue); + spin_lock(&sk->sk_receive_queue.lock); + WRITE_ONCE(u->inq_len, u->inq_len - skb->len); + __skb_unlink(skb, &sk->sk_receive_queue); + spin_unlock(&sk->sk_receive_queue.lock); + consume_skb(skb); if (scm.fp) @@ -3159,9 +3175,11 @@ long unix_inq_len(struct sock *sk) if (READ_ONCE(sk->sk_state) == TCP_LISTEN) return -EINVAL; + if (sk->sk_type == SOCK_STREAM) + return READ_ONCE(unix_sk(sk)->inq_len); + spin_lock(&sk->sk_receive_queue.lock); - if (sk->sk_type == SOCK_STREAM || - sk->sk_type == SOCK_SEQPACKET) { + if (sk->sk_type == SOCK_SEQPACKET) { skb_queue_walk(&sk->sk_receive_queue, skb) amount += unix_skb_len(skb); } else { -- cgit From 8b77338eb2af74bb93986e4a8cfd86724168fe39 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:17 +0000 Subject: af_unix: Cache state->msg in unix_stream_read_generic(). In unix_stream_read_generic(), state->msg is fetched multiple times. Let's cache it in a local variable. Signed-off-by: Kuniyuki Iwashima Link: https://patch.msgid.link/20250702223606.1054680-6-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 33 +++++++++++++++++---------------- 1 file changed, 17 insertions(+), 16 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d9e604295a71..c3dd41596d89 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -2840,20 +2840,21 @@ static int unix_stream_read_skb(struct sock *sk, skb_read_actor_t recv_actor) static int unix_stream_read_generic(struct unix_stream_read_state *state, bool freezable) { - struct scm_cookie scm; + int noblock = state->flags & MSG_DONTWAIT; struct socket *sock = state->socket; + struct msghdr *msg = state->msg; struct sock *sk = sock->sk; - struct unix_sock *u = unix_sk(sk); - int copied = 0; + size_t size = state->size; int flags = state->flags; - int noblock = flags & MSG_DONTWAIT; bool check_creds = false; - int target; + struct scm_cookie scm; + unsigned int last_len; + struct unix_sock *u; + int copied = 0; int err = 0; long timeo; + int target; int skip; - size_t size = state->size; - unsigned int last_len; if (unlikely(READ_ONCE(sk->sk_state) != TCP_ESTABLISHED)) { err = -EINVAL; @@ -2873,6 +2874,8 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state, memset(&scm, 0, sizeof(scm)); + u = unix_sk(sk); + /* Lock the socket to prevent queue disordering * while sleeps in memcpy_tomsg */ @@ -2964,14 +2967,12 @@ unlock: } /* Copy address just once */ - if (state->msg && state->msg->msg_name) { - DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, - state->msg->msg_name); - unix_copy_addr(state->msg, skb->sk); + if (msg && msg->msg_name) { + DECLARE_SOCKADDR(struct sockaddr_un *, sunaddr, msg->msg_name); - BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, - state->msg->msg_name, - &state->msg->msg_namelen); + unix_copy_addr(msg, skb->sk); + BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk, msg->msg_name, + &msg->msg_namelen); sunaddr = NULL; } @@ -3033,8 +3034,8 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (state->msg) - scm_recv_unix(sock, state->msg, &scm, flags); + if (msg) + scm_recv_unix(sock, msg, &scm, flags); else scm_destroy(&scm); out: -- cgit From df30285b3670bf52e1e5512e4d4482bec5e93c16 Mon Sep 17 00:00:00 2001 From: Kuniyuki Iwashima Date: Wed, 2 Jul 2025 22:35:18 +0000 Subject: af_unix: Introduce SO_INQ. We have an application that uses almost the same code for TCP and AF_UNIX (SOCK_STREAM). TCP can use TCP_INQ, but AF_UNIX doesn't have it and requires an extra syscall, ioctl(SIOCINQ) or getsockopt(SO_MEMINFO) as an alternative. Let's introduce the generic version of TCP_INQ. If SO_INQ is enabled, recvmsg() will put a cmsg of SCM_INQ that contains the exact value of ioctl(SIOCINQ). The cmsg is also included when msg->msg_get_inq is non-zero to make sockets io_uring-friendly. Note that SOCK_CUSTOM_SOCKOPT is flagged only for SOCK_STREAM to override setsockopt() for SOL_SOCKET. By having the flag in struct unix_sock, instead of struct sock, we can later add SO_INQ support for TCP and reuse tcp_sk(sk)->recvmsg_inq. Note also that supporting custom getsockopt() for SOL_SOCKET will need preparation for other SOCK_CUSTOM_SOCKOPT users (UDP, vsock, MPTCP). Signed-off-by: Kuniyuki Iwashima Reviewed-by: Willem de Bruijn Link: https://patch.msgid.link/20250702223606.1054680-7-kuniyu@google.com Signed-off-by: Jakub Kicinski --- net/unix/af_unix.c | 62 ++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 60 insertions(+), 2 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index c3dd41596d89..7a92733706fe 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -934,6 +934,52 @@ static void unix_show_fdinfo(struct seq_file *m, struct socket *sock) #define unix_show_fdinfo NULL #endif +static bool unix_custom_sockopt(int optname) +{ + switch (optname) { + case SO_INQ: + return true; + default: + return false; + } +} + +static int unix_setsockopt(struct socket *sock, int level, int optname, + sockptr_t optval, unsigned int optlen) +{ + struct unix_sock *u = unix_sk(sock->sk); + struct sock *sk = sock->sk; + int val; + + if (level != SOL_SOCKET) + return -EOPNOTSUPP; + + if (!unix_custom_sockopt(optname)) + return sock_setsockopt(sock, level, optname, optval, optlen); + + if (optlen != sizeof(int)) + return -EINVAL; + + if (copy_from_sockptr(&val, optval, sizeof(val))) + return -EFAULT; + + switch (optname) { + case SO_INQ: + if (sk->sk_type != SOCK_STREAM) + return -EINVAL; + + if (val > 1 || val < 0) + return -EINVAL; + + WRITE_ONCE(u->recvmsg_inq, val); + break; + default: + return -ENOPROTOOPT; + } + + return 0; +} + static const struct proto_ops unix_stream_ops = { .family = PF_UNIX, .owner = THIS_MODULE, @@ -950,6 +996,7 @@ static const struct proto_ops unix_stream_ops = { #endif .listen = unix_listen, .shutdown = unix_shutdown, + .setsockopt = unix_setsockopt, .sendmsg = unix_stream_sendmsg, .recvmsg = unix_stream_recvmsg, .read_skb = unix_stream_read_skb, @@ -1116,6 +1163,7 @@ static int unix_create(struct net *net, struct socket *sock, int protocol, switch (sock->type) { case SOCK_STREAM: + set_bit(SOCK_CUSTOM_SOCKOPT, &sock->flags); sock->ops = &unix_stream_ops; break; /* @@ -1847,6 +1895,9 @@ static int unix_accept(struct socket *sock, struct socket *newsock, skb_free_datagram(sk, skb); wake_up_interruptible(&unix_sk(sk)->peer_wait); + if (tsk->sk_type == SOCK_STREAM) + set_bit(SOCK_CUSTOM_SOCKOPT, &newsock->flags); + /* attach accepted sock to socket */ unix_state_lock(tsk); unix_update_edges(unix_sk(tsk)); @@ -3034,10 +3085,17 @@ unlock: } while (size); mutex_unlock(&u->iolock); - if (msg) + if (msg) { scm_recv_unix(sock, msg, &scm, flags); - else + + if (READ_ONCE(u->recvmsg_inq) || msg->msg_get_inq) { + msg->msg_inq = READ_ONCE(u->inq_len); + put_cmsg(msg, SOL_SOCKET, SCM_INQ, + sizeof(msg->msg_inq), &msg->msg_inq); + } + } else { scm_destroy(&scm); + } out: return copied ? : err; } -- cgit From 1f531e35c146cca22dc6f4a1bc657098f146f358 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 12 Jul 2025 06:41:57 +0100 Subject: don't bother with path_get()/path_put() in unix_open_file() Once unix_sock ->path is set, we are guaranteed that its ->path will remain unchanged (and pinned) until the socket is closed. OTOH, dentry_open() does not modify the path passed to it. IOW, there's no need to copy unix_sk(sk)->path in unix_open_file() - we can just pass it to dentry_open() and be done with that. Signed-off-by: Al Viro Link: https://lore.kernel.org/20250712054157.GZ1880847@ZenIV Reviewed-by: Kuniyuki Iwashima Signed-off-by: Christian Brauner --- net/unix/af_unix.c | 16 ++++------------ 1 file changed, 4 insertions(+), 12 deletions(-) (limited to 'net/unix/af_unix.c') diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c index d52811321fce..c247fb9ac761 100644 --- a/net/unix/af_unix.c +++ b/net/unix/af_unix.c @@ -3201,7 +3201,6 @@ EXPORT_SYMBOL_GPL(unix_outq_len); static int unix_open_file(struct sock *sk) { - struct path path; struct file *f; int fd; @@ -3211,27 +3210,20 @@ static int unix_open_file(struct sock *sk) if (!smp_load_acquire(&unix_sk(sk)->addr)) return -ENOENT; - path = unix_sk(sk)->path; - if (!path.dentry) + if (!unix_sk(sk)->path.dentry) return -ENOENT; - path_get(&path); - fd = get_unused_fd_flags(O_CLOEXEC); if (fd < 0) - goto out; + return fd; - f = dentry_open(&path, O_PATH, current_cred()); + f = dentry_open(&unix_sk(sk)->path, O_PATH, current_cred()); if (IS_ERR(f)) { put_unused_fd(fd); - fd = PTR_ERR(f); - goto out; + return PTR_ERR(f); } fd_install(fd, f); -out: - path_put(&path); - return fd; } -- cgit