summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/admin-guide/kernel-parameters.txt3
-rw-r--r--Documentation/filesystems/porting.rst4
-rw-r--r--Documentation/filesystems/proc.rst8
-rw-r--r--Documentation/filesystems/vfs.rst4
-rw-r--r--block/bdev.c2
-rw-r--r--drivers/dax/super.c2
-rw-r--r--drivers/misc/ibmasm/ibmasmfs.c2
-rw-r--r--drivers/usb/gadget/function/f_fs.c2
-rw-r--r--drivers/usb/gadget/legacy/inode.c2
-rw-r--r--fs/9p/vfs_super.c2
-rw-r--r--fs/afs/inode.c4
-rw-r--r--fs/btrfs/inode.c2
-rw-r--r--fs/ceph/super.c2
-rw-r--r--fs/configfs/mount.c2
-rw-r--r--fs/cramfs/inode.c11
-rw-r--r--fs/dcache.c4
-rw-r--r--fs/efivarfs/super.c2
-rw-r--r--fs/eventpoll.c139
-rw-r--r--fs/ext4/super.c2
-rw-r--r--fs/f2fs/super.c2
-rw-r--r--fs/fcntl.c10
-rw-r--r--fs/file.c5
-rw-r--r--fs/fs-writeback.c2
-rw-r--r--fs/fuse/inode.c2
-rw-r--r--fs/gfs2/super.c2
-rw-r--r--fs/hostfs/hostfs_kern.c2
-rw-r--r--fs/inode.c30
-rw-r--r--fs/ioctl.c5
-rw-r--r--fs/kernfs/mount.c2
-rw-r--r--fs/locks.c4
-rw-r--r--fs/minix/inode.c8
-rw-r--r--fs/namei.c22
-rw-r--r--fs/namespace.c106
-rw-r--r--fs/nfs/inode.c2
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c2
-rw-r--r--fs/orangefs/super.c2
-rw-r--r--fs/overlayfs/super.c2
-rw-r--r--fs/pidfs.c2
-rw-r--r--fs/pipe.c6
-rw-r--r--fs/proc/inode.c2
-rw-r--r--fs/proc/namespaces.c6
-rw-r--r--fs/proc/root.c98
-rw-r--r--fs/pstore/inode.c2
-rw-r--r--fs/ramfs/inode.c2
-rw-r--r--fs/read_write.c14
-rw-r--r--fs/smb/client/cifsfs.c2
-rw-r--r--fs/super.c8
-rw-r--r--fs/ubifs/super.c2
-rw-r--r--fs/xfs/xfs_super.c2
-rw-r--r--include/linux/fs.h10
-rw-r--r--include/linux/pid_namespace.h9
-rw-r--r--include/trace/events/filelock.h3
-rw-r--r--include/uapi/linux/fs.h5
-rw-r--r--init/Kconfig1
-rw-r--r--init/do_mounts_rd.c14
-rw-r--r--init/initramfs.c5
-rw-r--r--kernel/bpf/inode.c2
-rw-r--r--kernel/pid.c2
-rw-r--r--kernel/pid_namespace.c22
-rw-r--r--mm/shmem.c2
-rw-r--r--net/socket.c3
-rw-r--r--tools/testing/selftests/proc/.gitignore1
-rw-r--r--tools/testing/selftests/proc/Makefile1
-rw-r--r--tools/testing/selftests/proc/proc-pidns.c211
64 files changed, 582 insertions, 264 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt
index 5a7a83c411e9..e92c0056e4e0 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -6429,6 +6429,9 @@
rootflags= [KNL] Set root filesystem mount option string
+ initramfs_options= [KNL]
+ Specify mount options for for the initramfs mount.
+
rootfstype= [KNL] Set root filesystem type
rootwait [KNL] Wait (indefinitely) for root device to show up.
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 85f590254f07..b5db45c0094c 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil
->drop_inode() returns int now; it's called on final iput() with
inode->i_lock held and it returns true if filesystems wants the inode to be
-dropped. As before, generic_drop_inode() is still the default and it's been
-updated appropriately. generic_delete_inode() is also alive and it consists
+dropped. As before, inode_generic_drop() is still the default and it's been
+updated appropriately. inode_just_drop() is also alive and it consists
simply of return 1. Note that all actual eviction work is done by caller after
->drop_inode() returns.
diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst
index 2971551b7235..b7e3147ba3d4 100644
--- a/Documentation/filesystems/proc.rst
+++ b/Documentation/filesystems/proc.rst
@@ -2362,6 +2362,7 @@ The following mount options are supported:
hidepid= Set /proc/<pid>/ access mode.
gid= Set the group authorized to learn processes information.
subset= Show only the specified subset of procfs.
+ pidns= Specify a the namespace used by this procfs.
========= ========================================================
hidepid=off or hidepid=0 means classic mode - everybody may access all
@@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group.
subset=pid hides all top level files and directories in the procfs that
are not related to tasks.
+pidns= specifies a pid namespace (either as a string path to something like
+`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that
+will be used by the procfs instance when translating pids. By default, procfs
+will use the calling process's active pid namespace. Note that the pid
+namespace of an existing procfs instance cannot be modified (attempting to do
+so will give an `-EBUSY` error).
+
Chapter 5: Filesystem behavior
==============================
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index 486a91633474..7a314eee6305 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -327,11 +327,11 @@ or bottom half).
inode->i_lock spinlock held.
This method should be either NULL (normal UNIX filesystem
- semantics) or "generic_delete_inode" (for filesystems that do
+ semantics) or "inode_just_drop" (for filesystems that do
not want to cache inodes - causing "delete_inode" to always be
called regardless of the value of i_nlink)
- The "generic_delete_inode()" behavior is equivalent to the old
+ The "inode_just_drop()" behavior is equivalent to the old
practice of using "force_delete" in the put_inode() case, but
does not have the races that the "force_delete()" approach had.
diff --git a/block/bdev.c b/block/bdev.c
index b77ddd12dc06..810707cca970 100644
--- a/block/bdev.c
+++ b/block/bdev.c
@@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = {
.statfs = simple_statfs,
.alloc_inode = bdev_alloc_inode,
.free_inode = bdev_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = bdev_evict_inode,
};
diff --git a/drivers/dax/super.c b/drivers/dax/super.c
index 54c480e874cb..d7714d8afb0f 100644
--- a/drivers/dax/super.c
+++ b/drivers/dax/super.c
@@ -388,7 +388,7 @@ static const struct super_operations dax_sops = {
.alloc_inode = dax_alloc_inode,
.destroy_inode = dax_destroy_inode,
.free_inode = dax_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static int dax_init_fs_context(struct fs_context *fc)
diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c
index c44de892a61e..5372ed2a363e 100644
--- a/drivers/misc/ibmasm/ibmasmfs.c
+++ b/drivers/misc/ibmasm/ibmasmfs.c
@@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc)
static const struct super_operations ibmasmfs_s_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations;
diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c
index 08a251df20c4..5246fa6af3d6 100644
--- a/drivers/usb/gadget/function/f_fs.c
+++ b/drivers/usb/gadget/function/f_fs.c
@@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb,
/* Super block */
static const struct super_operations ffs_sb_operations = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
struct ffs_sb_fill_data {
diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c
index b51e132b0cd2..13c3da49348c 100644
--- a/drivers/usb/gadget/legacy/inode.c
+++ b/drivers/usb/gadget/legacy/inode.c
@@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name,
static const struct super_operations gadget_fs_operations = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static int
diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c
index 795c6388744c..1581ebac5bb4 100644
--- a/fs/9p/vfs_super.c
+++ b/fs/9p/vfs_super.c
@@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode)
v9ses = v9fs_inode2v9ses(inode);
if (v9ses->cache & (CACHE_META|CACHE_LOOSE))
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
/*
* in case of non cached mode always drop the
* inode because we want the inode attribute
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index e9538e91f848..e1cb17b85791 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode)
_enter("");
if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags))
- return generic_delete_inode(inode);
+ return inode_just_drop(inode);
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/*
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 18db1053cdf0..3923be975e47 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -7973,7 +7973,7 @@ int btrfs_drop_inode(struct inode *inode)
if (btrfs_root_refs(&root->root_item) == 0)
return 1;
else
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
static void init_once(void *foo)
diff --git a/fs/ceph/super.c b/fs/ceph/super.c
index c3eb651862c5..70dc9467f6a0 100644
--- a/fs/ceph/super.c
+++ b/fs/ceph/super.c
@@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = {
.alloc_inode = ceph_alloc_inode,
.free_inode = ceph_free_inode,
.write_inode = ceph_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = ceph_evict_inode,
.sync_fs = ceph_sync_fs,
.put_super = ceph_put_super,
diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c
index 740f18b60c9d..456c4a2efb53 100644
--- a/fs/configfs/mount.c
+++ b/fs/configfs/mount.c
@@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode)
static const struct super_operations configfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.free_inode = configfs_free_inode,
};
diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c
index b002e9b734f9..12daa85ed941 100644
--- a/fs/cramfs/inode.c
+++ b/fs/cramfs/inode.c
@@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb,
inode_nohighmem(inode);
inode->i_data.a_ops = &cramfs_aops;
break;
- default:
+ case S_IFCHR:
+ case S_IFBLK:
+ case S_IFIFO:
+ case S_IFSOCK:
init_special_inode(inode, cramfs_inode->mode,
old_decode_dev(cramfs_inode->size));
+ break;
+ default:
+ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ iget_failed(inode);
+ return ERR_PTR(-EIO);
}
inode->i_mode = cramfs_inode->mode;
diff --git a/fs/dcache.c b/fs/dcache.c
index 60046ae23d51..336bdb4c4b1f 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir)
{
preempt_disable_nested();
for (;;) {
- unsigned n = dir->i_dir_seq;
- if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n)
+ unsigned n = READ_ONCE(dir->i_dir_seq);
+ if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1))
return n;
cpu_relax();
}
diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c
index 4bb4002e3cdf..1f4d8ce56667 100644
--- a/fs/efivarfs/super.c
+++ b/fs/efivarfs/super.c
@@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb);
static const struct super_operations efivarfs_ops = {
.statfs = efivarfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.alloc_inode = efivarfs_alloc_inode,
.free_inode = efivarfs_free_inode,
.show_options = efivarfs_show_options,
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index b22d6f819f78..ee7c4b683ec3 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -46,10 +46,10 @@
*
* 1) epnested_mutex (mutex)
* 2) ep->mtx (mutex)
- * 3) ep->lock (rwlock)
+ * 3) ep->lock (spinlock)
*
* The acquire order is the one listed above, from 1 to 3.
- * We need a rwlock (ep->lock) because we manipulate objects
+ * We need a spinlock (ep->lock) because we manipulate objects
* from inside the poll callback, that might be triggered from
* a wake_up() that in turn might be called from IRQ context.
* So we can't sleep inside the poll callback and hence we need
@@ -195,7 +195,7 @@ struct eventpoll {
struct list_head rdllist;
/* Lock which protects rdllist and ovflist */
- rwlock_t lock;
+ spinlock_t lock;
/* RB tree root used to store monitored fd structs */
struct rb_root_cached rbr;
@@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist)
* in a lockless way.
*/
lockdep_assert_irqs_enabled();
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
list_splice_init(&ep->rdllist, txlist);
WRITE_ONCE(ep->ovflist, NULL);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_done_scan(struct eventpoll *ep,
@@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep,
{
struct epitem *epi, *nepi;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* During the time we spent inside the "sproc" callback, some
* other events might have been queued by the poll callback.
@@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep,
wake_up(&ep->wq);
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
static void ep_get(struct eventpoll *ep)
@@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force)
rb_erase_cached(&epi->rbn, &ep->rbr);
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (ep_is_linked(epi))
list_del_init(&epi->rdllink);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
wakeup_source_unregister(ep_wakeup_source(epi));
/*
@@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep)
return -ENOMEM;
mutex_init(&ep->mtx);
- rwlock_init(&ep->lock);
+ spin_lock_init(&ep->lock);
init_waitqueue_head(&ep->wq);
init_waitqueue_head(&ep->poll_wait);
INIT_LIST_HEAD(&ep->rdllist);
@@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd,
#endif /* CONFIG_KCMP */
/*
- * Adds a new entry to the tail of the list in a lockless way, i.e.
- * multiple CPUs are allowed to call this function concurrently.
- *
- * Beware: it is necessary to prevent any other modifications of the
- * existing list until all changes are completed, in other words
- * concurrent list_add_tail_lockless() calls should be protected
- * with a read lock, where write lock acts as a barrier which
- * makes sure all list_add_tail_lockless() calls are fully
- * completed.
- *
- * Also an element can be locklessly added to the list only in one
- * direction i.e. either to the tail or to the head, otherwise
- * concurrent access will corrupt the list.
- *
- * Return: %false if element has been already added to the list, %true
- * otherwise.
- */
-static inline bool list_add_tail_lockless(struct list_head *new,
- struct list_head *head)
-{
- struct list_head *prev;
-
- /*
- * This is simple 'new->next = head' operation, but cmpxchg()
- * is used in order to detect that same element has been just
- * added to the list from another CPU: the winner observes
- * new->next == new.
- */
- if (!try_cmpxchg(&new->next, &new, head))
- return false;
-
- /*
- * Initially ->next of a new element must be updated with the head
- * (we are inserting to the tail) and only then pointers are atomically
- * exchanged. XCHG guarantees memory ordering, thus ->next should be
- * updated before pointers are actually swapped and pointers are
- * swapped before prev->next is updated.
- */
-
- prev = xchg(&head->prev, new);
-
- /*
- * It is safe to modify prev->next and new->prev, because a new element
- * is added only to the tail and new->next is updated before XCHG.
- */
-
- prev->next = new;
- new->prev = prev;
-
- return true;
-}
-
-/*
- * Chains a new epi entry to the tail of the ep->ovflist in a lockless way,
- * i.e. multiple CPUs are allowed to call this function concurrently.
- *
- * Return: %false if epi element has been already chained, %true otherwise.
- */
-static inline bool chain_epi_lockless(struct epitem *epi)
-{
- struct eventpoll *ep = epi->ep;
-
- /* Fast preliminary check */
- if (epi->next != EP_UNACTIVE_PTR)
- return false;
-
- /* Check that the same epi has not been just chained from another CPU */
- if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR)
- return false;
-
- /* Atomically exchange tail */
- epi->next = xchg(&ep->ovflist, epi);
-
- return true;
-}
-
-/*
* This is the callback that is passed to the wait queue wakeup
* mechanism. It is called by the stored file descriptors when they
* have events to report.
- *
- * This callback takes a read lock in order not to contend with concurrent
- * events from another file descriptor, thus all modifications to ->rdllist
- * or ->ovflist are lockless. Read lock is paired with the write lock from
- * ep_start/done_scan(), which stops all list modifications and guarantees
- * that lists state is seen correctly.
- *
- * Another thing worth to mention is that ep_poll_callback() can be called
- * concurrently for the same @epi from different CPUs if poll table was inited
- * with several wait queues entries. Plural wakeup from different CPUs of a
- * single wait queue is serialized by wq.lock, but the case when multiple wait
- * queues are used should be detected accordingly. This is detected using
- * cmpxchg() operation.
*/
static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key)
{
@@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
unsigned long flags;
int ewake = 0;
- read_lock_irqsave(&ep->lock, flags);
+ spin_lock_irqsave(&ep->lock, flags);
ep_set_busy_poll_napi_id(epi);
@@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
* chained in ep->ovflist and requeued later on.
*/
if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) {
- if (chain_epi_lockless(epi))
+ if (epi->next == EP_UNACTIVE_PTR) {
+ epi->next = READ_ONCE(ep->ovflist);
+ WRITE_ONCE(ep->ovflist, epi);
ep_pm_stay_awake_rcu(epi);
+ }
} else if (!ep_is_linked(epi)) {
/* In the usual case, add event to ready list. */
- if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist))
- ep_pm_stay_awake_rcu(epi);
+ list_add_tail(&epi->rdllink, &ep->rdllist);
+ ep_pm_stay_awake_rcu(epi);
}
/*
@@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v
pwake++;
out_unlock:
- read_unlock_irqrestore(&ep->lock, flags);
+ spin_unlock_irqrestore(&ep->lock, flags);
/* We have to call this outside the lock */
if (pwake)
@@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
}
/* We have to drop the new item inside our item list to keep track of it */
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/* record NAPI ID of new item if present */
ep_set_busy_poll_napi_id(epi);
@@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event,
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
/* We have to call this outside the lock */
if (pwake)
@@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
* list, push it inside.
*/
if (ep_item_poll(epi, &pt, 1)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
if (!ep_is_linked(epi)) {
list_add_tail(&epi->rdllink, &ep->rdllist);
ep_pm_stay_awake(epi);
@@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi,
if (waitqueue_active(&ep->poll_wait))
pwake++;
}
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
/* We have to call this outside the lock */
@@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
init_wait(&wait);
wait.func = ep_autoremove_wake_function;
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* Barrierless variant, waitqueue_active() is called under
* the same lock on wakeup ep_poll_callback() side, so it
@@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (!eavail)
__add_wait_queue_exclusive(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
if (!eavail)
timed_out = !ep_schedule_timeout(to) ||
@@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
eavail = 1;
if (!list_empty_careful(&wait.entry)) {
- write_lock_irq(&ep->lock);
+ spin_lock_irq(&ep->lock);
/*
* If the thread timed out and is not on the wait queue,
* it means that the thread was woken up after its
@@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
eavail = list_empty(&wait.entry);
__remove_wait_queue(&ep->wq, &wait);
- write_unlock_irq(&ep->lock);
+ spin_unlock_irq(&ep->lock);
}
}
}
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 699c15db28a8..caac4067f777 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
static int ext4_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index e16c4e2830c2..63cf73409da6 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -1768,7 +1768,7 @@ static int f2fs_drop_inode(struct inode *inode)
trace_f2fs_drop_inode(inode, 0);
return 0;
}
- ret = generic_drop_inode(inode);
+ ret = inode_generic_drop(inode);
if (!ret)
ret = fscrypt_drop_inode(inode);
trace_f2fs_drop_inode(inode, ret);
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 5598e4d57422..72f8433d9109 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint)
}
}
-static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_get_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd,
return 0;
}
-static long fcntl_set_rw_hint(struct file *file, unsigned int cmd,
- unsigned long arg)
+static long fcntl_set_rw_hint(struct file *file, unsigned long arg)
{
struct inode *inode = file_inode(file);
u64 __user *argp = (u64 __user *)arg;
@@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg,
err = memfd_fcntl(filp, cmd, argi);
break;
case F_GET_RW_HINT:
- err = fcntl_get_rw_hint(filp, cmd, arg);
+ err = fcntl_get_rw_hint(filp, arg);
break;
case F_SET_RW_HINT:
- err = fcntl_set_rw_hint(filp, cmd, arg);
+ err = fcntl_set_rw_hint(filp, arg);
break;
default:
break;
diff --git a/fs/file.c b/fs/file.c
index 6d2275c3be9c..28743b742e3c 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags)
err = expand_files(files, fd);
if (unlikely(err < 0))
goto out_unlock;
- return do_dup2(files, file, fd, flags);
+ err = do_dup2(files, file, fd, flags);
+ if (err < 0)
+ return err;
+ return 0;
out_unlock:
spin_unlock(&files->file_lock);
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a07b8cf73ae2..a6cc3d305b84 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1123,7 +1123,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id,
dirty = dirty * 10 / 8;
/* issue the writeback work */
- work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN);
+ work = kzalloc(sizeof(*work), GFP_NOWAIT);
if (work) {
work->nr_pages = dirty;
work->sync_mode = WB_SYNC_NONE;
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 7ddfd2b3cc9c..fdecd5a90dee 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = {
.free_inode = fuse_free_inode,
.evict_inode = fuse_evict_inode,
.write_inode = fuse_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.umount_begin = fuse_umount_begin,
.statfs = fuse_statfs,
.sync_fs = fuse_sync_fs,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index b42e2110084b..644b2d1e7276 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode)
if (test_bit(SDF_EVICTING, &sdp->sd_flags))
return 1;
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
/**
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index 01e516175bcd..1e1acf5775ab 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
static const struct super_operations hostfs_sbops = {
.alloc_inode = hostfs_alloc_inode,
.free_inode = hostfs_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = hostfs_evict_inode,
.statfs = hostfs_statfs,
.show_options = hostfs_show_options,
diff --git a/fs/inode.c b/fs/inode.c
index 01ebdc40021e..a66c02123bca 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -1838,11 +1838,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval,
EXPORT_SYMBOL(insert_inode_locked4);
-int generic_delete_inode(struct inode *inode)
+int inode_just_drop(struct inode *inode)
{
return 1;
}
-EXPORT_SYMBOL(generic_delete_inode);
+EXPORT_SYMBOL(inode_just_drop);
/*
* Called when we're dropping the last reference
@@ -1866,7 +1866,7 @@ static void iput_final(struct inode *inode)
if (op->drop_inode)
drop = op->drop_inode(inode);
else
- drop = generic_drop_inode(inode);
+ drop = inode_generic_drop(inode);
if (!drop &&
!(inode->i_state & I_DONTCACHE) &&
@@ -2189,7 +2189,7 @@ static int __remove_privs(struct mnt_idmap *idmap,
return notify_change(idmap, dentry, &newattrs, NULL);
}
-int file_remove_privs_flags(struct file *file, unsigned int flags)
+static int file_remove_privs_flags(struct file *file, unsigned int flags)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = file_inode(file);
@@ -2214,7 +2214,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags)
inode_has_no_xattr(inode);
return error;
}
-EXPORT_SYMBOL_GPL(file_remove_privs_flags);
/**
* file_remove_privs - remove special file privileges (suid, capabilities)
@@ -2519,21 +2518,28 @@ void __init inode_init(void)
void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev)
{
inode->i_mode = mode;
- if (S_ISCHR(mode)) {
+ switch (inode->i_mode & S_IFMT) {
+ case S_IFCHR:
inode->i_fop = &def_chr_fops;
inode->i_rdev = rdev;
- } else if (S_ISBLK(mode)) {
+ break;
+ case S_IFBLK:
if (IS_ENABLED(CONFIG_BLOCK))
inode->i_fop = &def_blk_fops;
inode->i_rdev = rdev;
- } else if (S_ISFIFO(mode))
+ break;
+ case S_IFIFO:
inode->i_fop = &pipefifo_fops;
- else if (S_ISSOCK(mode))
- ; /* leave it no_open_fops */
- else
+ break;
+ case S_IFSOCK:
+ /* leave it no_open_fops */
+ break;
+ default:
printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for"
" inode %s:%lu\n", mode, inode->i_sb->s_id,
inode->i_ino);
+ break;
+ }
}
EXPORT_SYMBOL(init_special_inode);
@@ -2914,7 +2920,7 @@ EXPORT_SYMBOL(mode_strip_sgid);
*/
void dump_inode(struct inode *inode, const char *reason)
{
- pr_warn("%s encountered for inode %px", reason, inode);
+ pr_warn("%s encountered for inode %px (%s)\n", reason, inode, inode->i_sb->s_type->name);
}
EXPORT_SYMBOL(dump_inode);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 0248cb8db2d3..1c152c2b1b67 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
out:
return error;
}
-EXPORT_SYMBOL(vfs_ioctl);
static int ioctl_fibmap(struct file *filp, int __user *p)
{
@@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file,
goto out;
}
- size = offsetof(struct file_dedupe_range, info[count]);
+ size = struct_size(same, info, count);
if (size > PAGE_SIZE) {
ret = -ENOMEM;
goto out;
diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c
index e384a69fbece..76eaf64b9d9e 100644
--- a/fs/kernfs/mount.c
+++ b/fs/kernfs/mount.c
@@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf)
const struct super_operations kernfs_sops = {
.statfs = kernfs_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = kernfs_evict_inode,
.show_options = kernfs_sop_show_options,
diff --git a/fs/locks.c b/fs/locks.c
index 559f02aa4172..04a3f0e20724 100644
--- a/fs/locks.c
+++ b/fs/locks.c
@@ -2328,8 +2328,8 @@ out:
* To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX
* locks, the ->lock() interface may return asynchronously, before the lock has
* been granted or denied by the underlying filesystem, if (and only if)
- * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations
- * flags need to be set.
+ * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags
+ * need to be set.
*
* Callers expecting ->lock() to return asynchronously will only use F_SETLK,
* not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index df9d11479caf..32db676127a9 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev)
inode->i_op = &minix_symlink_inode_operations;
inode_nohighmem(inode);
inode->i_mapping->a_ops = &minix_aops;
- } else
+ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) ||
+ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) {
init_special_inode(inode, inode->i_mode, rdev);
+ } else {
+ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n",
+ inode->i_mode, inode->i_ino);
+ make_bad_inode(inode);
+ }
}
/*
diff --git a/fs/namei.c b/fs/namei.c
index cd43ff89fbaa..44856b70ea3b 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags
dentry->d_inode)
return -EISDIR;
+ /* No need to trigger automounts if mountpoint crossing is disabled. */
+ if (lookup_flags & LOOKUP_NO_XDEV)
+ return -EXDEV;
+
if (count && (*count)++ >= MAXSYMLINKS)
return -ELOOP;
@@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
/* Allow the filesystem to manage the transit without i_rwsem
* being held. */
if (flags & DCACHE_MANAGE_TRANSIT) {
+ if (lookup_flags & LOOKUP_NO_XDEV) {
+ ret = -EXDEV;
+ break;
+ }
ret = path->dentry->d_op->d_manage(path, false);
flags = smp_load_acquire(&path->dentry->d_flags);
if (ret < 0)
@@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped,
// here we know it's positive
flags = path->dentry->d_flags;
need_mntput = true;
+ if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) {
+ ret = -EXDEV;
+ break;
+ }
continue;
}
}
@@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry,
return -ECHILD;
}
ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags);
- if (jumped) {
- if (unlikely(nd->flags & LOOKUP_NO_XDEV))
- ret = -EXDEV;
- else
- nd->state |= ND_JUMPED;
- }
+ if (jumped)
+ nd->state |= ND_JUMPED;
if (unlikely(ret)) {
dput(path->dentry);
if (path->mnt != nd->path.mnt)
@@ -4828,7 +4836,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap,
return -EPERM;
/*
* Updating the link count will likely cause i_uid and i_gid to
- * be writen back improperly if their true value is unknown to
+ * be written back improperly if their true value is unknown to
* the vfs.
*/
if (HAS_UNMAPPED_ID(idmap, inode))
diff --git a/fs/namespace.c b/fs/namespace.c
index 51f77c65c0c6..38609066cf33 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -65,6 +65,15 @@ static int __init set_mphash_entries(char *str)
}
__setup("mphash_entries=", set_mphash_entries);
+static char * __initdata initramfs_options;
+static int __init initramfs_options_setup(char *str)
+{
+ initramfs_options = str;
+ return 1;
+}
+
+__setup("initramfs_options=", initramfs_options_setup);
+
static u64 event;
static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC);
static DEFINE_IDA(mnt_group_ida);
@@ -5711,7 +5720,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root)
static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
struct mnt_namespace *ns)
{
- struct path root __free(path_put) = {};
struct mount *m;
int err;
@@ -5723,7 +5731,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (!s->mnt)
return -ENOENT;
- err = grab_requested_root(ns, &root);
+ err = grab_requested_root(ns, &s->root);
if (err)
return err;
@@ -5732,7 +5740,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
* mounts to show users.
*/
m = real_mount(s->mnt);
- if (!is_path_reachable(m, m->mnt.mnt_root, &root) &&
+ if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -5740,8 +5748,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id,
if (err)
return err;
- s->root = root;
-
/*
* Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap
* can change concurrently as we only hold the read-side of the
@@ -5963,28 +5969,40 @@ retry:
if (!ret)
ret = copy_statmount_to_user(ks);
kvfree(ks->seq.buf);
+ path_put(&ks->root);
if (retry_statmount(ret, &seq_size))
goto retry;
return ret;
}
-static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
- u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids,
- bool reverse)
+struct klistmount {
+ u64 last_mnt_id;
+ u64 mnt_parent_id;
+ u64 *kmnt_ids;
+ u32 nr_mnt_ids;
+ struct mnt_namespace *ns;
+ struct path root;
+};
+
+static ssize_t do_listmount(struct klistmount *kls, bool reverse)
{
- struct path root __free(path_put) = {};
+ struct mnt_namespace *ns = kls->ns;
+ u64 mnt_parent_id = kls->mnt_parent_id;
+ u64 last_mnt_id = kls->last_mnt_id;
+ u64 *mnt_ids = kls->kmnt_ids;
+ size_t nr_mnt_ids = kls->nr_mnt_ids;
struct path orig;
struct mount *r, *first;
ssize_t ret;
rwsem_assert_held(&namespace_sem);
- ret = grab_requested_root(ns, &root);
+ ret = grab_requested_root(ns, &kls->root);
if (ret)
return ret;
if (mnt_parent_id == LSMT_ROOT) {
- orig = root;
+ orig = kls->root;
} else {
orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns);
if (!orig.mnt)
@@ -5996,7 +6014,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
* Don't trigger audit denials. We just want to determine what
* mounts to show users.
*/
- if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) &&
+ if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) &&
!ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
return -EPERM;
@@ -6029,14 +6047,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id,
return ret;
}
+static void __free_klistmount_free(const struct klistmount *kls)
+{
+ path_put(&kls->root);
+ kvfree(kls->kmnt_ids);
+ mnt_ns_release(kls->ns);
+}
+
+static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq,
+ size_t nr_mnt_ids)
+{
+
+ u64 last_mnt_id = kreq->param;
+
+ /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
+ if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
+ return -EINVAL;
+
+ kls->last_mnt_id = last_mnt_id;
+
+ kls->nr_mnt_ids = nr_mnt_ids;
+ kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids),
+ GFP_KERNEL_ACCOUNT);
+ if (!kls->kmnt_ids)
+ return -ENOMEM;
+
+ kls->ns = grab_requested_mnt_ns(kreq);
+ if (!kls->ns)
+ return -ENOENT;
+
+ kls->mnt_parent_id = kreq->mnt_id;
+ return 0;
+}
+
SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags)
{
- u64 *kmnt_ids __free(kvfree) = NULL;
+ struct klistmount kls __free(klistmount_free) = {};
const size_t maxcount = 1000000;
- struct mnt_namespace *ns __free(mnt_ns_release) = NULL;
struct mnt_id_req kreq;
- u64 last_mnt_id;
ssize_t ret;
if (flags & ~LISTMOUNT_REVERSE)
@@ -6057,22 +6106,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
if (ret)
return ret;
- last_mnt_id = kreq.param;
- /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */
- if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET)
- return -EINVAL;
-
- kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids),
- GFP_KERNEL_ACCOUNT);
- if (!kmnt_ids)
- return -ENOMEM;
-
- ns = grab_requested_mnt_ns(&kreq);
- if (!ns)
- return -ENOENT;
+ ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids);
+ if (ret)
+ return ret;
- if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) &&
- !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN))
+ if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) &&
+ !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN))
return -ENOENT;
/*
@@ -6080,12 +6119,11 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req,
* listmount() doesn't care about any mount properties.
*/
scoped_guard(rwsem_read, &namespace_sem)
- ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids,
- nr_mnt_ids, (flags & LISTMOUNT_REVERSE));
+ ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE));
if (ret <= 0)
return ret;
- if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids)))
+ if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids)))
return -EFAULT;
return ret;
@@ -6098,7 +6136,7 @@ static void __init init_mount_tree(void)
struct mnt_namespace *ns;
struct path root;
- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL);
+ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options);
if (IS_ERR(mnt))
panic("Can't create rootfs");
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 49df9debb1a6..43f09d8eb5e3 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid)
int nfs_drop_inode(struct inode *inode)
{
- return NFS_STALE(inode) || generic_drop_inode(inode);
+ return NFS_STALE(inode) || inode_generic_drop(inode);
}
EXPORT_SYMBOL_GPL(nfs_drop_inode);
diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c
index 5130ec44e5e1..807e2b758a5c 100644
--- a/fs/ocfs2/dlmfs/dlmfs.c
+++ b/fs/ocfs2/dlmfs/dlmfs.c
@@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = {
.alloc_inode = dlmfs_alloc_inode,
.free_inode = dlmfs_free_inode,
.evict_inode = dlmfs_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
};
static const struct inode_operations dlmfs_file_inode_operations = {
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index f3da840758e7..b46100a4f529 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = {
.free_inode = orangefs_free_inode,
.destroy_inode = orangefs_destroy_inode,
.write_inode = orangefs_write_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.statfs = orangefs_statfs,
.show_options = orangefs_show_options,
};
diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c
index df85a76597e9..bd3d7ba8fb95 100644
--- a/fs/overlayfs/super.c
+++ b/fs/overlayfs/super.c
@@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = {
.alloc_inode = ovl_alloc_inode,
.free_inode = ovl_free_inode,
.destroy_inode = ovl_destroy_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = ovl_put_super,
.sync_fs = ovl_sync_fs,
.statfs = ovl_statfs,
diff --git a/fs/pidfs.c b/fs/pidfs.c
index 108e7527f837..d01729c5263a 100644
--- a/fs/pidfs.c
+++ b/fs/pidfs.c
@@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode)
}
static const struct super_operations pidfs_sops = {
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pidfs_evict_inode,
.statfs = simple_statfs,
};
diff --git a/fs/pipe.c b/fs/pipe.c
index 731622d0738d..42fead1efe52 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
mutex_lock(&pipe->mutex);
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
ret = -EPIPE;
goto out;
}
@@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from)
for (;;) {
if (!pipe->readers) {
- send_sig(SIGPIPE, current, 0);
+ if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0)
+ send_sig(SIGPIPE, current, 0);
if (!ret)
ret = -EPIPE;
break;
diff --git a/fs/proc/inode.c b/fs/proc/inode.c
index 129490151be1..d9b7ef122343 100644
--- a/fs/proc/inode.c
+++ b/fs/proc/inode.c
@@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root)
const struct super_operations proc_sops = {
.alloc_inode = proc_alloc_inode,
.free_inode = proc_free_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = proc_evict_inode,
.statfs = simple_statfs,
.show_options = proc_show_options,
diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c
index 4403a2e20c16..ea2b597fd92c 100644
--- a/fs/proc/namespaces.c
+++ b/fs/proc/namespaces.c
@@ -12,7 +12,7 @@
#include "internal.h"
-static const struct proc_ns_operations *ns_entries[] = {
+static const struct proc_ns_operations *const ns_entries[] = {
#ifdef CONFIG_NET_NS
&netns_operations,
#endif
@@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry,
static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx)
{
struct task_struct *task = get_proc_task(file_inode(file));
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
if (!task)
return -ENOENT;
@@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir,
struct dentry *dentry, unsigned int flags)
{
struct task_struct *task = get_proc_task(dir);
- const struct proc_ns_operations **entry, **last;
+ const struct proc_ns_operations *const *entry, *const *last;
unsigned int len = dentry->d_name.len;
struct dentry *res = ERR_PTR(-ENOENT);
diff --git a/fs/proc/root.c b/fs/proc/root.c
index ed86ac710384..fd1f1c8a939a 100644
--- a/fs/proc/root.c
+++ b/fs/proc/root.c
@@ -38,12 +38,14 @@ enum proc_param {
Opt_gid,
Opt_hidepid,
Opt_subset,
+ Opt_pidns,
};
static const struct fs_parameter_spec proc_fs_parameters[] = {
- fsparam_u32("gid", Opt_gid),
+ fsparam_u32("gid", Opt_gid),
fsparam_string("hidepid", Opt_hidepid),
fsparam_string("subset", Opt_subset),
+ fsparam_file_or_string("pidns", Opt_pidns),
{}
};
@@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value)
return 0;
}
+#ifdef CONFIG_PID_NS
+static int proc_parse_pidns_param(struct fs_context *fc,
+ struct fs_parameter *param,
+ struct fs_parse_result *result)
+{
+ struct proc_fs_context *ctx = fc->fs_private;
+ struct pid_namespace *target, *active = task_active_pid_ns(current);
+ struct ns_common *ns;
+ struct file *ns_filp __free(fput) = NULL;
+
+ switch (param->type) {
+ case fs_value_is_file:
+ /* came through fsconfig, steal the file reference */
+ ns_filp = no_free_ptr(param->file);
+ break;
+ case fs_value_is_string:
+ ns_filp = filp_open(param->string, O_RDONLY, 0);
+ break;
+ default:
+ WARN_ON_ONCE(true);
+ break;
+ }
+ if (!ns_filp)
+ ns_filp = ERR_PTR(-EBADF);
+ if (IS_ERR(ns_filp)) {
+ errorfc(fc, "could not get file from pidns argument");
+ return PTR_ERR(ns_filp);
+ }
+
+ if (!proc_ns_file(ns_filp))
+ return invalfc(fc, "pidns argument is not an nsfs file");
+ ns = get_proc_ns(file_inode(ns_filp));
+ if (ns->ops->type != CLONE_NEWPID)
+ return invalfc(fc, "pidns argument is not a pidns file");
+ target = container_of(ns, struct pid_namespace, ns);
+
+ /*
+ * pidns= is shorthand for joining the pidns to get a fsopen fd, so the
+ * permission model should be the same as pidns_install().
+ */
+ if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) {
+ errorfc(fc, "insufficient permissions to set pidns");
+ return -EPERM;
+ }
+ if (!pidns_is_ancestor(target, active))
+ return invalfc(fc, "cannot set pidns to non-descendant pidns");
+
+ put_pid_ns(ctx->pid_ns);
+ ctx->pid_ns = get_pid_ns(target);
+ put_user_ns(fc->user_ns);
+ fc->user_ns = get_user_ns(ctx->pid_ns->user_ns);
+ return 0;
+}
+#endif /* CONFIG_PID_NS */
+
static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
{
struct proc_fs_context *ctx = fc->fs_private;
struct fs_parse_result result;
- int opt;
+ int opt, err;
opt = fs_parse(fc, proc_fs_parameters, param, &result);
if (opt < 0)
@@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param)
break;
case Opt_hidepid:
- if (proc_parse_hidepid_param(fc, param))
- return -EINVAL;
+ err = proc_parse_hidepid_param(fc, param);
+ if (err)
+ return err;
break;
case Opt_subset:
- if (proc_parse_subset_param(fc, param->string) < 0)
- return -EINVAL;
+ err = proc_parse_subset_param(fc, param->string);
+ if (err)
+ return err;
+ break;
+
+ case Opt_pidns:
+#ifdef CONFIG_PID_NS
+ /*
+ * We would have to RCU-protect every proc_pid_ns() or
+ * proc_sb_info() access if we allowed this to be reconfigured
+ * for an existing procfs instance. Luckily, procfs instances
+ * are cheap to create, and mount-beneath would let you
+ * atomically replace an instance even with overmounts.
+ */
+ if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) {
+ errorfc(fc, "cannot reconfigure pidns for existing procfs");
+ return -EBUSY;
+ }
+ err = proc_parse_pidns_param(fc, param, &result);
+ if (err)
+ return err;
break;
+#else
+ errorfc(fc, "pidns mount flag not supported on this system");
+ return -EOPNOTSUPP;
+#endif
default:
return -EINVAL;
@@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info,
fs_info->hide_pid = ctx->hidepid;
if (ctx->mask & (1 << Opt_subset))
fs_info->pidonly = ctx->pidonly;
+ if (ctx->mask & (1 << Opt_pidns) &&
+ !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) {
+ put_pid_ns(fs_info->pid_ns);
+ fs_info->pid_ns = get_pid_ns(ctx->pid_ns);
+ }
}
static int proc_fill_super(struct super_block *s, struct fs_context *fc)
diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c
index 1a2e1185426c..b4e55c90f8dc 100644
--- a/fs/pstore/inode.c
+++ b/fs/pstore/inode.c
@@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc)
static const struct super_operations pstore_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.evict_inode = pstore_evict_inode,
.show_options = pstore_show_options,
};
diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c
index f8874c3b8c1e..41f9995da7ca 100644
--- a/fs/ramfs/inode.c
+++ b/fs/ramfs/inode.c
@@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root)
static const struct super_operations ramfs_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = ramfs_show_options,
};
diff --git a/fs/read_write.c b/fs/read_write.c
index c5b6265d984b..833bae068770 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
if (len == 0)
return 0;
+ /*
+ * Make sure return value doesn't overflow in 32bit compat mode. Also
+ * limit the size for all cases except when calling ->copy_file_range().
+ */
+ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall())
+ len = min_t(size_t, MAX_RW_COUNT, len);
+
file_start_write(file_out);
/*
@@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
len, flags);
} else if (!splice && file_in->f_op->remap_file_range && samesb) {
ret = file_in->f_op->remap_file_range(file_in, pos_in,
- file_out, pos_out,
- min_t(loff_t, MAX_RW_COUNT, len),
- REMAP_FILE_CAN_SHORTEN);
+ file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN);
/* fallback to splice */
if (ret <= 0)
splice = true;
@@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in,
* to splicing from input file, while file_start_write() is held on
* the output file on a different sb.
*/
- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out,
- min_t(size_t, len, MAX_RW_COUNT), 0);
+ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0);
done:
if (ret > 0) {
fsnotify_access(file_in);
diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c
index e1848276bab4..b0e84ca5d268 100644
--- a/fs/smb/client/cifsfs.c
+++ b/fs/smb/client/cifsfs.c
@@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode)
/* no serverino => unconditional eviction */
return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) ||
- generic_drop_inode(inode);
+ inode_generic_drop(inode);
}
static const struct super_operations cifs_super_ops = {
diff --git a/fs/super.c b/fs/super.c
index 7f876f32343a..e91718017701 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -2318,13 +2318,15 @@ int sb_init_dio_done_wq(struct super_block *sb)
sb->s_id);
if (!wq)
return -ENOMEM;
+
+ old = NULL;
/*
* This has to be atomic as more DIOs can race to create the workqueue
*/
- old = cmpxchg(&sb->s_dio_done_wq, NULL, wq);
- /* Someone created workqueue before us? Free ours... */
- if (old)
+ if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) {
+ /* Someone created workqueue before us? Free ours... */
destroy_workqueue(wq);
+ }
return 0;
}
EXPORT_SYMBOL_GPL(sb_init_dio_done_wq);
diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c
index f3e3b2068608..733fd1e5a9a2 100644
--- a/fs/ubifs/super.c
+++ b/fs/ubifs/super.c
@@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc)
static int ubifs_drop_inode(struct inode *inode)
{
- int drop = generic_drop_inode(inode);
+ int drop = inode_generic_drop(inode);
if (!drop)
drop = fscrypt_drop_inode(inode);
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index bb0a82635a77..a05ff68748dc 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -778,7 +778,7 @@ xfs_fs_drop_inode(
return 0;
}
- return generic_drop_inode(inode);
+ return inode_generic_drop(inode);
}
STATIC void
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 601d036a6c78..e90384db5544 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -357,6 +357,7 @@ struct readahead_control;
#define IOCB_APPEND (__force int) RWF_APPEND
#define IOCB_ATOMIC (__force int) RWF_ATOMIC
#define IOCB_DONTCACHE (__force int) RWF_DONTCACHE
+#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL
/* non-RWF related bits - start at 16 */
#define IOCB_EVENTFD (1 << 16)
@@ -2053,8 +2054,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);
-int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
-
#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
unsigned long arg);
@@ -3282,7 +3281,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len)
/**
* name_contains_dotdot - check if a file name contains ".." path components
- *
+ * @name: File path string to check
* Search for ".." surrounded by either '/' or start/end of string.
*/
static inline bool name_contains_dotdot(const char *name)
@@ -3314,8 +3313,8 @@ extern void address_space_init_once(struct address_space *mapping);
extern struct inode * igrab(struct inode *);
extern ino_t iunique(struct super_block *, ino_t);
extern int inode_needs_sync(struct inode *inode);
-extern int generic_delete_inode(struct inode *inode);
-static inline int generic_drop_inode(struct inode *inode)
+extern int inode_just_drop(struct inode *inode);
+static inline int inode_generic_drop(struct inode *inode)
{
return !inode->i_nlink || inode_unhashed(inode);
}
@@ -3394,7 +3393,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb)
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
-extern int file_remove_privs_flags(struct file *file, unsigned int flags);
extern int file_remove_privs(struct file *);
int setattr_should_drop_sgid(struct mnt_idmap *idmap,
const struct inode *inode);
diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h
index 7c67a5811199..17fdc059f8da 100644
--- a/include/linux/pid_namespace.h
+++ b/include/linux/pid_namespace.h
@@ -84,6 +84,9 @@ extern void zap_pid_ns_processes(struct pid_namespace *pid_ns);
extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd);
extern void put_pid_ns(struct pid_namespace *ns);
+extern bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor);
+
#else /* !CONFIG_PID_NS */
#include <linux/err.h>
@@ -118,6 +121,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd)
{
return 0;
}
+
+static inline bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor)
+{
+ return false;
+}
#endif /* CONFIG_PID_NS */
extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk);
diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h
index b8d1e00a7982..2dfeb158e848 100644
--- a/include/trace/events/filelock.h
+++ b/include/trace/events/filelock.h
@@ -27,7 +27,8 @@
{ FL_SLEEP, "FL_SLEEP" }, \
{ FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \
{ FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \
- { FL_OFDLCK, "FL_OFDLCK" })
+ { FL_OFDLCK, "FL_OFDLCK" }, \
+ { FL_RECLAIM, "FL_RECLAIM"})
#define show_fl_type(val) \
__print_symbolic(val, \
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 0bd678a4a10e..beb4c2d1e41c 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t;
/* buffered IO that drops the cache after reading or writing data */
#define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080)
+/* prevent pipe and socket writes from raising SIGPIPE */
+#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100)
+
/* mask of flags supported by the kernel */
#define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\
RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\
- RWF_DONTCACHE)
+ RWF_DONTCACHE | RWF_NOSIGNAL)
#define PROCFS_IOCTL_MAGIC 'f'
diff --git a/init/Kconfig b/init/Kconfig
index 6a91ec742186..87c868f86a06 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -1504,6 +1504,7 @@ config BOOT_CONFIG_EMBED_FILE
config INITRAMFS_PRESERVE_MTIME
bool "Preserve cpio archive mtimes in initramfs"
+ depends on BLK_DEV_INITRD
default y
help
Each entry in an initramfs cpio archive carries an mtime value. When
diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c
index ac021ae6e6fa..19d9f33dcacf 100644
--- a/init/do_mounts_rd.c
+++ b/init/do_mounts_rd.c
@@ -7,6 +7,7 @@
#include <uapi/linux/cramfs_fs.h>
#include <linux/initrd.h>
#include <linux/string.h>
+#include <linux/string_choices.h>
#include <linux/slab.h>
#include "do_mounts.h"
@@ -186,14 +187,12 @@ static unsigned long nr_blocks(struct file *file)
int __init rd_load_image(char *from)
{
int res = 0;
- unsigned long rd_blocks, devblocks;
+ unsigned long rd_blocks, devblocks, nr_disks;
int nblocks, i;
char *buf = NULL;
unsigned short rotate = 0;
decompress_fn decompressor = NULL;
-#if !defined(CONFIG_S390)
char rotator[4] = { '|' , '/' , '-' , '\\' };
-#endif
out_file = filp_open("/dev/ram", O_RDWR, 0);
if (IS_ERR(out_file))
@@ -244,8 +243,9 @@ int __init rd_load_image(char *from)
goto done;
}
- printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
- nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : "");
+ nr_disks = (nblocks - 1) / devblocks + 1;
+ pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ",
+ nblocks, nr_disks, str_plural(nr_disks));
for (i = 0; i < nblocks; i++) {
if (i && (i % devblocks == 0)) {
pr_cont("done disk #1.\n");
@@ -255,12 +255,10 @@ int __init rd_load_image(char *from)
}
kernel_read(in_file, buf, BLOCK_SIZE, &in_pos);
kernel_write(out_file, buf, BLOCK_SIZE, &out_pos);
-#if !defined(CONFIG_S390)
- if (!(i % 16)) {
+ if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) {
pr_cont("%c\b", rotator[rotate & 0x3]);
rotate++;
}
-#endif
}
pr_cont("done.\n");
diff --git a/init/initramfs.c b/init/initramfs.c
index 097673b97784..6ddbfb17fb8f 100644
--- a/init/initramfs.c
+++ b/init/initramfs.c
@@ -19,6 +19,7 @@
#include <linux/init_syscalls.h>
#include <linux/umh.h>
#include <linux/security.h>
+#include <linux/overflow.h>
#include "do_mounts.h"
#include "initramfs_internal.h"
@@ -108,7 +109,7 @@ static char __init *find_link(int major, int minor, int ino,
q->minor = minor;
q->ino = ino;
q->mode = mode;
- strcpy(q->name, name);
+ strscpy(q->name, name);
q->next = NULL;
*p = q;
hardlink_seen = true;
@@ -152,7 +153,7 @@ static void __init dir_add(const char *name, size_t nlen, time64_t mtime)
{
struct dir_entry *de;
- de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL);
+ de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL);
if (!de)
panic_show_mem("can't allocate dir_entry buffer");
INIT_LIST_HEAD(&de->list);
diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c
index 5c2e96b19392..6d021d18afa6 100644
--- a/kernel/bpf/inode.c
+++ b/kernel/bpf/inode.c
@@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode)
const struct super_operations bpf_super_ops = {
.statfs = simple_statfs,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.show_options = bpf_show_options,
.free_inode = bpf_free_inode,
};
diff --git a/kernel/pid.c b/kernel/pid.c
index c45a28c16cd2..d94ce0250501 100644
--- a/kernel/pid.c
+++ b/kernel/pid.c
@@ -680,7 +680,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head,
container_of(head->set, struct pid_namespace, set);
int mode = table->mode;
- if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) ||
+ if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) ||
uid_eq(current_euid(), make_kuid(pidns->user_ns, 0)))
mode = (mode & S_IRWXU) >> 6;
else if (in_egroup_p(make_kgid(pidns->user_ns, 0)))
diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c
index 7098ed44e717..b7b45c2597ec 100644
--- a/kernel/pid_namespace.c
+++ b/kernel/pid_namespace.c
@@ -390,11 +390,23 @@ static void pidns_put(struct ns_common *ns)
put_pid_ns(to_pid_ns(ns));
}
+bool pidns_is_ancestor(struct pid_namespace *child,
+ struct pid_namespace *ancestor)
+{
+ struct pid_namespace *ns;
+
+ if (child->level < ancestor->level)
+ return false;
+ for (ns = child; ns->level > ancestor->level; ns = ns->parent)
+ ;
+ return ns == ancestor;
+}
+
static int pidns_install(struct nsset *nsset, struct ns_common *ns)
{
struct nsproxy *nsproxy = nsset->nsproxy;
struct pid_namespace *active = task_active_pid_ns(current);
- struct pid_namespace *ancestor, *new = to_pid_ns(ns);
+ struct pid_namespace *new = to_pid_ns(ns);
if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) ||
!ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN))
@@ -408,13 +420,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns)
* this maintains the property that processes and their
* children can not escape their current pid namespace.
*/
- if (new->level < active->level)
- return -EINVAL;
-
- ancestor = new;
- while (ancestor->level > active->level)
- ancestor = ancestor->parent;
- if (ancestor != active)
+ if (!pidns_is_ancestor(new, active))
return -EINVAL;
put_pid_ns(nsproxy->pid_ns_for_children);
diff --git a/mm/shmem.c b/mm/shmem.c
index e2c76a30802b..932727247c64 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = {
.get_dquots = shmem_get_dquots,
#endif
.evict_inode = shmem_evict_inode,
- .drop_inode = generic_delete_inode,
+ .drop_inode = inode_just_drop,
.put_super = shmem_put_super,
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
.nr_cached_objects = shmem_unused_huge_count,
diff --git a/net/socket.c b/net/socket.c
index 682969deaed3..bac335ecee4c 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (sock->type == SOCK_SEQPACKET)
msg.msg_flags |= MSG_EOR;
+ if (iocb->ki_flags & IOCB_NOSIGNAL)
+ msg.msg_flags |= MSG_NOSIGNAL;
+
res = __sock_sendmsg(sock, &msg);
*from = msg.msg_iter;
return res;
diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore
index 19bb333e2485..6b78a8382d40 100644
--- a/tools/testing/selftests/proc/.gitignore
+++ b/tools/testing/selftests/proc/.gitignore
@@ -18,6 +18,7 @@
/proc-tid0
/proc-uptime-001
/proc-uptime-002
+/proc-pidns
/read
/self
/setns-dcache
diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile
index 50aba102201a..be3013515aae 100644
--- a/tools/testing/selftests/proc/Makefile
+++ b/tools/testing/selftests/proc/Makefile
@@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc
TEST_GEN_PROGS += thread-self
TEST_GEN_PROGS += proc-multiple-procfs
TEST_GEN_PROGS += proc-fsconfig-hidepid
+TEST_GEN_PROGS += proc-pidns
include ../lib.mk
diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c
new file mode 100644
index 000000000000..52500597f951
--- /dev/null
+++ b/tools/testing/selftests/proc/proc-pidns.c
@@ -0,0 +1,211 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Author: Aleksa Sarai <cyphar@cyphar.com>
+ * Copyright (C) 2025 SUSE LLC.
+ */
+
+#include <assert.h>
+#include <errno.h>
+#include <sched.h>
+#include <stdbool.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+#include <stdio.h>
+#include <sys/mount.h>
+#include <sys/stat.h>
+#include <sys/prctl.h>
+
+#include "../kselftest_harness.h"
+
+#define ASSERT_ERRNO(expected, _t, seen) \
+ __EXPECT(expected, #expected, \
+ ({__typeof__(seen) _tmp_seen = (seen); \
+ _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1)
+
+#define ASSERT_ERRNO_EQ(expected, seen) \
+ ASSERT_ERRNO(expected, ==, seen)
+
+#define ASSERT_SUCCESS(seen) \
+ ASSERT_ERRNO(0, <=, seen)
+
+static int touch(char *path)
+{
+ int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644);
+ if (fd < 0)
+ return -1;
+ return close(fd);
+}
+
+FIXTURE(ns)
+{
+ int host_mntns, host_pidns;
+ int dummy_pidns;
+};
+
+FIXTURE_SETUP(ns)
+{
+ /* Stash the old mntns. */
+ self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC);
+ ASSERT_SUCCESS(self->host_mntns);
+
+ /* Create a new mount namespace and make it private. */
+ ASSERT_SUCCESS(unshare(CLONE_NEWNS));
+ ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL));
+
+ /*
+ * Create a proper tmpfs that we can use and will disappear once we
+ * leave this mntns.
+ */
+ ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL));
+
+ /*
+ * Create a pidns we can use for later tests. We need to fork off a
+ * child so that we get a usable nsfd that we can bind-mount and open.
+ */
+ ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755));
+ ASSERT_SUCCESS(touch("/tmp/dummy/pidns"));
+ ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755));
+
+ self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC);
+ ASSERT_SUCCESS(self->host_pidns);
+ ASSERT_SUCCESS(unshare(CLONE_NEWPID));
+
+ pid_t pid = fork();
+ ASSERT_SUCCESS(pid);
+ if (!pid) {
+ prctl(PR_SET_PDEATHSIG, SIGKILL);
+ ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL));
+ ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL));
+ exit(0);
+ }
+
+ int wstatus;
+ ASSERT_EQ(waitpid(pid, &wstatus, 0), pid);
+ ASSERT_TRUE(WIFEXITED(wstatus));
+ ASSERT_EQ(WEXITSTATUS(wstatus), 0);
+
+ ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID));
+
+ self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC);
+ ASSERT_SUCCESS(self->dummy_pidns);
+}
+
+FIXTURE_TEARDOWN(ns)
+{
+ ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS));
+ ASSERT_SUCCESS(close(self->host_mntns));
+
+ ASSERT_SUCCESS(close(self->host_pidns));
+ ASSERT_SUCCESS(close(self->dummy_pidns));
+}
+
+TEST_F(ns, pidns_mount_string_path)
+{
+ ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755));
+ ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid"));
+ ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK));
+
+ ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755));
+ ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns"));
+ ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK));
+ ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK));
+}
+
+TEST_F(ns, pidns_fsconfig_string_path)
+{
+ int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+ ASSERT_SUCCESS(fsfd);
+
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0));
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+ int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+ ASSERT_SUCCESS(mountfd);
+
+ ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_SUCCESS(close(fsfd));
+ ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_fsconfig_fd)
+{
+ int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+ ASSERT_SUCCESS(fsfd);
+
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns));
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+ int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+ ASSERT_SUCCESS(mountfd);
+
+ ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_SUCCESS(close(fsfd));
+ ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_reconfigure_remount)
+{
+ ASSERT_SUCCESS(mkdir("/tmp/proc", 0755));
+ ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, ""));
+
+ ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK));
+ ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK));
+
+ ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns"));
+
+ ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK));
+ ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK));
+}
+
+TEST_F(ns, pidns_reconfigure_fsconfig_string_path)
+{
+ int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+ ASSERT_SUCCESS(fsfd);
+
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+ int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+ ASSERT_SUCCESS(mountfd);
+
+ ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0));
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */
+
+ ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_SUCCESS(close(fsfd));
+ ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_F(ns, pidns_reconfigure_fsconfig_fd)
+{
+ int fsfd = fsopen("proc", FSOPEN_CLOEXEC);
+ ASSERT_SUCCESS(fsfd);
+
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0));
+
+ int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0);
+ ASSERT_SUCCESS(mountfd);
+
+ ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns));
+ ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */
+
+ ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0));
+ ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0));
+
+ ASSERT_SUCCESS(close(fsfd));
+ ASSERT_SUCCESS(close(mountfd));
+}
+
+TEST_HARNESS_MAIN