diff options
64 files changed, 582 insertions, 264 deletions
diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 5a7a83c411e9..e92c0056e4e0 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -6429,6 +6429,9 @@ rootflags= [KNL] Set root filesystem mount option string + initramfs_options= [KNL] + Specify mount options for for the initramfs mount. + rootfstype= [KNL] Set root filesystem type rootwait [KNL] Wait (indefinitely) for root device to show up. diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 85f590254f07..b5db45c0094c 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -340,8 +340,8 @@ of those. Caller makes sure async writeback cannot be running for the inode whil ->drop_inode() returns int now; it's called on final iput() with inode->i_lock held and it returns true if filesystems wants the inode to be -dropped. As before, generic_drop_inode() is still the default and it's been -updated appropriately. generic_delete_inode() is also alive and it consists +dropped. As before, inode_generic_drop() is still the default and it's been +updated appropriately. inode_just_drop() is also alive and it consists simply of return 1. Note that all actual eviction work is done by caller after ->drop_inode() returns. diff --git a/Documentation/filesystems/proc.rst b/Documentation/filesystems/proc.rst index 2971551b7235..b7e3147ba3d4 100644 --- a/Documentation/filesystems/proc.rst +++ b/Documentation/filesystems/proc.rst @@ -2362,6 +2362,7 @@ The following mount options are supported: hidepid= Set /proc/<pid>/ access mode. gid= Set the group authorized to learn processes information. subset= Show only the specified subset of procfs. + pidns= Specify a the namespace used by this procfs. ========= ======================================================== hidepid=off or hidepid=0 means classic mode - everybody may access all @@ -2394,6 +2395,13 @@ information about processes information, just add identd to this group. subset=pid hides all top level files and directories in the procfs that are not related to tasks. +pidns= specifies a pid namespace (either as a string path to something like +`/proc/$pid/ns/pid`, or a file descriptor when using `FSCONFIG_SET_FD`) that +will be used by the procfs instance when translating pids. By default, procfs +will use the calling process's active pid namespace. Note that the pid +namespace of an existing procfs instance cannot be modified (attempting to do +so will give an `-EBUSY` error). + Chapter 5: Filesystem behavior ============================== diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst index 486a91633474..7a314eee6305 100644 --- a/Documentation/filesystems/vfs.rst +++ b/Documentation/filesystems/vfs.rst @@ -327,11 +327,11 @@ or bottom half). inode->i_lock spinlock held. This method should be either NULL (normal UNIX filesystem - semantics) or "generic_delete_inode" (for filesystems that do + semantics) or "inode_just_drop" (for filesystems that do not want to cache inodes - causing "delete_inode" to always be called regardless of the value of i_nlink) - The "generic_delete_inode()" behavior is equivalent to the old + The "inode_just_drop()" behavior is equivalent to the old practice of using "force_delete" in the put_inode() case, but does not have the races that the "force_delete()" approach had. diff --git a/block/bdev.c b/block/bdev.c index b77ddd12dc06..810707cca970 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -412,7 +412,7 @@ static const struct super_operations bdev_sops = { .statfs = simple_statfs, .alloc_inode = bdev_alloc_inode, .free_inode = bdev_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = bdev_evict_inode, }; diff --git a/drivers/dax/super.c b/drivers/dax/super.c index 54c480e874cb..d7714d8afb0f 100644 --- a/drivers/dax/super.c +++ b/drivers/dax/super.c @@ -388,7 +388,7 @@ static const struct super_operations dax_sops = { .alloc_inode = dax_alloc_inode, .destroy_inode = dax_destroy_inode, .free_inode = dax_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int dax_init_fs_context(struct fs_context *fc) diff --git a/drivers/misc/ibmasm/ibmasmfs.c b/drivers/misc/ibmasm/ibmasmfs.c index c44de892a61e..5372ed2a363e 100644 --- a/drivers/misc/ibmasm/ibmasmfs.c +++ b/drivers/misc/ibmasm/ibmasmfs.c @@ -94,7 +94,7 @@ static int ibmasmfs_init_fs_context(struct fs_context *fc) static const struct super_operations ibmasmfs_s_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct file_operations *ibmasmfs_dir_ops = &simple_dir_operations; diff --git a/drivers/usb/gadget/function/f_fs.c b/drivers/usb/gadget/function/f_fs.c index 08a251df20c4..5246fa6af3d6 100644 --- a/drivers/usb/gadget/function/f_fs.c +++ b/drivers/usb/gadget/function/f_fs.c @@ -1891,7 +1891,7 @@ static struct dentry *ffs_sb_create_file(struct super_block *sb, /* Super block */ static const struct super_operations ffs_sb_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; struct ffs_sb_fill_data { diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index b51e132b0cd2..13c3da49348c 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -2011,7 +2011,7 @@ gadgetfs_create_file (struct super_block *sb, char const *name, static const struct super_operations gadget_fs_operations = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static int diff --git a/fs/9p/vfs_super.c b/fs/9p/vfs_super.c index 795c6388744c..1581ebac5bb4 100644 --- a/fs/9p/vfs_super.c +++ b/fs/9p/vfs_super.c @@ -252,7 +252,7 @@ static int v9fs_drop_inode(struct inode *inode) v9ses = v9fs_inode2v9ses(inode); if (v9ses->cache & (CACHE_META|CACHE_LOOSE)) - return generic_drop_inode(inode); + return inode_generic_drop(inode); /* * in case of non cached mode always drop the * inode because we want the inode attribute diff --git a/fs/afs/inode.c b/fs/afs/inode.c index e9538e91f848..e1cb17b85791 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -723,9 +723,9 @@ int afs_drop_inode(struct inode *inode) _enter(""); if (test_bit(AFS_VNODE_PSEUDODIR, &AFS_FS_I(inode)->flags)) - return generic_delete_inode(inode); + return inode_just_drop(inode); else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 18db1053cdf0..3923be975e47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -7973,7 +7973,7 @@ int btrfs_drop_inode(struct inode *inode) if (btrfs_root_refs(&root->root_item) == 0) return 1; else - return generic_drop_inode(inode); + return inode_generic_drop(inode); } static void init_once(void *foo) diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..70dc9467f6a0 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1042,7 +1042,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 740f18b60c9d..456c4a2efb53 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -36,7 +36,7 @@ static void configfs_free_inode(struct inode *inode) static const struct super_operations configfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .free_inode = configfs_free_inode, }; diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c index b002e9b734f9..12daa85ed941 100644 --- a/fs/cramfs/inode.c +++ b/fs/cramfs/inode.c @@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, inode_nohighmem(inode); inode->i_data.a_ops = &cramfs_aops; break; - default: + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: init_special_inode(inode, cramfs_inode->mode, old_decode_dev(cramfs_inode->size)); + break; + default: + printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } inode->i_mode = cramfs_inode->mode; diff --git a/fs/dcache.c b/fs/dcache.c index 60046ae23d51..336bdb4c4b1f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2509,8 +2509,8 @@ static inline unsigned start_dir_add(struct inode *dir) { preempt_disable_nested(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = READ_ONCE(dir->i_dir_seq); + if (!(n & 1) && try_cmpxchg(&dir->i_dir_seq, &n, n + 1)) return n; cpu_relax(); } diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 4bb4002e3cdf..1f4d8ce56667 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -127,7 +127,7 @@ static int efivarfs_unfreeze_fs(struct super_block *sb); static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b22d6f819f78..ee7c4b683ec3 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -46,10 +46,10 @@ * * 1) epnested_mutex (mutex) * 2) ep->mtx (mutex) - * 3) ep->lock (rwlock) + * 3) ep->lock (spinlock) * * The acquire order is the one listed above, from 1 to 3. - * We need a rwlock (ep->lock) because we manipulate objects + * We need a spinlock (ep->lock) because we manipulate objects * from inside the poll callback, that might be triggered from * a wake_up() that in turn might be called from IRQ context. * So we can't sleep inside the poll callback and hence we need @@ -195,7 +195,7 @@ struct eventpoll { struct list_head rdllist; /* Lock which protects rdllist and ovflist */ - rwlock_t lock; + spinlock_t lock; /* RB tree root used to store monitored fd structs */ struct rb_root_cached rbr; @@ -741,10 +741,10 @@ static void ep_start_scan(struct eventpoll *ep, struct list_head *txlist) * in a lockless way. */ lockdep_assert_irqs_enabled(); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); list_splice_init(&ep->rdllist, txlist); WRITE_ONCE(ep->ovflist, NULL); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_done_scan(struct eventpoll *ep, @@ -752,7 +752,7 @@ static void ep_done_scan(struct eventpoll *ep, { struct epitem *epi, *nepi; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * During the time we spent inside the "sproc" callback, some * other events might have been queued by the poll callback. @@ -793,7 +793,7 @@ static void ep_done_scan(struct eventpoll *ep, wake_up(&ep->wq); } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } static void ep_get(struct eventpoll *ep) @@ -868,10 +868,10 @@ static bool __ep_remove(struct eventpoll *ep, struct epitem *epi, bool force) rb_erase_cached(&epi->rbn, &ep->rbr); - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (ep_is_linked(epi)) list_del_init(&epi->rdllink); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); wakeup_source_unregister(ep_wakeup_source(epi)); /* @@ -1152,7 +1152,7 @@ static int ep_alloc(struct eventpoll **pep) return -ENOMEM; mutex_init(&ep->mtx); - rwlock_init(&ep->lock); + spin_lock_init(&ep->lock); init_waitqueue_head(&ep->wq); init_waitqueue_head(&ep->poll_wait); INIT_LIST_HEAD(&ep->rdllist); @@ -1240,99 +1240,9 @@ struct file *get_epoll_tfile_raw_ptr(struct file *file, int tfd, #endif /* CONFIG_KCMP */ /* - * Adds a new entry to the tail of the list in a lockless way, i.e. - * multiple CPUs are allowed to call this function concurrently. - * - * Beware: it is necessary to prevent any other modifications of the - * existing list until all changes are completed, in other words - * concurrent list_add_tail_lockless() calls should be protected - * with a read lock, where write lock acts as a barrier which - * makes sure all list_add_tail_lockless() calls are fully - * completed. - * - * Also an element can be locklessly added to the list only in one - * direction i.e. either to the tail or to the head, otherwise - * concurrent access will corrupt the list. - * - * Return: %false if element has been already added to the list, %true - * otherwise. - */ -static inline bool list_add_tail_lockless(struct list_head *new, - struct list_head *head) -{ - struct list_head *prev; - - /* - * This is simple 'new->next = head' operation, but cmpxchg() - * is used in order to detect that same element has been just - * added to the list from another CPU: the winner observes - * new->next == new. - */ - if (!try_cmpxchg(&new->next, &new, head)) - return false; - - /* - * Initially ->next of a new element must be updated with the head - * (we are inserting to the tail) and only then pointers are atomically - * exchanged. XCHG guarantees memory ordering, thus ->next should be - * updated before pointers are actually swapped and pointers are - * swapped before prev->next is updated. - */ - - prev = xchg(&head->prev, new); - - /* - * It is safe to modify prev->next and new->prev, because a new element - * is added only to the tail and new->next is updated before XCHG. - */ - - prev->next = new; - new->prev = prev; - - return true; -} - -/* - * Chains a new epi entry to the tail of the ep->ovflist in a lockless way, - * i.e. multiple CPUs are allowed to call this function concurrently. - * - * Return: %false if epi element has been already chained, %true otherwise. - */ -static inline bool chain_epi_lockless(struct epitem *epi) -{ - struct eventpoll *ep = epi->ep; - - /* Fast preliminary check */ - if (epi->next != EP_UNACTIVE_PTR) - return false; - - /* Check that the same epi has not been just chained from another CPU */ - if (cmpxchg(&epi->next, EP_UNACTIVE_PTR, NULL) != EP_UNACTIVE_PTR) - return false; - - /* Atomically exchange tail */ - epi->next = xchg(&ep->ovflist, epi); - - return true; -} - -/* * This is the callback that is passed to the wait queue wakeup * mechanism. It is called by the stored file descriptors when they * have events to report. - * - * This callback takes a read lock in order not to contend with concurrent - * events from another file descriptor, thus all modifications to ->rdllist - * or ->ovflist are lockless. Read lock is paired with the write lock from - * ep_start/done_scan(), which stops all list modifications and guarantees - * that lists state is seen correctly. - * - * Another thing worth to mention is that ep_poll_callback() can be called - * concurrently for the same @epi from different CPUs if poll table was inited - * with several wait queues entries. Plural wakeup from different CPUs of a - * single wait queue is serialized by wq.lock, but the case when multiple wait - * queues are used should be detected accordingly. This is detected using - * cmpxchg() operation. */ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, void *key) { @@ -1343,7 +1253,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v unsigned long flags; int ewake = 0; - read_lock_irqsave(&ep->lock, flags); + spin_lock_irqsave(&ep->lock, flags); ep_set_busy_poll_napi_id(epi); @@ -1372,12 +1282,15 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v * chained in ep->ovflist and requeued later on. */ if (READ_ONCE(ep->ovflist) != EP_UNACTIVE_PTR) { - if (chain_epi_lockless(epi)) + if (epi->next == EP_UNACTIVE_PTR) { + epi->next = READ_ONCE(ep->ovflist); + WRITE_ONCE(ep->ovflist, epi); ep_pm_stay_awake_rcu(epi); + } } else if (!ep_is_linked(epi)) { /* In the usual case, add event to ready list. */ - if (list_add_tail_lockless(&epi->rdllink, &ep->rdllist)) - ep_pm_stay_awake_rcu(epi); + list_add_tail(&epi->rdllink, &ep->rdllist); + ep_pm_stay_awake_rcu(epi); } /* @@ -1410,7 +1323,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v pwake++; out_unlock: - read_unlock_irqrestore(&ep->lock, flags); + spin_unlock_irqrestore(&ep->lock, flags); /* We have to call this outside the lock */ if (pwake) @@ -1745,7 +1658,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, } /* We have to drop the new item inside our item list to keep track of it */ - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* record NAPI ID of new item if present */ ep_set_busy_poll_napi_id(epi); @@ -1762,7 +1675,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); /* We have to call this outside the lock */ if (pwake) @@ -1826,7 +1739,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, * list, push it inside. */ if (ep_item_poll(epi, &pt, 1)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); if (!ep_is_linked(epi)) { list_add_tail(&epi->rdllink, &ep->rdllist); ep_pm_stay_awake(epi); @@ -1837,7 +1750,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, if (waitqueue_active(&ep->poll_wait)) pwake++; } - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } /* We have to call this outside the lock */ @@ -2089,7 +2002,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, init_wait(&wait); wait.func = ep_autoremove_wake_function; - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * Barrierless variant, waitqueue_active() is called under * the same lock on wakeup ep_poll_callback() side, so it @@ -2108,7 +2021,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (!eavail) __add_wait_queue_exclusive(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); if (!eavail) timed_out = !ep_schedule_timeout(to) || @@ -2124,7 +2037,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, eavail = 1; if (!list_empty_careful(&wait.entry)) { - write_lock_irq(&ep->lock); + spin_lock_irq(&ep->lock); /* * If the thread timed out and is not on the wait queue, * it means that the thread was woken up after its @@ -2135,7 +2048,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) eavail = list_empty(&wait.entry); __remove_wait_queue(&ep->wq, &wait); - write_unlock_irq(&ep->lock); + spin_unlock_irq(&ep->lock); } } } diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 699c15db28a8..caac4067f777 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -1417,7 +1417,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) static int ext4_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e16c4e2830c2..63cf73409da6 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1768,7 +1768,7 @@ static int f2fs_drop_inode(struct inode *inode) trace_f2fs_drop_inode(inode, 0); return 0; } - ret = generic_drop_inode(inode); + ret = inode_generic_drop(inode); if (!ret) ret = fscrypt_drop_inode(inode); trace_f2fs_drop_inode(inode, ret); diff --git a/fs/fcntl.c b/fs/fcntl.c index 5598e4d57422..72f8433d9109 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -355,8 +355,7 @@ static bool rw_hint_valid(u64 hint) } } -static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_get_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -367,8 +366,7 @@ static long fcntl_get_rw_hint(struct file *file, unsigned int cmd, return 0; } -static long fcntl_set_rw_hint(struct file *file, unsigned int cmd, - unsigned long arg) +static long fcntl_set_rw_hint(struct file *file, unsigned long arg) { struct inode *inode = file_inode(file); u64 __user *argp = (u64 __user *)arg; @@ -547,10 +545,10 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, err = memfd_fcntl(filp, cmd, argi); break; case F_GET_RW_HINT: - err = fcntl_get_rw_hint(filp, cmd, arg); + err = fcntl_get_rw_hint(filp, arg); break; case F_SET_RW_HINT: - err = fcntl_set_rw_hint(filp, cmd, arg); + err = fcntl_set_rw_hint(filp, arg); break; default: break; diff --git a/fs/file.c b/fs/file.c index 6d2275c3be9c..28743b742e3c 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1330,7 +1330,10 @@ int replace_fd(unsigned fd, struct file *file, unsigned flags) err = expand_files(files, fd); if (unlikely(err < 0)) goto out_unlock; - return do_dup2(files, file, fd, flags); + err = do_dup2(files, file, fd, flags); + if (err < 0) + return err; + return 0; out_unlock: spin_unlock(&files->file_lock); diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index a07b8cf73ae2..a6cc3d305b84 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1123,7 +1123,7 @@ int cgroup_writeback_by_id(u64 bdi_id, int memcg_id, dirty = dirty * 10 / 8; /* issue the writeback work */ - work = kzalloc(sizeof(*work), GFP_NOWAIT | __GFP_NOWARN); + work = kzalloc(sizeof(*work), GFP_NOWAIT); if (work) { work->nr_pages = dirty; work->sync_mode = WB_SYNC_NONE; diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7ddfd2b3cc9c..fdecd5a90dee 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1209,7 +1209,7 @@ static const struct super_operations fuse_super_operations = { .free_inode = fuse_free_inode, .evict_inode = fuse_evict_inode, .write_inode = fuse_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .umount_begin = fuse_umount_begin, .statfs = fuse_statfs, .sync_fs = fuse_sync_fs, diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index b42e2110084b..644b2d1e7276 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -1050,7 +1050,7 @@ static int gfs2_drop_inode(struct inode *inode) if (test_bit(SDF_EVICTING, &sdp->sd_flags)) return 1; - return generic_drop_inode(inode); + return inode_generic_drop(inode); } /** diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 01e516175bcd..1e1acf5775ab 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -261,7 +261,7 @@ static int hostfs_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations hostfs_sbops = { .alloc_inode = hostfs_alloc_inode, .free_inode = hostfs_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = hostfs_evict_inode, .statfs = hostfs_statfs, .show_options = hostfs_show_options, diff --git a/fs/inode.c b/fs/inode.c index 01ebdc40021e..a66c02123bca 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1838,11 +1838,11 @@ int insert_inode_locked4(struct inode *inode, unsigned long hashval, EXPORT_SYMBOL(insert_inode_locked4); -int generic_delete_inode(struct inode *inode) +int inode_just_drop(struct inode *inode) { return 1; } -EXPORT_SYMBOL(generic_delete_inode); +EXPORT_SYMBOL(inode_just_drop); /* * Called when we're dropping the last reference @@ -1866,7 +1866,7 @@ static void iput_final(struct inode *inode) if (op->drop_inode) drop = op->drop_inode(inode); else - drop = generic_drop_inode(inode); + drop = inode_generic_drop(inode); if (!drop && !(inode->i_state & I_DONTCACHE) && @@ -2189,7 +2189,7 @@ static int __remove_privs(struct mnt_idmap *idmap, return notify_change(idmap, dentry, &newattrs, NULL); } -int file_remove_privs_flags(struct file *file, unsigned int flags) +static int file_remove_privs_flags(struct file *file, unsigned int flags) { struct dentry *dentry = file_dentry(file); struct inode *inode = file_inode(file); @@ -2214,7 +2214,6 @@ int file_remove_privs_flags(struct file *file, unsigned int flags) inode_has_no_xattr(inode); return error; } -EXPORT_SYMBOL_GPL(file_remove_privs_flags); /** * file_remove_privs - remove special file privileges (suid, capabilities) @@ -2519,21 +2518,28 @@ void __init inode_init(void) void init_special_inode(struct inode *inode, umode_t mode, dev_t rdev) { inode->i_mode = mode; - if (S_ISCHR(mode)) { + switch (inode->i_mode & S_IFMT) { + case S_IFCHR: inode->i_fop = &def_chr_fops; inode->i_rdev = rdev; - } else if (S_ISBLK(mode)) { + break; + case S_IFBLK: if (IS_ENABLED(CONFIG_BLOCK)) inode->i_fop = &def_blk_fops; inode->i_rdev = rdev; - } else if (S_ISFIFO(mode)) + break; + case S_IFIFO: inode->i_fop = &pipefifo_fops; - else if (S_ISSOCK(mode)) - ; /* leave it no_open_fops */ - else + break; + case S_IFSOCK: + /* leave it no_open_fops */ + break; + default: printk(KERN_DEBUG "init_special_inode: bogus i_mode (%o) for" " inode %s:%lu\n", mode, inode->i_sb->s_id, inode->i_ino); + break; + } } EXPORT_SYMBOL(init_special_inode); @@ -2914,7 +2920,7 @@ EXPORT_SYMBOL(mode_strip_sgid); */ void dump_inode(struct inode *inode, const char *reason) { - pr_warn("%s encountered for inode %px", reason, inode); + pr_warn("%s encountered for inode %px (%s)\n", reason, inode, inode->i_sb->s_type->name); } EXPORT_SYMBOL(dump_inode); diff --git a/fs/ioctl.c b/fs/ioctl.c index 0248cb8db2d3..1c152c2b1b67 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -54,7 +54,6 @@ int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) out: return error; } -EXPORT_SYMBOL(vfs_ioctl); static int ioctl_fibmap(struct file *filp, int __user *p) { @@ -426,7 +425,7 @@ static int ioctl_file_dedupe_range(struct file *file, goto out; } - size = offsetof(struct file_dedupe_range, info[count]); + size = struct_size(same, info, count); if (size > PAGE_SIZE) { ret = -ENOMEM; goto out; diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index e384a69fbece..76eaf64b9d9e 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -57,7 +57,7 @@ static int kernfs_statfs(struct dentry *dentry, struct kstatfs *buf) const struct super_operations kernfs_sops = { .statfs = kernfs_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = kernfs_evict_inode, .show_options = kernfs_sop_show_options, diff --git a/fs/locks.c b/fs/locks.c index 559f02aa4172..04a3f0e20724 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -2328,8 +2328,8 @@ out: * To avoid blocking kernel daemons, such as lockd, that need to acquire POSIX * locks, the ->lock() interface may return asynchronously, before the lock has * been granted or denied by the underlying filesystem, if (and only if) - * lm_grant is set. Additionally EXPORT_OP_ASYNC_LOCK in export_operations - * flags need to be set. + * lm_grant is set. Additionally FOP_ASYNC_LOCK in file_operations fop_flags + * need to be set. * * Callers expecting ->lock() to return asynchronously will only use F_SETLK, * not F_SETLKW; they will set FL_SLEEP if (and only if) the request is for a diff --git a/fs/minix/inode.c b/fs/minix/inode.c index df9d11479caf..32db676127a9 100644 --- a/fs/minix/inode.c +++ b/fs/minix/inode.c @@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) inode->i_op = &minix_symlink_inode_operations; inode_nohighmem(inode); inode->i_mapping->a_ops = &minix_aops; - } else + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { init_special_inode(inode, inode->i_mode, rdev); + } else { + printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + make_bad_inode(inode); + } } /* diff --git a/fs/namei.c b/fs/namei.c index cd43ff89fbaa..44856b70ea3b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1449,6 +1449,10 @@ static int follow_automount(struct path *path, int *count, unsigned lookup_flags dentry->d_inode) return -EISDIR; + /* No need to trigger automounts if mountpoint crossing is disabled. */ + if (lookup_flags & LOOKUP_NO_XDEV) + return -EXDEV; + if (count && (*count)++ >= MAXSYMLINKS) return -ELOOP; @@ -1472,6 +1476,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, /* Allow the filesystem to manage the transit without i_rwsem * being held. */ if (flags & DCACHE_MANAGE_TRANSIT) { + if (lookup_flags & LOOKUP_NO_XDEV) { + ret = -EXDEV; + break; + } ret = path->dentry->d_op->d_manage(path, false); flags = smp_load_acquire(&path->dentry->d_flags); if (ret < 0) @@ -1489,6 +1497,10 @@ static int __traverse_mounts(struct path *path, unsigned flags, bool *jumped, // here we know it's positive flags = path->dentry->d_flags; need_mntput = true; + if (unlikely(lookup_flags & LOOKUP_NO_XDEV)) { + ret = -EXDEV; + break; + } continue; } } @@ -1630,12 +1642,8 @@ static inline int handle_mounts(struct nameidata *nd, struct dentry *dentry, return -ECHILD; } ret = traverse_mounts(path, &jumped, &nd->total_link_count, nd->flags); - if (jumped) { - if (unlikely(nd->flags & LOOKUP_NO_XDEV)) - ret = -EXDEV; - else - nd->state |= ND_JUMPED; - } + if (jumped) + nd->state |= ND_JUMPED; if (unlikely(ret)) { dput(path->dentry); if (path->mnt != nd->path.mnt) @@ -4828,7 +4836,7 @@ int vfs_link(struct dentry *old_dentry, struct mnt_idmap *idmap, return -EPERM; /* * Updating the link count will likely cause i_uid and i_gid to - * be writen back improperly if their true value is unknown to + * be written back improperly if their true value is unknown to * the vfs. */ if (HAS_UNMAPPED_ID(idmap, inode)) diff --git a/fs/namespace.c b/fs/namespace.c index 51f77c65c0c6..38609066cf33 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -65,6 +65,15 @@ static int __init set_mphash_entries(char *str) } __setup("mphash_entries=", set_mphash_entries); +static char * __initdata initramfs_options; +static int __init initramfs_options_setup(char *str) +{ + initramfs_options = str; + return 1; +} + +__setup("initramfs_options=", initramfs_options_setup); + static u64 event; static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); static DEFINE_IDA(mnt_group_ida); @@ -5711,7 +5720,6 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { - struct path root __free(path_put) = {}; struct mount *m; int err; @@ -5723,7 +5731,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!s->mnt) return -ENOENT; - err = grab_requested_root(ns, &root); + err = grab_requested_root(ns, &s->root); if (err) return err; @@ -5732,7 +5740,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, * mounts to show users. */ m = real_mount(s->mnt); - if (!is_path_reachable(m, m->mnt.mnt_root, &root) && + if (!is_path_reachable(m, m->mnt.mnt_root, &s->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -5740,8 +5748,6 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (err) return err; - s->root = root; - /* * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap * can change concurrently as we only hold the read-side of the @@ -5963,28 +5969,40 @@ retry: if (!ret) ret = copy_statmount_to_user(ks); kvfree(ks->seq.buf); + path_put(&ks->root); if (retry_statmount(ret, &seq_size)) goto retry; return ret; } -static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, - u64 last_mnt_id, u64 *mnt_ids, size_t nr_mnt_ids, - bool reverse) +struct klistmount { + u64 last_mnt_id; + u64 mnt_parent_id; + u64 *kmnt_ids; + u32 nr_mnt_ids; + struct mnt_namespace *ns; + struct path root; +}; + +static ssize_t do_listmount(struct klistmount *kls, bool reverse) { - struct path root __free(path_put) = {}; + struct mnt_namespace *ns = kls->ns; + u64 mnt_parent_id = kls->mnt_parent_id; + u64 last_mnt_id = kls->last_mnt_id; + u64 *mnt_ids = kls->kmnt_ids; + size_t nr_mnt_ids = kls->nr_mnt_ids; struct path orig; struct mount *r, *first; ssize_t ret; rwsem_assert_held(&namespace_sem); - ret = grab_requested_root(ns, &root); + ret = grab_requested_root(ns, &kls->root); if (ret) return ret; if (mnt_parent_id == LSMT_ROOT) { - orig = root; + orig = kls->root; } else { orig.mnt = lookup_mnt_in_ns(mnt_parent_id, ns); if (!orig.mnt) @@ -5996,7 +6014,7 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, * Don't trigger audit denials. We just want to determine what * mounts to show users. */ - if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &root) && + if (!is_path_reachable(real_mount(orig.mnt), orig.dentry, &kls->root) && !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -EPERM; @@ -6029,14 +6047,45 @@ static ssize_t do_listmount(struct mnt_namespace *ns, u64 mnt_parent_id, return ret; } +static void __free_klistmount_free(const struct klistmount *kls) +{ + path_put(&kls->root); + kvfree(kls->kmnt_ids); + mnt_ns_release(kls->ns); +} + +static inline int prepare_klistmount(struct klistmount *kls, struct mnt_id_req *kreq, + size_t nr_mnt_ids) +{ + + u64 last_mnt_id = kreq->param; + + /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ + if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) + return -EINVAL; + + kls->last_mnt_id = last_mnt_id; + + kls->nr_mnt_ids = nr_mnt_ids; + kls->kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kls->kmnt_ids), + GFP_KERNEL_ACCOUNT); + if (!kls->kmnt_ids) + return -ENOMEM; + + kls->ns = grab_requested_mnt_ns(kreq); + if (!kls->ns) + return -ENOENT; + + kls->mnt_parent_id = kreq->mnt_id; + return 0; +} + SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, u64 __user *, mnt_ids, size_t, nr_mnt_ids, unsigned int, flags) { - u64 *kmnt_ids __free(kvfree) = NULL; + struct klistmount kls __free(klistmount_free) = {}; const size_t maxcount = 1000000; - struct mnt_namespace *ns __free(mnt_ns_release) = NULL; struct mnt_id_req kreq; - u64 last_mnt_id; ssize_t ret; if (flags & ~LISTMOUNT_REVERSE) @@ -6057,22 +6106,12 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, if (ret) return ret; - last_mnt_id = kreq.param; - /* The first valid unique mount id is MNT_UNIQUE_ID_OFFSET + 1. */ - if (last_mnt_id != 0 && last_mnt_id <= MNT_UNIQUE_ID_OFFSET) - return -EINVAL; - - kmnt_ids = kvmalloc_array(nr_mnt_ids, sizeof(*kmnt_ids), - GFP_KERNEL_ACCOUNT); - if (!kmnt_ids) - return -ENOMEM; - - ns = grab_requested_mnt_ns(&kreq); - if (!ns) - return -ENOENT; + ret = prepare_klistmount(&kls, &kreq, nr_mnt_ids); + if (ret) + return ret; - if (kreq.mnt_ns_id && (ns != current->nsproxy->mnt_ns) && - !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) + if (kreq.mnt_ns_id && (kls.ns != current->nsproxy->mnt_ns) && + !ns_capable_noaudit(kls.ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; /* @@ -6080,12 +6119,11 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * listmount() doesn't care about any mount properties. */ scoped_guard(rwsem_read, &namespace_sem) - ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, - nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); + ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; - if (copy_to_user(mnt_ids, kmnt_ids, ret * sizeof(*mnt_ids))) + if (copy_to_user(mnt_ids, kls.kmnt_ids, ret * sizeof(*mnt_ids))) return -EFAULT; return ret; @@ -6098,7 +6136,7 @@ static void __init init_mount_tree(void) struct mnt_namespace *ns; struct path root; - mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); + mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); if (IS_ERR(mnt)) panic("Can't create rootfs"); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 49df9debb1a6..43f09d8eb5e3 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -108,7 +108,7 @@ u64 nfs_compat_user_ino64(u64 fileid) int nfs_drop_inode(struct inode *inode) { - return NFS_STALE(inode) || generic_drop_inode(inode); + return NFS_STALE(inode) || inode_generic_drop(inode); } EXPORT_SYMBOL_GPL(nfs_drop_inode); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 5130ec44e5e1..807e2b758a5c 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -547,7 +547,7 @@ static const struct super_operations dlmfs_ops = { .alloc_inode = dlmfs_alloc_inode, .free_inode = dlmfs_free_inode, .evict_inode = dlmfs_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, }; static const struct inode_operations dlmfs_file_inode_operations = { diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index f3da840758e7..b46100a4f529 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -306,7 +306,7 @@ static const struct super_operations orangefs_s_ops = { .free_inode = orangefs_free_inode, .destroy_inode = orangefs_destroy_inode, .write_inode = orangefs_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .statfs = orangefs_statfs, .show_options = orangefs_show_options, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index df85a76597e9..bd3d7ba8fb95 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -280,7 +280,7 @@ static const struct super_operations ovl_super_operations = { .alloc_inode = ovl_alloc_inode, .free_inode = ovl_free_inode, .destroy_inode = ovl_destroy_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = ovl_put_super, .sync_fs = ovl_sync_fs, .statfs = ovl_statfs, diff --git a/fs/pidfs.c b/fs/pidfs.c index 108e7527f837..d01729c5263a 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -718,7 +718,7 @@ static void pidfs_evict_inode(struct inode *inode) } static const struct super_operations pidfs_sops = { - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pidfs_evict_inode, .statfs = simple_statfs, }; diff --git a/fs/pipe.c b/fs/pipe.c index 731622d0738d..42fead1efe52 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -458,7 +458,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) mutex_lock(&pipe->mutex); if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); ret = -EPIPE; goto out; } @@ -498,7 +499,8 @@ anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { if (!pipe->readers) { - send_sig(SIGPIPE, current, 0); + if ((iocb->ki_flags & IOCB_NOSIGNAL) == 0) + send_sig(SIGPIPE, current, 0); if (!ret) ret = -EPIPE; break; diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 129490151be1..d9b7ef122343 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -187,7 +187,7 @@ static int proc_show_options(struct seq_file *seq, struct dentry *root) const struct super_operations proc_sops = { .alloc_inode = proc_alloc_inode, .free_inode = proc_free_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = proc_evict_inode, .statfs = simple_statfs, .show_options = proc_show_options, diff --git a/fs/proc/namespaces.c b/fs/proc/namespaces.c index 4403a2e20c16..ea2b597fd92c 100644 --- a/fs/proc/namespaces.c +++ b/fs/proc/namespaces.c @@ -12,7 +12,7 @@ #include "internal.h" -static const struct proc_ns_operations *ns_entries[] = { +static const struct proc_ns_operations *const ns_entries[] = { #ifdef CONFIG_NET_NS &netns_operations, #endif @@ -117,7 +117,7 @@ static struct dentry *proc_ns_instantiate(struct dentry *dentry, static int proc_ns_dir_readdir(struct file *file, struct dir_context *ctx) { struct task_struct *task = get_proc_task(file_inode(file)); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; if (!task) return -ENOENT; @@ -151,7 +151,7 @@ static struct dentry *proc_ns_dir_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) { struct task_struct *task = get_proc_task(dir); - const struct proc_ns_operations **entry, **last; + const struct proc_ns_operations *const *entry, *const *last; unsigned int len = dentry->d_name.len; struct dentry *res = ERR_PTR(-ENOENT); diff --git a/fs/proc/root.c b/fs/proc/root.c index ed86ac710384..fd1f1c8a939a 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -38,12 +38,14 @@ enum proc_param { Opt_gid, Opt_hidepid, Opt_subset, + Opt_pidns, }; static const struct fs_parameter_spec proc_fs_parameters[] = { - fsparam_u32("gid", Opt_gid), + fsparam_u32("gid", Opt_gid), fsparam_string("hidepid", Opt_hidepid), fsparam_string("subset", Opt_subset), + fsparam_file_or_string("pidns", Opt_pidns), {} }; @@ -109,11 +111,66 @@ static int proc_parse_subset_param(struct fs_context *fc, char *value) return 0; } +#ifdef CONFIG_PID_NS +static int proc_parse_pidns_param(struct fs_context *fc, + struct fs_parameter *param, + struct fs_parse_result *result) +{ + struct proc_fs_context *ctx = fc->fs_private; + struct pid_namespace *target, *active = task_active_pid_ns(current); + struct ns_common *ns; + struct file *ns_filp __free(fput) = NULL; + + switch (param->type) { + case fs_value_is_file: + /* came through fsconfig, steal the file reference */ + ns_filp = no_free_ptr(param->file); + break; + case fs_value_is_string: + ns_filp = filp_open(param->string, O_RDONLY, 0); + break; + default: + WARN_ON_ONCE(true); + break; + } + if (!ns_filp) + ns_filp = ERR_PTR(-EBADF); + if (IS_ERR(ns_filp)) { + errorfc(fc, "could not get file from pidns argument"); + return PTR_ERR(ns_filp); + } + + if (!proc_ns_file(ns_filp)) + return invalfc(fc, "pidns argument is not an nsfs file"); + ns = get_proc_ns(file_inode(ns_filp)); + if (ns->ops->type != CLONE_NEWPID) + return invalfc(fc, "pidns argument is not a pidns file"); + target = container_of(ns, struct pid_namespace, ns); + + /* + * pidns= is shorthand for joining the pidns to get a fsopen fd, so the + * permission model should be the same as pidns_install(). + */ + if (!ns_capable(target->user_ns, CAP_SYS_ADMIN)) { + errorfc(fc, "insufficient permissions to set pidns"); + return -EPERM; + } + if (!pidns_is_ancestor(target, active)) + return invalfc(fc, "cannot set pidns to non-descendant pidns"); + + put_pid_ns(ctx->pid_ns); + ctx->pid_ns = get_pid_ns(target); + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(ctx->pid_ns->user_ns); + return 0; +} +#endif /* CONFIG_PID_NS */ + static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) { struct proc_fs_context *ctx = fc->fs_private; struct fs_parse_result result; - int opt; + int opt, err; opt = fs_parse(fc, proc_fs_parameters, param, &result); if (opt < 0) @@ -125,14 +182,38 @@ static int proc_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_hidepid: - if (proc_parse_hidepid_param(fc, param)) - return -EINVAL; + err = proc_parse_hidepid_param(fc, param); + if (err) + return err; break; case Opt_subset: - if (proc_parse_subset_param(fc, param->string) < 0) - return -EINVAL; + err = proc_parse_subset_param(fc, param->string); + if (err) + return err; + break; + + case Opt_pidns: +#ifdef CONFIG_PID_NS + /* + * We would have to RCU-protect every proc_pid_ns() or + * proc_sb_info() access if we allowed this to be reconfigured + * for an existing procfs instance. Luckily, procfs instances + * are cheap to create, and mount-beneath would let you + * atomically replace an instance even with overmounts. + */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + errorfc(fc, "cannot reconfigure pidns for existing procfs"); + return -EBUSY; + } + err = proc_parse_pidns_param(fc, param, &result); + if (err) + return err; break; +#else + errorfc(fc, "pidns mount flag not supported on this system"); + return -EOPNOTSUPP; +#endif default: return -EINVAL; @@ -154,6 +235,11 @@ static void proc_apply_options(struct proc_fs_info *fs_info, fs_info->hide_pid = ctx->hidepid; if (ctx->mask & (1 << Opt_subset)) fs_info->pidonly = ctx->pidonly; + if (ctx->mask & (1 << Opt_pidns) && + !WARN_ON_ONCE(fc->purpose == FS_CONTEXT_FOR_RECONFIGURE)) { + put_pid_ns(fs_info->pid_ns); + fs_info->pid_ns = get_pid_ns(ctx->pid_ns); + } } static int proc_fill_super(struct super_block *s, struct fs_context *fc) diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 1a2e1185426c..b4e55c90f8dc 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -282,7 +282,7 @@ static int pstore_reconfigure(struct fs_context *fc) static const struct super_operations pstore_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = pstore_evict_inode, .show_options = pstore_show_options, }; diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index f8874c3b8c1e..41f9995da7ca 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -215,7 +215,7 @@ static int ramfs_show_options(struct seq_file *m, struct dentry *root) static const struct super_operations ramfs_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = ramfs_show_options, }; diff --git a/fs/read_write.c b/fs/read_write.c index c5b6265d984b..833bae068770 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, if (len == 0) return 0; + /* + * Make sure return value doesn't overflow in 32bit compat mode. Also + * limit the size for all cases except when calling ->copy_file_range(). + */ + if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) + len = min_t(size_t, MAX_RW_COUNT, len); + file_start_write(file_out); /* @@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, len, flags); } else if (!splice && file_in->f_op->remap_file_range && samesb) { ret = file_in->f_op->remap_file_range(file_in, pos_in, - file_out, pos_out, - min_t(loff_t, MAX_RW_COUNT, len), - REMAP_FILE_CAN_SHORTEN); + file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); /* fallback to splice */ if (ret <= 0) splice = true; @@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, * to splicing from input file, while file_start_write() is held on * the output file on a different sb. */ - ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, - min_t(size_t, len, MAX_RW_COUNT), 0); + ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); done: if (ret > 0) { fsnotify_access(file_in); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index e1848276bab4..b0e84ca5d268 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -857,7 +857,7 @@ static int cifs_drop_inode(struct inode *inode) /* no serverino => unconditional eviction */ return !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) || - generic_drop_inode(inode); + inode_generic_drop(inode); } static const struct super_operations cifs_super_ops = { diff --git a/fs/super.c b/fs/super.c index 7f876f32343a..e91718017701 100644 --- a/fs/super.c +++ b/fs/super.c @@ -2318,13 +2318,15 @@ int sb_init_dio_done_wq(struct super_block *sb) sb->s_id); if (!wq) return -ENOMEM; + + old = NULL; /* * This has to be atomic as more DIOs can race to create the workqueue */ - old = cmpxchg(&sb->s_dio_done_wq, NULL, wq); - /* Someone created workqueue before us? Free ours... */ - if (old) + if (!try_cmpxchg(&sb->s_dio_done_wq, &old, wq)) { + /* Someone created workqueue before us? Free ours... */ destroy_workqueue(wq); + } return 0; } EXPORT_SYMBOL_GPL(sb_init_dio_done_wq); diff --git a/fs/ubifs/super.c b/fs/ubifs/super.c index f3e3b2068608..733fd1e5a9a2 100644 --- a/fs/ubifs/super.c +++ b/fs/ubifs/super.c @@ -335,7 +335,7 @@ static int ubifs_write_inode(struct inode *inode, struct writeback_control *wbc) static int ubifs_drop_inode(struct inode *inode) { - int drop = generic_drop_inode(inode); + int drop = inode_generic_drop(inode); if (!drop) drop = fscrypt_drop_inode(inode); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index bb0a82635a77..a05ff68748dc 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -778,7 +778,7 @@ xfs_fs_drop_inode( return 0; } - return generic_drop_inode(inode); + return inode_generic_drop(inode); } STATIC void diff --git a/include/linux/fs.h b/include/linux/fs.h index 601d036a6c78..e90384db5544 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -357,6 +357,7 @@ struct readahead_control; #define IOCB_APPEND (__force int) RWF_APPEND #define IOCB_ATOMIC (__force int) RWF_ATOMIC #define IOCB_DONTCACHE (__force int) RWF_DONTCACHE +#define IOCB_NOSIGNAL (__force int) RWF_NOSIGNAL /* non-RWF related bits - start at 16 */ #define IOCB_EVENTFD (1 << 16) @@ -2053,8 +2054,6 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); - #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, unsigned long arg); @@ -3282,7 +3281,7 @@ static inline bool is_dot_dotdot(const char *name, size_t len) /** * name_contains_dotdot - check if a file name contains ".." path components - * + * @name: File path string to check * Search for ".." surrounded by either '/' or start/end of string. */ static inline bool name_contains_dotdot(const char *name) @@ -3314,8 +3313,8 @@ extern void address_space_init_once(struct address_space *mapping); extern struct inode * igrab(struct inode *); extern ino_t iunique(struct super_block *, ino_t); extern int inode_needs_sync(struct inode *inode); -extern int generic_delete_inode(struct inode *inode); -static inline int generic_drop_inode(struct inode *inode) +extern int inode_just_drop(struct inode *inode); +static inline int inode_generic_drop(struct inode *inode) { return !inode->i_nlink || inode_unhashed(inode); } @@ -3394,7 +3393,6 @@ static inline struct inode *new_inode_pseudo(struct super_block *sb) extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); -extern int file_remove_privs_flags(struct file *file, unsigned int flags); extern int file_remove_privs(struct file *); int setattr_should_drop_sgid(struct mnt_idmap *idmap, const struct inode *inode); diff --git a/include/linux/pid_namespace.h b/include/linux/pid_namespace.h index 7c67a5811199..17fdc059f8da 100644 --- a/include/linux/pid_namespace.h +++ b/include/linux/pid_namespace.h @@ -84,6 +84,9 @@ extern void zap_pid_ns_processes(struct pid_namespace *pid_ns); extern int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd); extern void put_pid_ns(struct pid_namespace *ns); +extern bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor); + #else /* !CONFIG_PID_NS */ #include <linux/err.h> @@ -118,6 +121,12 @@ static inline int reboot_pid_ns(struct pid_namespace *pid_ns, int cmd) { return 0; } + +static inline bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + return false; +} #endif /* CONFIG_PID_NS */ extern struct pid_namespace *task_active_pid_ns(struct task_struct *tsk); diff --git a/include/trace/events/filelock.h b/include/trace/events/filelock.h index b8d1e00a7982..2dfeb158e848 100644 --- a/include/trace/events/filelock.h +++ b/include/trace/events/filelock.h @@ -27,7 +27,8 @@ { FL_SLEEP, "FL_SLEEP" }, \ { FL_DOWNGRADE_PENDING, "FL_DOWNGRADE_PENDING" }, \ { FL_UNLOCK_PENDING, "FL_UNLOCK_PENDING" }, \ - { FL_OFDLCK, "FL_OFDLCK" }) + { FL_OFDLCK, "FL_OFDLCK" }, \ + { FL_RECLAIM, "FL_RECLAIM"}) #define show_fl_type(val) \ __print_symbolic(val, \ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 0bd678a4a10e..beb4c2d1e41c 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -430,10 +430,13 @@ typedef int __bitwise __kernel_rwf_t; /* buffered IO that drops the cache after reading or writing data */ #define RWF_DONTCACHE ((__force __kernel_rwf_t)0x00000080) +/* prevent pipe and socket writes from raising SIGPIPE */ +#define RWF_NOSIGNAL ((__force __kernel_rwf_t)0x00000100) + /* mask of flags supported by the kernel */ #define RWF_SUPPORTED (RWF_HIPRI | RWF_DSYNC | RWF_SYNC | RWF_NOWAIT |\ RWF_APPEND | RWF_NOAPPEND | RWF_ATOMIC |\ - RWF_DONTCACHE) + RWF_DONTCACHE | RWF_NOSIGNAL) #define PROCFS_IOCTL_MAGIC 'f' diff --git a/init/Kconfig b/init/Kconfig index 6a91ec742186..87c868f86a06 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -1504,6 +1504,7 @@ config BOOT_CONFIG_EMBED_FILE config INITRAMFS_PRESERVE_MTIME bool "Preserve cpio archive mtimes in initramfs" + depends on BLK_DEV_INITRD default y help Each entry in an initramfs cpio archive carries an mtime value. When diff --git a/init/do_mounts_rd.c b/init/do_mounts_rd.c index ac021ae6e6fa..19d9f33dcacf 100644 --- a/init/do_mounts_rd.c +++ b/init/do_mounts_rd.c @@ -7,6 +7,7 @@ #include <uapi/linux/cramfs_fs.h> #include <linux/initrd.h> #include <linux/string.h> +#include <linux/string_choices.h> #include <linux/slab.h> #include "do_mounts.h" @@ -186,14 +187,12 @@ static unsigned long nr_blocks(struct file *file) int __init rd_load_image(char *from) { int res = 0; - unsigned long rd_blocks, devblocks; + unsigned long rd_blocks, devblocks, nr_disks; int nblocks, i; char *buf = NULL; unsigned short rotate = 0; decompress_fn decompressor = NULL; -#if !defined(CONFIG_S390) char rotator[4] = { '|' , '/' , '-' , '\\' }; -#endif out_file = filp_open("/dev/ram", O_RDWR, 0); if (IS_ERR(out_file)) @@ -244,8 +243,9 @@ int __init rd_load_image(char *from) goto done; } - printk(KERN_NOTICE "RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", - nblocks, ((nblocks-1)/devblocks)+1, nblocks>devblocks ? "s" : ""); + nr_disks = (nblocks - 1) / devblocks + 1; + pr_notice("RAMDISK: Loading %dKiB [%ld disk%s] into ram disk... ", + nblocks, nr_disks, str_plural(nr_disks)); for (i = 0; i < nblocks; i++) { if (i && (i % devblocks == 0)) { pr_cont("done disk #1.\n"); @@ -255,12 +255,10 @@ int __init rd_load_image(char *from) } kernel_read(in_file, buf, BLOCK_SIZE, &in_pos); kernel_write(out_file, buf, BLOCK_SIZE, &out_pos); -#if !defined(CONFIG_S390) - if (!(i % 16)) { + if (!IS_ENABLED(CONFIG_S390) && !(i % 16)) { pr_cont("%c\b", rotator[rotate & 0x3]); rotate++; } -#endif } pr_cont("done.\n"); diff --git a/init/initramfs.c b/init/initramfs.c index 097673b97784..6ddbfb17fb8f 100644 --- a/init/initramfs.c +++ b/init/initramfs.c @@ -19,6 +19,7 @@ #include <linux/init_syscalls.h> #include <linux/umh.h> #include <linux/security.h> +#include <linux/overflow.h> #include "do_mounts.h" #include "initramfs_internal.h" @@ -108,7 +109,7 @@ static char __init *find_link(int major, int minor, int ino, q->minor = minor; q->ino = ino; q->mode = mode; - strcpy(q->name, name); + strscpy(q->name, name); q->next = NULL; *p = q; hardlink_seen = true; @@ -152,7 +153,7 @@ static void __init dir_add(const char *name, size_t nlen, time64_t mtime) { struct dir_entry *de; - de = kmalloc(sizeof(struct dir_entry) + nlen, GFP_KERNEL); + de = kmalloc(struct_size(de, name, nlen), GFP_KERNEL); if (!de) panic_show_mem("can't allocate dir_entry buffer"); INIT_LIST_HEAD(&de->list); diff --git a/kernel/bpf/inode.c b/kernel/bpf/inode.c index 5c2e96b19392..6d021d18afa6 100644 --- a/kernel/bpf/inode.c +++ b/kernel/bpf/inode.c @@ -788,7 +788,7 @@ static void bpf_free_inode(struct inode *inode) const struct super_operations bpf_super_ops = { .statfs = simple_statfs, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .show_options = bpf_show_options, .free_inode = bpf_free_inode, }; diff --git a/kernel/pid.c b/kernel/pid.c index c45a28c16cd2..d94ce0250501 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -680,7 +680,7 @@ static int pid_table_root_permissions(struct ctl_table_header *head, container_of(head->set, struct pid_namespace, set); int mode = table->mode; - if (ns_capable(pidns->user_ns, CAP_SYS_ADMIN) || + if (ns_capable_noaudit(pidns->user_ns, CAP_SYS_ADMIN) || uid_eq(current_euid(), make_kuid(pidns->user_ns, 0))) mode = (mode & S_IRWXU) >> 6; else if (in_egroup_p(make_kgid(pidns->user_ns, 0))) diff --git a/kernel/pid_namespace.c b/kernel/pid_namespace.c index 7098ed44e717..b7b45c2597ec 100644 --- a/kernel/pid_namespace.c +++ b/kernel/pid_namespace.c @@ -390,11 +390,23 @@ static void pidns_put(struct ns_common *ns) put_pid_ns(to_pid_ns(ns)); } +bool pidns_is_ancestor(struct pid_namespace *child, + struct pid_namespace *ancestor) +{ + struct pid_namespace *ns; + + if (child->level < ancestor->level) + return false; + for (ns = child; ns->level > ancestor->level; ns = ns->parent) + ; + return ns == ancestor; +} + static int pidns_install(struct nsset *nsset, struct ns_common *ns) { struct nsproxy *nsproxy = nsset->nsproxy; struct pid_namespace *active = task_active_pid_ns(current); - struct pid_namespace *ancestor, *new = to_pid_ns(ns); + struct pid_namespace *new = to_pid_ns(ns); if (!ns_capable(new->user_ns, CAP_SYS_ADMIN) || !ns_capable(nsset->cred->user_ns, CAP_SYS_ADMIN)) @@ -408,13 +420,7 @@ static int pidns_install(struct nsset *nsset, struct ns_common *ns) * this maintains the property that processes and their * children can not escape their current pid namespace. */ - if (new->level < active->level) - return -EINVAL; - - ancestor = new; - while (ancestor->level > active->level) - ancestor = ancestor->parent; - if (ancestor != active) + if (!pidns_is_ancestor(new, active)) return -EINVAL; put_pid_ns(nsproxy->pid_ns_for_children); diff --git a/mm/shmem.c b/mm/shmem.c index e2c76a30802b..932727247c64 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -5341,7 +5341,7 @@ static const struct super_operations shmem_ops = { .get_dquots = shmem_get_dquots, #endif .evict_inode = shmem_evict_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .put_super = shmem_put_super, #ifdef CONFIG_TRANSPARENT_HUGEPAGE .nr_cached_objects = shmem_unused_huge_count, diff --git a/net/socket.c b/net/socket.c index 682969deaed3..bac335ecee4c 100644 --- a/net/socket.c +++ b/net/socket.c @@ -1176,6 +1176,9 @@ static ssize_t sock_write_iter(struct kiocb *iocb, struct iov_iter *from) if (sock->type == SOCK_SEQPACKET) msg.msg_flags |= MSG_EOR; + if (iocb->ki_flags & IOCB_NOSIGNAL) + msg.msg_flags |= MSG_NOSIGNAL; + res = __sock_sendmsg(sock, &msg); *from = msg.msg_iter; return res; diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 19bb333e2485..6b78a8382d40 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -18,6 +18,7 @@ /proc-tid0 /proc-uptime-001 /proc-uptime-002 +/proc-pidns /read /self /setns-dcache diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index 50aba102201a..be3013515aae 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -28,5 +28,6 @@ TEST_GEN_PROGS += setns-sysvipc TEST_GEN_PROGS += thread-self TEST_GEN_PROGS += proc-multiple-procfs TEST_GEN_PROGS += proc-fsconfig-hidepid +TEST_GEN_PROGS += proc-pidns include ../lib.mk diff --git a/tools/testing/selftests/proc/proc-pidns.c b/tools/testing/selftests/proc/proc-pidns.c new file mode 100644 index 000000000000..52500597f951 --- /dev/null +++ b/tools/testing/selftests/proc/proc-pidns.c @@ -0,0 +1,211 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Author: Aleksa Sarai <cyphar@cyphar.com> + * Copyright (C) 2025 SUSE LLC. + */ + +#include <assert.h> +#include <errno.h> +#include <sched.h> +#include <stdbool.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <stdio.h> +#include <sys/mount.h> +#include <sys/stat.h> +#include <sys/prctl.h> + +#include "../kselftest_harness.h" + +#define ASSERT_ERRNO(expected, _t, seen) \ + __EXPECT(expected, #expected, \ + ({__typeof__(seen) _tmp_seen = (seen); \ + _tmp_seen >= 0 ? _tmp_seen : -errno; }), #seen, _t, 1) + +#define ASSERT_ERRNO_EQ(expected, seen) \ + ASSERT_ERRNO(expected, ==, seen) + +#define ASSERT_SUCCESS(seen) \ + ASSERT_ERRNO(0, <=, seen) + +static int touch(char *path) +{ + int fd = open(path, O_WRONLY|O_CREAT|O_CLOEXEC, 0644); + if (fd < 0) + return -1; + return close(fd); +} + +FIXTURE(ns) +{ + int host_mntns, host_pidns; + int dummy_pidns; +}; + +FIXTURE_SETUP(ns) +{ + /* Stash the old mntns. */ + self->host_mntns = open("/proc/self/ns/mnt", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_mntns); + + /* Create a new mount namespace and make it private. */ + ASSERT_SUCCESS(unshare(CLONE_NEWNS)); + ASSERT_SUCCESS(mount(NULL, "/", NULL, MS_PRIVATE|MS_REC, NULL)); + + /* + * Create a proper tmpfs that we can use and will disappear once we + * leave this mntns. + */ + ASSERT_SUCCESS(mount("tmpfs", "/tmp", "tmpfs", 0, NULL)); + + /* + * Create a pidns we can use for later tests. We need to fork off a + * child so that we get a usable nsfd that we can bind-mount and open. + */ + ASSERT_SUCCESS(mkdir("/tmp/dummy", 0755)); + ASSERT_SUCCESS(touch("/tmp/dummy/pidns")); + ASSERT_SUCCESS(mkdir("/tmp/dummy/proc", 0755)); + + self->host_pidns = open("/proc/self/ns/pid", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->host_pidns); + ASSERT_SUCCESS(unshare(CLONE_NEWPID)); + + pid_t pid = fork(); + ASSERT_SUCCESS(pid); + if (!pid) { + prctl(PR_SET_PDEATHSIG, SIGKILL); + ASSERT_SUCCESS(mount("/proc/self/ns/pid", "/tmp/dummy/pidns", NULL, MS_BIND, NULL)); + ASSERT_SUCCESS(mount("proc", "/tmp/dummy/proc", "proc", 0, NULL)); + exit(0); + } + + int wstatus; + ASSERT_EQ(waitpid(pid, &wstatus, 0), pid); + ASSERT_TRUE(WIFEXITED(wstatus)); + ASSERT_EQ(WEXITSTATUS(wstatus), 0); + + ASSERT_SUCCESS(setns(self->host_pidns, CLONE_NEWPID)); + + self->dummy_pidns = open("/tmp/dummy/pidns", O_RDONLY|O_CLOEXEC); + ASSERT_SUCCESS(self->dummy_pidns); +} + +FIXTURE_TEARDOWN(ns) +{ + ASSERT_SUCCESS(setns(self->host_mntns, CLONE_NEWNS)); + ASSERT_SUCCESS(close(self->host_mntns)); + + ASSERT_SUCCESS(close(self->host_pidns)); + ASSERT_SUCCESS(close(self->dummy_pidns)); +} + +TEST_F(ns, pidns_mount_string_path) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc-host", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-host", "proc", 0, "pidns=/proc/self/ns/pid")); + ASSERT_SUCCESS(access("/tmp/proc-host/self/", X_OK)); + + ASSERT_SUCCESS(mkdir("/tmp/proc-dummy", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc-dummy", "proc", 0, "pidns=/tmp/dummy/pidns")); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/1/", X_OK)); + ASSERT_ERRNO_EQ(-ENOENT, access("/tmp/proc-dummy/self/", X_OK)); +} + +TEST_F(ns, pidns_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_ERRNO_EQ(-ENOENT, faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_remount) +{ + ASSERT_SUCCESS(mkdir("/tmp/proc", 0755)); + ASSERT_SUCCESS(mount("proc", "/tmp/proc", "proc", 0, "")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); + + ASSERT_ERRNO_EQ(-EBUSY, mount(NULL, "/tmp/proc", NULL, MS_REMOUNT, "pidns=/tmp/dummy/pidns")); + + ASSERT_SUCCESS(access("/tmp/proc/1/", X_OK)); + ASSERT_SUCCESS(access("/tmp/proc/self/", X_OK)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_string_path) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_STRING, "pidns", "/tmp/dummy/pidns", 0)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_F(ns, pidns_reconfigure_fsconfig_fd) +{ + int fsfd = fsopen("proc", FSOPEN_CLOEXEC); + ASSERT_SUCCESS(fsfd); + + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_CREATE, NULL, NULL, 0)); + + int mountfd = fsmount(fsfd, FSMOUNT_CLOEXEC, 0); + ASSERT_SUCCESS(mountfd); + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_ERRNO_EQ(-EBUSY, fsconfig(fsfd, FSCONFIG_SET_FD, "pidns", NULL, self->dummy_pidns)); + ASSERT_SUCCESS(fsconfig(fsfd, FSCONFIG_CMD_RECONFIGURE, NULL, NULL, 0)); /* noop */ + + ASSERT_SUCCESS(faccessat(mountfd, "1/", X_OK, 0)); + ASSERT_SUCCESS(faccessat(mountfd, "self/", X_OK, 0)); + + ASSERT_SUCCESS(close(fsfd)); + ASSERT_SUCCESS(close(mountfd)); +} + +TEST_HARNESS_MAIN |