diff options
36 files changed, 339 insertions, 258 deletions
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst index 1639e78e3146..12a71ba221b8 100644 --- a/Documentation/filesystems/porting.rst +++ b/Documentation/filesystems/porting.rst @@ -1157,3 +1157,8 @@ in normal case it points into the pathname being looked up. NOTE: if you need something like full path from the root of filesystem, you are still on your own - this assists with simple cases, but it's not magic. + +--- + +** mandatory ** +invalidate_inodes() is gone use evict_inodes() instead. diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c index 2e94d20c4ac7..b735f4c2fe5e 100644 --- a/arch/arm64/kernel/elfcore.c +++ b/arch/arm64/kernel/elfcore.c @@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm, int ret = 1; unsigned long addr; void *tags = NULL; + int locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { - struct page *page = get_dump_page(addr); + struct page *page = get_dump_page(addr, &locked); /* * get_dump_page() returns NULL when encountering an empty diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c index 1574176e3ffc..c86950d7105a 100644 --- a/arch/powerpc/platforms/pseries/papr-vpd.c +++ b/arch/powerpc/platforms/pseries/papr-vpd.c @@ -482,14 +482,13 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc) goto free_blob; } - file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops, - (void *)blob, O_RDONLY); + file = anon_inode_getfile_fmode("[papr-vpd]", &papr_vpd_handle_ops, + (void *)blob, O_RDONLY, + FMODE_LSEEK | FMODE_PREAD); if (IS_ERR(file)) { err = PTR_ERR(file); goto put_fd; } - - file->f_mode |= FMODE_LSEEK | FMODE_PREAD; fd_install(fd, file); return fd; put_fd: diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c index 49559605177e..c321d442f0da 100644 --- a/drivers/vfio/group.c +++ b/drivers/vfio/group.c @@ -266,24 +266,12 @@ static struct file *vfio_device_open_file(struct vfio_device *device) if (ret) goto err_free; - /* - * We can't use anon_inode_getfd() because we need to modify - * the f_mode flags directly to allow more than just ioctls - */ - filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops, - df, O_RDWR); + filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops, + df, O_RDWR, FMODE_PREAD | FMODE_PWRITE); if (IS_ERR(filep)) { ret = PTR_ERR(filep); goto err_close_device; } - - /* - * TODO: add an anon_inode interface to do this. - * Appears to be missing by lack of need rather than - * explicitly prevented. Now there's need. - */ - filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE); - /* * Use the pseudo fs inode on the device to link all mmaps * to the same address space, allowing us to unmap all vmas diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 6d57efbb8110..c5a6aae12d2c 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp, sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; - struct inode *inode = base->d_inode; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; @@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp, "the parent autofs mount timeout which could " "prevent shutdown\n"); - inode_lock_shared(inode); dentry = try_lookup_one_len(param->path, base, path_len); - inode_unlock_shared(inode); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index fe3de9ad57bf..d9bc67176128 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_free_id; } - anon_file->file = anon_inode_getfile("[cachefiles]", - &cachefiles_ondemand_fd_fops, object, O_WRONLY); + anon_file->file = anon_inode_getfile_fmode("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, + O_WRONLY, FMODE_PWRITE | FMODE_LSEEK); if (IS_ERR(anon_file->file)) { ret = PTR_ERR(anon_file->file); goto err_put_fd; @@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_put_file; } - anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - load = (void *)req->msg.data; load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; diff --git a/fs/coredump.c b/fs/coredump.c index 4375c70144d0..d6a92cd6018e 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, { unsigned long addr; struct page *dump_page; + int locked, ret; dump_page = dump_page_alloc(); if (!dump_page) return 0; + ret = 0; + locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; + if (!locked) { + if (mmap_read_lock_killable(current->mm)) + goto out; + locked = 1; + } + /* * To avoid having to allocate page tables for virtual address * ranges that have never been used yet, and also to make it @@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, * NULL when encountering an empty page table entry that would * otherwise have been filled with the zero page. */ - page = get_dump_page(addr); + page = get_dump_page(addr, &locked); if (page) { + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) { - dump_page_free(dump_page); - return 0; - } + if (stop) + goto out; } else { dump_skip(cprm, PAGE_SIZE); } + + if (dump_interrupted()) + goto out; + + if (!need_resched()) + continue; + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } cond_resched(); } + ret = 1; +out: + if (locked) + mmap_read_unlock(current->mm); + dump_page_free(dump_page); - return 1; + return ret; } #endif diff --git a/fs/dcache.c b/fs/dcache.c index e3634916ffb9..3ee84f62827a 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2480,7 +2480,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); - wake_up_all(d_wait); + if (wq_has_sleeper(d_wait)) + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0b1c878317ab..e7b7f426fecf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = { .destroy_inode = ecryptfs_destroy_inode, .free_inode = ecryptfs_free_inode, .statfs = ecryptfs_statfs, - .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, .show_options = ecryptfs_show_options }; diff --git a/fs/eventfd.c b/fs/eventfd.c index 76129bfcd663..af42b2c7d235 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags) if (fd < 0) goto err; - file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); + file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, + ctx, flags, FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } - - file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..1fc770270ab8 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -448,7 +448,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) budget = BUSY_POLL_BUDGET; if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -560,7 +560,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -2047,7 +2047,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0c899cfba578..b5845c4846b8 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..40fed4501aab 100644 --- a/fs/file.c +++ b/fs/file.c @@ -418,17 +418,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -577,6 +585,7 @@ repeat: __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); @@ -612,22 +621,14 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ - void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; @@ -642,7 +643,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - WARN_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -650,7 +651,7 @@ void fd_install(unsigned int fd, struct file *file) /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } @@ -679,7 +680,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -1182,6 +1183,16 @@ static inline bool file_needs_f_pos_lock(struct file *file) (file_count(file) > 1 || file->f_op->iterate_shared); } +bool file_seek_cur_needs_f_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; + + VFS_WARN_ON_ONCE((file_count(file) > 1) && + !mutex_is_locked(&file->f_pos_lock)); + return true; +} + struct fd fdget_pos(unsigned int fd) { struct fd f = fdget(fd); @@ -1230,14 +1241,34 @@ __releases(&files->file_lock) struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); - tofree = fdt->fd[fd]; + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); diff --git a/fs/file_table.c b/fs/file_table.c index 5c00dc38558d..9f0a1a164c82 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (unlikely(get_nr_files() >= files_stat.max_files) && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. diff --git a/fs/inode.c b/fs/inode.c index 5587aabdaa5e..99318b157a9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head) free_inode_nonrcu(inode); } -static struct inode *alloc_inode(struct super_block *sb) +/** + * alloc_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; @@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + struct super_block *sb = inode->i_sb; + + spin_lock(&sb->s_inode_list_lock); + list_add(&inode->i_sb_list, &sb->s_inodes); + spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { + struct super_block *sb = inode->i_sb; + if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); + spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); + spin_unlock(&sb->s_inode_list_lock); } } @@ -806,23 +820,16 @@ static void evict(struct inode *inode) /* * Wake up waiters in __wait_on_freeing_inode(). * - * Lockless hash lookup may end up finding the inode before we removed - * it above, but only lock it *after* we are done with the wakeup below. - * In this case the potential waiter cannot safely block. + * It is an invariant that any thread we need to wake up is already + * accounted for before remove_inode_hash() acquires ->i_lock -- both + * sides take the lock and sleep is aborted if the inode is found + * unhashed. Thus either the sleeper wins and goes off CPU, or removal + * wins and the sleeper aborts after testing with the lock. * - * The inode being unhashed after the call to remove_inode_hash() is - * used as an indicator whether blocking on it is safe. + * This also means we don't need any fences for the call below. */ - spin_lock(&inode->i_lock); - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -900,46 +907,6 @@ again: } EXPORT_SYMBOL_GPL(evict_inodes); -/** - * invalidate_inodes - attempt to free all inodes on a superblock - * @sb: superblock to operate on - * - * Attempts to free all inodes (including dirty inodes) for a given superblock. - */ -void invalidate_inodes(struct super_block *sb) -{ - struct inode *inode, *next; - LIST_HEAD(dispose); - -again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } - if (atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); - spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); - if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); - cond_resched(); - dispose_list(&dispose); - goto again; - } - } - spin_unlock(&sb->s_inode_list_lock); - - dispose_list(&dispose); -} - /* * Isolate the inode from the LRU in preparation for freeing it. * @@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** - * new_inode_pseudo - obtain an inode - * @sb: superblock - * - * Allocates a new inode for given superblock. - * Inode wont be chained in superblock s_inodes list - * This means : - * - fs can't be unmount - * - quotas, fsnotify, writeback can't work - */ -struct inode *new_inode_pseudo(struct super_block *sb) -{ - return alloc_inode(sb); -} - -/** * new_inode - obtain an inode * @sb: superblock * @@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - inode = new_inode_pseudo(sb); + inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; @@ -1348,8 +1300,8 @@ again: } if (set && unlikely(set(inode, data))) { - inode = NULL; - goto unlock; + spin_unlock(&inode_hash_lock); + return NULL; } /* @@ -1361,14 +1313,14 @@ again: hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); -unlock: - spin_unlock(&inode_hash_lock); return inode; } @@ -1497,8 +1449,8 @@ again: inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); + inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); + +#ifdef CONFIG_DEBUG_VFS +/* + * Dump an inode. + * + * TODO: add a proper inode dumping routine, this is a stub to get debug off the + * ground. + */ +void dump_inode(struct inode *inode, const char *reason) +{ + pr_warn("%s encountered for inode %px", reason, inode); +} + +EXPORT_SYMBOL(dump_inode); +#endif diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..82127c69e641 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -187,8 +187,8 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); -long do_ftruncate(struct file *file, loff_t length, int small); -long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_ftruncate(struct file *file, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); @@ -207,7 +207,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -void invalidate_inodes(struct super_block *sb); /* * dcache.c @@ -338,3 +337,4 @@ static inline bool path_mounted(const struct path *path) return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file); +bool file_seek_cur_needs_f_lock(struct file *file); diff --git a/fs/ioctl.c b/fs/ioctl.c index 638a36be31c1..c91fd2b46a77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) return error; } -static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; @@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, return ret; } -static long ioctl_file_clone_range(struct file *file, - struct file_clone_range __user *argp) +static int ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; diff --git a/fs/namei.c b/fs/namei.c index ecb7b95c2ca3..2a12238e696a 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,6 +125,13 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) +static inline void initname(struct filename *name) +{ + name->uptr = NULL; + name->aname = NULL; + atomic_set(&name->refcnt, 1); +} + struct filename * getname_flags(const char __user *filename, int flags) { @@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags) return ERR_PTR(-ENAMETOOLONG); } } - - atomic_set(&result->refcnt, 1); - result->uptr = filename; - result->aname = NULL; + initname(result); audit_getname(result); return result; } @@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags) return getname_flags(filename, flags); } -struct filename *getname(const char __user * filename) -{ - return getname_flags(filename, 0); -} - struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; @@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - result->uptr = NULL; - result->aname = NULL; - atomic_set(&result->refcnt, 1); + initname(result); audit_getname(result); - return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { + int refcnt; + if (IS_ERR_OR_NULL(name)) return; - if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) - return; + refcnt = atomic_read(&name->refcnt); + if (refcnt != 1) { + if (WARN_ON_ONCE(!refcnt)) + return; - if (!atomic_dec_and_test(&name->refcnt)) - return; + if (!atomic_dec_and_test(&name->refcnt)) + return; + } if (name->name != name->iname) { __putname(name->name); @@ -2863,15 +2864,14 @@ static int lookup_one_common(struct mnt_idmap *idmap, * Note that this routine is purely a helper for filesystem usage and should * not be called by generic code. * - * The caller must hold base->i_mutex. + * No locks need be held - only a counted reference to @base is needed. + * */ struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) { struct qstr this; int err; - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); if (err) return ERR_PTR(err); @@ -3415,6 +3415,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; + default: + VFS_BUG_ON_INODE(1, inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); diff --git a/fs/open.c b/fs/open.c index 1be20de9f283..201497f65590 100644 --- a/fs/open.c +++ b/fs/open.c @@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -123,7 +123,7 @@ mnt_drop_write_and_out: } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_ftruncate(struct file *file, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small) return error; } -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; @@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; loff_t sum; if (offset < 0 || len <= 0) @@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void) return override_creds(override_cred); } -static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; @@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root, } EXPORT_SYMBOL(file_open_root); -static long do_sys_openat2(int dfd, const char __user *filename, - struct open_how *how) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(how, &op); struct filename *tmp; + int err, fd; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); - if (fd >= 0) { + if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); @@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename, return fd; } -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); diff --git a/fs/read_write.c b/fs/read_write.c index a6133241dfb8..bb0ed26a0b3a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, if (whence == SEEK_CUR) { /* - * f_lock protects against read/modify/write race with - * other SEEK_CURs. Note that parallel writes and reads - * behave like SEEK_SET. + * If the file requires locking via f_pos_lock we know + * that mutual exclusion for SEEK_CUR on the same file + * is guaranteed. If the file isn't locked, we take + * f_lock to protect against f_pos races with other + * SEEK_CURs. */ - guard(spinlock)(&file->f_lock); + if (file_seek_cur_needs_f_lock(file)) { + guard(spinlock)(&file->f_lock); + return vfs_setpos(file, file->f_pos + offset, maxsize); + } return vfs_setpos(file, file->f_pos + offset, maxsize); } diff --git a/fs/signalfd.c b/fs/signalfd.c index d1a5f43ce466..d469782f97f4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) return ufd; } - file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx, - O_RDWR | (flags & O_NONBLOCK)); + file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, + ctx, O_RDWR | (flags & O_NONBLOCK), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; - fd_install(ufd, file); } else { CLASS(fd, f)(ufd); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8582cf61242c..9e4f7378f30f 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) spin_unlock(&tcon->tc_lock); /* - * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * BB Add call to evict_inodes(sb) for all superblocks mounted * to this tcon. */ } diff --git a/fs/super.c b/fs/super.c index 723176dee229..97a17f9d9023 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); - invalidate_inodes(sb); + evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); diff --git a/fs/timerfd.c b/fs/timerfd.c index 9f7eb451a60f..753e22e83e0f 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -439,15 +439,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } - file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; fd_install(ufd, file); return ufd; } diff --git a/include/linux/fs.h b/include/linux/fs.h index b570138ff114..91b1e191530c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2,6 +2,7 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H +#include <linux/vfsdebug.h> #include <linux/linkage.h> #include <linux/wait_bit.h> #include <linux/kdev_t.h> @@ -790,19 +791,8 @@ struct inode { static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen) { - int testlen; - - /* - * TODO: patch it into a debug-only check if relevant macros show up. - * In the meantime, since we are suffering strlen even on production kernels - * to find the right length, do a fixup if the wrong value got passed. - */ - testlen = strlen(link); - if (testlen != linklen) { - WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)", - link, linklen, testlen); - linklen = testlen; - } + VFS_WARN_ON_INODE(strlen(link) != linklen, inode); + VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode); inode->i_link = link; inode->i_linklen = linklen; inode->i_opflags |= IOP_CACHED_LINK; @@ -1067,7 +1057,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) /** * struct file - Represents a file - * @f_ref: reference count * @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context. * @f_mode: FMODE_* flags often used in hotpaths * @f_op: file operations @@ -1077,12 +1066,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_flags: file flags * @f_iocb_flags: iocb flags * @f_cred: stashed credentials of creator/opener + * @f_owner: file owner * @f_path: path of the file * @f_pos_lock: lock protecting file position * @f_pipe: specific to pipes * @f_pos: file position * @f_security: LSM security context of this file - * @f_owner: file owner * @f_wb_err: writeback error * @f_sb_err: per sb writeback errors * @f_ep: link of all epoll hooks for this file @@ -1090,9 +1079,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) * @f_llist: work queue entrypoint * @f_ra: file's readahead state * @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.) + * @f_ref: reference count */ struct file { - file_ref_t f_ref; spinlock_t f_lock; fmode_t f_mode; const struct file_operations *f_op; @@ -1102,6 +1091,7 @@ struct file { unsigned int f_flags; unsigned int f_iocb_flags; const struct cred *f_cred; + struct fown_struct *f_owner; /* --- cacheline 1 boundary (64 bytes) --- */ struct path f_path; union { @@ -1115,7 +1105,6 @@ struct file { void *f_security; #endif /* --- cacheline 2 boundary (128 bytes) --- */ - struct fown_struct *f_owner; errseq_t f_wb_err; errseq_t f_sb_err; #ifdef CONFIG_EPOLL @@ -1127,6 +1116,7 @@ struct file { struct file_ra_state f_ra; freeptr_t f_freeptr; }; + file_ref_t f_ref; /* --- cacheline 3 boundary (192 bytes) --- */ } __randomize_layout __attribute__((aligned(4))); /* lest something weird decides that 2 is OK */ @@ -2039,7 +2029,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group); int vfs_fchmod(struct file *file, umode_t mode); int vfs_utimes(const struct path *path, struct timespec64 *times); -extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); #ifdef CONFIG_COMPAT extern long compat_ptr_ioctl(struct file *file, unsigned int cmd, @@ -2791,13 +2781,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt) return mnt_idmap(mnt) != &nop_mnt_idmap; } -extern long vfs_truncate(const struct path *, loff_t); +int vfs_truncate(const struct path *, loff_t); int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start, unsigned int time_attrs, struct file *filp); extern int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len); -extern long do_sys_open(int dfd, const char __user *filename, int flags, - umode_t mode); +int do_sys_open(int dfd, const char __user *filename, int flags, + umode_t mode); extern struct file *file_open_name(struct filename *, int, umode_t); extern struct file *filp_open(const char *, int, umode_t); extern struct file *file_open_root(const struct path *, @@ -2848,7 +2838,10 @@ extern int filp_close(struct file *, fl_owner_t id); extern struct filename *getname_flags(const char __user *, int); extern struct filename *getname_uflags(const char __user *, int); -extern struct filename *getname(const char __user *); +static inline struct filename *getname(const char __user *name) +{ + return getname_flags(name, 0); +} extern struct filename *getname_kernel(const char *); extern struct filename *__getname_maybe_null(const char __user *); static inline struct filename *getname_maybe_null(const char __user *name, int flags) @@ -2862,6 +2855,12 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f } extern void putname(struct filename *name); +static inline struct filename *refname(struct filename *name) +{ + atomic_inc(&name->refcnt); + return name; +} + extern int finish_open(struct file *file, struct dentry *dentry, int (*open)(struct inode *, struct file *)); extern int finish_no_open(struct file *file, struct dentry *dentry); @@ -3294,7 +3293,11 @@ static inline void __iget(struct inode *inode) extern void iget_failed(struct inode *); extern void clear_inode(struct inode *); extern void __destroy_inode(struct inode *); -extern struct inode *new_inode_pseudo(struct super_block *sb); +struct inode *alloc_inode(struct super_block *sb); +static inline struct inode *new_inode_pseudo(struct super_block *sb) +{ + return alloc_inode(sb); +} extern struct inode *new_inode(struct super_block *sb); extern void free_inode_nonrcu(struct inode *inode); extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *); diff --git a/include/linux/mm.h b/include/linux/mm.h index 1f80baddacc5..2edb8d14d165 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2555,7 +2555,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc, struct task_struct *task, bool bypass_rlim); struct kvec; -struct page *get_dump_page(unsigned long addr); +struct page *get_dump_page(unsigned long addr, int *locked); bool folio_mark_dirty(struct folio *folio); bool folio_mark_dirty_lock(struct folio *folio); diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 47bfc6b1b632..f348e7005306 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio, return folio->index + folio_page_idx(folio, page); } -/* - * Return byte-offset into filesystem object for page. +/** + * folio_pos - Returns the byte position of this folio in its file. + * @folio: The folio. */ -static inline loff_t page_offset(struct page *page) +static inline loff_t folio_pos(const struct folio *folio) { - return ((loff_t)page->index) << PAGE_SHIFT; + return ((loff_t)folio->index) * PAGE_SIZE; } -/** - * folio_pos - Returns the byte position of this folio in its file. - * @folio: The folio. +/* + * Return byte-offset into filesystem object for page. */ -static inline loff_t folio_pos(struct folio *folio) +static inline loff_t page_offset(struct page *page) { - return page_offset(&folio->page); + struct folio *folio = page_folio(page); + + return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE; } /* diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c6333204d451..bae4490c1dda 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -1266,14 +1266,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user, AT_SYMLINK_NOFOLLOW); } -extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); static inline long ksys_ftruncate(unsigned int fd, loff_t length) { return do_sys_ftruncate(fd, length, 1); } -extern long do_sys_truncate(const char __user *pathname, loff_t length); +int do_sys_truncate(const char __user *pathname, loff_t length); static inline long ksys_truncate(const char __user *pathname, loff_t length) { diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h new file mode 100644 index 000000000000..9cf22d3eb9dd --- /dev/null +++ b/include/linux/vfsdebug.h @@ -0,0 +1,45 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef LINUX_VFS_DEBUG_H +#define LINUX_VFS_DEBUG_H 1 + +#include <linux/bug.h> + +struct inode; + +#ifdef CONFIG_DEBUG_VFS +void dump_inode(struct inode *inode, const char *reason); + +#define VFS_BUG_ON(cond) BUG_ON(cond) +#define VFS_WARN_ON(cond) (void)WARN_ON(cond) +#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond) +#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format) +#define VFS_WARN(cond, format...) (void)WARN(cond, format) + +#define VFS_BUG_ON_INODE(cond, inode) ({ \ + if (unlikely(!!(cond))) { \ + dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\ + BUG_ON(1); \ + } \ +}) + +#define VFS_WARN_ON_INODE(cond, inode) ({ \ + int __ret_warn = !!(cond); \ + \ + if (unlikely(__ret_warn)) { \ + dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\ + WARN_ON(1); \ + } \ + unlikely(__ret_warn); \ +}) +#else +#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond) +#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond) + +#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond) +#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond) +#endif /* CONFIG_DEBUG_VFS */ + +#endif diff --git a/kernel/auditsc.c b/kernel/auditsc.c index 9c853cde9abe..78fd876a5473 100644 --- a/kernel/auditsc.c +++ b/kernel/auditsc.c @@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr) list_for_each_entry(n, &context->names_list, list) { if (!n->name) continue; - if (n->name->uptr == uptr) { - atomic_inc(&n->name->refcnt); - return n->name; - } + if (n->name->uptr == uptr) + return refname(n->name); } return NULL; } @@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name) n->name = name; n->name_len = AUDIT_NAME_FULL; name->aname = n; - atomic_inc(&name->refcnt); + refname(name); } static inline int audit_copy_fcaps(struct audit_names *name, @@ -2369,7 +2367,7 @@ out_alloc: return; if (name) { n->name = name; - atomic_inc(&name->refcnt); + refname(name); } out: @@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent, if (found_parent) { found_child->name = found_parent->name; found_child->name_len = AUDIT_NAME_FULL; - atomic_inc(&found_child->name->refcnt); + refname(found_child->name); } } diff --git a/kernel/kcmp.c b/kernel/kcmp.c index 2c596851f8a9..7c1a65bd5f8d 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type, */ task1 = find_task_by_vpid(pid1); task2 = find_task_by_vpid(pid2); - if (!task1 || !task2) + if (unlikely(!task1 || !task2)) goto err_no_task; get_task_struct(task1); diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c index 5267adeaa403..41e4e8070923 100644 --- a/kernel/watch_queue.c +++ b/kernel/watch_queue.c @@ -269,6 +269,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes) if (ret < 0) goto error; + /* + * pipe_resize_ring() does not update nr_accounted for watch_queue + * pipes, because the above vastly overprovisions. Set nr_accounted on + * and max_usage this pipe to the number that was actually charged to + * the user above via account_pipe_buffers. + */ + pipe->max_usage = nr_pages; + pipe->nr_accounted = nr_pages; + ret = -ENOMEM; pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL); if (!pages) diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index 35796c290ca3..b775ba54dcd8 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -808,6 +808,15 @@ config ARCH_HAS_DEBUG_VM_PGTABLE An architecture should select this when it can successfully build and run DEBUG_VM_PGTABLE. +config DEBUG_VFS + bool "Debug VFS" + depends on DEBUG_KERNEL + help + Enable this to turn on extended checks in the VFS layer that may impact + performance. + + If unsure, say N. + config DEBUG_VM_IRQSOFF def_bool DEBUG_VM && !PREEMPT_RT @@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable); /** * get_dump_page() - pin user page in memory while writing it to core dump * @addr: user address + * @locked: a pointer to an int denoting whether the mmap sem is held * * Returns struct page pointer of user page pinned for dump, * to be freed afterwards by put_page(). @@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable); * Called without mmap_lock (takes and releases the mmap_lock by itself). */ #ifdef CONFIG_ELF_CORE -struct page *get_dump_page(unsigned long addr) +struct page *get_dump_page(unsigned long addr, int *locked) { struct page *page; - int locked = 0; int ret; - ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked, + ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked, FOLL_FORCE | FOLL_DUMP | FOLL_GET); return (ret == 1) ? page : NULL; } diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 71b9dc331aae..582769ae830e 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1216,7 +1216,7 @@ static void hook_inode_free_security_rcu(void *inode_security) /* * Release the inodes used in a security policy. * - * Cf. fsnotify_unmount_inodes() and invalidate_inodes() + * Cf. fsnotify_unmount_inodes() and evict_inodes() */ static void hook_sb_delete(struct super_block *const sb) { diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index ba0327e2d0d3..16bee3b78219 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -4224,15 +4224,14 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu) if (fd < 0) return fd; - file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY); + file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu, + O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(vcpu->kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; @@ -5020,16 +5019,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm) if (fd < 0) return fd; - file = anon_inode_getfile("kvm-vm-stats", - &kvm_vm_stats_fops, kvm, O_RDONLY); + file = anon_inode_getfile_fmode("kvm-vm-stats", + &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD); if (IS_ERR(file)) { put_unused_fd(fd); return PTR_ERR(file); } kvm_get_kvm(kvm); - - file->f_mode |= FMODE_PREAD; fd_install(fd, file); return fd; |