summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/filesystems/porting.rst5
-rw-r--r--arch/arm64/kernel/elfcore.c3
-rw-r--r--arch/powerpc/platforms/pseries/papr-vpd.c7
-rw-r--r--drivers/vfio/group.c16
-rw-r--r--fs/autofs/dev-ioctl.c3
-rw-r--r--fs/cachefiles/ondemand.c7
-rw-r--r--fs/coredump.c38
-rw-r--r--fs/dcache.c3
-rw-r--r--fs/ecryptfs/super.c1
-rw-r--r--fs/eventfd.c5
-rw-r--r--fs/eventpoll.c8
-rw-r--r--fs/exportfs/expfs.c2
-rw-r--r--fs/file.c81
-rw-r--r--fs/file_table.c3
-rw-r--r--fs/inode.c127
-rw-r--r--fs/internal.h6
-rw-r--r--fs/ioctl.c10
-rw-r--r--fs/namei.c42
-rw-r--r--fs/open.c29
-rw-r--r--fs/read_write.c13
-rw-r--r--fs/signalfd.c7
-rw-r--r--fs/smb/client/file.c2
-rw-r--r--fs/super.c2
-rw-r--r--fs/timerfd.c6
-rw-r--r--include/linux/fs.h49
-rw-r--r--include/linux/mm.h2
-rw-r--r--include/linux/pagemap.h20
-rw-r--r--include/linux/syscalls.h4
-rw-r--r--include/linux/vfsdebug.h45
-rw-r--r--kernel/auditsc.c12
-rw-r--r--kernel/kcmp.c2
-rw-r--r--kernel/watch_queue.c9
-rw-r--r--lib/Kconfig.debug9
-rw-r--r--mm/gup.c6
-rw-r--r--security/landlock/fs.c2
-rw-r--r--virt/kvm/kvm_main.c11
36 files changed, 339 insertions, 258 deletions
diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 1639e78e3146..12a71ba221b8 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1157,3 +1157,8 @@ in normal case it points into the pathname being looked up.
NOTE: if you need something like full path from the root of filesystem,
you are still on your own - this assists with simple cases, but it's not
magic.
+
+---
+
+** mandatory **
+invalidate_inodes() is gone use evict_inodes() instead.
diff --git a/arch/arm64/kernel/elfcore.c b/arch/arm64/kernel/elfcore.c
index 2e94d20c4ac7..b735f4c2fe5e 100644
--- a/arch/arm64/kernel/elfcore.c
+++ b/arch/arm64/kernel/elfcore.c
@@ -27,9 +27,10 @@ static int mte_dump_tag_range(struct coredump_params *cprm,
int ret = 1;
unsigned long addr;
void *tags = NULL;
+ int locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
- struct page *page = get_dump_page(addr);
+ struct page *page = get_dump_page(addr, &locked);
/*
* get_dump_page() returns NULL when encountering an empty
diff --git a/arch/powerpc/platforms/pseries/papr-vpd.c b/arch/powerpc/platforms/pseries/papr-vpd.c
index 1574176e3ffc..c86950d7105a 100644
--- a/arch/powerpc/platforms/pseries/papr-vpd.c
+++ b/arch/powerpc/platforms/pseries/papr-vpd.c
@@ -482,14 +482,13 @@ static long papr_vpd_create_handle(struct papr_location_code __user *ulc)
goto free_blob;
}
- file = anon_inode_getfile("[papr-vpd]", &papr_vpd_handle_ops,
- (void *)blob, O_RDONLY);
+ file = anon_inode_getfile_fmode("[papr-vpd]", &papr_vpd_handle_ops,
+ (void *)blob, O_RDONLY,
+ FMODE_LSEEK | FMODE_PREAD);
if (IS_ERR(file)) {
err = PTR_ERR(file);
goto put_fd;
}
-
- file->f_mode |= FMODE_LSEEK | FMODE_PREAD;
fd_install(fd, file);
return fd;
put_fd:
diff --git a/drivers/vfio/group.c b/drivers/vfio/group.c
index 49559605177e..c321d442f0da 100644
--- a/drivers/vfio/group.c
+++ b/drivers/vfio/group.c
@@ -266,24 +266,12 @@ static struct file *vfio_device_open_file(struct vfio_device *device)
if (ret)
goto err_free;
- /*
- * We can't use anon_inode_getfd() because we need to modify
- * the f_mode flags directly to allow more than just ioctls
- */
- filep = anon_inode_getfile("[vfio-device]", &vfio_device_fops,
- df, O_RDWR);
+ filep = anon_inode_getfile_fmode("[vfio-device]", &vfio_device_fops,
+ df, O_RDWR, FMODE_PREAD | FMODE_PWRITE);
if (IS_ERR(filep)) {
ret = PTR_ERR(filep);
goto err_close_device;
}
-
- /*
- * TODO: add an anon_inode interface to do this.
- * Appears to be missing by lack of need rather than
- * explicitly prevented. Now there's need.
- */
- filep->f_mode |= (FMODE_PREAD | FMODE_PWRITE);
-
/*
* Use the pseudo fs inode on the device to link all mmaps
* to the same address space, allowing us to unmap all vmas
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c
index 6d57efbb8110..c5a6aae12d2c 100644
--- a/fs/autofs/dev-ioctl.c
+++ b/fs/autofs/dev-ioctl.c
@@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
sbi->exp_timeout = timeout * HZ;
} else {
struct dentry *base = fp->f_path.dentry;
- struct inode *inode = base->d_inode;
int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1;
struct dentry *dentry;
struct autofs_info *ino;
@@ -460,9 +459,7 @@ static int autofs_dev_ioctl_timeout(struct file *fp,
"the parent autofs mount timeout which could "
"prevent shutdown\n");
- inode_lock_shared(inode);
dentry = try_lookup_one_len(param->path, base, path_len);
- inode_unlock_shared(inode);
if (IS_ERR_OR_NULL(dentry))
return dentry ? PTR_ERR(dentry) : -ENOENT;
ino = autofs_dentry_ino(dentry);
diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c
index fe3de9ad57bf..d9bc67176128 100644
--- a/fs/cachefiles/ondemand.c
+++ b/fs/cachefiles/ondemand.c
@@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_free_id;
}
- anon_file->file = anon_inode_getfile("[cachefiles]",
- &cachefiles_ondemand_fd_fops, object, O_WRONLY);
+ anon_file->file = anon_inode_getfile_fmode("[cachefiles]",
+ &cachefiles_ondemand_fd_fops, object,
+ O_WRONLY, FMODE_PWRITE | FMODE_LSEEK);
if (IS_ERR(anon_file->file)) {
ret = PTR_ERR(anon_file->file);
goto err_put_fd;
@@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req,
goto err_put_file;
}
- anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK;
-
load = (void *)req->msg.data;
load->fd = anon_file->fd;
object->ondemand->ondemand_id = object_id;
diff --git a/fs/coredump.c b/fs/coredump.c
index 4375c70144d0..d6a92cd6018e 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -926,14 +926,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
{
unsigned long addr;
struct page *dump_page;
+ int locked, ret;
dump_page = dump_page_alloc();
if (!dump_page)
return 0;
+ ret = 0;
+ locked = 0;
for (addr = start; addr < start + len; addr += PAGE_SIZE) {
struct page *page;
+ if (!locked) {
+ if (mmap_read_lock_killable(current->mm))
+ goto out;
+ locked = 1;
+ }
+
/*
* To avoid having to allocate page tables for virtual address
* ranges that have never been used yet, and also to make it
@@ -941,21 +950,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start,
* NULL when encountering an empty page table entry that would
* otherwise have been filled with the zero page.
*/
- page = get_dump_page(addr);
+ page = get_dump_page(addr, &locked);
if (page) {
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page));
put_page(page);
- if (stop) {
- dump_page_free(dump_page);
- return 0;
- }
+ if (stop)
+ goto out;
} else {
dump_skip(cprm, PAGE_SIZE);
}
+
+ if (dump_interrupted())
+ goto out;
+
+ if (!need_resched())
+ continue;
+ if (locked) {
+ mmap_read_unlock(current->mm);
+ locked = 0;
+ }
cond_resched();
}
+ ret = 1;
+out:
+ if (locked)
+ mmap_read_unlock(current->mm);
+
dump_page_free(dump_page);
- return 1;
+ return ret;
}
#endif
diff --git a/fs/dcache.c b/fs/dcache.c
index e3634916ffb9..3ee84f62827a 100644
--- a/fs/dcache.c
+++ b/fs/dcache.c
@@ -2480,7 +2480,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n,
{
smp_store_release(&dir->i_dir_seq, n + 2);
preempt_enable_nested();
- wake_up_all(d_wait);
+ if (wq_has_sleeper(d_wait))
+ wake_up_all(d_wait);
}
static void d_wait_lookup(struct dentry *dentry)
diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c
index 0b1c878317ab..e7b7f426fecf 100644
--- a/fs/ecryptfs/super.c
+++ b/fs/ecryptfs/super.c
@@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = {
.destroy_inode = ecryptfs_destroy_inode,
.free_inode = ecryptfs_free_inode,
.statfs = ecryptfs_statfs,
- .remount_fs = NULL,
.evict_inode = ecryptfs_evict_inode,
.show_options = ecryptfs_show_options
};
diff --git a/fs/eventfd.c b/fs/eventfd.c
index 76129bfcd663..af42b2c7d235 100644
--- a/fs/eventfd.c
+++ b/fs/eventfd.c
@@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags)
if (fd < 0)
goto err;
- file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags);
+ file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops,
+ ctx, flags, FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(fd);
fd = PTR_ERR(file);
goto err;
}
-
- file->f_mode |= FMODE_NOWAIT;
fd_install(fd, file);
return fd;
err:
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 7c0980db77b3..1fc770270ab8 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time)
*
* we must do our busy polling with irqs enabled
*/
-static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static bool ep_busy_loop(struct eventpoll *ep)
{
unsigned int napi_id = READ_ONCE(ep->napi_id);
u16 budget = READ_ONCE(ep->busy_poll_budget);
@@ -448,7 +448,7 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock)
budget = BUSY_POLL_BUDGET;
if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) {
- napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end,
+ napi_busy_loop(napi_id, ep_busy_loop_end,
ep, prefer_busy_poll, budget);
if (ep_events_available(ep))
return true;
@@ -560,7 +560,7 @@ static void ep_resume_napi_irqs(struct eventpoll *ep)
#else
-static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock)
+static inline bool ep_busy_loop(struct eventpoll *ep)
{
return false;
}
@@ -2047,7 +2047,7 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events,
if (timed_out)
return 0;
- eavail = ep_busy_loop(ep, timed_out);
+ eavail = ep_busy_loop(ep);
if (eavail)
continue;
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 0c899cfba578..b5845c4846b8 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt,
int err;
parent = ERR_PTR(-EACCES);
- inode_lock(dentry->d_inode);
if (mnt->mnt_sb->s_export_op->get_parent)
parent = mnt->mnt_sb->s_export_op->get_parent(dentry);
- inode_unlock(dentry->d_inode);
if (IS_ERR(parent)) {
dprintk("get_parent of %lu failed, err %ld\n",
diff --git a/fs/file.c b/fs/file.c
index d868cdb95d1e..40fed4501aab 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -418,17 +418,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho
old_fds = old_fdt->fd;
new_fds = new_fdt->fd;
+ /*
+ * We may be racing against fd allocation from other threads using this
+ * files_struct, despite holding ->file_lock.
+ *
+ * alloc_fd() might have already claimed a slot, while fd_install()
+ * did not populate it yet. Note the latter operates locklessly, so
+ * the file can show up as we are walking the array below.
+ *
+ * At the same time we know no files will disappear as all other
+ * operations take the lock.
+ *
+ * Instead of trying to placate userspace racing with itself, we
+ * ref the file if we see it and mark the fd slot as unused otherwise.
+ */
for (i = open_files; i != 0; i--) {
- struct file *f = *old_fds++;
+ struct file *f = rcu_dereference_raw(*old_fds++);
if (f) {
get_file(f);
} else {
- /*
- * The fd may be claimed in the fd bitmap but not yet
- * instantiated in the files array if a sibling thread
- * is partway through open(). So make sure that this
- * fd is available to the new process.
- */
__clear_open_fd(open_files - i, new_fdt);
}
rcu_assign_pointer(*new_fds++, f);
@@ -577,6 +585,7 @@ repeat:
__set_open_fd(fd, fdt, flags & O_CLOEXEC);
error = fd;
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
out:
spin_unlock(&files->file_lock);
@@ -612,22 +621,14 @@ void put_unused_fd(unsigned int fd)
EXPORT_SYMBOL(put_unused_fd);
-/*
- * Install a file pointer in the fd array.
- *
- * The VFS is full of places where we drop the files lock between
- * setting the open_fds bitmap and installing the file in the file
- * array. At any such point, we are vulnerable to a dup2() race
- * installing a file in the array before us. We need to detect this and
- * fput() the struct file we are about to overwrite in this case.
- *
- * It should never happen - if we allow dup2() do it, _really_ bad things
- * will follow.
+/**
+ * fd_install - install a file pointer in the fd array
+ * @fd: file descriptor to install the file in
+ * @file: the file to install
*
* This consumes the "file" refcount, so callers should treat it
* as if they had called fput(file).
*/
-
void fd_install(unsigned int fd, struct file *file)
{
struct files_struct *files = current->files;
@@ -642,7 +643,7 @@ void fd_install(unsigned int fd, struct file *file)
rcu_read_unlock_sched();
spin_lock(&files->file_lock);
fdt = files_fdtable(files);
- WARN_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
spin_unlock(&files->file_lock);
return;
@@ -650,7 +651,7 @@ void fd_install(unsigned int fd, struct file *file)
/* coupled with smp_wmb() in expand_fdtable() */
smp_rmb();
fdt = rcu_dereference_sched(files->fdt);
- BUG_ON(fdt->fd[fd] != NULL);
+ VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL);
rcu_assign_pointer(fdt->fd[fd], file);
rcu_read_unlock_sched();
}
@@ -679,7 +680,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd)
return NULL;
fd = array_index_nospec(fd, fdt->max_fds);
- file = fdt->fd[fd];
+ file = rcu_dereference_raw(fdt->fd[fd]);
if (file) {
rcu_assign_pointer(fdt->fd[fd], NULL);
__put_unused_fd(files, fd);
@@ -1182,6 +1183,16 @@ static inline bool file_needs_f_pos_lock(struct file *file)
(file_count(file) > 1 || file->f_op->iterate_shared);
}
+bool file_seek_cur_needs_f_lock(struct file *file)
+{
+ if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared)
+ return false;
+
+ VFS_WARN_ON_ONCE((file_count(file) > 1) &&
+ !mutex_is_locked(&file->f_pos_lock));
+ return true;
+}
+
struct fd fdget_pos(unsigned int fd)
{
struct fd f = fdget(fd);
@@ -1230,14 +1241,34 @@ __releases(&files->file_lock)
struct fdtable *fdt;
/*
- * We need to detect attempts to do dup2() over allocated but still
- * not finished descriptor.
+ * dup2() is expected to close the file installed in the target fd slot
+ * (if any). However, userspace hand-picking a fd may be racing against
+ * its own threads which happened to allocate it in open() et al but did
+ * not populate it yet.
+ *
+ * Broadly speaking we may be racing against the following:
+ * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL
+ * file = hard_work_goes_here();
+ * fd_install(fd, file); // only now ->fd[fd] == file
+ *
+ * It is an invariant that a successfully allocated fd has a NULL entry
+ * in the array until the matching fd_install().
+ *
+ * If we fit the window, we have the fd to populate, yet no target file
+ * to close. Trying to ignore it and install our new file would violate
+ * the invariant and make fd_install() overwrite our file.
+ *
+ * Things can be done(tm) to handle this. However, the issue does not
+ * concern legitimate programs and we only need to make sure the kernel
+ * does not trip over it.
+ *
+ * The simplest way out is to return an error if we find ourselves here.
*
* POSIX is silent on the issue, we return -EBUSY.
*/
fdt = files_fdtable(files);
fd = array_index_nospec(fd, fdt->max_fds);
- tofree = fdt->fd[fd];
+ tofree = rcu_dereference_raw(fdt->fd[fd]);
if (!tofree && fd_is_open(fd, fdt))
goto Ebusy;
get_file(file);
diff --git a/fs/file_table.c b/fs/file_table.c
index 5c00dc38558d..9f0a1a164c82 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred)
/*
* Privileged users can go above max_files
*/
- if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) {
+ if (unlikely(get_nr_files() >= files_stat.max_files) &&
+ !capable(CAP_SYS_ADMIN)) {
/*
* percpu_counters are inaccurate. Do an expensive check before
* we go and fail.
diff --git a/fs/inode.c b/fs/inode.c
index 5587aabdaa5e..99318b157a9a 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head)
free_inode_nonrcu(inode);
}
-static struct inode *alloc_inode(struct super_block *sb)
+/**
+ * alloc_inode - obtain an inode
+ * @sb: superblock
+ *
+ * Allocates a new inode for given superblock.
+ * Inode wont be chained in superblock s_inodes list
+ * This means :
+ * - fs can't be unmount
+ * - quotas, fsnotify, writeback can't work
+ */
+struct inode *alloc_inode(struct super_block *sb)
{
const struct super_operations *ops = sb->s_op;
struct inode *inode;
@@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode)
*/
void inode_sb_list_add(struct inode *inode)
{
- spin_lock(&inode->i_sb->s_inode_list_lock);
- list_add(&inode->i_sb_list, &inode->i_sb->s_inodes);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ struct super_block *sb = inode->i_sb;
+
+ spin_lock(&sb->s_inode_list_lock);
+ list_add(&inode->i_sb_list, &sb->s_inodes);
+ spin_unlock(&sb->s_inode_list_lock);
}
EXPORT_SYMBOL_GPL(inode_sb_list_add);
static inline void inode_sb_list_del(struct inode *inode)
{
+ struct super_block *sb = inode->i_sb;
+
if (!list_empty(&inode->i_sb_list)) {
- spin_lock(&inode->i_sb->s_inode_list_lock);
+ spin_lock(&sb->s_inode_list_lock);
list_del_init(&inode->i_sb_list);
- spin_unlock(&inode->i_sb->s_inode_list_lock);
+ spin_unlock(&sb->s_inode_list_lock);
}
}
@@ -806,23 +820,16 @@ static void evict(struct inode *inode)
/*
* Wake up waiters in __wait_on_freeing_inode().
*
- * Lockless hash lookup may end up finding the inode before we removed
- * it above, but only lock it *after* we are done with the wakeup below.
- * In this case the potential waiter cannot safely block.
+ * It is an invariant that any thread we need to wake up is already
+ * accounted for before remove_inode_hash() acquires ->i_lock -- both
+ * sides take the lock and sleep is aborted if the inode is found
+ * unhashed. Thus either the sleeper wins and goes off CPU, or removal
+ * wins and the sleeper aborts after testing with the lock.
*
- * The inode being unhashed after the call to remove_inode_hash() is
- * used as an indicator whether blocking on it is safe.
+ * This also means we don't need any fences for the call below.
*/
- spin_lock(&inode->i_lock);
- /*
- * Pairs with the barrier in prepare_to_wait_event() to make sure
- * ___wait_var_event() either sees the bit cleared or
- * waitqueue_active() check in wake_up_var() sees the waiter.
- */
- smp_mb__after_spinlock();
inode_wake_up_bit(inode, __I_NEW);
BUG_ON(inode->i_state != (I_FREEING | I_CLEAR));
- spin_unlock(&inode->i_lock);
destroy_inode(inode);
}
@@ -900,46 +907,6 @@ again:
}
EXPORT_SYMBOL_GPL(evict_inodes);
-/**
- * invalidate_inodes - attempt to free all inodes on a superblock
- * @sb: superblock to operate on
- *
- * Attempts to free all inodes (including dirty inodes) for a given superblock.
- */
-void invalidate_inodes(struct super_block *sb)
-{
- struct inode *inode, *next;
- LIST_HEAD(dispose);
-
-again:
- spin_lock(&sb->s_inode_list_lock);
- list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) {
- spin_lock(&inode->i_lock);
- if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
- if (atomic_read(&inode->i_count)) {
- spin_unlock(&inode->i_lock);
- continue;
- }
-
- inode->i_state |= I_FREEING;
- inode_lru_list_del(inode);
- spin_unlock(&inode->i_lock);
- list_add(&inode->i_lru, &dispose);
- if (need_resched()) {
- spin_unlock(&sb->s_inode_list_lock);
- cond_resched();
- dispose_list(&dispose);
- goto again;
- }
- }
- spin_unlock(&sb->s_inode_list_lock);
-
- dispose_list(&dispose);
-}
-
/*
* Isolate the inode from the LRU in preparation for freeing it.
*
@@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void)
EXPORT_SYMBOL(get_next_ino);
/**
- * new_inode_pseudo - obtain an inode
- * @sb: superblock
- *
- * Allocates a new inode for given superblock.
- * Inode wont be chained in superblock s_inodes list
- * This means :
- * - fs can't be unmount
- * - quotas, fsnotify, writeback can't work
- */
-struct inode *new_inode_pseudo(struct super_block *sb)
-{
- return alloc_inode(sb);
-}
-
-/**
* new_inode - obtain an inode
* @sb: superblock
*
@@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb)
{
struct inode *inode;
- inode = new_inode_pseudo(sb);
+ inode = alloc_inode(sb);
if (inode)
inode_sb_list_add(inode);
return inode;
@@ -1348,8 +1300,8 @@ again:
}
if (set && unlikely(set(inode, data))) {
- inode = NULL;
- goto unlock;
+ spin_unlock(&inode_hash_lock);
+ return NULL;
}
/*
@@ -1361,14 +1313,14 @@ again:
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
+ spin_unlock(&inode_hash_lock);
+
/*
* Add inode to the sb list if it's not already. It has I_NEW at this
* point, so it should be safe to test i_sb_list locklessly.
*/
if (list_empty(&inode->i_sb_list))
inode_sb_list_add(inode);
-unlock:
- spin_unlock(&inode_hash_lock);
return inode;
}
@@ -1497,8 +1449,8 @@ again:
inode->i_state = I_NEW;
hlist_add_head_rcu(&inode->i_hash, head);
spin_unlock(&inode->i_lock);
- inode_sb_list_add(inode);
spin_unlock(&inode_hash_lock);
+ inode_sb_list_add(inode);
/* Return the locked inode with I_NEW set, the
* caller is responsible for filling in the contents
@@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap,
return mode & ~S_ISGID;
}
EXPORT_SYMBOL(mode_strip_sgid);
+
+#ifdef CONFIG_DEBUG_VFS
+/*
+ * Dump an inode.
+ *
+ * TODO: add a proper inode dumping routine, this is a stub to get debug off the
+ * ground.
+ */
+void dump_inode(struct inode *inode, const char *reason)
+{
+ pr_warn("%s encountered for inode %px", reason, inode);
+}
+
+EXPORT_SYMBOL(dump_inode);
+#endif
diff --git a/fs/internal.h b/fs/internal.h
index e7f02ae1e098..82127c69e641 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -187,8 +187,8 @@ extern struct open_how build_open_how(int flags, umode_t mode);
extern int build_open_flags(const struct open_how *how, struct open_flags *op);
struct file *file_close_fd_locked(struct files_struct *files, unsigned fd);
-long do_ftruncate(struct file *file, loff_t length, int small);
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_ftruncate(struct file *file, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
int chmod_common(const struct path *path, umode_t mode);
int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group,
int flag);
@@ -207,7 +207,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap,
* fs-writeback.c
*/
extern long get_nr_dirty_inodes(void);
-void invalidate_inodes(struct super_block *sb);
/*
* dcache.c
@@ -338,3 +337,4 @@ static inline bool path_mounted(const struct path *path)
return path->mnt->mnt_root == path->dentry;
}
void file_f_owner_release(struct file *file);
+bool file_seek_cur_needs_f_lock(struct file *file);
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 638a36be31c1..c91fd2b46a77 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -41,7 +41,7 @@
*
* Returns 0 on success, -errno on error.
*/
-long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
+int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
int error = -ENOTTY;
@@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap)
return error;
}
-static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
- u64 off, u64 olen, u64 destoff)
+static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
+ u64 off, u64 olen, u64 destoff)
{
CLASS(fd, src_file)(srcfd);
loff_t cloned;
@@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd,
return ret;
}
-static long ioctl_file_clone_range(struct file *file,
- struct file_clone_range __user *argp)
+static int ioctl_file_clone_range(struct file *file,
+ struct file_clone_range __user *argp)
{
struct file_clone_range args;
diff --git a/fs/namei.c b/fs/namei.c
index ecb7b95c2ca3..2a12238e696a 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -125,6 +125,13 @@
#define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname))
+static inline void initname(struct filename *name)
+{
+ name->uptr = NULL;
+ name->aname = NULL;
+ atomic_set(&name->refcnt, 1);
+}
+
struct filename *
getname_flags(const char __user *filename, int flags)
{
@@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags)
return ERR_PTR(-ENAMETOOLONG);
}
}
-
- atomic_set(&result->refcnt, 1);
- result->uptr = filename;
- result->aname = NULL;
+ initname(result);
audit_getname(result);
return result;
}
@@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags)
return getname_flags(filename, flags);
}
-struct filename *getname(const char __user * filename)
-{
- return getname_flags(filename, 0);
-}
-
struct filename *__getname_maybe_null(const char __user *pathname)
{
struct filename *name;
@@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename)
return ERR_PTR(-ENAMETOOLONG);
}
memcpy((char *)result->name, filename, len);
- result->uptr = NULL;
- result->aname = NULL;
- atomic_set(&result->refcnt, 1);
+ initname(result);
audit_getname(result);
-
return result;
}
EXPORT_SYMBOL(getname_kernel);
void putname(struct filename *name)
{
+ int refcnt;
+
if (IS_ERR_OR_NULL(name))
return;
- if (WARN_ON_ONCE(!atomic_read(&name->refcnt)))
- return;
+ refcnt = atomic_read(&name->refcnt);
+ if (refcnt != 1) {
+ if (WARN_ON_ONCE(!refcnt))
+ return;
- if (!atomic_dec_and_test(&name->refcnt))
- return;
+ if (!atomic_dec_and_test(&name->refcnt))
+ return;
+ }
if (name->name != name->iname) {
__putname(name->name);
@@ -2863,15 +2864,14 @@ static int lookup_one_common(struct mnt_idmap *idmap,
* Note that this routine is purely a helper for filesystem usage and should
* not be called by generic code.
*
- * The caller must hold base->i_mutex.
+ * No locks need be held - only a counted reference to @base is needed.
+ *
*/
struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len)
{
struct qstr this;
int err;
- WARN_ON_ONCE(!inode_is_locked(base->d_inode));
-
err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this);
if (err)
return ERR_PTR(err);
@@ -3415,6 +3415,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path,
if ((acc_mode & MAY_EXEC) && path_noexec(path))
return -EACCES;
break;
+ default:
+ VFS_BUG_ON_INODE(1, inode);
}
error = inode_permission(idmap, inode, MAY_OPEN | acc_mode);
diff --git a/fs/open.c b/fs/open.c
index 1be20de9f283..201497f65590 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -67,11 +67,11 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry,
return ret;
}
-long vfs_truncate(const struct path *path, loff_t length)
+int vfs_truncate(const struct path *path, loff_t length)
{
struct mnt_idmap *idmap;
struct inode *inode;
- long error;
+ int error;
inode = path->dentry->d_inode;
@@ -123,7 +123,7 @@ mnt_drop_write_and_out:
}
EXPORT_SYMBOL_GPL(vfs_truncate);
-long do_sys_truncate(const char __user *pathname, loff_t length)
+int do_sys_truncate(const char __user *pathname, loff_t length)
{
unsigned int lookup_flags = LOOKUP_FOLLOW;
struct path path;
@@ -157,7 +157,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length
}
#endif
-long do_ftruncate(struct file *file, loff_t length, int small)
+int do_ftruncate(struct file *file, loff_t length, int small)
{
struct inode *inode;
struct dentry *dentry;
@@ -196,7 +196,7 @@ long do_ftruncate(struct file *file, loff_t length, int small)
return error;
}
-long do_sys_ftruncate(unsigned int fd, loff_t length, int small)
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small)
{
if (length < 0)
return -EINVAL;
@@ -251,7 +251,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd,
int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
{
struct inode *inode = file_inode(file);
- long ret;
+ int ret;
loff_t sum;
if (offset < 0 || len <= 0)
@@ -460,7 +460,7 @@ static const struct cred *access_override_creds(void)
return override_creds(override_cred);
}
-static long do_faccessat(int dfd, const char __user *filename, int mode, int flags)
+static int do_faccessat(int dfd, const char __user *filename, int mode, int flags)
{
struct path path;
struct inode *inode;
@@ -1409,22 +1409,23 @@ struct file *file_open_root(const struct path *root,
}
EXPORT_SYMBOL(file_open_root);
-static long do_sys_openat2(int dfd, const char __user *filename,
- struct open_how *how)
+static int do_sys_openat2(int dfd, const char __user *filename,
+ struct open_how *how)
{
struct open_flags op;
- int fd = build_open_flags(how, &op);
struct filename *tmp;
+ int err, fd;
- if (fd)
- return fd;
+ err = build_open_flags(how, &op);
+ if (unlikely(err))
+ return err;
tmp = getname(filename);
if (IS_ERR(tmp))
return PTR_ERR(tmp);
fd = get_unused_fd_flags(how->flags);
- if (fd >= 0) {
+ if (likely(fd >= 0)) {
struct file *f = do_filp_open(dfd, tmp, &op);
if (IS_ERR(f)) {
put_unused_fd(fd);
@@ -1437,7 +1438,7 @@ static long do_sys_openat2(int dfd, const char __user *filename,
return fd;
}
-long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
+int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode)
{
struct open_how how = build_open_how(flags, mode);
return do_sys_openat2(dfd, filename, &how);
diff --git a/fs/read_write.c b/fs/read_write.c
index a6133241dfb8..bb0ed26a0b3a 100644
--- a/fs/read_write.c
+++ b/fs/read_write.c
@@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence,
if (whence == SEEK_CUR) {
/*
- * f_lock protects against read/modify/write race with
- * other SEEK_CURs. Note that parallel writes and reads
- * behave like SEEK_SET.
+ * If the file requires locking via f_pos_lock we know
+ * that mutual exclusion for SEEK_CUR on the same file
+ * is guaranteed. If the file isn't locked, we take
+ * f_lock to protect against f_pos races with other
+ * SEEK_CURs.
*/
- guard(spinlock)(&file->f_lock);
+ if (file_seek_cur_needs_f_lock(file)) {
+ guard(spinlock)(&file->f_lock);
+ return vfs_setpos(file, file->f_pos + offset, maxsize);
+ }
return vfs_setpos(file, file->f_pos + offset, maxsize);
}
diff --git a/fs/signalfd.c b/fs/signalfd.c
index d1a5f43ce466..d469782f97f4 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags)
return ufd;
}
- file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx,
- O_RDWR | (flags & O_NONBLOCK));
+ file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops,
+ ctx, O_RDWR | (flags & O_NONBLOCK),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
-
fd_install(ufd, file);
} else {
CLASS(fd, f)(ufd);
diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c
index 8582cf61242c..9e4f7378f30f 100644
--- a/fs/smb/client/file.c
+++ b/fs/smb/client/file.c
@@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon)
spin_unlock(&tcon->tc_lock);
/*
- * BB Add call to invalidate_inodes(sb) for all superblocks mounted
+ * BB Add call to evict_inodes(sb) for all superblocks mounted
* to this tcon.
*/
}
diff --git a/fs/super.c b/fs/super.c
index 723176dee229..97a17f9d9023 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -1417,7 +1417,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise)
if (!surprise)
sync_filesystem(sb);
shrink_dcache_sb(sb);
- invalidate_inodes(sb);
+ evict_inodes(sb);
if (sb->s_op->shutdown)
sb->s_op->shutdown(sb);
diff --git a/fs/timerfd.c b/fs/timerfd.c
index 9f7eb451a60f..753e22e83e0f 100644
--- a/fs/timerfd.c
+++ b/fs/timerfd.c
@@ -439,15 +439,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags)
return ufd;
}
- file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx,
- O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS));
+ file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx,
+ O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS),
+ FMODE_NOWAIT);
if (IS_ERR(file)) {
put_unused_fd(ufd);
kfree(ctx);
return PTR_ERR(file);
}
- file->f_mode |= FMODE_NOWAIT;
fd_install(ufd, file);
return ufd;
}
diff --git a/include/linux/fs.h b/include/linux/fs.h
index b570138ff114..91b1e191530c 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -2,6 +2,7 @@
#ifndef _LINUX_FS_H
#define _LINUX_FS_H
+#include <linux/vfsdebug.h>
#include <linux/linkage.h>
#include <linux/wait_bit.h>
#include <linux/kdev_t.h>
@@ -790,19 +791,8 @@ struct inode {
static inline void inode_set_cached_link(struct inode *inode, char *link, int linklen)
{
- int testlen;
-
- /*
- * TODO: patch it into a debug-only check if relevant macros show up.
- * In the meantime, since we are suffering strlen even on production kernels
- * to find the right length, do a fixup if the wrong value got passed.
- */
- testlen = strlen(link);
- if (testlen != linklen) {
- WARN_ONCE(1, "bad length passed for symlink [%s] (got %d, expected %d)",
- link, linklen, testlen);
- linklen = testlen;
- }
+ VFS_WARN_ON_INODE(strlen(link) != linklen, inode);
+ VFS_WARN_ON_INODE(inode->i_opflags & IOP_CACHED_LINK, inode);
inode->i_link = link;
inode->i_linklen = linklen;
inode->i_opflags |= IOP_CACHED_LINK;
@@ -1067,7 +1057,6 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
/**
* struct file - Represents a file
- * @f_ref: reference count
* @f_lock: Protects f_ep, f_flags. Must not be taken from IRQ context.
* @f_mode: FMODE_* flags often used in hotpaths
* @f_op: file operations
@@ -1077,12 +1066,12 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_flags: file flags
* @f_iocb_flags: iocb flags
* @f_cred: stashed credentials of creator/opener
+ * @f_owner: file owner
* @f_path: path of the file
* @f_pos_lock: lock protecting file position
* @f_pipe: specific to pipes
* @f_pos: file position
* @f_security: LSM security context of this file
- * @f_owner: file owner
* @f_wb_err: writeback error
* @f_sb_err: per sb writeback errors
* @f_ep: link of all epoll hooks for this file
@@ -1090,9 +1079,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
* @f_llist: work queue entrypoint
* @f_ra: file's readahead state
* @f_freeptr: Pointer used by SLAB_TYPESAFE_BY_RCU file cache (don't touch.)
+ * @f_ref: reference count
*/
struct file {
- file_ref_t f_ref;
spinlock_t f_lock;
fmode_t f_mode;
const struct file_operations *f_op;
@@ -1102,6 +1091,7 @@ struct file {
unsigned int f_flags;
unsigned int f_iocb_flags;
const struct cred *f_cred;
+ struct fown_struct *f_owner;
/* --- cacheline 1 boundary (64 bytes) --- */
struct path f_path;
union {
@@ -1115,7 +1105,6 @@ struct file {
void *f_security;
#endif
/* --- cacheline 2 boundary (128 bytes) --- */
- struct fown_struct *f_owner;
errseq_t f_wb_err;
errseq_t f_sb_err;
#ifdef CONFIG_EPOLL
@@ -1127,6 +1116,7 @@ struct file {
struct file_ra_state f_ra;
freeptr_t f_freeptr;
};
+ file_ref_t f_ref;
/* --- cacheline 3 boundary (192 bytes) --- */
} __randomize_layout
__attribute__((aligned(4))); /* lest something weird decides that 2 is OK */
@@ -2039,7 +2029,7 @@ int vfs_fchown(struct file *file, uid_t user, gid_t group);
int vfs_fchmod(struct file *file, umode_t mode);
int vfs_utimes(const struct path *path, struct timespec64 *times);
-extern long vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int vfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
#ifdef CONFIG_COMPAT
extern long compat_ptr_ioctl(struct file *file, unsigned int cmd,
@@ -2791,13 +2781,13 @@ static inline bool is_idmapped_mnt(const struct vfsmount *mnt)
return mnt_idmap(mnt) != &nop_mnt_idmap;
}
-extern long vfs_truncate(const struct path *, loff_t);
+int vfs_truncate(const struct path *, loff_t);
int do_truncate(struct mnt_idmap *, struct dentry *, loff_t start,
unsigned int time_attrs, struct file *filp);
extern int vfs_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
-extern long do_sys_open(int dfd, const char __user *filename, int flags,
- umode_t mode);
+int do_sys_open(int dfd, const char __user *filename, int flags,
+ umode_t mode);
extern struct file *file_open_name(struct filename *, int, umode_t);
extern struct file *filp_open(const char *, int, umode_t);
extern struct file *file_open_root(const struct path *,
@@ -2848,7 +2838,10 @@ extern int filp_close(struct file *, fl_owner_t id);
extern struct filename *getname_flags(const char __user *, int);
extern struct filename *getname_uflags(const char __user *, int);
-extern struct filename *getname(const char __user *);
+static inline struct filename *getname(const char __user *name)
+{
+ return getname_flags(name, 0);
+}
extern struct filename *getname_kernel(const char *);
extern struct filename *__getname_maybe_null(const char __user *);
static inline struct filename *getname_maybe_null(const char __user *name, int flags)
@@ -2862,6 +2855,12 @@ static inline struct filename *getname_maybe_null(const char __user *name, int f
}
extern void putname(struct filename *name);
+static inline struct filename *refname(struct filename *name)
+{
+ atomic_inc(&name->refcnt);
+ return name;
+}
+
extern int finish_open(struct file *file, struct dentry *dentry,
int (*open)(struct inode *, struct file *));
extern int finish_no_open(struct file *file, struct dentry *dentry);
@@ -3294,7 +3293,11 @@ static inline void __iget(struct inode *inode)
extern void iget_failed(struct inode *);
extern void clear_inode(struct inode *);
extern void __destroy_inode(struct inode *);
-extern struct inode *new_inode_pseudo(struct super_block *sb);
+struct inode *alloc_inode(struct super_block *sb);
+static inline struct inode *new_inode_pseudo(struct super_block *sb)
+{
+ return alloc_inode(sb);
+}
extern struct inode *new_inode(struct super_block *sb);
extern void free_inode_nonrcu(struct inode *inode);
extern int setattr_should_drop_suidgid(struct mnt_idmap *, struct inode *);
diff --git a/include/linux/mm.h b/include/linux/mm.h
index 1f80baddacc5..2edb8d14d165 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -2555,7 +2555,7 @@ int __account_locked_vm(struct mm_struct *mm, unsigned long pages, bool inc,
struct task_struct *task, bool bypass_rlim);
struct kvec;
-struct page *get_dump_page(unsigned long addr);
+struct page *get_dump_page(unsigned long addr, int *locked);
bool folio_mark_dirty(struct folio *folio);
bool folio_mark_dirty_lock(struct folio *folio);
diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 47bfc6b1b632..f348e7005306 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -1044,21 +1044,23 @@ static inline pgoff_t page_pgoff(const struct folio *folio,
return folio->index + folio_page_idx(folio, page);
}
-/*
- * Return byte-offset into filesystem object for page.
+/**
+ * folio_pos - Returns the byte position of this folio in its file.
+ * @folio: The folio.
*/
-static inline loff_t page_offset(struct page *page)
+static inline loff_t folio_pos(const struct folio *folio)
{
- return ((loff_t)page->index) << PAGE_SHIFT;
+ return ((loff_t)folio->index) * PAGE_SIZE;
}
-/**
- * folio_pos - Returns the byte position of this folio in its file.
- * @folio: The folio.
+/*
+ * Return byte-offset into filesystem object for page.
*/
-static inline loff_t folio_pos(struct folio *folio)
+static inline loff_t page_offset(struct page *page)
{
- return page_offset(&folio->page);
+ struct folio *folio = page_folio(page);
+
+ return folio_pos(folio) + folio_page_idx(folio, page) * PAGE_SIZE;
}
/*
diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
index c6333204d451..bae4490c1dda 100644
--- a/include/linux/syscalls.h
+++ b/include/linux/syscalls.h
@@ -1266,14 +1266,14 @@ static inline long ksys_lchown(const char __user *filename, uid_t user,
AT_SYMLINK_NOFOLLOW);
}
-extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
+int do_sys_ftruncate(unsigned int fd, loff_t length, int small);
static inline long ksys_ftruncate(unsigned int fd, loff_t length)
{
return do_sys_ftruncate(fd, length, 1);
}
-extern long do_sys_truncate(const char __user *pathname, loff_t length);
+int do_sys_truncate(const char __user *pathname, loff_t length);
static inline long ksys_truncate(const char __user *pathname, loff_t length)
{
diff --git a/include/linux/vfsdebug.h b/include/linux/vfsdebug.h
new file mode 100644
index 000000000000..9cf22d3eb9dd
--- /dev/null
+++ b/include/linux/vfsdebug.h
@@ -0,0 +1,45 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef LINUX_VFS_DEBUG_H
+#define LINUX_VFS_DEBUG_H 1
+
+#include <linux/bug.h>
+
+struct inode;
+
+#ifdef CONFIG_DEBUG_VFS
+void dump_inode(struct inode *inode, const char *reason);
+
+#define VFS_BUG_ON(cond) BUG_ON(cond)
+#define VFS_WARN_ON(cond) (void)WARN_ON(cond)
+#define VFS_WARN_ON_ONCE(cond) (void)WARN_ON_ONCE(cond)
+#define VFS_WARN_ONCE(cond, format...) (void)WARN_ONCE(cond, format)
+#define VFS_WARN(cond, format...) (void)WARN(cond, format)
+
+#define VFS_BUG_ON_INODE(cond, inode) ({ \
+ if (unlikely(!!(cond))) { \
+ dump_inode(inode, "VFS_BUG_ON_INODE(" #cond")");\
+ BUG_ON(1); \
+ } \
+})
+
+#define VFS_WARN_ON_INODE(cond, inode) ({ \
+ int __ret_warn = !!(cond); \
+ \
+ if (unlikely(__ret_warn)) { \
+ dump_inode(inode, "VFS_WARN_ON_INODE(" #cond")");\
+ WARN_ON(1); \
+ } \
+ unlikely(__ret_warn); \
+})
+#else
+#define VFS_BUG_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ON(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ON_ONCE(cond) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN_ONCE(cond, format...) BUILD_BUG_ON_INVALID(cond)
+#define VFS_WARN(cond, format...) BUILD_BUG_ON_INVALID(cond)
+
+#define VFS_BUG_ON_INODE(cond, inode) VFS_BUG_ON(cond)
+#define VFS_WARN_ON_INODE(cond, inode) BUILD_BUG_ON_INVALID(cond)
+#endif /* CONFIG_DEBUG_VFS */
+
+#endif
diff --git a/kernel/auditsc.c b/kernel/auditsc.c
index 9c853cde9abe..78fd876a5473 100644
--- a/kernel/auditsc.c
+++ b/kernel/auditsc.c
@@ -2207,10 +2207,8 @@ __audit_reusename(const __user char *uptr)
list_for_each_entry(n, &context->names_list, list) {
if (!n->name)
continue;
- if (n->name->uptr == uptr) {
- atomic_inc(&n->name->refcnt);
- return n->name;
- }
+ if (n->name->uptr == uptr)
+ return refname(n->name);
}
return NULL;
}
@@ -2237,7 +2235,7 @@ void __audit_getname(struct filename *name)
n->name = name;
n->name_len = AUDIT_NAME_FULL;
name->aname = n;
- atomic_inc(&name->refcnt);
+ refname(name);
}
static inline int audit_copy_fcaps(struct audit_names *name,
@@ -2369,7 +2367,7 @@ out_alloc:
return;
if (name) {
n->name = name;
- atomic_inc(&name->refcnt);
+ refname(name);
}
out:
@@ -2496,7 +2494,7 @@ void __audit_inode_child(struct inode *parent,
if (found_parent) {
found_child->name = found_parent->name;
found_child->name_len = AUDIT_NAME_FULL;
- atomic_inc(&found_child->name->refcnt);
+ refname(found_child->name);
}
}
diff --git a/kernel/kcmp.c b/kernel/kcmp.c
index 2c596851f8a9..7c1a65bd5f8d 100644
--- a/kernel/kcmp.c
+++ b/kernel/kcmp.c
@@ -145,7 +145,7 @@ SYSCALL_DEFINE5(kcmp, pid_t, pid1, pid_t, pid2, int, type,
*/
task1 = find_task_by_vpid(pid1);
task2 = find_task_by_vpid(pid2);
- if (!task1 || !task2)
+ if (unlikely(!task1 || !task2))
goto err_no_task;
get_task_struct(task1);
diff --git a/kernel/watch_queue.c b/kernel/watch_queue.c
index 5267adeaa403..41e4e8070923 100644
--- a/kernel/watch_queue.c
+++ b/kernel/watch_queue.c
@@ -269,6 +269,15 @@ long watch_queue_set_size(struct pipe_inode_info *pipe, unsigned int nr_notes)
if (ret < 0)
goto error;
+ /*
+ * pipe_resize_ring() does not update nr_accounted for watch_queue
+ * pipes, because the above vastly overprovisions. Set nr_accounted on
+ * and max_usage this pipe to the number that was actually charged to
+ * the user above via account_pipe_buffers.
+ */
+ pipe->max_usage = nr_pages;
+ pipe->nr_accounted = nr_pages;
+
ret = -ENOMEM;
pages = kcalloc(nr_pages, sizeof(struct page *), GFP_KERNEL);
if (!pages)
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index 35796c290ca3..b775ba54dcd8 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -808,6 +808,15 @@ config ARCH_HAS_DEBUG_VM_PGTABLE
An architecture should select this when it can successfully
build and run DEBUG_VM_PGTABLE.
+config DEBUG_VFS
+ bool "Debug VFS"
+ depends on DEBUG_KERNEL
+ help
+ Enable this to turn on extended checks in the VFS layer that may impact
+ performance.
+
+ If unsure, say N.
+
config DEBUG_VM_IRQSOFF
def_bool DEBUG_VM && !PREEMPT_RT
diff --git a/mm/gup.c b/mm/gup.c
index 3883b307780e..855ab860f88b 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -2254,6 +2254,7 @@ EXPORT_SYMBOL(fault_in_readable);
/**
* get_dump_page() - pin user page in memory while writing it to core dump
* @addr: user address
+ * @locked: a pointer to an int denoting whether the mmap sem is held
*
* Returns struct page pointer of user page pinned for dump,
* to be freed afterwards by put_page().
@@ -2266,13 +2267,12 @@ EXPORT_SYMBOL(fault_in_readable);
* Called without mmap_lock (takes and releases the mmap_lock by itself).
*/
#ifdef CONFIG_ELF_CORE
-struct page *get_dump_page(unsigned long addr)
+struct page *get_dump_page(unsigned long addr, int *locked)
{
struct page *page;
- int locked = 0;
int ret;
- ret = __get_user_pages_locked(current->mm, addr, 1, &page, &locked,
+ ret = __get_user_pages_locked(current->mm, addr, 1, &page, locked,
FOLL_FORCE | FOLL_DUMP | FOLL_GET);
return (ret == 1) ? page : NULL;
}
diff --git a/security/landlock/fs.c b/security/landlock/fs.c
index 71b9dc331aae..582769ae830e 100644
--- a/security/landlock/fs.c
+++ b/security/landlock/fs.c
@@ -1216,7 +1216,7 @@ static void hook_inode_free_security_rcu(void *inode_security)
/*
* Release the inodes used in a security policy.
*
- * Cf. fsnotify_unmount_inodes() and invalidate_inodes()
+ * Cf. fsnotify_unmount_inodes() and evict_inodes()
*/
static void hook_sb_delete(struct super_block *const sb)
{
diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index ba0327e2d0d3..16bee3b78219 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -4224,15 +4224,14 @@ static int kvm_vcpu_ioctl_get_stats_fd(struct kvm_vcpu *vcpu)
if (fd < 0)
return fd;
- file = anon_inode_getfile(name, &kvm_vcpu_stats_fops, vcpu, O_RDONLY);
+ file = anon_inode_getfile_fmode(name, &kvm_vcpu_stats_fops, vcpu,
+ O_RDONLY, FMODE_PREAD);
if (IS_ERR(file)) {
put_unused_fd(fd);
return PTR_ERR(file);
}
kvm_get_kvm(vcpu->kvm);
-
- file->f_mode |= FMODE_PREAD;
fd_install(fd, file);
return fd;
@@ -5020,16 +5019,14 @@ static int kvm_vm_ioctl_get_stats_fd(struct kvm *kvm)
if (fd < 0)
return fd;
- file = anon_inode_getfile("kvm-vm-stats",
- &kvm_vm_stats_fops, kvm, O_RDONLY);
+ file = anon_inode_getfile_fmode("kvm-vm-stats",
+ &kvm_vm_stats_fops, kvm, O_RDONLY, FMODE_PREAD);
if (IS_ERR(file)) {
put_unused_fd(fd);
return PTR_ERR(file);
}
kvm_get_kvm(kvm);
-
- file->f_mode |= FMODE_PREAD;
fd_install(fd, file);
return fd;