diff options
Diffstat (limited to 'fs')
65 files changed, 1054 insertions, 710 deletions
diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index a58f9248b0f5..6743b3b64217 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -432,16 +432,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp, if (!autofs_type_indirect(sbi->type)) return -EINVAL; - /* An expire timeout greater than the superblock timeout - * could be a problem at shutdown but the super block - * timeout itself can change so all we can really do is - * warn the user. - */ - if (timeout >= sbi->exp_timeout) - pr_warn("per-mount expire timeout is greater than " - "the parent autofs mount timeout which could " - "prevent shutdown\n"); - dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len), base); if (IS_ERR_OR_NULL(dentry)) @@ -470,6 +460,18 @@ static int autofs_dev_ioctl_timeout(struct file *fp, ino->flags |= AUTOFS_INF_EXPIRE_SET; ino->exp_timeout = timeout * HZ; } + + /* An expire timeout greater than the superblock timeout + * could be a problem at shutdown but the super block + * timeout itself can change so all we can really do is + * warn the user. + */ + if (ino->flags & AUTOFS_INF_EXPIRE_SET && + ino->exp_timeout > sbi->exp_timeout) + pr_warn("per-mount expire timeout is greater than " + "the parent autofs mount timeout which could " + "prevent shutdown\n"); + dput(dentry); } diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 732aee76a24c..b932b1719dfc 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -55,7 +55,7 @@ void autofs_kill_sb(struct super_block *sb) } pr_debug("shutting down\n"); - kill_litter_super(sb); + kill_anon_super(sb); if (sbi) kfree_rcu(sbi, rcu); } diff --git a/fs/autofs/root.c b/fs/autofs/root.c index d10df9d89d1c..2c31002b314a 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -602,9 +602,8 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, } inode->i_private = cp; inode->i_size = size; - d_add(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; @@ -631,12 +630,11 @@ static int autofs_dir_symlink(struct mnt_idmap *idmap, static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; - dput(ino->dentry); + d_make_discardable(dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); @@ -718,7 +716,7 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count--; - dput(ino->dentry); + d_make_discardable(dentry); d_inode(dentry)->i_size = 0; clear_nlink(d_inode(dentry)); @@ -748,12 +746,11 @@ static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap, inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return ERR_PTR(-ENOMEM); - d_add(dentry, inode); if (sbi->version < 5) autofs_set_leaf_automount_flags(dentry); - dget(dentry); + d_make_persistent(dentry, inode); p_ino = autofs_dentry_ino(dentry->d_parent); p_ino->count++; inc_nlink(dir); diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index d7aec5b87c2b..8cb1a94339b8 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -765,14 +765,41 @@ static const struct file_operations bm_entry_operations = { /* /register */ +/* add to filesystem */ +static int add_entry(Node *e, struct super_block *sb) +{ + struct dentry *dentry = simple_start_creating(sb->s_root, e->name); + struct inode *inode; + struct binfmt_misc *misc; + + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + + inode = bm_get_inode(sb, S_IFREG | 0644); + if (unlikely(!inode)) { + simple_done_creating(dentry); + return -ENOMEM; + } + + refcount_set(&e->users, 1); + e->dentry = dentry; + inode->i_private = e; + inode->i_fop = &bm_entry_operations; + + d_make_persistent(dentry, inode); + misc = i_binfmt_misc(inode); + write_lock(&misc->entries_lock); + list_add(&e->list, &misc->entries); + write_unlock(&misc->entries_lock); + simple_done_creating(dentry); + return 0; +} + static ssize_t bm_register_write(struct file *file, const char __user *buffer, size_t count, loff_t *ppos) { Node *e; - struct inode *inode; struct super_block *sb = file_inode(file)->i_sb; - struct dentry *root = sb->s_root, *dentry; - struct binfmt_misc *misc; int err = 0; struct file *f = NULL; @@ -800,39 +827,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, e->interp_file = f; } - inode_lock(d_inode(root)); - dentry = lookup_noperm(&QSTR(e->name), root); - err = PTR_ERR(dentry); - if (IS_ERR(dentry)) - goto out; - - err = -EEXIST; - if (d_really_is_positive(dentry)) - goto out2; - - inode = bm_get_inode(sb, S_IFREG | 0644); - - err = -ENOMEM; - if (!inode) - goto out2; - - refcount_set(&e->users, 1); - e->dentry = dget(dentry); - inode->i_private = e; - inode->i_fop = &bm_entry_operations; - - d_instantiate(dentry, inode); - misc = i_binfmt_misc(inode); - write_lock(&misc->entries_lock); - list_add(&e->list, &misc->entries); - write_unlock(&misc->entries_lock); - - err = 0; -out2: - dput(dentry); -out: - inode_unlock(d_inode(root)); - + err = add_entry(e, sb); if (err) { if (f) { exe_file_allow_write_access(f); @@ -1027,7 +1022,7 @@ static struct file_system_type bm_fs_type = { .name = "binfmt_misc", .init_fs_context = bm_init_fs_context, .fs_flags = FS_USERNS_MOUNT, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("binfmt_misc"); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 7dda6cc68379..6b3357287b42 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -475,8 +475,8 @@ static noinline int add_ra_bio_pages(struct inode *inode, continue; } - folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, - ~__GFP_FS), 0); + folio = filemap_alloc_folio(mapping_gfp_constraint(mapping, ~__GFP_FS), + 0, NULL); if (!folio) break; diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index 06dfcb461f53..a2ac3fb68bc8 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -736,7 +736,7 @@ again: } folio = filemap_alloc_folio(mapping_gfp_constraint(inode->i_mapping, ~__GFP_FS), - 0); + 0, NULL); if (!folio) return ERR_PTR(-ENOMEM); diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 81f4f06bc87e..ba95f636a5ab 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -400,8 +400,14 @@ static void remove_dir(struct dentry * d) configfs_remove_dirent(d); - if (d_really_is_positive(d)) - simple_rmdir(d_inode(parent),d); + if (d_really_is_positive(d)) { + if (likely(simple_empty(d))) { + __simple_rmdir(d_inode(parent),d); + dput(d); + } else { + pr_warn("remove_dir (%pd): attributes remain", d); + } + } pr_debug(" o %pd removing done (%d)\n", d, d_count(d)); @@ -598,7 +604,7 @@ static void detach_attrs(struct config_item * item) static int populate_attrs(struct config_item *item) { const struct config_item_type *t = item->ci_type; - struct configfs_group_operations *ops; + const struct configfs_group_operations *ops; struct configfs_attribute *attr; struct configfs_bin_attribute *bin_attr; int error = 0; diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 0ad32150611e..affe4742bbb5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -30,7 +30,7 @@ struct configfs_buffer { size_t count; loff_t pos; char * page; - struct configfs_item_operations * ops; + const struct configfs_item_operations *ops; struct mutex mutex; int needs_read_fill; bool read_in_progress; diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 1d2e3a5738d1..bcda3372e141 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -211,7 +211,8 @@ void configfs_drop_dentry(struct configfs_dirent * sd, struct dentry * parent) dget_dlock(dentry); __d_drop(dentry); spin_unlock(&dentry->d_lock); - simple_unlink(d_inode(parent), dentry); + __simple_unlink(d_inode(parent), dentry); + dput(dentry); } else spin_unlock(&dentry->d_lock); } diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 456c4a2efb53..4929f3431189 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -116,7 +116,7 @@ static struct file_system_type configfs_fs_type = { .owner = THIS_MODULE, .name = "configfs", .init_fs_context = configfs_init_fs_context, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("configfs"); @@ -24,7 +24,7 @@ #include <linux/mmu_notifier.h> #include <linux/iomap.h> #include <linux/rmap.h> -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> #define CREATE_TRACE_POINTS #include <trace/events/fs_dax.h> diff --git a/fs/dcache.c b/fs/dcache.c index 9143fd502def..dc2fff4811d1 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -870,6 +870,24 @@ locked: return false; } +static void finish_dput(struct dentry *dentry) + __releases(dentry->d_lock) + __releases(RCU) +{ + while (lock_for_kill(dentry)) { + rcu_read_unlock(); + dentry = __dentry_kill(dentry); + if (!dentry) + return; + if (retain_dentry(dentry, true)) { + spin_unlock(&dentry->d_lock); + return; + } + rcu_read_lock(); + } + rcu_read_unlock(); + spin_unlock(&dentry->d_lock); +} /* * This is dput @@ -907,22 +925,21 @@ void dput(struct dentry *dentry) rcu_read_unlock(); return; } - while (lock_for_kill(dentry)) { - rcu_read_unlock(); - dentry = __dentry_kill(dentry); - if (!dentry) - return; - if (retain_dentry(dentry, true)) { - spin_unlock(&dentry->d_lock); - return; - } - rcu_read_lock(); - } - rcu_read_unlock(); - spin_unlock(&dentry->d_lock); + finish_dput(dentry); } EXPORT_SYMBOL(dput); +void d_make_discardable(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + WARN_ON(!(dentry->d_flags & DCACHE_PERSISTENT)); + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + rcu_read_lock(); + finish_dput(dentry); +} +EXPORT_SYMBOL(d_make_discardable); + static void to_shrink_list(struct dentry *dentry, struct list_head *list) __must_hold(&dentry->d_lock) { @@ -1087,6 +1104,15 @@ struct dentry *d_find_alias_rcu(struct inode *inode) return de; } +void d_dispose_if_unused(struct dentry *dentry, struct list_head *dispose) +{ + spin_lock(&dentry->d_lock); + if (!dentry->d_lockref.count) + to_shrink_list(dentry, dispose); + spin_unlock(&dentry->d_lock); +} +EXPORT_SYMBOL(d_dispose_if_unused); + /* * Try to kill dentries associated with this inode. * WARNING: you must own a reference to inode. @@ -1097,12 +1123,8 @@ void d_prune_aliases(struct inode *inode) struct dentry *dentry; spin_lock(&inode->i_lock); - hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) { - spin_lock(&dentry->d_lock); - if (!dentry->d_lockref.count) - to_shrink_list(dentry, &dispose); - spin_unlock(&dentry->d_lock); - } + hlist_for_each_entry(dentry, &inode->i_dentry, d_u.d_alias) + d_dispose_if_unused(dentry, &dispose); spin_unlock(&inode->i_lock); shrink_dentry_list(&dispose); } @@ -1142,6 +1164,7 @@ void shrink_dentry_list(struct list_head *list) shrink_kill(dentry); } } +EXPORT_SYMBOL(shrink_dentry_list); static enum lru_status dentry_lru_isolate(struct list_head *item, struct list_lru_one *lru, void *arg) @@ -1512,6 +1535,15 @@ out: return ret; } +static enum d_walk_ret select_collect_umount(void *_data, struct dentry *dentry) +{ + if (dentry->d_flags & DCACHE_PERSISTENT) { + dentry->d_flags &= ~DCACHE_PERSISTENT; + dentry->d_lockref.count--; + } + return select_collect(_data, dentry); +} + static enum d_walk_ret select_collect2(void *_data, struct dentry *dentry) { struct select_data *data = _data; @@ -1540,18 +1572,20 @@ out: } /** - * shrink_dcache_parent - prune dcache + * shrink_dcache_tree - prune dcache * @parent: parent of entries to prune + * @for_umount: true if we want to unpin the persistent ones * * Prune the dcache to remove unused children of the parent dentry. */ -void shrink_dcache_parent(struct dentry *parent) +static void shrink_dcache_tree(struct dentry *parent, bool for_umount) { for (;;) { struct select_data data = {.start = parent}; INIT_LIST_HEAD(&data.dispose); - d_walk(parent, &data, select_collect); + d_walk(parent, &data, + for_umount ? select_collect_umount : select_collect); if (!list_empty(&data.dispose)) { shrink_dentry_list(&data.dispose); @@ -1576,6 +1610,11 @@ void shrink_dcache_parent(struct dentry *parent) shrink_dentry_list(&data.dispose); } } + +void shrink_dcache_parent(struct dentry *parent) +{ + shrink_dcache_tree(parent, false); +} EXPORT_SYMBOL(shrink_dcache_parent); static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) @@ -1602,7 +1641,7 @@ static enum d_walk_ret umount_check(void *_data, struct dentry *dentry) static void do_one_tree(struct dentry *dentry) { - shrink_dcache_parent(dentry); + shrink_dcache_tree(dentry, true); d_walk(dentry, dentry, umount_check); d_drop(dentry); dput(dentry); @@ -1924,7 +1963,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) unsigned add_flags = d_flags_for_inode(inode); WARN_ON(d_in_lookup(dentry)); - spin_lock(&dentry->d_lock); /* * The negative counter only tracks dentries on the LRU. Don't dec if * d_lru is on another list. @@ -1937,7 +1975,6 @@ static void __d_instantiate(struct dentry *dentry, struct inode *inode) __d_set_inode_and_type(dentry, inode, add_flags); raw_write_seqcount_end(&dentry->d_seq); fsnotify_update_flags(dentry); - spin_unlock(&dentry->d_lock); } /** @@ -1961,7 +1998,9 @@ void d_instantiate(struct dentry *entry, struct inode * inode) if (inode) { security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); spin_unlock(&inode->i_lock); } } @@ -1980,7 +2019,9 @@ void d_instantiate_new(struct dentry *entry, struct inode *inode) lockdep_annotate_inode_mutex_key(inode); security_d_instantiate(entry, inode); spin_lock(&inode->i_lock); + spin_lock(&entry->d_lock); __d_instantiate(entry, inode); + spin_unlock(&entry->d_lock); WARN_ON(!(inode_state_read(inode) & I_NEW)); inode_state_clear(inode, I_NEW | I_CREATING); inode_wake_up_bit(inode, __I_NEW); @@ -2742,6 +2783,24 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); +struct dentry *d_make_persistent(struct dentry *dentry, struct inode *inode) +{ + WARN_ON(!hlist_unhashed(&dentry->d_u.d_alias)); + WARN_ON(!inode); + security_d_instantiate(dentry, inode); + spin_lock(&inode->i_lock); + spin_lock(&dentry->d_lock); + __d_instantiate(dentry, inode); + dentry->d_flags |= DCACHE_PERSISTENT; + dget_dlock(dentry); + if (d_unhashed(dentry)) + __d_rehash(dentry); + spin_unlock(&dentry->d_lock); + spin_unlock(&inode->i_lock); + return dentry; +} +EXPORT_SYMBOL(d_make_persistent); + static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { @@ -3111,26 +3170,6 @@ bool is_subdir(struct dentry *new_dentry, struct dentry *old_dentry) } EXPORT_SYMBOL(is_subdir); -static enum d_walk_ret d_genocide_kill(void *data, struct dentry *dentry) -{ - struct dentry *root = data; - if (dentry != root) { - if (d_unhashed(dentry) || !dentry->d_inode) - return D_WALK_SKIP; - - if (!(dentry->d_flags & DCACHE_GENOCIDE)) { - dentry->d_flags |= DCACHE_GENOCIDE; - dentry->d_lockref.count--; - } - } - return D_WALK_CONTINUE; -} - -void d_genocide(struct dentry *parent) -{ - d_walk(parent, parent, d_genocide_kill); -} - void d_mark_tmpfile(struct file *file, struct inode *inode) { struct dentry *dentry = file->f_path.dentry; diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 532bd7c46baf..4b263c328ed2 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -35,7 +35,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; +static bool debugfs_enabled __ro_after_init = IS_ENABLED(CONFIG_DEBUG_FS_ALLOW_ALL); /* * Don't allow access attributes to be changed whilst the kernel is locked down @@ -287,9 +287,6 @@ static int debugfs_get_tree(struct fs_context *fc) { int err; - if (!(debugfs_allow & DEBUGFS_ALLOW_API)) - return -EPERM; - err = get_tree_single(fc, debugfs_fill_super); if (err) return err; @@ -329,7 +326,7 @@ static struct file_system_type debug_fs_type = { .name = "debugfs", .init_fs_context = debugfs_init_fs_context, .parameters = debugfs_param_specs, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("debugfs"); @@ -368,7 +365,7 @@ static struct dentry *debugfs_start_creating(const char *name, struct dentry *dentry; int error; - if (!(debugfs_allow & DEBUGFS_ALLOW_API)) + if (!debugfs_enabled) return ERR_PTR(-EPERM); if (!debugfs_initialized()) @@ -405,16 +402,15 @@ static struct dentry *debugfs_start_creating(const char *name, static struct dentry *debugfs_failed_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - dput(dentry); + simple_done_creating(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return ERR_PTR(-ENOMEM); } static struct dentry *debugfs_end_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - return dentry; + simple_done_creating(dentry); + return dentry; // borrowed } static struct dentry *__debugfs_create_file(const char *name, umode_t mode, @@ -434,11 +430,6 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, if (IS_ERR(dentry)) return dentry; - if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - debugfs_failed_creating(dentry); - return ERR_PTR(-EPERM); - } - inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create file '%s'\n", @@ -456,7 +447,7 @@ static struct dentry *__debugfs_create_file(const char *name, umode_t mode, DEBUGFS_I(inode)->raw = real_fops; DEBUGFS_I(inode)->aux = (void *)aux; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return debugfs_end_creating(dentry); } @@ -584,11 +575,6 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) if (IS_ERR(dentry)) return dentry; - if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - debugfs_failed_creating(dentry); - return ERR_PTR(-EPERM); - } - inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create directory '%s'\n", @@ -602,7 +588,7 @@ struct dentry *debugfs_create_dir(const char *name, struct dentry *parent) /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return debugfs_end_creating(dentry); @@ -631,11 +617,6 @@ struct dentry *debugfs_create_automount(const char *name, if (IS_ERR(dentry)) return dentry; - if (!(debugfs_allow & DEBUGFS_ALLOW_API)) { - debugfs_failed_creating(dentry); - return ERR_PTR(-EPERM); - } - inode = debugfs_get_inode(dentry->d_sb); if (unlikely(!inode)) { pr_err("out of free dentries, can not create automount '%s'\n", @@ -649,7 +630,7 @@ struct dentry *debugfs_create_automount(const char *name, DEBUGFS_I(inode)->automount = f; /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return debugfs_end_creating(dentry); @@ -704,7 +685,7 @@ struct dentry *debugfs_create_symlink(const char *name, struct dentry *parent, inode->i_mode = S_IFLNK | S_IRWXUGO; inode->i_op = &debugfs_symlink_inode_operations; inode->i_link = link; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); return debugfs_end_creating(dentry); } EXPORT_SYMBOL_GPL(debugfs_create_symlink); @@ -899,21 +880,25 @@ static int __init debugfs_kernel(char *str) { if (str) { if (!strcmp(str, "on")) - debugfs_allow = DEBUGFS_ALLOW_API | DEBUGFS_ALLOW_MOUNT; - else if (!strcmp(str, "no-mount")) - debugfs_allow = DEBUGFS_ALLOW_API; + debugfs_enabled = true; else if (!strcmp(str, "off")) - debugfs_allow = 0; + debugfs_enabled = false; + else if (!strcmp(str, "no-mount")) { + pr_notice("debugfs=no-mount is a deprecated alias " + "for debugfs=off\n"); + debugfs_enabled = false; + } } return 0; } early_param("debugfs", debugfs_kernel); + static int __init debugfs_init(void) { int retval; - if (!(debugfs_allow & DEBUGFS_ALLOW_MOUNT)) + if (!debugfs_enabled) return -EPERM; retval = sysfs_create_mount_point(kernel_kobj, "debug"); diff --git a/fs/debugfs/internal.h b/fs/debugfs/internal.h index 427987f81571..c95699b27a56 100644 --- a/fs/debugfs/internal.h +++ b/fs/debugfs/internal.h @@ -55,17 +55,4 @@ enum { HAS_IOCTL = 16 }; -#define DEBUGFS_ALLOW_API BIT(0) -#define DEBUGFS_ALLOW_MOUNT BIT(1) - -#ifdef CONFIG_DEBUG_FS_ALLOW_ALL -#define DEFAULT_DEBUGFS_ALLOW_BITS (DEBUGFS_ALLOW_MOUNT | DEBUGFS_ALLOW_API) -#endif -#ifdef CONFIG_DEBUG_FS_DISALLOW_MOUNT -#define DEFAULT_DEBUGFS_ALLOW_BITS (DEBUGFS_ALLOW_API) -#endif -#ifdef CONFIG_DEBUG_FS_ALLOW_NONE -#define DEFAULT_DEBUGFS_ALLOW_BITS (0) -#endif - #endif /* _DEBUGFS_INTERNAL_H_ */ diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index fdf22264a8e9..9f3de528c358 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -102,7 +102,7 @@ struct pts_fs_info { struct ida allocated_ptys; struct pts_mount_opts mount_opts; struct super_block *sb; - struct dentry *ptmx_dentry; + struct inode *ptmx_inode; // borrowed }; static inline struct pts_fs_info *DEVPTS_SB(struct super_block *sb) @@ -259,7 +259,6 @@ static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; - int rc = -ENOMEM; struct dentry *dentry; struct inode *inode; struct dentry *root = sb->s_root; @@ -268,18 +267,10 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) kuid_t ptmx_uid = current_fsuid(); kgid_t ptmx_gid = current_fsgid(); - inode_lock(d_inode(root)); - - /* If we have already created ptmx node, return */ - if (fsi->ptmx_dentry) { - rc = 0; - goto out; - } - - dentry = d_alloc_name(root, "ptmx"); - if (!dentry) { + dentry = simple_start_creating(root, "ptmx"); + if (IS_ERR(dentry)) { pr_err("Unable to alloc dentry for ptmx node\n"); - goto out; + return PTR_ERR(dentry); } /* @@ -287,9 +278,9 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) */ inode = new_inode(sb); if (!inode) { + simple_done_creating(dentry); pr_err("Unable to alloc inode for ptmx node\n"); - dput(dentry); - goto out; + return -ENOMEM; } inode->i_ino = 2; @@ -299,23 +290,18 @@ static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) init_special_inode(inode, mode, MKDEV(TTYAUX_MAJOR, 2)); inode->i_uid = ptmx_uid; inode->i_gid = ptmx_gid; + fsi->ptmx_inode = inode; - d_add(dentry, inode); + d_make_persistent(dentry, inode); - fsi->ptmx_dentry = dentry; - rc = 0; -out: - inode_unlock(d_inode(root)); - return rc; + simple_done_creating(dentry); + + return 0; } static void update_ptmx_mode(struct pts_fs_info *fsi) { - struct inode *inode; - if (fsi->ptmx_dentry) { - inode = d_inode(fsi->ptmx_dentry); - inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; - } + fsi->ptmx_inode->i_mode = S_IFCHR|fsi->mount_opts.ptmxmode; } static int devpts_reconfigure(struct fs_context *fc) @@ -461,7 +447,7 @@ static void devpts_kill_sb(struct super_block *sb) if (fsi) ida_destroy(&fsi->allocated_ptys); kfree(fsi); - kill_litter_super(sb); + kill_anon_super(sb); } static struct file_system_type devpts_fs_type = { @@ -534,16 +520,15 @@ struct dentry *devpts_pty_new(struct pts_fs_info *fsi, int index, void *priv) sprintf(s, "%d", index); dentry = d_alloc_name(root, s); - if (dentry) { - dentry->d_fsdata = priv; - d_add(dentry, inode); - fsnotify_create(d_inode(root), dentry); - } else { + if (!dentry) { iput(inode); - dentry = ERR_PTR(-ENOMEM); + return ERR_PTR(-ENOMEM); } - - return dentry; + dentry->d_fsdata = priv; + d_make_persistent(dentry, inode); + fsnotify_create(d_inode(root), dentry); + dput(dentry); + return dentry; // borrowed } /** @@ -573,7 +558,7 @@ void devpts_pty_kill(struct dentry *dentry) drop_nlink(dentry->d_inode); d_drop(dentry); fsnotify_unlink(d_inode(dentry->d_parent), dentry); - dput(dentry); /* d_alloc_name() in devpts_pty_new() */ + d_make_discardable(dentry); } static int __init init_devpts_fs(void) diff --git a/fs/efivarfs/inode.c b/fs/efivarfs/inode.c index 2891614abf8d..95dcad83da11 100644 --- a/fs/efivarfs/inode.c +++ b/fs/efivarfs/inode.c @@ -113,8 +113,7 @@ static int efivarfs_create(struct mnt_idmap *idmap, struct inode *dir, inode->i_private = var; - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); return 0; } @@ -126,9 +125,7 @@ static int efivarfs_unlink(struct inode *dir, struct dentry *dentry) if (efivar_entry_delete(var)) return -EINVAL; - drop_nlink(d_inode(dentry)); - dput(dentry); - return 0; + return simple_unlink(dir, dentry); }; const struct inode_operations efivarfs_dir_inode_operations = { diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 6de97565d5f7..9da992925920 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -278,7 +278,8 @@ static int efivarfs_create_dentry(struct super_block *sb, efi_char16_t *name16, inode->i_private = entry; i_size_write(inode, size + sizeof(__u32)); /* attributes + data */ inode_unlock(inode); - d_add(dentry, inode); + d_make_persistent(dentry, inode); + dput(dentry); return 0; @@ -522,7 +523,7 @@ static void efivarfs_kill_sb(struct super_block *sb) struct efivarfs_fs_info *sfi = sb->s_fs_info; blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb); - kill_litter_super(sb); + kill_anon_super(sb); kfree(sfi); } diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 461a929e0825..65da21504632 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -562,7 +562,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) * Allocate a managed folio for cached I/O, or it may be * then filled with a file-backed folio for in-place I/O */ - newfolio = filemap_alloc_folio(gfp, 0); + newfolio = filemap_alloc_folio(gfp, 0, NULL); if (!newfolio) continue; newfolio->private = Z_EROFS_PREALLOCATED_FOLIO; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index 2d2d510f2372..5429041c7eaf 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -106,7 +106,7 @@ static int exfat_allocate_bitmap(struct super_block *sb, (PAGE_SHIFT - sb->s_blocksize_bits); for (i = 0; i < sbi->map_sectors; i++) { /* Trigger the next readahead in advance. */ - if (0 == (i % max_ra_count)) { + if (max_ra_count && 0 == (i % max_ra_count)) { blk_start_plug(&plug); for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++) sb_breadahead(sb, sector + j); @@ -183,11 +183,10 @@ void exfat_free_bitmap(struct exfat_sb_info *sbi) kvfree(sbi->vol_amap); } -int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) +int exfat_set_bitmap(struct super_block *sb, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; - struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); if (!is_valid_cluster(sbi, clu)) @@ -202,11 +201,10 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) return 0; } -int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) +int exfat_clear_bitmap(struct super_block *sb, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; - struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); if (!is_valid_cluster(sbi, clu)) @@ -226,6 +224,28 @@ int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) return 0; } +bool exfat_test_bitmap(struct super_block *sb, unsigned int clu) +{ + int i, b; + unsigned int ent_idx; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + if (!sbi->vol_amap) + return true; + + if (!is_valid_cluster(sbi, clu)) + return false; + + ent_idx = CLUSTER_TO_BITMAP_ENT(clu); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + if (!test_bit_le(b, sbi->vol_amap[i]->b_data)) + return false; + + return true; +} + /* * If the value of "clu" is 0, it means cluster 2 which is the first cluster of * the cluster heap. diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index 7229146fe2bf..3045a58e124a 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -604,6 +604,11 @@ static int exfat_find_location(struct super_block *sb, struct exfat_chain *p_dir if (ret) return ret; + if (!exfat_test_bitmap(sb, clu)) { + exfat_err(sb, "failed to test cluster bit(%u)", clu); + return -EIO; + } + /* byte offset in cluster */ off = EXFAT_CLU_OFFSET(off, sbi); diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 38210fb6901c..176fef62574c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -452,8 +452,9 @@ int exfat_count_num_clusters(struct super_block *sb, /* balloc.c */ int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); -int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync); -int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); +int exfat_set_bitmap(struct super_block *sb, unsigned int clu, bool sync); +int exfat_clear_bitmap(struct super_block *sb, unsigned int clu, bool sync); +bool exfat_test_bitmap(struct super_block *sb, unsigned int clu); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 825083634ba2..c9c5f2e3a05e 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -205,7 +205,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + err = exfat_clear_bitmap(sb, clu, (sync && IS_DIRSYNC(inode))); if (err) break; clu++; @@ -233,7 +233,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)))) + if (exfat_clear_bitmap(sb, clu, (sync && IS_DIRSYNC(inode)))) break; if (sbi->options.discard) { @@ -409,7 +409,7 @@ int exfat_alloc_cluster(struct inode *inode, unsigned int num_alloc, } /* update allocation bitmap */ - if (exfat_set_bitmap(inode, new_clu, sync_bmap)) { + if (exfat_set_bitmap(sb, new_clu, sync_bmap)) { ret = -EIO; goto free_cluster; } diff --git a/fs/exfat/file.c b/fs/exfat/file.c index adc37b4d7fc2..536c8078f0c1 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -25,6 +25,8 @@ static int exfat_cont_expand(struct inode *inode, loff_t size) struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_chain clu; + truncate_pagecache(inode, i_size_read(inode)); + ret = inode_newsize_ok(inode, size); if (ret) return ret; @@ -639,6 +641,9 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) inode_lock(inode); + if (pos > i_size_read(inode)) + truncate_pagecache(inode, i_size_read(inode)); + valid_size = ei->valid_size; ret = generic_write_checks(iocb, iter); diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 745dce29ddb5..dfe957493d49 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -645,16 +645,6 @@ static int exfat_find(struct inode *dir, const struct qstr *qname, info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); - if (info->valid_size < 0) { - exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size); - return -EIO; - } - - if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { - exfat_fs_error(sb, "data size is invalid(%lld)", info->size); - return -EIO; - } - info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu); if (!is_valid_cluster(sbi, info->start_clu) && info->size) { exfat_warn(sb, "start_clu is invalid cluster(0x%x)", @@ -692,6 +682,16 @@ static int exfat_find(struct inode *dir, const struct qstr *qname, 0); exfat_put_dentry_set(&es, false); + if (info->valid_size < 0) { + exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size); + return -EIO; + } + + if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { + exfat_fs_error(sb, "data size is invalid(%lld)", info->size); + return -EIO; + } + if (ei->start_clu == EXFAT_FREE_CLUSTER) { exfat_fs_error(sb, "non-zero size file starts with zero cluster (size : %llu, p_dir : %u, entry : 0x%08x)", diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 74d451f732c7..10e872a99663 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -629,6 +629,17 @@ static int __exfat_fill_super(struct super_block *sb, goto free_bh; } + if (!exfat_test_bitmap(sb, sbi->root_dir)) { + exfat_warn(sb, "failed to test first cluster bit of root dir(%u)", + sbi->root_dir); + /* + * The first cluster bit of the root directory should never + * be unset except when storage is corrupted. This bit is + * set to allow operations after mount. + */ + exfat_set_bitmap(sb, sbi->root_dir, false); + } + ret = exfat_count_used_clusters(sb, &sbi->used_clusters); if (ret) { exfat_err(sb, "failed to scan clusters"); @@ -813,10 +824,21 @@ static int exfat_init_fs_context(struct fs_context *fc) ratelimit_state_init(&sbi->ratelimit, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); - sbi->options.fs_uid = current_uid(); - sbi->options.fs_gid = current_gid(); - sbi->options.fs_fmask = current->fs->umask; - sbi->options.fs_dmask = current->fs->umask; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE && fc->root) { + struct super_block *sb = fc->root->d_sb; + struct exfat_mount_options *cur_opts = &EXFAT_SB(sb)->options; + + sbi->options.fs_uid = cur_opts->fs_uid; + sbi->options.fs_gid = cur_opts->fs_gid; + sbi->options.fs_fmask = cur_opts->fs_fmask; + sbi->options.fs_dmask = cur_opts->fs_dmask; + } else { + sbi->options.fs_uid = current_uid(); + sbi->options.fs_gid = current_gid(); + sbi->options.fs_fmask = current->fs->umask; + sbi->options.fs_dmask = current->fs->umask; + } + sbi->options.allow_utime = -1; sbi->options.errors = EXFAT_ERRORS_RO; exfat_set_iocharset(&sbi->options, exfat_default_iocharset); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index be53e06caf3d..d7e6f563b3e4 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1947,7 +1947,7 @@ static void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, return; } - cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0, NULL); if (!cfolio) return; diff --git a/fs/fuse/control.c b/fs/fuse/control.c index bb407705603c..140bd5730d99 100644 --- a/fs/fuse/control.c +++ b/fs/fuse/control.c @@ -205,8 +205,7 @@ static const struct file_operations fuse_conn_congestion_threshold_ops = { static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, struct fuse_conn *fc, - const char *name, - int mode, int nlink, + const char *name, int mode, const struct inode_operations *iop, const struct file_operations *fop) { @@ -232,10 +231,19 @@ static struct dentry *fuse_ctl_add_dentry(struct dentry *parent, if (iop) inode->i_op = iop; inode->i_fop = fop; - set_nlink(inode, nlink); + if (S_ISDIR(mode)) { + inc_nlink(d_inode(parent)); + inc_nlink(inode); + } inode->i_private = fc; - d_add(dentry, inode); - + d_make_persistent(dentry, inode); + dput(dentry); + + /* + * We are returning a borrowed reference here - it's only good while + * fuse_mutex is held. Actually it's d_make_persistent() return + * value... + */ return dentry; } @@ -252,22 +260,21 @@ int fuse_ctl_add_conn(struct fuse_conn *fc) return 0; parent = fuse_control_sb->s_root; - inc_nlink(d_inode(parent)); sprintf(name, "%u", fc->dev); - parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, 2, + parent = fuse_ctl_add_dentry(parent, fc, name, S_IFDIR | 0500, &simple_dir_inode_operations, &simple_dir_operations); if (!parent) goto err; - if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, 1, + if (!fuse_ctl_add_dentry(parent, fc, "waiting", S_IFREG | 0400, NULL, &fuse_ctl_waiting_ops) || - !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, 1, + !fuse_ctl_add_dentry(parent, fc, "abort", S_IFREG | 0200, NULL, &fuse_ctl_abort_ops) || !fuse_ctl_add_dentry(parent, fc, "max_background", S_IFREG | 0600, - 1, NULL, &fuse_conn_max_background_ops) || + NULL, &fuse_conn_max_background_ops) || !fuse_ctl_add_dentry(parent, fc, "congestion_threshold", - S_IFREG | 0600, 1, NULL, + S_IFREG | 0600, NULL, &fuse_conn_congestion_threshold_ops)) goto err; @@ -289,18 +296,13 @@ static void remove_one(struct dentry *dentry) */ void fuse_ctl_remove_conn(struct fuse_conn *fc) { - struct dentry *dentry; char name[32]; if (!fuse_control_sb || fc->no_control) return; sprintf(name, "%u", fc->dev); - dentry = lookup_noperm_positive_unlocked(&QSTR(name), fuse_control_sb->s_root); - if (!IS_ERR(dentry)) { - simple_recursive_removal(dentry, remove_one); - dput(dentry); // paired with lookup_noperm_positive_unlocked() - } + simple_remove_by_name(fuse_control_sb->s_root, name, remove_one); } static int fuse_ctl_fill_super(struct super_block *sb, struct fs_context *fsc) @@ -350,7 +352,7 @@ static void fuse_ctl_kill_sb(struct super_block *sb) fuse_control_sb = NULL; mutex_unlock(&fuse_mutex); - kill_litter_super(sb); + kill_anon_super(sb); } static struct file_system_type fuse_ctl_fs_type = { diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 132f38619d70..6d59cbc877c6 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -846,7 +846,7 @@ void fuse_copy_init(struct fuse_copy_state *cs, bool write, } /* Unmap and put previous page of userspace buffer */ -static void fuse_copy_finish(struct fuse_copy_state *cs) +void fuse_copy_finish(struct fuse_copy_state *cs) { if (cs->currbuf) { struct pipe_buffer *buf = cs->currbuf; @@ -2041,13 +2041,14 @@ static int fuse_notify_resend(struct fuse_conn *fc) /* * Increments the fuse connection epoch. This will result of dentries from - * previous epochs to be invalidated. - * - * XXX optimization: add call to shrink_dcache_sb()? + * previous epochs to be invalidated. Additionally, if inval_wq is set, a work + * queue is scheduled to trigger the invalidation. */ static int fuse_notify_inc_epoch(struct fuse_conn *fc) { atomic_inc(&fc->epoch); + if (inval_wq) + schedule_work(&fc->epoch_work); return 0; } diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index f8c93dc45768..5ceb217ced1b 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -86,6 +86,7 @@ static void fuse_uring_req_end(struct fuse_ring_ent *ent, struct fuse_req *req, lockdep_assert_not_held(&queue->lock); spin_lock(&queue->lock); ent->fuse_req = NULL; + list_del_init(&req->list); if (test_bit(FR_BACKGROUND, &req->flags)) { queue->active_background--; spin_lock(&fc->bg_lock); @@ -598,12 +599,14 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, cs.is_uring = true; cs.req = req; - return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + err = fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); + fuse_copy_finish(&cs); + return err; } - /* - * Copy data from the req to the ring buffer - */ +/* + * Copy data from the req to the ring buffer + */ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, struct fuse_ring_ent *ent) { @@ -649,6 +652,7 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, /* copy the payload */ err = fuse_copy_args(&cs, num_args, args->in_pages, (struct fuse_arg *)in_args, 0); + fuse_copy_finish(&cs); if (err) { pr_info_ratelimited("%s fuse_copy_args failed\n", __func__); return err; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 87a63ae93a45..4b6b3d2758ff 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -27,6 +27,67 @@ module_param(allow_sys_admin_access, bool, 0644); MODULE_PARM_DESC(allow_sys_admin_access, "Allow users with CAP_SYS_ADMIN in initial userns to bypass allow_other access check"); +struct dentry_bucket { + struct rb_root tree; + spinlock_t lock; +}; + +#define HASH_BITS 5 +#define HASH_SIZE (1 << HASH_BITS) +static struct dentry_bucket dentry_hash[HASH_SIZE]; +struct delayed_work dentry_tree_work; + +/* Minimum invalidation work queue frequency */ +#define FUSE_DENTRY_INVAL_FREQ_MIN 5 + +unsigned __read_mostly inval_wq; +static int inval_wq_set(const char *val, const struct kernel_param *kp) +{ + unsigned int num; + unsigned int old = inval_wq; + int ret; + + if (!val) + return -EINVAL; + + ret = kstrtouint(val, 0, &num); + if (ret) + return ret; + + if ((num < FUSE_DENTRY_INVAL_FREQ_MIN) && (num != 0)) + return -EINVAL; + + /* This should prevent overflow in secs_to_jiffies() */ + if (num > USHRT_MAX) + return -EINVAL; + + *((unsigned int *)kp->arg) = num; + + if (num && !old) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(num)); + else if (!num && old) + cancel_delayed_work_sync(&dentry_tree_work); + + return 0; +} +static const struct kernel_param_ops inval_wq_ops = { + .set = inval_wq_set, + .get = param_get_uint, +}; +module_param_cb(inval_wq, &inval_wq_ops, &inval_wq, 0644); +__MODULE_PARM_TYPE(inval_wq, "uint"); +MODULE_PARM_DESC(inval_wq, + "Dentries invalidation work queue period in secs (>= " + __stringify(FUSE_DENTRY_INVAL_FREQ_MIN) ")."); + +static inline struct dentry_bucket *get_dentry_bucket(struct dentry *dentry) +{ + int i = hash_ptr(dentry, HASH_BITS); + + return &dentry_hash[i]; +} + static void fuse_advise_use_readdirplus(struct inode *dir) { struct fuse_inode *fi = get_fuse_inode(dir); @@ -34,33 +95,151 @@ static void fuse_advise_use_readdirplus(struct inode *dir) set_bit(FUSE_I_ADVISE_RDPLUS, &fi->state); } -#if BITS_PER_LONG >= 64 -static inline void __fuse_dentry_settime(struct dentry *entry, u64 time) +struct fuse_dentry { + u64 time; + union { + struct rcu_head rcu; + struct rb_node node; + }; + struct dentry *dentry; +}; + +static void __fuse_dentry_tree_del_node(struct fuse_dentry *fd, + struct dentry_bucket *bucket) { - entry->d_fsdata = (void *) time; + if (!RB_EMPTY_NODE(&fd->node)) { + rb_erase(&fd->node, &bucket->tree); + RB_CLEAR_NODE(&fd->node); + } } -static inline u64 fuse_dentry_time(const struct dentry *entry) +static void fuse_dentry_tree_del_node(struct dentry *dentry) { - return (u64)entry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + __fuse_dentry_tree_del_node(fd, bucket); + spin_unlock(&bucket->lock); } -#else -union fuse_dentry { - u64 time; - struct rcu_head rcu; -}; +static void fuse_dentry_tree_add_node(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + struct dentry_bucket *bucket; + struct fuse_dentry *cur; + struct rb_node **p, *parent = NULL; + + if (!inval_wq) + return; + + bucket = get_dentry_bucket(dentry); + + spin_lock(&bucket->lock); + + __fuse_dentry_tree_del_node(fd, bucket); + + p = &bucket->tree.rb_node; + while (*p) { + parent = *p; + cur = rb_entry(*p, struct fuse_dentry, node); + if (fd->time < cur->time) + p = &(*p)->rb_left; + else + p = &(*p)->rb_right; + } + rb_link_node(&fd->node, parent, p); + rb_insert_color(&fd->node, &bucket->tree); + spin_unlock(&bucket->lock); +} + +/* + * work queue which, when enabled, will periodically check for expired dentries + * in the dentries tree. + */ +static void fuse_dentry_tree_work(struct work_struct *work) +{ + LIST_HEAD(dispose); + struct fuse_dentry *fd; + struct rb_node *node; + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock(&dentry_hash[i].lock); + node = rb_first(&dentry_hash[i].tree); + while (node) { + fd = rb_entry(node, struct fuse_dentry, node); + if (time_after64(get_jiffies_64(), fd->time)) { + rb_erase(&fd->node, &dentry_hash[i].tree); + RB_CLEAR_NODE(&fd->node); + spin_unlock(&dentry_hash[i].lock); + d_dispose_if_unused(fd->dentry, &dispose); + cond_resched(); + spin_lock(&dentry_hash[i].lock); + } else + break; + node = rb_first(&dentry_hash[i].tree); + } + spin_unlock(&dentry_hash[i].lock); + shrink_dentry_list(&dispose); + } + + if (inval_wq) + schedule_delayed_work(&dentry_tree_work, + secs_to_jiffies(inval_wq)); +} + +void fuse_epoch_work(struct work_struct *work) +{ + struct fuse_conn *fc = container_of(work, struct fuse_conn, + epoch_work); + struct fuse_mount *fm; + struct inode *inode; + + down_read(&fc->killsb); + + inode = fuse_ilookup(fc, FUSE_ROOT_ID, &fm); + if (inode) { + iput(inode); + /* Remove all possible active references to cached inodes */ + shrink_dcache_sb(fm->sb); + } else + pr_warn("Failed to get root inode"); + + up_read(&fc->killsb); +} + +void fuse_dentry_tree_init(void) +{ + int i; + + for (i = 0; i < HASH_SIZE; i++) { + spin_lock_init(&dentry_hash[i].lock); + dentry_hash[i].tree = RB_ROOT; + } + INIT_DELAYED_WORK(&dentry_tree_work, fuse_dentry_tree_work); +} + +void fuse_dentry_tree_cleanup(void) +{ + int i; + + inval_wq = 0; + cancel_delayed_work_sync(&dentry_tree_work); + + for (i = 0; i < HASH_SIZE; i++) + WARN_ON_ONCE(!RB_EMPTY_ROOT(&dentry_hash[i].tree)); +} static inline void __fuse_dentry_settime(struct dentry *dentry, u64 time) { - ((union fuse_dentry *) dentry->d_fsdata)->time = time; + ((struct fuse_dentry *) dentry->d_fsdata)->time = time; } static inline u64 fuse_dentry_time(const struct dentry *entry) { - return ((union fuse_dentry *) entry->d_fsdata)->time; + return ((struct fuse_dentry *) entry->d_fsdata)->time; } -#endif static void fuse_dentry_settime(struct dentry *dentry, u64 time) { @@ -81,6 +260,7 @@ static void fuse_dentry_settime(struct dentry *dentry, u64 time) } __fuse_dentry_settime(dentry, time); + fuse_dentry_tree_add_node(dentry); } /* @@ -283,21 +463,36 @@ invalid: goto out; } -#if BITS_PER_LONG < 64 static int fuse_dentry_init(struct dentry *dentry) { - dentry->d_fsdata = kzalloc(sizeof(union fuse_dentry), - GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + struct fuse_dentry *fd; - return dentry->d_fsdata ? 0 : -ENOMEM; + fd = kzalloc(sizeof(struct fuse_dentry), + GFP_KERNEL_ACCOUNT | __GFP_RECLAIMABLE); + if (!fd) + return -ENOMEM; + + fd->dentry = dentry; + RB_CLEAR_NODE(&fd->node); + dentry->d_fsdata = fd; + + return 0; } + +static void fuse_dentry_prune(struct dentry *dentry) +{ + struct fuse_dentry *fd = dentry->d_fsdata; + + if (!RB_EMPTY_NODE(&fd->node)) + fuse_dentry_tree_del_node(dentry); +} + static void fuse_dentry_release(struct dentry *dentry) { - union fuse_dentry *fd = dentry->d_fsdata; + struct fuse_dentry *fd = dentry->d_fsdata; kfree_rcu(fd, rcu); } -#endif static int fuse_dentry_delete(const struct dentry *dentry) { @@ -331,10 +526,9 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) const struct dentry_operations fuse_dentry_operations = { .d_revalidate = fuse_dentry_revalidate, .d_delete = fuse_dentry_delete, -#if BITS_PER_LONG < 64 .d_init = fuse_dentry_init, + .d_prune = fuse_dentry_prune, .d_release = fuse_dentry_release, -#endif .d_automount = fuse_dentry_automount, }; @@ -471,7 +665,7 @@ static int get_security_context(struct dentry *entry, umode_t mode, u32 total_len = sizeof(*header); int err, nr_ctx = 0; const char *name = NULL; - size_t namelen; + size_t namesize; err = security_dentry_init_security(entry, mode, &entry->d_name, &name, &lsmctx); @@ -482,12 +676,12 @@ static int get_security_context(struct dentry *entry, umode_t mode, if (lsmctx.len) { nr_ctx = 1; - namelen = strlen(name) + 1; + namesize = strlen(name) + 1; err = -EIO; - if (WARN_ON(namelen > XATTR_NAME_MAX + 1 || + if (WARN_ON(namesize > XATTR_NAME_MAX + 1 || lsmctx.len > S32_MAX)) goto out_err; - total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namelen + + total_len += FUSE_REC_ALIGN(sizeof(*fctx) + namesize + lsmctx.len); } @@ -504,8 +698,8 @@ static int get_security_context(struct dentry *entry, umode_t mode, fctx->size = lsmctx.len; ptr += sizeof(*fctx); - strcpy(ptr, name); - ptr += namelen; + strscpy(ptr, name, namesize); + ptr += namesize; memcpy(ptr, lsmctx.context, lsmctx.len); } diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7bcb650a9f26..01bc894e9c2b 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -110,7 +110,9 @@ static void fuse_file_put(struct fuse_file *ff, bool sync) fuse_file_io_release(ff, ra->inode); if (!args) { - /* Do nothing when server does not implement 'open' */ + /* Do nothing when server does not implement 'opendir' */ + } else if (args->opcode == FUSE_RELEASE && ff->fm->fc->no_open) { + fuse_release_end(ff->fm, args, 0); } else if (sync) { fuse_simple_request(ff->fm, args); fuse_release_end(ff->fm, args, 0); @@ -131,8 +133,17 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, struct fuse_file *ff; int opcode = isdir ? FUSE_OPENDIR : FUSE_OPEN; bool open = isdir ? !fc->no_opendir : !fc->no_open; + bool release = !isdir || open; - ff = fuse_file_alloc(fm, open); + /* + * ff->args->release_args still needs to be allocated (so we can hold an + * inode reference while there are pending inflight file operations when + * ->release() is called, see fuse_prepare_release()) even if + * fc->no_open is set else it becomes possible for reclaim to deadlock + * if while servicing the readahead request the server triggers reclaim + * and reclaim evicts the inode of the file being read ahead. + */ + ff = fuse_file_alloc(fm, release); if (!ff) return ERR_PTR(-ENOMEM); @@ -152,13 +163,14 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, fuse_file_free(ff); return ERR_PTR(err); } else { - /* No release needed */ - kfree(ff->args); - ff->args = NULL; - if (isdir) + if (isdir) { + /* No release needed */ + kfree(ff->args); + ff->args = NULL; fc->no_opendir = 1; - else + } else { fc->no_open = 1; + } } } @@ -1652,7 +1664,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!ia) return -ENOMEM; - if (fopen_direct_io && fc->direct_io_allow_mmap) { + if (fopen_direct_io) { res = filemap_write_and_wait_range(mapping, pos, pos + count - 1); if (res) { fuse_io_free(ia); @@ -1726,6 +1738,15 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (res > 0) *ppos = pos; + if (res > 0 && write && fopen_direct_io) { + /* + * As in generic_file_direct_write(), invalidate after the + * write, to invalidate read-ahead cache that may have competed + * with the write. + */ + invalidate_inode_pages2_range(mapping, idx_from, idx_to); + } + return res > 0 ? res : err; } EXPORT_SYMBOL_GPL(fuse_direct_io); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 6e8373f97040..134bf44aff0d 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -62,6 +62,7 @@ void fuse_dev_end_requests(struct list_head *head); void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); +void fuse_copy_finish(struct fuse_copy_state *cs); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, int zeroing); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index f616c1991fed..7f16049387d1 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -54,6 +54,13 @@ /** Frequency (in jiffies) of request timeout checks, if opted into */ extern const unsigned long fuse_timeout_timer_freq; +/* + * Dentries invalidation workqueue period, in seconds. The value of this + * parameter shall be >= FUSE_DENTRY_INVAL_FREQ_MIN seconds, or 0 (zero), in + * which case no workqueue will be created. + */ +extern unsigned inval_wq __read_mostly; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; /* @@ -232,6 +239,11 @@ enum { FUSE_I_BTIME, /* Wants or already has page cache IO */ FUSE_I_CACHE_IO_MODE, + /* + * Client has exclusive access to the inode, either because fs is local + * or the fuse server has an exclusive "lease" on distributed fs + */ + FUSE_I_EXCLUSIVE, }; struct fuse_conn; @@ -642,6 +654,8 @@ struct fuse_conn { /** Current epoch for up-to-date dentries */ atomic_t epoch; + struct work_struct epoch_work; + struct rcu_head rcu; /** The user id for this mount */ @@ -1038,7 +1052,7 @@ static inline struct fuse_conn *get_fuse_conn(struct inode *inode) return get_fuse_mount_super(inode->i_sb)->fc; } -static inline struct fuse_inode *get_fuse_inode(struct inode *inode) +static inline struct fuse_inode *get_fuse_inode(const struct inode *inode) { return container_of(inode, struct fuse_inode, inode); } @@ -1080,6 +1094,13 @@ static inline bool fuse_is_bad(struct inode *inode) return unlikely(test_bit(FUSE_I_BAD, &get_fuse_inode(inode)->state)); } +static inline bool fuse_inode_is_exclusive(const struct inode *inode) +{ + const struct fuse_inode *fi = get_fuse_inode(inode); + + return test_bit(FUSE_I_EXCLUSIVE, &fi->state); +} + static inline struct folio **fuse_folios_alloc(unsigned int nfolios, gfp_t flags, struct fuse_folio_desc **desc) { @@ -1269,6 +1290,11 @@ void fuse_wait_aborted(struct fuse_conn *fc); /* Check if any requests timed out */ void fuse_check_timeout(struct work_struct *work); +void fuse_dentry_tree_init(void); +void fuse_dentry_tree_cleanup(void); + +void fuse_epoch_work(struct work_struct *work); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 1a397be53f49..819e50d66622 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -977,6 +977,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); atomic_set(&fc->epoch, 1); + INIT_WORK(&fc->epoch_work, fuse_epoch_work); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1021,26 +1022,28 @@ static void delayed_release(struct rcu_head *p) void fuse_conn_put(struct fuse_conn *fc) { - if (refcount_dec_and_test(&fc->count)) { - struct fuse_iqueue *fiq = &fc->iq; - struct fuse_sync_bucket *bucket; - - if (IS_ENABLED(CONFIG_FUSE_DAX)) - fuse_dax_conn_free(fc); - if (fc->timeout.req_timeout) - cancel_delayed_work_sync(&fc->timeout.work); - if (fiq->ops->release) - fiq->ops->release(fiq); - put_pid_ns(fc->pid_ns); - bucket = rcu_dereference_protected(fc->curr_bucket, 1); - if (bucket) { - WARN_ON(atomic_read(&bucket->count) != 1); - kfree(bucket); - } - if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) - fuse_backing_files_free(fc); - call_rcu(&fc->rcu, delayed_release); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_sync_bucket *bucket; + + if (!refcount_dec_and_test(&fc->count)) + return; + + if (IS_ENABLED(CONFIG_FUSE_DAX)) + fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); + cancel_work_sync(&fc->epoch_work); + if (fiq->ops->release) + fiq->ops->release(fiq); + put_pid_ns(fc->pid_ns); + bucket = rcu_dereference_protected(fc->curr_bucket, 1); + if (bucket) { + WARN_ON(atomic_read(&bucket->count) != 1); + kfree(bucket); } + if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) + fuse_backing_files_free(fc); + call_rcu(&fc->rcu, delayed_release); } EXPORT_SYMBOL_GPL(fuse_conn_put); @@ -2283,6 +2286,8 @@ static int __init fuse_init(void) if (res) goto err_sysfs_cleanup; + fuse_dentry_tree_init(); + sanitize_global_limit(&max_user_bgreq); sanitize_global_limit(&max_user_congthresh); @@ -2302,6 +2307,7 @@ static void __exit fuse_exit(void) { pr_debug("exit\n"); + fuse_dentry_tree_cleanup(); fuse_ctl_cleanup(); fuse_sysfs_cleanup(); fuse_fs_cleanup(); diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 15b2f094d36e..aa02599b770f 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -3,40 +3,8 @@ #define __UM_FS_HOSTFS #include <os.h> +#include <generated/asm-offsets.h> -/* - * These are exactly the same definitions as in fs.h, but the names are - * changed so that this file can be included in both kernel and user files. - */ - -#define HOSTFS_ATTR_MODE 1 -#define HOSTFS_ATTR_UID 2 -#define HOSTFS_ATTR_GID 4 -#define HOSTFS_ATTR_SIZE 8 -#define HOSTFS_ATTR_ATIME 16 -#define HOSTFS_ATTR_MTIME 32 -#define HOSTFS_ATTR_CTIME 64 -#define HOSTFS_ATTR_ATIME_SET 128 -#define HOSTFS_ATTR_MTIME_SET 256 - -/* This one is unused by hostfs. */ -#define HOSTFS_ATTR_FORCE 512 /* Not a change, but a change it */ -#define HOSTFS_ATTR_ATTR_FLAG 1024 - -/* - * If you are very careful, you'll notice that these two are missing: - * - * #define ATTR_KILL_SUID 2048 - * #define ATTR_KILL_SGID 4096 - * - * and this is because they were added in 2.5 development. - * Actually, they are not needed by most ->setattr() methods - they are set by - * callers of notify_change() to notify that the setuid/setgid bits must be - * dropped. - * notify_change() will delete those flags, make sure attr->ia_valid & ATTR_MODE - * is on, and remove the appropriate bits from attr->ia_mode (attr is a - * "struct iattr *"). -BlaisorBlade - */ struct hostfs_timespec { long long tv_sec; long long tv_nsec; diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index f42548ee9083..3b4c152c5c73 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -96,8 +96,15 @@ static const struct fs_parameter_spec hugetlb_fs_parameters[] = { #define PGOFF_LOFFT_MAX \ (((1UL << (PAGE_SHIFT + 1)) - 1) << (BITS_PER_LONG - (PAGE_SHIFT + 1))) -static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) +static int hugetlb_file_mmap_prepare_success(const struct vm_area_struct *vma) { + /* Unfortunate we have to reassign vma->vm_private_data. */ + return hugetlb_vma_lock_alloc((struct vm_area_struct *)vma); +} + +static int hugetlbfs_file_mmap_prepare(struct vm_area_desc *desc) +{ + struct file *file = desc->file; struct inode *inode = file_inode(file); loff_t len, vma_len; int ret; @@ -112,8 +119,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * way when do_mmap unwinds (may be important on powerpc * and ia64). */ - vm_flags_set(vma, VM_HUGETLB | VM_DONTEXPAND); - vma->vm_ops = &hugetlb_vm_ops; + desc->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; + desc->vm_ops = &hugetlb_vm_ops; /* * page based offset in vm_pgoff could be sufficiently large to @@ -122,16 +129,16 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) * sizeof(unsigned long). So, only check in those instances. */ if (sizeof(unsigned long) == sizeof(loff_t)) { - if (vma->vm_pgoff & PGOFF_LOFFT_MAX) + if (desc->pgoff & PGOFF_LOFFT_MAX) return -EINVAL; } /* must be huge page aligned */ - if (vma->vm_pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) + if (desc->pgoff & (~huge_page_mask(h) >> PAGE_SHIFT)) return -EINVAL; - vma_len = (loff_t)(vma->vm_end - vma->vm_start); - len = vma_len + ((loff_t)vma->vm_pgoff << PAGE_SHIFT); + vma_len = (loff_t)vma_desc_size(desc); + len = vma_len + ((loff_t)desc->pgoff << PAGE_SHIFT); /* check for overflow */ if (len < vma_len) return -EINVAL; @@ -141,7 +148,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ret = -ENOMEM; - vm_flags = vma->vm_flags; + vm_flags = desc->vm_flags; /* * for SHM_HUGETLB, the pages are reserved in the shmget() call so skip * reserving here. Note: only for SHM hugetlbfs file, the inode @@ -151,17 +158,30 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vm_flags |= VM_NORESERVE; if (hugetlb_reserve_pages(inode, - vma->vm_pgoff >> huge_page_order(h), - len >> huge_page_shift(h), vma, - vm_flags) < 0) + desc->pgoff >> huge_page_order(h), + len >> huge_page_shift(h), desc, + vm_flags) < 0) goto out; ret = 0; - if (vma->vm_flags & VM_WRITE && inode->i_size < len) + if ((desc->vm_flags & VM_WRITE) && inode->i_size < len) i_size_write(inode, len); out: inode_unlock(inode); + if (!ret) { + /* Allocate the VMA lock after we set it up. */ + desc->action.success_hook = hugetlb_file_mmap_prepare_success; + /* + * We cannot permit the rmap finding this VMA in the time + * between the VMA being inserted into the VMA tree and the + * completion/success hook being invoked. + * + * This is because we establish a per-VMA hugetlb lock which can + * be raced by rmap. + */ + desc->action.hide_from_rmap_until_complete = true; + } return ret; } @@ -184,8 +204,7 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, if (addr) addr0 = ALIGN(addr, huge_page_size(h)); - return mm_get_unmapped_area_vmflags(current->mm, file, addr0, len, pgoff, - flags, 0); + return mm_get_unmapped_area_vmflags(file, addr0, len, pgoff, flags, 0); } /* @@ -975,8 +994,7 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, if (!inode) return -ENOSPC; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - d_instantiate(dentry, inode); - dget(dentry);/* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); return 0; } @@ -1023,10 +1041,9 @@ static int hugetlbfs_symlink(struct mnt_idmap *idmap, if (inode) { int l = strlen(symname)+1; error = page_symlink(inode, symname, l); - if (!error) { - d_instantiate(dentry, inode); - dget(dentry); - } else + if (!error) + d_make_persistent(dentry, inode); + else iput(inode); } inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); @@ -1221,7 +1238,7 @@ static void init_once(void *foo) static const struct file_operations hugetlbfs_file_operations = { .read_iter = hugetlbfs_read_iter, - .mmap = hugetlbfs_file_mmap, + .mmap_prepare = hugetlbfs_file_mmap_prepare, .fsync = noop_fsync, .get_unmapped_area = hugetlb_get_unmapped_area, .llseek = default_llseek, @@ -1483,7 +1500,7 @@ static struct file_system_type hugetlbfs_fs_type = { .name = "hugetlbfs", .init_fs_context = hugetlbfs_init_fs_context, .parameters = hugetlb_fs_parameters, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, .fs_flags = FS_ALLOW_IDMAP, }; diff --git a/fs/inode.c b/fs/inode.c index cc8265cfe80e..521383223d8a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1968,7 +1968,7 @@ void iput(struct inode *inode) retry: lockdep_assert_not_held(&inode->i_lock); - VFS_BUG_ON_INODE(inode_state_read_once(inode) & I_CLEAR, inode); + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode); /* * Note this assert is technically racy as if the count is bogusly * equal to one, then two CPUs racing to further drop it can both @@ -2010,6 +2010,7 @@ EXPORT_SYMBOL(iput); */ void iput_not_last(struct inode *inode) { + VFS_BUG_ON_INODE(inode_state_read_once(inode) & (I_FREEING | I_CLEAR), inode); VFS_BUG_ON_INODE(atomic_read(&inode->i_count) < 2, inode); WARN_ON(atomic_sub_return(1, &inode->i_count) == 0); diff --git a/fs/internal.h b/fs/internal.h index d08d5e2235e9..ab638d41ab81 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -230,7 +230,6 @@ extern void shrink_dcache_for_umount(struct super_block *); extern struct dentry *__d_lookup(const struct dentry *, const struct qstr *); extern struct dentry *__d_lookup_rcu(const struct dentry *parent, const struct qstr *name, unsigned *seq); -extern void d_genocide(struct dentry *); /* * pipe.c diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index a670ba3e565e..5c0efd6b239f 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -675,11 +675,14 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, if (parent) { ret = security_kernfs_init_security(parent, kn); if (ret) - goto err_out3; + goto err_out4; } return kn; + err_out4: + simple_xattrs_free(&kn->iattr->xattrs, NULL); + kmem_cache_free(kernfs_iattrs_cache, kn->iattr); err_out3: spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 76eaf64b9d9e..3ac52e141766 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -298,6 +298,7 @@ static int kernfs_fill_super(struct super_block *sb, struct kernfs_fs_context *k if (info->root->flags & KERNFS_ROOT_SUPPORT_EXPORTOP) sb->s_export_op = &kernfs_export_ops; sb->s_time_gran = 1; + sb->s_maxbytes = MAX_LFS_FILESIZE; /* sysfs dentries and inodes don't require IO to create */ sb->s_shrink->seeks = 0; diff --git a/fs/libfs.c b/fs/libfs.c index 2d6657947abd..9264523be85c 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -630,7 +630,7 @@ static void __simple_recursive_removal(struct dentry *dentry, if (callback) callback(victim); fsnotify_delete(inode, d_inode(victim), victim); - dput(victim); // unpin it + d_make_discardable(victim); } if (victim == dentry) { inode_set_mtime_to_ts(inode, @@ -655,6 +655,19 @@ void simple_recursive_removal(struct dentry *dentry, } EXPORT_SYMBOL(simple_recursive_removal); +void simple_remove_by_name(struct dentry *parent, const char *name, + void (*callback)(struct dentry *)) +{ + struct dentry *dentry; + + dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); + if (!IS_ERR(dentry)) { + simple_recursive_removal(dentry, callback); + dput(dentry); // paired with lookup_noperm_positive_unlocked() + } +} +EXPORT_SYMBOL(simple_remove_by_name); + /* caller holds parent directory with I_MUTEX_PARENT */ void locked_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) @@ -752,8 +765,7 @@ int simple_link(struct dentry *old_dentry, struct inode *dir, struct dentry *den inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); inc_nlink(inode); ihold(inode); - dget(dentry); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); return 0; } EXPORT_SYMBOL(simple_link); @@ -779,14 +791,28 @@ out: } EXPORT_SYMBOL(simple_empty); -int simple_unlink(struct inode *dir, struct dentry *dentry) +void __simple_unlink(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); inode_set_mtime_to_ts(dir, inode_set_ctime_to_ts(dir, inode_set_ctime_current(inode))); drop_nlink(inode); - dput(dentry); +} +EXPORT_SYMBOL(__simple_unlink); + +void __simple_rmdir(struct inode *dir, struct dentry *dentry) +{ + drop_nlink(d_inode(dentry)); + __simple_unlink(dir, dentry); + drop_nlink(dir); +} +EXPORT_SYMBOL(__simple_rmdir); + +int simple_unlink(struct inode *dir, struct dentry *dentry) +{ + __simple_unlink(dir, dentry); + d_make_discardable(dentry); return 0; } EXPORT_SYMBOL(simple_unlink); @@ -796,9 +822,8 @@ int simple_rmdir(struct inode *dir, struct dentry *dentry) if (!simple_empty(dentry)) return -ENOTEMPTY; - drop_nlink(d_inode(dentry)); - simple_unlink(dir, dentry); - drop_nlink(dir); + __simple_rmdir(dir, dentry); + d_make_discardable(dentry); return 0; } EXPORT_SYMBOL(simple_rmdir); @@ -1066,7 +1091,8 @@ int simple_fill_super(struct super_block *s, unsigned long magic, simple_inode_init_ts(inode); inode->i_fop = files->ops; inode->i_ino = i; - d_add(dentry, inode); + d_make_persistent(dentry, inode); + dput(dentry); } return 0; } @@ -2312,3 +2338,11 @@ struct dentry *simple_start_creating(struct dentry *parent, const char *name) return start_dirop(parent, &qname, LOOKUP_CREATE | LOOKUP_EXCL); } EXPORT_SYMBOL(simple_start_creating); + +/* parent must have been held exclusive since simple_start_creating() */ +void simple_done_creating(struct dentry *child) +{ + inode_unlock(child->d_parent->d_inode); + dput(child); +} +EXPORT_SYMBOL(simple_done_creating); diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 49ed90c6b9f2..f33bfa7b58e6 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -615,8 +615,11 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } -static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *filp) +static void nfs_local_call_read(struct work_struct *work) { + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; bool force_done = false; ssize_t status; int n_iters; @@ -633,7 +636,9 @@ static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *fi } else iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + scoped_with_creds(filp->f_cred) + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { if (unlikely(status >= 0 && status < iocb->iters[i].count)) force_done = true; /* Partial read */ @@ -645,16 +650,6 @@ static void do_nfs_local_call_read(struct nfs_local_kiocb *iocb, struct file *fi } } -static void nfs_local_call_read(struct work_struct *work) -{ - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - - scoped_with_creds(filp->f_cred) - do_nfs_local_call_read(iocb, filp); -} - static int nfs_local_do_read(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) @@ -822,13 +817,18 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } -static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb, - struct file *filp) +static void nfs_local_call_write(struct work_struct *work) { + struct nfs_local_kiocb *iocb = + container_of(work, struct nfs_local_kiocb, work); + struct file *filp = iocb->kiocb.ki_filp; + unsigned long old_flags = current->flags; bool force_done = false; ssize_t status; int n_iters; + current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; + file_start_write(filp); n_iters = atomic_read(&iocb->n_iters); for (int i = 0; i < n_iters ; i++) { @@ -842,7 +842,9 @@ static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb, } else iocb->kiocb.ki_flags &= ~IOCB_DIRECT; - status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + scoped_with_creds(filp->f_cred) + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { if (unlikely(status >= 0 && status < iocb->iters[i].count)) force_done = true; /* Partial write */ @@ -854,22 +856,6 @@ static ssize_t do_nfs_local_call_write(struct nfs_local_kiocb *iocb, } file_end_write(filp); - return status; -} - -static void nfs_local_call_write(struct work_struct *work) -{ - struct nfs_local_kiocb *iocb = - container_of(work, struct nfs_local_kiocb, work); - struct file *filp = iocb->kiocb.ki_filp; - unsigned long old_flags = current->flags; - ssize_t status; - - current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; - - scoped_with_creds(filp->f_cred) - status = do_nfs_local_call_write(iocb, filp); - current->flags = old_flags; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 2b79129703d5..5ce9a49e76ba 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1137,11 +1137,11 @@ static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *nc inode->i_private = ncl; kref_get(&ncl->cl_ref); } - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); - inode_unlock(dir); - return dentry; + simple_done_creating(dentry); + return dentry; // borrowed } #if IS_ENABLED(CONFIG_SUNRPC_GSS) @@ -1170,9 +1170,9 @@ static void _nfsd_symlink(struct dentry *parent, const char *name, inode->i_link = (char *)content; inode->i_size = strlen(content); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); - inode_unlock(dir); + simple_done_creating(dentry); } #else static inline void _nfsd_symlink(struct dentry *parent, const char *name, @@ -1228,11 +1228,11 @@ static int nfsdfs_create_files(struct dentry *root, kref_get(&ncl->cl_ref); inode->i_fop = files->ops; inode->i_private = ncl; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) - fdentries[i] = dentry; - inode_unlock(dir); + fdentries[i] = dentry; // borrowed + simple_done_creating(dentry); } return 0; } @@ -1346,7 +1346,7 @@ static void nfsd_umount(struct super_block *sb) nfsd_shutdown_threads(net); - kill_litter_super(sb); + kill_anon_super(sb); put_net(net); } diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 5016bccc2ac5..2e7b2e566ebe 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -379,7 +379,7 @@ static int ntfs_file_mmap_prepare(struct vm_area_desc *desc) if (rw) { u64 to = min_t(loff_t, i_size_read(inode), - from + desc->end - desc->start); + from + vma_desc_size(desc)); if (is_sparsed(ni)) { /* Allocate clusters for rw map. */ diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index cccaa1d6fbba..339f0b11cdc8 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -441,8 +441,7 @@ static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, ip->ip_conn = conn; inc_nlink(dir); - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); status = 0; bail: @@ -480,8 +479,7 @@ static int dlmfs_create(struct mnt_idmap *idmap, goto bail; } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); bail: return status; } @@ -574,7 +572,7 @@ static int dlmfs_init_fs_context(struct fs_context *fc) static struct file_system_type dlmfs_fs_type = { .owner = THIS_MODULE, .name = "ocfs2_dlmfs", - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, .init_fs_context = dlmfs_init_fs_context, }; MODULE_ALIAS_FS("ocfs2_dlmfs"); diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 06b860b9ded6..ff3dbd1ca61f 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -581,7 +581,8 @@ out_cleanup_unlocked: goto out_dput; } -static const struct cred *ovl_override_creator_creds(struct dentry *dentry, struct inode *inode, umode_t mode) +static const struct cred *ovl_override_creator_creds(const struct cred *original_creds, + struct dentry *dentry, struct inode *inode, umode_t mode) { int err; @@ -596,7 +597,7 @@ static const struct cred *ovl_override_creator_creds(struct dentry *dentry, stru override_cred->fsgid = inode->i_gid; err = security_dentry_create_files_as(dentry, mode, &dentry->d_name, - current->cred, override_cred); + original_creds, override_cred); if (err) return ERR_PTR(err); @@ -614,8 +615,11 @@ static void ovl_revert_creator_creds(const struct cred *old_cred) DEFINE_CLASS(ovl_override_creator_creds, const struct cred *, if (!IS_ERR_OR_NULL(_T)) ovl_revert_creator_creds(_T), - ovl_override_creator_creds(dentry, inode, mode), - struct dentry *dentry, struct inode *inode, umode_t mode) + ovl_override_creator_creds(original_creds, dentry, inode, mode), + const struct cred *original_creds, + struct dentry *dentry, + struct inode *inode, + umode_t mode) static int ovl_create_handle_whiteouts(struct dentry *dentry, struct inode *inode, @@ -633,7 +637,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, int err; struct dentry *parent = dentry->d_parent; - with_ovl_creds(dentry->d_sb) { + scoped_class(override_creds_ovl, original_creds, dentry->d_sb) { /* * When linking a file with copy up origin into a new parent, mark the * new parent dir "impure". @@ -661,7 +665,7 @@ static int ovl_create_or_link(struct dentry *dentry, struct inode *inode, if (attr->hardlink) return ovl_create_handle_whiteouts(dentry, inode, attr); - scoped_class(ovl_override_creator_creds, cred, dentry, inode, attr->mode) { + scoped_class(ovl_override_creator_creds, cred, original_creds, dentry, inode, attr->mode) { if (IS_ERR(cred)) return PTR_ERR(cred); return ovl_create_handle_whiteouts(dentry, inode, attr); @@ -1364,8 +1368,8 @@ static int ovl_create_tmpfile(struct file *file, struct dentry *dentry, int flags = file->f_flags | OVL_OPEN_FLAGS; int err; - with_ovl_creds(dentry->d_sb) { - scoped_class(ovl_override_creator_creds, cred, dentry, inode, mode) { + scoped_class(override_creds_ovl, original_creds, dentry->d_sb) { + scoped_class(ovl_override_creator_creds, cred, original_creds, dentry, inode, mode) { if (IS_ERR(cred)) return PTR_ERR(cred); diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 28b2f707cfbc..ba9146f22a2c 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -128,9 +128,17 @@ static int ovl_dentry_revalidate_common(struct dentry *dentry, unsigned int i; int ret = 1; - /* Careful in RCU mode */ - if (!inode) + if (!inode) { + /* + * Lookup of negative dentries will call ovl_dentry_init_flags() + * with NULL upperdentry and NULL oe, resulting in the + * DCACHE_OP*_REVALIDATE flags being cleared. Hence the only + * way to get a negative inode is due to a race with dentry + * destruction. + */ + WARN_ON(!(flags & LOOKUP_RCU)); return -ECHILD; + } oe = OVL_I_E(inode); lowerstack = ovl_lowerstack(oe); diff --git a/fs/pipe.c b/fs/pipe.c index 2d0fed2ecbfd..9e6a01475815 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -1481,31 +1481,16 @@ static struct file_system_type pipe_fs_type = { }; #ifdef CONFIG_SYSCTL -static int do_proc_dopipe_max_size_conv(unsigned long *lvalp, - unsigned int *valp, - int write, void *data) -{ - if (write) { - unsigned int val; - - val = round_pipe_size(*lvalp); - if (val == 0) - return -EINVAL; - - *valp = val; - } else { - unsigned int val = *valp; - *lvalp = (unsigned long) val; - } - - return 0; -} +static SYSCTL_USER_TO_KERN_UINT_CONV(_pipe_maxsz, round_pipe_size) +static SYSCTL_UINT_CONV_CUSTOM(_pipe_maxsz, + sysctl_user_to_kern_uint_conv_pipe_maxsz, + sysctl_kern_to_user_uint_conv, true) static int proc_dopipe_max_size(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - return do_proc_douintvec(table, write, buffer, lenp, ppos, - do_proc_dopipe_max_size_conv, NULL); + return proc_douintvec_conv(table, write, buffer, lenp, ppos, + do_proc_uint_conv_pipe_maxsz); } static const struct ctl_table fs_pipe_sysctls[] = { @@ -1515,6 +1500,7 @@ static const struct ctl_table fs_pipe_sysctls[] = { .maxlen = sizeof(pipe_max_size), .mode = 0644, .proc_handler = proc_dopipe_max_size, + .extra1 = SYSCTL_ONE, }, { .procname = "pipe-user-pages-hard", diff --git a/fs/proc/base.c b/fs/proc/base.c index 407b41cb6e7c..4eec684baca9 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -3578,14 +3578,12 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) return 0; if (pos == TGID_OFFSET - 2) { - struct inode *inode = d_inode(fs_info->proc_self); - if (!dir_emit(ctx, "self", 4, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "self", 4, self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } if (pos == TGID_OFFSET - 1) { - struct inode *inode = d_inode(fs_info->proc_thread_self); - if (!dir_emit(ctx, "thread-self", 11, inode->i_ino, DT_LNK)) + if (!dir_emit(ctx, "thread-self", 11, thread_self_inum, DT_LNK)) return 0; ctx->pos = pos = pos + 1; } diff --git a/fs/proc/inode.c b/fs/proc/inode.c index d9b7ef122343..2d3425cfa94b 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -443,7 +443,7 @@ pde_get_unmapped_area(struct proc_dir_entry *pde, struct file *file, unsigned lo return pde->proc_ops->proc_get_unmapped_area(file, orig_addr, len, pgoff, flags); #ifdef CONFIG_MMU - return mm_get_unmapped_area(current->mm, file, orig_addr, len, pgoff, flags); + return mm_get_unmapped_area(file, orig_addr, len, pgoff, flags); #endif return orig_addr; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index d1598576506c..c1e8eb984da8 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -373,6 +373,7 @@ static inline void proc_tty_init(void) {} extern struct proc_dir_entry proc_root; extern void proc_self_init(void); +extern unsigned self_inum, thread_self_inum; /* * task_[no]mmu.c diff --git a/fs/proc/root.c b/fs/proc/root.c index 1e24e085c7d5..d8ca41d823e4 100644 --- a/fs/proc/root.c +++ b/fs/proc/root.c @@ -347,17 +347,11 @@ static void proc_kill_sb(struct super_block *sb) { struct proc_fs_info *fs_info = proc_sb_info(sb); - if (!fs_info) { - kill_anon_super(sb); - return; - } - - dput(fs_info->proc_self); - dput(fs_info->proc_thread_self); - kill_anon_super(sb); - put_pid_ns(fs_info->pid_ns); - kfree_rcu(fs_info, rcu); + if (fs_info) { + put_pid_ns(fs_info->pid_ns); + kfree_rcu(fs_info, rcu); + } } static struct file_system_type proc_fs_type = { diff --git a/fs/proc/self.c b/fs/proc/self.c index b46fbfd22681..62d2c0cfe35c 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_self_inode_operations = { .get_link = proc_self_get_link, }; -static unsigned self_inum __ro_after_init; +unsigned self_inum __ro_after_init; int proc_setup_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *self; int ret = -ENOMEM; @@ -51,18 +50,15 @@ int proc_setup_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_self_inode_operations; - d_add(self, inode); + d_make_persistent(self, inode); ret = 0; - } else { - dput(self); } + dput(self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/self\n"); - else - fs_info->proc_self = self; return ret; } diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index fc35a0543f01..81dfc26bfae8 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -14,7 +14,7 @@ #include <linux/rmap.h> #include <linux/swap.h> #include <linux/sched/mm.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/mmu_notifier.h> #include <linux/page_idle.h> #include <linux/shmem_fs.h> @@ -1017,14 +1017,16 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, young = pte_young(ptent); dirty = pte_dirty(ptent); present = true; - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + } else if (pte_none(ptent)) { + smaps_pte_hole_lookup(addr, walk); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (!non_swap_entry(swpent)) { + if (softleaf_is_swap(entry)) { int mapcount; mss->swap += PAGE_SIZE; - mapcount = swp_swapcount(swpent); + mapcount = swp_swapcount(entry); if (mapcount >= 2) { u64 pss_delta = (u64)PAGE_SIZE << PSS_SHIFT; @@ -1033,14 +1035,11 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, } else { mss->swap_pss += (u64)PAGE_SIZE << PSS_SHIFT; } - } else if (is_pfn_swap_entry(swpent)) { - if (is_device_private_entry(swpent)) + } else if (softleaf_has_pfn(entry)) { + if (softleaf_is_device_private(entry)) present = true; - page = pfn_swap_entry_to_page(swpent); + page = softleaf_to_page(entry); } - } else { - smaps_pte_hole_lookup(addr, walk); - return; } if (!page) @@ -1060,14 +1059,16 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, bool present = false; struct folio *folio; + if (pmd_none(*pmd)) + return; if (pmd_present(*pmd)) { page = vm_normal_page_pmd(vma, addr, *pmd); present = true; - } else if (unlikely(thp_migration_supported() && is_swap_pmd(*pmd))) { - swp_entry_t entry = pmd_to_swp_entry(*pmd); + } else if (unlikely(thp_migration_supported())) { + const softleaf_t entry = softleaf_from_pmd(*pmd); - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); } if (IS_ERR_OR_NULL(page)) return; @@ -1146,6 +1147,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_MAYSHARE)] = "ms", [ilog2(VM_GROWSDOWN)] = "gd", [ilog2(VM_PFNMAP)] = "pf", + [ilog2(VM_MAYBE_GUARD)] = "gu", [ilog2(VM_LOCKED)] = "lo", [ilog2(VM_IO)] = "io", [ilog2(VM_SEQ_READ)] = "sr", @@ -1181,10 +1183,10 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) [ilog2(VM_PKEY_BIT0)] = "", [ilog2(VM_PKEY_BIT1)] = "", [ilog2(VM_PKEY_BIT2)] = "", -#if VM_PKEY_BIT3 +#if CONFIG_ARCH_PKEY_BITS > 3 [ilog2(VM_PKEY_BIT3)] = "", #endif -#if VM_PKEY_BIT4 +#if CONFIG_ARCH_PKEY_BITS > 4 [ilog2(VM_PKEY_BIT4)] = "", #endif #endif /* CONFIG_ARCH_HAS_PKEYS */ @@ -1230,11 +1232,11 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (pte_present(ptent)) { folio = page_folio(pte_page(ptent)); present = true; - } else if (is_swap_pte(ptent)) { - swp_entry_t swpent = pte_to_swp_entry(ptent); + } else { + const softleaf_t entry = softleaf_from_pte(ptent); - if (is_pfn_swap_entry(swpent)) - folio = pfn_swap_entry_folio(swpent); + if (softleaf_has_pfn(entry)) + folio = softleaf_to_folio(entry); } if (folio) { @@ -1582,8 +1584,6 @@ struct clear_refs_private { enum clear_refs_types type; }; -#ifdef CONFIG_MEM_SOFT_DIRTY - static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, pte_t pte) { struct folio *folio; @@ -1603,6 +1603,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr, static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { + if (!pgtable_supports_soft_dirty()) + return; /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the @@ -1611,6 +1613,9 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, */ pte_t ptent = ptep_get(pte); + if (pte_none(ptent)) + return; + if (pte_present(ptent)) { pte_t old_pte; @@ -1620,24 +1625,21 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, ptent = pte_wrprotect(old_pte); ptent = pte_clear_soft_dirty(ptent); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { + } else { ptent = pte_swp_clear_soft_dirty(ptent); set_pte_at(vma->vm_mm, addr, pte, ptent); } } -#else -static inline void clear_soft_dirty(struct vm_area_struct *vma, - unsigned long addr, pte_t *pte) -{ -} -#endif -#if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE) +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t old, pmd = *pmdp; + if (!pgtable_supports_soft_dirty()) + return; + if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ old = pmdp_invalidate(vma, addr, pmdp); @@ -1650,7 +1652,7 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, pmd = pmd_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_clear_soft_dirty(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } @@ -1923,6 +1925,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, struct page *page = NULL; struct folio *folio; + if (pte_none(pte)) + goto out; + if (pte_present(pte)) { if (pm->show_pfn) frame = pte_pfn(pte); @@ -1932,32 +1937,34 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, flags |= PM_SOFT_DIRTY; if (pte_uffd_wp(pte)) flags |= PM_UFFD_WP; - } else if (is_swap_pte(pte)) { - swp_entry_t entry; + } else { + softleaf_t entry; + if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; if (pte_swp_uffd_wp(pte)) flags |= PM_UFFD_WP; - entry = pte_to_swp_entry(pte); + entry = softleaf_from_pte(pte); if (pm->show_pfn) { pgoff_t offset; + /* * For PFN swap offsets, keeping the offset field * to be PFN only to be compatible with old smaps. */ - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry); + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry); else offset = swp_offset(entry); frame = swp_type(entry) | (offset << MAX_SWAPFILES_SHIFT); } flags |= PM_SWAP; - if (is_pfn_swap_entry(entry)) - page = pfn_swap_entry_to_page(entry); - if (pte_marker_entry_uffd_wp(entry)) + if (softleaf_has_pfn(entry)) + page = softleaf_to_page(entry); + if (softleaf_is_uffd_wp_marker(entry)) flags |= PM_UFFD_WP; - if (is_guard_swp_entry(entry)) + if (softleaf_is_guard_marker(entry)) flags |= PM_GUARD_REGION; } @@ -1969,96 +1976,110 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } + +out: if (vma->vm_flags & VM_SOFTDIRTY) flags |= PM_SOFT_DIRTY; return make_pme(frame, flags); } -static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, - struct mm_walk *walk) +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +static int pagemap_pmd_range_thp(pmd_t *pmdp, unsigned long addr, + unsigned long end, struct vm_area_struct *vma, + struct pagemapread *pm) { - struct vm_area_struct *vma = walk->vma; - struct pagemapread *pm = walk->private; - spinlock_t *ptl; - pte_t *pte, *orig_pte; + unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; + u64 flags = 0, frame = 0; + pmd_t pmd = *pmdp; + struct page *page = NULL; + struct folio *folio = NULL; int err = 0; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE - ptl = pmd_trans_huge_lock(pmdp, vma); - if (ptl) { - unsigned int idx = (addr & ~PMD_MASK) >> PAGE_SHIFT; - u64 flags = 0, frame = 0; - pmd_t pmd = *pmdp; - struct page *page = NULL; - struct folio *folio = NULL; + if (vma->vm_flags & VM_SOFTDIRTY) + flags |= PM_SOFT_DIRTY; - if (vma->vm_flags & VM_SOFTDIRTY) - flags |= PM_SOFT_DIRTY; + if (pmd_none(pmd)) + goto populate_pagemap; - if (pmd_present(pmd)) { - page = pmd_page(pmd); + if (pmd_present(pmd)) { + page = pmd_page(pmd); - flags |= PM_PRESENT; - if (pmd_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - if (pm->show_pfn) - frame = pmd_pfn(pmd) + idx; - } -#ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - else if (is_swap_pmd(pmd)) { - swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset; - - if (pm->show_pfn) { - if (is_pfn_swap_entry(entry)) - offset = swp_offset_pfn(entry) + idx; - else - offset = swp_offset(entry) + idx; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); - } - flags |= PM_SWAP; - if (pmd_swp_soft_dirty(pmd)) - flags |= PM_SOFT_DIRTY; - if (pmd_swp_uffd_wp(pmd)) - flags |= PM_UFFD_WP; - VM_BUG_ON(!is_pmd_migration_entry(pmd)); - page = pfn_swap_entry_to_page(entry); - } -#endif + flags |= PM_PRESENT; + if (pmd_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + if (pm->show_pfn) + frame = pmd_pfn(pmd) + idx; + } else if (thp_migration_supported()) { + const softleaf_t entry = softleaf_from_pmd(pmd); + unsigned long offset; - if (page) { - folio = page_folio(page); - if (!folio_test_anon(folio)) - flags |= PM_FILE; + if (pm->show_pfn) { + if (softleaf_has_pfn(entry)) + offset = softleaf_to_pfn(entry) + idx; + else + offset = swp_offset(entry) + idx; + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); } + flags |= PM_SWAP; + if (pmd_swp_soft_dirty(pmd)) + flags |= PM_SOFT_DIRTY; + if (pmd_swp_uffd_wp(pmd)) + flags |= PM_UFFD_WP; + VM_WARN_ON_ONCE(!pmd_is_migration_entry(pmd)); + page = softleaf_to_page(entry); + } + + if (page) { + folio = page_folio(page); + if (!folio_test_anon(folio)) + flags |= PM_FILE; + } - for (; addr != end; addr += PAGE_SIZE, idx++) { - u64 cur_flags = flags; - pagemap_entry_t pme; +populate_pagemap: + for (; addr != end; addr += PAGE_SIZE, idx++) { + u64 cur_flags = flags; + pagemap_entry_t pme; - if (folio && (flags & PM_PRESENT) && - __folio_page_mapped_exclusively(folio, page)) - cur_flags |= PM_MMAP_EXCLUSIVE; + if (folio && (flags & PM_PRESENT) && + __folio_page_mapped_exclusively(folio, page)) + cur_flags |= PM_MMAP_EXCLUSIVE; - pme = make_pme(frame, cur_flags); - err = add_to_pagemap(&pme, pm); - if (err) - break; - if (pm->show_pfn) { - if (flags & PM_PRESENT) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); - } + pme = make_pme(frame, cur_flags); + err = add_to_pagemap(&pme, pm); + if (err) + break; + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); } + } + return err; +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + +static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, + struct mm_walk *walk) +{ + struct vm_area_struct *vma = walk->vma; + struct pagemapread *pm = walk->private; + spinlock_t *ptl; + pte_t *pte, *orig_pte; + int err = 0; + +#ifdef CONFIG_TRANSPARENT_HUGEPAGE + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { + err = pagemap_pmd_range_thp(pmdp, addr, end, vma, pm); spin_unlock(ptl); return err; } -#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ +#endif /* * We can assume that @vma always points to a valid one and @end never @@ -2310,12 +2331,16 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, struct vm_area_struct *vma, unsigned long addr, pte_t pte) { - unsigned long categories = 0; + unsigned long categories; + + if (pte_none(pte)) + return 0; if (pte_present(pte)) { struct page *page; - categories |= PAGE_IS_PRESENT; + categories = PAGE_IS_PRESENT; + if (!pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; @@ -2329,19 +2354,20 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { - swp_entry_t swp; + } else { + softleaf_t entry; + + categories = PAGE_IS_SWAPPED; - categories |= PAGE_IS_SWAPPED; if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - swp = pte_to_swp_entry(pte); - if (is_guard_swp_entry(swp)) + entry = softleaf_from_pte(pte); + if (softleaf_is_guard_marker(entry)) categories |= PAGE_IS_GUARD; else if ((p->masks_of_interest & PAGE_IS_FILE) && - is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; if (pte_swp_soft_dirty(pte)) @@ -2360,12 +2386,12 @@ static void make_uffd_wp_pte(struct vm_area_struct *vma, old_pte = ptep_modify_prot_start(vma, addr, pte); ptent = pte_mkuffd_wp(old_pte); ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent); - } else if (is_swap_pte(ptent)) { - ptent = pte_swp_mkuffd_wp(ptent); - set_pte_at(vma->vm_mm, addr, pte, ptent); - } else { + } else if (pte_none(ptent)) { set_pte_at(vma->vm_mm, addr, pte, make_pte_marker(PTE_MARKER_UFFD_WP)); + } else { + ptent = pte_swp_mkuffd_wp(ptent); + set_pte_at(vma->vm_mm, addr, pte, ptent); } } @@ -2376,6 +2402,9 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, { unsigned long categories = PAGE_IS_HUGE; + if (pmd_none(pmd)) + return categories; + if (pmd_present(pmd)) { struct page *page; @@ -2393,9 +2422,7 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_PFNZERO; if (pmd_soft_dirty(pmd)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pmd(pmd)) { - swp_entry_t swp; - + } else { categories |= PAGE_IS_SWAPPED; if (!pmd_swp_uffd_wp(pmd)) categories |= PAGE_IS_WRITTEN; @@ -2403,9 +2430,10 @@ static unsigned long pagemap_thp_category(struct pagemap_scan_private *p, categories |= PAGE_IS_SOFT_DIRTY; if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pmd_to_swp_entry(pmd); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) + const softleaf_t entry = softleaf_from_pmd(pmd); + + if (softleaf_has_pfn(entry) && + !folio_test_anon(softleaf_to_folio(entry))) categories |= PAGE_IS_FILE; } } @@ -2422,7 +2450,7 @@ static void make_uffd_wp_pmd(struct vm_area_struct *vma, old = pmdp_invalidate_ad(vma, addr, pmdp); pmd = pmd_mkuffd_wp(old); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); - } else if (is_migration_entry(pmd_to_swp_entry(pmd))) { + } else if (pmd_is_migration_entry(pmd)) { pmd = pmd_swp_mkuffd_wp(pmd); set_pmd_at(vma->vm_mm, addr, pmdp, pmd); } @@ -2434,6 +2462,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) { unsigned long categories = PAGE_IS_HUGE; + if (pte_none(pte)) + return categories; + /* * According to pagemap_hugetlb_range(), file-backed HugeTLB * page cannot be swapped. So PAGE_IS_FILE is not checked for @@ -2441,6 +2472,7 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) */ if (pte_present(pte)) { categories |= PAGE_IS_PRESENT; + if (!huge_pte_uffd_wp(pte)) categories |= PAGE_IS_WRITTEN; if (!PageAnon(pte_page(pte))) @@ -2449,8 +2481,9 @@ static unsigned long pagemap_hugetlb_category(pte_t pte) categories |= PAGE_IS_PFNZERO; if (pte_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; - } else if (is_swap_pte(pte)) { + } else { categories |= PAGE_IS_SWAPPED; + if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; if (pte_swp_soft_dirty(pte)) @@ -2464,22 +2497,25 @@ static void make_uffd_wp_huge_pte(struct vm_area_struct *vma, unsigned long addr, pte_t *ptep, pte_t ptent) { - unsigned long psize; + const unsigned long psize = huge_page_size(hstate_vma(vma)); + softleaf_t entry; - if (is_hugetlb_entry_hwpoisoned(ptent) || is_pte_marker(ptent)) + if (huge_pte_none(ptent)) { + set_huge_pte_at(vma->vm_mm, addr, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), psize); return; + } - psize = huge_page_size(hstate_vma(vma)); + entry = softleaf_from_pte(ptent); + if (softleaf_is_hwpoison(entry) || softleaf_is_marker(entry)) + return; - if (is_hugetlb_entry_migration(ptent)) + if (softleaf_is_migration(entry)) set_huge_pte_at(vma->vm_mm, addr, ptep, pte_swp_mkuffd_wp(ptent), psize); - else if (!huge_pte_none(ptent)) + else huge_ptep_modify_prot_commit(vma, addr, ptep, ptent, huge_pte_mkuffd_wp(ptent)); - else - set_huge_pte_at(vma->vm_mm, addr, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), psize); } #endif /* CONFIG_HUGETLB_PAGE */ diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 0e5050d6ab64..d6113dbe58e0 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -31,12 +31,11 @@ static const struct inode_operations proc_thread_self_inode_operations = { .get_link = proc_thread_self_get_link, }; -static unsigned thread_self_inum __ro_after_init; +unsigned thread_self_inum __ro_after_init; int proc_setup_thread_self(struct super_block *s) { struct inode *root_inode = d_inode(s->s_root); - struct proc_fs_info *fs_info = proc_sb_info(s); struct dentry *thread_self; int ret = -ENOMEM; @@ -51,19 +50,15 @@ int proc_setup_thread_self(struct super_block *s) inode->i_uid = GLOBAL_ROOT_UID; inode->i_gid = GLOBAL_ROOT_GID; inode->i_op = &proc_thread_self_inode_operations; - d_add(thread_self, inode); + d_make_persistent(thread_self, inode); ret = 0; - } else { - dput(thread_self); } + dput(thread_self); } inode_unlock(root_inode); if (ret) pr_err("proc_fill_super: can't allocate /proc/thread-self\n"); - else - fs_info->proc_thread_self = thread_self; - return ret; } diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index b4e55c90f8dc..71deffcc3356 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -373,7 +373,7 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) if (!dentry) return -ENOMEM; - private->dentry = dentry; + private->dentry = dentry; // borrowed private->record = record; inode->i_size = private->total_size = size; inode->i_private = private; @@ -382,7 +382,8 @@ int pstore_mkfile(struct dentry *root, struct pstore_record *record) inode_set_mtime_to_ts(inode, inode_set_ctime_to_ts(inode, record->time)); - d_add(dentry, no_free_ptr(inode)); + d_make_persistent(dentry, no_free_ptr(inode)); + dput(dentry); list_add(&(no_free_ptr(private))->list, &records_list); @@ -465,7 +466,7 @@ static void pstore_kill_sb(struct super_block *sb) guard(mutex)(&pstore_sb_lock); WARN_ON(pstore_sb && pstore_sb != sb); - kill_litter_super(sb); + kill_anon_super(sb); pstore_sb = NULL; guard(mutex)(&records_list_lock); diff --git a/fs/pstore/ram.c b/fs/pstore/ram.c index bc68b4de5287..39936d6da0dd 100644 --- a/fs/pstore/ram.c +++ b/fs/pstore/ram.c @@ -864,6 +864,8 @@ static int ramoops_probe(struct platform_device *pdev) ramoops_console_size = pdata->console_size; ramoops_pmsg_size = pdata->pmsg_size; ramoops_ftrace_size = pdata->ftrace_size; + mem_type = pdata->mem_type; + ramoops_ecc = pdata->ecc_info.ecc_size; pr_info("using 0x%lx@0x%llx, ecc: %d\n", cxt->size, (unsigned long long)cxt->phys_addr, diff --git a/fs/ramfs/file-mmu.c b/fs/ramfs/file-mmu.c index b11f5b20b78b..c3ed1c5117b2 100644 --- a/fs/ramfs/file-mmu.c +++ b/fs/ramfs/file-mmu.c @@ -35,7 +35,7 @@ static unsigned long ramfs_mmu_get_unmapped_area(struct file *file, unsigned long addr, unsigned long len, unsigned long pgoff, unsigned long flags) { - return mm_get_unmapped_area(current->mm, file, addr, len, pgoff, flags); + return mm_get_unmapped_area(file, addr, len, pgoff, flags); } const struct file_operations ramfs_file_operations = { diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 41f9995da7ca..505d10a0cb36 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -110,8 +110,7 @@ ramfs_mknod(struct mnt_idmap *idmap, struct inode *dir, goto out; } - d_instantiate(dentry, inode); - dget(dentry); /* Extra count - pin the dentry in core */ + d_make_persistent(dentry, inode); error = 0; inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } @@ -154,8 +153,7 @@ static int ramfs_symlink(struct mnt_idmap *idmap, struct inode *dir, error = page_symlink(inode, symname, l); if (!error) { - d_instantiate(dentry, inode); - dget(dentry); + d_make_persistent(dentry, inode); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); } else @@ -313,7 +311,7 @@ int ramfs_init_fs_context(struct fs_context *fc) void ramfs_kill_sb(struct super_block *sb) { kfree(sb->s_fs_info); - kill_litter_super(sb); + kill_anon_super(sb); } static struct file_system_type ramfs_fs_type = { diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c index 87bbc2605de1..0bfc13c5b96d 100644 --- a/fs/resctrl/pseudo_lock.c +++ b/fs/resctrl/pseudo_lock.c @@ -995,10 +995,11 @@ static const struct vm_operations_struct pseudo_mmap_ops = { .mremap = pseudo_lock_dev_mremap, }; -static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +static int pseudo_lock_dev_mmap_prepare(struct vm_area_desc *desc) { - unsigned long vsize = vma->vm_end - vma->vm_start; - unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + unsigned long off = desc->pgoff << PAGE_SHIFT; + unsigned long vsize = vma_desc_size(desc); + struct file *filp = desc->file; struct pseudo_lock_region *plr; struct rdtgroup *rdtgrp; unsigned long physical; @@ -1043,7 +1044,7 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) * Ensure changes are carried directly to the memory being mapped, * do not allow copy-on-write mapping. */ - if (!(vma->vm_flags & VM_SHARED)) { + if (!(desc->vm_flags & VM_SHARED)) { mutex_unlock(&rdtgroup_mutex); return -EINVAL; } @@ -1055,12 +1056,9 @@ static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) memset(plr->kmem + off, 0, vsize); - if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, - vsize, vma->vm_page_prot)) { - mutex_unlock(&rdtgroup_mutex); - return -EAGAIN; - } - vma->vm_ops = &pseudo_mmap_ops; + desc->vm_ops = &pseudo_mmap_ops; + mmap_action_remap_full(desc, physical + desc->pgoff); + mutex_unlock(&rdtgroup_mutex); return 0; } @@ -1071,7 +1069,7 @@ static const struct file_operations pseudo_lock_dev_fops = { .write = NULL, .open = pseudo_lock_dev_open, .release = pseudo_lock_dev_release, - .mmap = pseudo_lock_dev_mmap, + .mmap_prepare = pseudo_lock_dev_mmap_prepare, }; int rdt_pseudo_lock_init(void) diff --git a/fs/super.c b/fs/super.c index 7c66b96b59be..3d85265d1400 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1189,7 +1189,7 @@ static void filesystems_freeze_callback(struct super_block *sb, void *freeze_all if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) return; - if (freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) + if (!freeze_all_ptr && !(sb->s_type->fs_flags & FS_POWER_FREEZE)) return; if (!get_active_super(sb)) @@ -1292,14 +1292,6 @@ void kill_anon_super(struct super_block *sb) } EXPORT_SYMBOL(kill_anon_super); -void kill_litter_super(struct super_block *sb) -{ - if (sb->s_root) - d_genocide(sb->s_root); - kill_anon_super(sb); -} -EXPORT_SYMBOL(kill_litter_super); - int set_anon_super_fc(struct super_block *sb, struct fs_context *fc) { return set_anon_super(sb, NULL); diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index e142bac4f9f8..e1e639f515a0 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -36,6 +36,9 @@ static umode_t __first_visible(const struct attribute_group *grp, struct kobject if (grp->attrs && grp->attrs[0] && grp->is_visible) return grp->is_visible(kobj, grp->attrs[0], 0); + if (grp->attrs && grp->attrs[0] && grp->is_visible_const) + return grp->is_visible_const(kobj, grp->attrs[0], 0); + if (grp->bin_attrs && grp->bin_attrs[0] && grp->is_bin_visible) return grp->is_bin_visible(kobj, grp->bin_attrs[0], 0); @@ -61,8 +64,11 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, */ if (update) kernfs_remove_by_name(parent, (*attr)->name); - if (grp->is_visible) { - mode = grp->is_visible(kobj, *attr, i); + if (grp->is_visible || grp->is_visible_const) { + if (grp->is_visible) + mode = grp->is_visible(kobj, *attr, i); + else + mode = grp->is_visible_const(kobj, *attr, i); mode &= ~SYSFS_GROUP_INVISIBLE; if (!mode) continue; diff --git a/fs/tracefs/event_inode.c b/fs/tracefs/event_inode.c index 8705c77a9e75..61cbdafa2411 100644 --- a/fs/tracefs/event_inode.c +++ b/fs/tracefs/event_inode.c @@ -757,7 +757,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry const struct eventfs_entry *entries, int size, void *data) { - struct dentry *dentry = tracefs_start_creating(name, parent); + struct dentry *dentry; struct eventfs_root_inode *rei; struct eventfs_inode *ei; struct tracefs_inode *ti; @@ -768,6 +768,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry if (security_locked_down(LOCKDOWN_TRACEFS)) return NULL; + dentry = tracefs_start_creating(name, parent); if (IS_ERR(dentry)) return ERR_CAST(dentry); @@ -822,7 +823,7 @@ struct eventfs_inode *eventfs_create_events_dir(const char *name, struct dentry * something not worth much. Keeping directory links at 1 * tells userspace not to trust the link number. */ - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); /* The dentry of the "events" parent does keep track though */ inc_nlink(dentry->d_parent->d_inode); fsnotify_mkdir(dentry->d_parent->d_inode, dentry); @@ -909,5 +910,5 @@ void eventfs_remove_events_dir(struct eventfs_inode *ei) * and destroyed dynamically. */ d_invalidate(dentry); - dput(dentry); + d_make_discardable(dentry); } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 0c023941a316..d9d8932a7b9c 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -538,7 +538,7 @@ static struct file_system_type trace_fs_type = { .name = "tracefs", .init_fs_context = tracefs_init_fs_context, .parameters = tracefs_param_specs, - .kill_sb = kill_litter_super, + .kill_sb = kill_anon_super, }; MODULE_ALIAS_FS("tracefs"); @@ -571,16 +571,15 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) struct dentry *tracefs_failed_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - dput(dentry); + simple_done_creating(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; } struct dentry *tracefs_end_creating(struct dentry *dentry) { - inode_unlock(d_inode(dentry->d_parent)); - return dentry; + simple_done_creating(dentry); + return dentry; // borrowed } /* Find the inode that this will use for default */ @@ -661,7 +660,7 @@ struct dentry *tracefs_create_file(const char *name, umode_t mode, inode->i_private = data; inode->i_uid = d_inode(dentry->d_parent)->i_uid; inode->i_gid = d_inode(dentry->d_parent)->i_gid; - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); fsnotify_create(d_inode(dentry->d_parent), dentry); return tracefs_end_creating(dentry); } @@ -692,7 +691,7 @@ static struct dentry *__create_dir(const char *name, struct dentry *parent, /* directory inodes start off with i_nlink == 2 (for "." entry) */ inc_nlink(inode); - d_instantiate(dentry, inode); + d_make_persistent(dentry, inode); inc_nlink(d_inode(dentry->d_parent)); fsnotify_mkdir(d_inode(dentry->d_parent), dentry); return tracefs_end_creating(dentry); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index e6e74b384087..c5ba1f4487bd 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -29,7 +29,7 @@ #include <linux/ioctl.h> #include <linux/security.h> #include <linux/hugetlb.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/miscdevice.h> #include <linux/uio.h> @@ -233,40 +233,48 @@ static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, { struct vm_area_struct *vma = vmf->vma; pte_t *ptep, pte; - bool ret = true; assert_fault_locked(vmf); ptep = hugetlb_walk(vma, vmf->address, vma_mmu_pagesize(vma)); if (!ptep) - goto out; + return true; - ret = false; pte = huge_ptep_get(vma->vm_mm, vmf->address, ptep); /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. + */ + + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (huge_pte_none(pte)) + return true; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(pte)) + return true; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. */ - if (huge_pte_none_mostly(pte)) - ret = true; if (!huge_pte_write(pte) && (reason & VM_UFFD_WP)) - ret = true; -out: - return ret; + return true; + + return false; } #else static inline bool userfaultfd_huge_must_wait(struct userfaultfd_ctx *ctx, struct vm_fault *vmf, unsigned long reason) { - return false; /* should never get here */ + /* Should never get here. */ + VM_WARN_ON_ONCE(1); + return false; } #endif /* CONFIG_HUGETLB_PAGE */ /* - * Verify the pagetables are still not ok after having reigstered into + * Verify the pagetables are still not ok after having registered into * the fault_pending_wqh to avoid userland having to UFFDIO_WAKE any * userfault that has already been resolved, if userfaultfd_read_iter and * UFFDIO_COPY|ZEROPAGE are being run simultaneously on two different @@ -284,53 +292,63 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, pmd_t *pmd, _pmd; pte_t *pte; pte_t ptent; - bool ret = true; + bool ret; assert_fault_locked(vmf); pgd = pgd_offset(mm, address); if (!pgd_present(*pgd)) - goto out; + return true; p4d = p4d_offset(pgd, address); if (!p4d_present(*p4d)) - goto out; + return true; pud = pud_offset(p4d, address); if (!pud_present(*pud)) - goto out; + return true; pmd = pmd_offset(pud, address); again: _pmd = pmdp_get_lockless(pmd); if (pmd_none(_pmd)) - goto out; + return true; - ret = false; + /* + * A race could arise which would result in a softleaf entry such as + * migration entry unexpectedly being present in the PMD, so explicitly + * check for this and bail out if so. + */ if (!pmd_present(_pmd)) - goto out; + return false; - if (pmd_trans_huge(_pmd)) { - if (!pmd_write(_pmd) && (reason & VM_UFFD_WP)) - ret = true; - goto out; - } + if (pmd_trans_huge(_pmd)) + return !pmd_write(_pmd) && (reason & VM_UFFD_WP); pte = pte_offset_map(pmd, address); - if (!pte) { - ret = true; + if (!pte) goto again; - } + /* * Lockless access: we're in a wait_event so it's ok if it - * changes under us. PTE markers should be handled the same as none - * ptes here. + * changes under us. */ ptent = ptep_get(pte); - if (pte_none_mostly(ptent)) - ret = true; + + ret = true; + /* Entry is still missing, wait for userspace to resolve the fault. */ + if (pte_none(ptent)) + goto out; + /* UFFD PTE markers require userspace to resolve the fault. */ + if (pte_is_uffd_marker(ptent)) + goto out; + /* + * If VMA has UFFD WP faults enabled and WP fault, wait for userspace to + * resolve the fault. + */ if (!pte_write(ptent) && (reason & VM_UFFD_WP)) - ret = true; - pte_unmap(pte); + goto out; + ret = false; out: + pte_unmap(pte); return ret; } @@ -490,12 +508,13 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) set_current_state(blocking_state); spin_unlock_irq(&ctx->fault_pending_wqh.lock); - if (!is_vm_hugetlb_page(vma)) - must_wait = userfaultfd_must_wait(ctx, vmf, reason); - else + if (is_vm_hugetlb_page(vma)) { must_wait = userfaultfd_huge_must_wait(ctx, vmf, reason); - if (is_vm_hugetlb_page(vma)) hugetlb_vma_unlock_read(vma); + } else { + must_wait = userfaultfd_must_wait(ctx, vmf, reason); + } + release_fault_lock(vmf); if (likely(must_wait && !READ_ONCE(ctx->released))) { @@ -1270,9 +1289,9 @@ static int userfaultfd_register(struct userfaultfd_ctx *ctx, if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MISSING) vm_flags |= VM_UFFD_MISSING; if (uffdio_register.mode & UFFDIO_REGISTER_MODE_WP) { -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - goto out; -#endif + if (!pgtable_supports_uffd_wp()) + goto out; + vm_flags |= VM_UFFD_WP; } if (uffdio_register.mode & UFFDIO_REGISTER_MODE_MINOR) { @@ -1980,14 +1999,14 @@ static int userfaultfd_api(struct userfaultfd_ctx *ctx, uffdio_api.features &= ~(UFFD_FEATURE_MINOR_HUGETLBFS | UFFD_FEATURE_MINOR_SHMEM); #endif -#ifndef CONFIG_HAVE_ARCH_USERFAULTFD_WP - uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; -#endif -#ifndef CONFIG_PTE_MARKER_UFFD_WP - uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; - uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; - uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; -#endif + if (!pgtable_supports_uffd_wp()) + uffdio_api.features &= ~UFFD_FEATURE_PAGEFAULT_FLAG_WP; + + if (!uffd_supports_wp_marker()) { + uffdio_api.features &= ~UFFD_FEATURE_WP_HUGETLBFS_SHMEM; + uffdio_api.features &= ~UFFD_FEATURE_WP_UNPOPULATED; + uffdio_api.features &= ~UFFD_FEATURE_WP_ASYNC; + } ret = -EINVAL; if (features & ~uffdio_api.features) |
