diff options
Diffstat (limited to 'fs')
274 files changed, 6818 insertions, 3784 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 77e9c4387c1d..a020a8f00a1a 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -438,8 +438,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, v9ses->flags &= ~V9FS_ACCESS_MASK; v9ses->flags |= V9FS_ACCESS_USER; } - /*FIXME !! */ - /* for legacy mode, fall back to V9FS_ACCESS_ANY */ + /* FIXME: for legacy mode, fall back to V9FS_ACCESS_ANY */ if (!(v9fs_proto_dotu(v9ses) || v9fs_proto_dotl(v9ses)) && ((v9ses->flags&V9FS_ACCESS_MASK) == V9FS_ACCESS_USER)) { @@ -450,7 +449,7 @@ struct p9_fid *v9fs_session_init(struct v9fs_session_info *v9ses, if (!v9fs_proto_dotl(v9ses) || !((v9ses->flags & V9FS_ACCESS_MASK) == V9FS_ACCESS_CLIENT)) { /* - * We support ACL checks on clinet only if the protocol is + * We support ACL checks on client only if the protocol is * 9P2000.L and access is V9FS_ACCESS_CLIENT. */ v9ses->flags &= ~V9FS_ACL_MASK; @@ -561,7 +560,7 @@ static ssize_t caches_show(struct kobject *kobj, spin_lock(&v9fs_sessionlist_lock); list_for_each_entry(v9ses, &v9fs_sessionlist, slist) { if (v9ses->cachetag) { - n = snprintf(buf, limit, "%s\n", v9ses->cachetag); + n = snprintf(buf + count, limit, "%s\n", v9ses->cachetag); if (n < 0) { count = n; break; @@ -597,13 +596,16 @@ static const struct attribute_group v9fs_attr_group = { static int __init v9fs_sysfs_init(void) { + int ret; + v9fs_kobj = kobject_create_and_add("9p", fs_kobj); if (!v9fs_kobj) return -ENOMEM; - if (sysfs_create_group(v9fs_kobj, &v9fs_attr_group)) { + ret = sysfs_create_group(v9fs_kobj, &v9fs_attr_group); + if (ret) { kobject_put(v9fs_kobj); - return -ENOMEM; + return ret; } return 0; @@ -669,7 +671,7 @@ static int __init init_v9fs(void) int err; pr_info("Installing v9fs 9p2000 file system support\n"); - /* TODO: Setup list of registered trasnport modules */ + /* TODO: Setup list of registered transport modules */ err = v9fs_init_inode_cache(); if (err < 0) { diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index 04795508a795..f3248a3e5402 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -66,6 +66,7 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) struct p9_fid *fid; struct inode *inode; struct v9fs_inode *v9inode; + unsigned int cached; if (flags & LOOKUP_RCU) return -ECHILD; @@ -75,13 +76,22 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) goto out_valid; v9inode = V9FS_I(inode); - if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + struct v9fs_session_info *v9ses = v9fs_inode2v9ses(inode); + + cached = v9ses->cache & (CACHE_META | CACHE_LOOSE); + + if (!cached || v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { int retval; struct v9fs_session_info *v9ses; fid = v9fs_fid_lookup(dentry); - if (IS_ERR(fid)) + if (IS_ERR(fid)) { + p9_debug( + P9_DEBUG_VFS, + "v9fs_fid_lookup: dentry = %pd (%p), got error %pe\n", + dentry, dentry, fid); return PTR_ERR(fid); + } v9ses = v9fs_inode2v9ses(inode); if (v9fs_proto_dotl(v9ses)) @@ -90,12 +100,25 @@ static int __v9fs_lookup_revalidate(struct dentry *dentry, unsigned int flags) retval = v9fs_refresh_inode(fid, inode); p9_fid_put(fid); - if (retval == -ENOENT) + if (retval == -ENOENT) { + p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to ENOENT\n", + dentry, dentry); + return 0; + } + if (v9inode->cache_validity & V9FS_INO_INVALID_ATTR) { + p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) invalidated due to type change\n", + dentry, dentry); return 0; - if (retval < 0) + } + if (retval < 0) { + p9_debug(P9_DEBUG_VFS, + "refresh inode: dentry = %pd (%p), got error %pe\n", + dentry, dentry, ERR_PTR(retval)); return retval; + } } out_valid: + p9_debug(P9_DEBUG_VFS, "dentry: %pd (%p) is valid\n", dentry, dentry); return 1; } @@ -127,6 +150,8 @@ const struct dentry_operations v9fs_cached_dentry_operations = { }; const struct dentry_operations v9fs_dentry_operations = { + .d_revalidate = v9fs_lookup_revalidate, + .d_weak_revalidate = __v9fs_lookup_revalidate, .d_release = v9fs_dentry_release, .d_unalias_trylock = v9fs_dentry_unalias_trylock, .d_unalias_unlock = v9fs_dentry_unalias_unlock, diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 399d455d50d6..69f378a83775 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -768,22 +768,18 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, struct v9fs_inode __maybe_unused *v9inode; struct v9fs_session_info *v9ses; struct p9_fid *fid; - struct dentry *res = NULL; struct inode *inode; int p9_omode; if (d_in_lookup(dentry)) { - res = v9fs_vfs_lookup(dir, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode); @@ -795,17 +791,17 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, "write-only file with writeback enabled, creating w/ O_RDWR\n"); } fid = v9fs_create(v9ses, dir, dentry, NULL, perm, p9_omode); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); - goto error; - } + if (IS_ERR(fid)) + return PTR_ERR(fid); v9fs_invalidate_inode_attr(dir); inode = d_inode(dentry); v9inode = V9FS_I(inode); err = finish_open(file, dentry, generic_file_open); - if (err) - goto error; + if (unlikely(err)) { + p9_fid_put(fid); + return err; + } file->private_data = fid; #ifdef CONFIG_9P_FSCACHE @@ -818,13 +814,7 @@ v9fs_vfs_atomic_open(struct inode *dir, struct dentry *dentry, v9fs_open_fid_add(inode, &fid); file->f_mode |= FMODE_CREATED; -out: - dput(res); - return err; - -error: - p9_fid_put(fid); - goto out; + return 0; } /** @@ -1349,8 +1339,14 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if (inode_wrong_type(inode, umode)) + if (inode_wrong_type(inode, umode)) { + /* + * Do this as a way of letting the caller know the inode should not + * be reused + */ + v9fs_invalidate_inode_attr(inode); goto out; + } /* * We don't want to refresh inode->i_size, diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 5b5fda617b80..0b404e8484d2 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -238,20 +238,16 @@ v9fs_vfs_atomic_open_dotl(struct inode *dir, struct dentry *dentry, struct p9_fid *dfid = NULL, *ofid = NULL; struct v9fs_session_info *v9ses; struct posix_acl *pacl = NULL, *dacl = NULL; - struct dentry *res = NULL; if (d_in_lookup(dentry)) { - res = v9fs_vfs_lookup(dir, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = v9fs_vfs_lookup(dir, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); v9ses = v9fs_inode2v9ses(dir); @@ -337,7 +333,6 @@ out: p9_fid_put(ofid); p9_fid_put(fid); v9fs_put_acl(dacl, pacl); - dput(res); return err; } @@ -902,8 +897,14 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if (inode_wrong_type(inode, st->st_mode)) + if (inode_wrong_type(inode, st->st_mode)) { + /* + * Do this as a way of letting the caller know the inode should not + * be reused + */ + v9fs_invalidate_inode_attr(inode); goto out; + } /* * We don't want to refresh inode->i_size, diff --git a/fs/afs/dir_edit.c b/fs/afs/dir_edit.c index 4b1342c72089..fd3aa9f97ce6 100644 --- a/fs/afs/dir_edit.c +++ b/fs/afs/dir_edit.c @@ -239,7 +239,7 @@ static void afs_edit_init_block(union afs_xdr_dir_block *meta, * The caller must hold the inode locked. */ void afs_edit_dir_add(struct afs_vnode *vnode, - struct qstr *name, struct afs_fid *new_fid, + const struct qstr *name, struct afs_fid *new_fid, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block; @@ -391,7 +391,7 @@ error: * The caller must hold the inode locked. */ void afs_edit_dir_remove(struct afs_vnode *vnode, - struct qstr *name, enum afs_edit_dir_reason why) + const struct qstr *name, enum afs_edit_dir_reason why) { union afs_xdr_dir_block *meta, *block, *pblock; union afs_xdr_dirent *de, *pde; diff --git a/fs/afs/dir_search.c b/fs/afs/dir_search.c index b25bd892db4d..d2516e55b5ed 100644 --- a/fs/afs/dir_search.c +++ b/fs/afs/dir_search.c @@ -188,7 +188,7 @@ bad: /* * Search the appropriate hash chain in the contents of an AFS directory. */ -int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version) { struct afs_dir_iter iter = { .dvnode = dvnode, }; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 444a3ea4fdf6..a45ae5c2ef8a 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -1099,9 +1099,9 @@ int afs_single_writepages(struct address_space *mapping, /* * dir_edit.c */ -extern void afs_edit_dir_add(struct afs_vnode *, struct qstr *, struct afs_fid *, +extern void afs_edit_dir_add(struct afs_vnode *, const struct qstr *, struct afs_fid *, enum afs_edit_dir_reason); -extern void afs_edit_dir_remove(struct afs_vnode *, struct qstr *, enum afs_edit_dir_reason); +extern void afs_edit_dir_remove(struct afs_vnode *, const struct qstr *, enum afs_edit_dir_reason); void afs_edit_dir_update(struct afs_vnode *vnode, const struct qstr *name, struct afs_vnode *new_dvnode, enum afs_edit_dir_reason why); void afs_mkdir_init_dir(struct afs_vnode *dvnode, struct afs_vnode *parent_vnode); @@ -1114,7 +1114,7 @@ bool afs_dir_init_iter(struct afs_dir_iter *iter, const struct qstr *name); union afs_xdr_dir_block *afs_dir_find_block(struct afs_dir_iter *iter, size_t block); int afs_dir_search_bucket(struct afs_dir_iter *iter, const struct qstr *name, struct afs_fid *_fid); -int afs_dir_search(struct afs_vnode *dvnode, struct qstr *name, +int afs_dir_search(struct afs_vnode *dvnode, const struct qstr *name, struct afs_fid *_fid, afs_dataversion_t *_dir_version); /* diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 9434a5399f2b..1ad048e6e164 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -137,7 +137,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ret = -EINVAL; if (content[size - 1] == '.') - ret = vfs_parse_fs_string(fc, "source", content, size - 1); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(content, size - 1)); do_delayed_call(&cleanup); if (ret < 0) return ret; diff --git a/fs/attr.c b/fs/attr.c index 5425c1dbbff9..795f231d00e8 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -286,20 +286,12 @@ static void setattr_copy_mgtime(struct inode *inode, const struct iattr *attr) unsigned int ia_valid = attr->ia_valid; struct timespec64 now; - if (ia_valid & ATTR_CTIME) { - /* - * In the case of an update for a write delegation, we must respect - * the value in ia_ctime and not use the current time. - */ - if (ia_valid & ATTR_DELEG) - now = inode_set_ctime_deleg(inode, attr->ia_ctime); - else - now = inode_set_ctime_current(inode); - } else { - /* If ATTR_CTIME isn't set, then ATTR_MTIME shouldn't be either. */ - WARN_ON_ONCE(ia_valid & ATTR_MTIME); + if (ia_valid & ATTR_CTIME_SET) + now = inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + now = inode_set_ctime_current(inode); + else now = current_time(inode); - } if (ia_valid & ATTR_ATIME_SET) inode_set_atime_to_ts(inode, attr->ia_atime); @@ -359,12 +351,11 @@ void setattr_copy(struct mnt_idmap *idmap, struct inode *inode, inode_set_atime_to_ts(inode, attr->ia_atime); if (ia_valid & ATTR_MTIME) inode_set_mtime_to_ts(inode, attr->ia_mtime); - if (ia_valid & ATTR_CTIME) { - if (ia_valid & ATTR_DELEG) - inode_set_ctime_deleg(inode, attr->ia_ctime); - else - inode_set_ctime_to_ts(inode, attr->ia_ctime); - } + + if (ia_valid & ATTR_CTIME_SET) + inode_set_ctime_deleg(inode, attr->ia_ctime); + else if (ia_valid & ATTR_CTIME) + inode_set_ctime_to_ts(inode, attr->ia_ctime); } EXPORT_SYMBOL(setattr_copy); @@ -463,15 +454,18 @@ int notify_change(struct mnt_idmap *idmap, struct dentry *dentry, now = current_time(inode); - attr->ia_ctime = now; - if (!(ia_valid & ATTR_ATIME_SET)) - attr->ia_atime = now; - else + if (ia_valid & ATTR_ATIME_SET) attr->ia_atime = timestamp_truncate(attr->ia_atime, inode); - if (!(ia_valid & ATTR_MTIME_SET)) - attr->ia_mtime = now; else + attr->ia_atime = now; + if (ia_valid & ATTR_CTIME_SET) + attr->ia_ctime = timestamp_truncate(attr->ia_ctime, inode); + else + attr->ia_ctime = now; + if (ia_valid & ATTR_MTIME_SET) attr->ia_mtime = timestamp_truncate(attr->ia_mtime, inode); + else + attr->ia_mtime = now; if (ia_valid & ATTR_KILL_PRIV) { error = security_inode_need_killpriv(dentry); diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 1e36a12b88f7..5ace2511fec5 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -79,7 +79,7 @@ __bpf_kfunc void bpf_put_file(struct file *file) * pathname in *buf*, including the NUL termination character. On error, a * negative integer is returned. */ -__bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) +__bpf_kfunc int bpf_path_d_path(const struct path *path, char *buf, size_t buf__sz) { int len; char *ret; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ebbf55f8864b..0aa7e5d1b05f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3397,7 +3397,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device if (fs_info->sectorsize > PAGE_SIZE) btrfs_warn(fs_info, - "support for block size %u with page size %zu is experimental, some features may be missing", + "support for block size %u with page size %lu is experimental, some features may be missing", fs_info->sectorsize, PAGE_SIZE); /* * Handle the space caching options appropriately now that we have the diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index d062ac521051..230d9326b685 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -23,7 +23,11 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, int type; if (parent && (len < BTRFS_FID_SIZE_CONNECTABLE)) { - *max_len = BTRFS_FID_SIZE_CONNECTABLE; + if (btrfs_root_id(BTRFS_I(inode)->root) != + btrfs_root_id(BTRFS_I(parent)->root)) + *max_len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; + else + *max_len = BTRFS_FID_SIZE_CONNECTABLE; return FILEID_INVALID; } else if (len < BTRFS_FID_SIZE_NON_CONNECTABLE) { *max_len = BTRFS_FID_SIZE_NON_CONNECTABLE; @@ -45,6 +49,8 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len, parent_root_id = btrfs_root_id(BTRFS_I(parent)->root); if (parent_root_id != fid->root_objectid) { + if (*max_len < BTRFS_FID_SIZE_CONNECTABLE_ROOT) + return FILEID_INVALID; fid->parent_root_objectid = parent_root_id; len = BTRFS_FID_SIZE_CONNECTABLE_ROOT; type = FILEID_BTRFS_WITH_PARENT_ROOT; diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index c123a3ef154a..755ec6dfd51c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -973,7 +973,7 @@ static void btrfs_readahead_expand(struct readahead_control *ractl, { const u64 ra_pos = readahead_pos(ractl); const u64 ra_end = ra_pos + readahead_length(ractl); - const u64 em_end = em->start + em->ram_bytes; + const u64 em_end = em->start + em->len; /* No expansion for holes and inline extents. */ if (em->disk_bytenr > EXTENT_MAP_LAST_BYTE) diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index dad0b492a663..d86541073d42 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -1106,14 +1106,15 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, * If ret is 1 (no key found), it means this is an empty block group, * without any extents allocated from it and there's no block group * item (key BTRFS_BLOCK_GROUP_ITEM_KEY) located in the extent tree - * because we are using the block group tree feature, so block group - * items are stored in the block group tree. It also means there are no - * extents allocated for block groups with a start offset beyond this - * block group's end offset (this is the last, highest, block group). + * because we are using the block group tree feature (so block group + * items are stored in the block group tree) or this is a new block + * group created in the current transaction and its block group item + * was not yet inserted in the extent tree (that happens in + * btrfs_create_pending_block_groups() -> insert_block_group_item()). + * It also means there are no extents allocated for block groups with a + * start offset beyond this block group's end offset (this is the last, + * highest, block group). */ - if (!btrfs_fs_compat_ro(trans->fs_info, BLOCK_GROUP_TREE)) - ASSERT(ret == 0); - start = block_group->start; end = block_group->start + block_group->length; while (ret == 0) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 185bef0df1c2..8cb7d5a462ef 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3740,7 +3740,7 @@ static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg) prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); if (!prealloc) { ret = -ENOMEM; - goto drop_write; + goto out; } } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8dd8de6b9fb8..0765e06d00b8 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3780,6 +3780,7 @@ out: /* * Mark start of chunk relocation that is cancellable. Check if the cancellation * has been requested meanwhile and don't start in that case. + * NOTE: if this returns an error, reloc_chunk_end() must not be called. * * Return: * 0 success @@ -3796,10 +3797,8 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) if (atomic_read(&fs_info->reloc_cancel_req) > 0) { btrfs_info(fs_info, "chunk relocation canceled on start"); - /* - * On cancel, clear all requests but let the caller mark - * the end after cleanup operations. - */ + /* On cancel, clear all requests. */ + clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags); atomic_set(&fs_info->reloc_cancel_req, 0); return -ECANCELED; } @@ -3808,9 +3807,11 @@ static int reloc_chunk_start(struct btrfs_fs_info *fs_info) /* * Mark end of chunk relocation that is cancellable and wake any waiters. + * NOTE: call only if a previous call to reloc_chunk_start() succeeded. */ static void reloc_chunk_end(struct btrfs_fs_info *fs_info) { + ASSERT(test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)); /* Requested after start, clear bit first so any waiters can continue */ if (atomic_read(&fs_info->reloc_cancel_req) > 0) btrfs_info(fs_info, "chunk relocation canceled during operation"); @@ -4023,9 +4024,9 @@ out: if (err && rw) btrfs_dec_block_group_ro(rc->block_group); iput(rc->data_inode); + reloc_chunk_end(fs_info); out_put_bg: btrfs_put_block_group(bg); - reloc_chunk_end(fs_info); free_reloc_control(rc); return err; } @@ -4208,8 +4209,8 @@ out_clean: ret = ret2; out_unset: unset_reloc_control(rc); -out_end: reloc_chunk_end(fs_info); +out_end: free_reloc_control(rc); out: free_reloc_roots(&reloc_roots); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 4691d0bdb2e8..651b11884f82 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -694,7 +694,7 @@ static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); return folio_address(folio) + offset_in_folio(folio, offset); } @@ -707,7 +707,7 @@ static phys_addr_t scrub_stripe_get_paddr(struct scrub_stripe *stripe, int secto /* stripe->folios[] is allocated by us and no highmem is allowed. */ ASSERT(folio); - ASSERT(!folio_test_partial_kmap(folio)); + ASSERT(!folio_test_highmem(folio)); /* And the range must be contained inside the folio. */ ASSERT(offset_in_folio(folio, offset) + fs_info->sectorsize <= folio_size(folio)); return page_to_phys(folio_page(folio, 0)) + offset_in_folio(folio, offset); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9230e5066fc6..6144e66661f5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -178,7 +178,6 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; - struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -305,6 +304,9 @@ struct send_ctx { struct btrfs_lru_cache dir_created_cache; struct btrfs_lru_cache dir_utimes_cache; + + /* Must be last as it ends in a flexible-array member. */ + struct fs_path cur_inode_path; }; struct pending_dir_move { diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index d6e496436539..aadc02374b2a 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1900,8 +1900,6 @@ static int btrfs_get_tree_super(struct fs_context *fc) return PTR_ERR(sb); } - set_device_specific_options(fs_info); - if (sb->s_root) { /* * Not the first mount of the fs thus got an existing super block. @@ -1946,6 +1944,7 @@ static int btrfs_get_tree_super(struct fs_context *fc) deactivate_locked_super(sb); return -EACCES; } + set_device_specific_options(fs_info); bdev = fs_devices->latest_dev->bdev; snprintf(sb->s_id, sizeof(sb->s_id), "%pg", bdev); shrinker_debugfs_rename(sb->s_shrink, "sb-btrfs:%s", sb->s_id); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index ca30b15ea452..c10b4c242acf 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1797,7 +1797,7 @@ static int check_inode_extref(struct extent_buffer *leaf, struct btrfs_inode_extref *extref = (struct btrfs_inode_extref *)ptr; u16 namelen; - if (unlikely(ptr + sizeof(*extref)) > end) { + if (unlikely(ptr + sizeof(*extref) > end)) { inode_ref_err(leaf, slot, "inode extref overflow, ptr %lu end %lu inode_extref size %zu", ptr, end, sizeof(*extref)); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 6aad6b65522b..621e0df097e3 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -5829,7 +5829,7 @@ struct btrfs_dir_list { * See process_dir_items_leaf() for details about why it is needed. * This is a recursive operation - if an existing dentry corresponds to a * directory, that directory's new entries are logged too (same behaviour as - * ext3/4, xfs, f2fs, reiserfs, nilfs2). Note that when logging the inodes + * ext3/4, xfs, f2fs, nilfs2). Note that when logging the inodes * the dentries point to we do not acquire their VFS lock, otherwise lockdep * complains about the following circular lock dependency / possible deadlock: * diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index e00036672f33..0ea0df18a8e4 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1753,7 +1753,7 @@ out: !fs_info->stripe_root) { btrfs_err(fs_info, "zoned: data %s needs raid-stripe-tree", btrfs_bg_type_to_raid_name(map->type)); - return -EINVAL; + ret = -EINVAL; } if (unlikely(cache->alloc_offset > cache->zone_capacity)) { diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 32973c62c1a2..d18c0eaef9b7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1260,8 +1260,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, spin_unlock(&fsc->async_unlink_conflict_lock); spin_lock(&dentry->d_lock); - di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; - wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags); spin_unlock(&dentry->d_lock); synchronize_rcu(); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 978acd3d4b32..99b30f784ee2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -579,8 +579,7 @@ static void wake_async_create_waiters(struct inode *inode, spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + clear_and_wake_up_bit(CEPH_ASYNC_CREATE_BIT, &ci->i_ceph_flags); if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; @@ -762,8 +761,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode, } spin_lock(&dentry->d_lock); - di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; - wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_CREATE_BIT, &di->flags); spin_unlock(&dentry->d_lock); return ret; @@ -2121,10 +2119,10 @@ again: if (ceph_inode_is_shutdown(inode)) return -ESTALE; - if (direct_lock) - ceph_start_io_direct(inode); - else - ceph_start_io_read(inode); + ret = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_read(inode); + if (ret) + return ret; if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) want |= CEPH_CAP_FILE_CACHE; @@ -2277,7 +2275,9 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos, (fi->flags & CEPH_F_SYNC)) return copy_splice_read(in, ppos, pipe, len, flags); - ceph_start_io_read(inode); + ret = ceph_start_io_read(inode); + if (ret) + return ret; want = CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -2356,10 +2356,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) direct_lock = true; retry_snap: - if (direct_lock) - ceph_start_io_direct(inode); - else - ceph_start_io_write(inode); + err = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_write(inode); + if (err) + goto out_unlocked; if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -2878,7 +2878,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off struct ceph_object_id src_oid, dst_oid; struct ceph_osd_client *osdc; struct ceph_osd_request *req; - size_t bytes = 0; + ssize_t bytes = 0; u64 src_objnum, src_objoff, dst_objnum, dst_objoff; u32 src_objlen, dst_objlen; u32 object_size = src_ci->i_layout.object_size; @@ -2928,7 +2928,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off "OSDs don't support copy-from2; disabling copy offload\n"); } doutc(cl, "returned %d\n", ret); - if (!bytes) + if (bytes <= 0) bytes = ret; goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 949f0badc944..a6e260d9e420 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1794,6 +1794,11 @@ retry_lookup: goto done; } + if (unlikely(!in)) { + err = -EINVAL; + goto done; + } + /* attach proper inode */ if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); @@ -1829,6 +1834,12 @@ retry_lookup: doutc(cl, " linking snapped dir %p to dn %p\n", in, req->r_dentry); ceph_dir_clear_ordered(dir); + + if (unlikely(!in)) { + err = -EINVAL; + goto done; + } + ihold(in); err = splice_dentry(&req->r_dentry, in); if (err < 0) diff --git a/fs/ceph/io.c b/fs/ceph/io.c index c456509b31c3..2d10f49c93a9 100644 --- a/fs/ceph/io.c +++ b/fs/ceph/io.c @@ -21,14 +21,23 @@ /* Call with exclusively locked inode->i_rwsem */ static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) { + bool is_odirect; + lockdep_assert_held_write(&inode->i_rwsem); - if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~CEPH_I_ODIRECT; - spin_unlock(&ci->i_ceph_lock); - inode_dio_wait(inode); + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (is_odirect) { + clear_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); } + spin_unlock(&ci->i_ceph_lock); + + if (is_odirect) + inode_dio_wait(inode); } /** @@ -47,20 +56,35 @@ static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void -ceph_start_io_read(struct inode *inode) +int ceph_start_io_read(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; /* Be an optimist! */ - down_read(&inode->i_rwsem); - if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) - return; + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (!is_odirect) + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + ceph_block_o_direct(ci, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** @@ -83,11 +107,12 @@ ceph_end_io_read(struct inode *inode) * Declare that a buffered write operation is about to start, and ensure * that we block all direct I/O. */ -void -ceph_start_io_write(struct inode *inode) +int ceph_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - ceph_block_o_direct(ceph_inode(inode), inode); + int err = down_write_killable(&inode->i_rwsem); + if (!err) + ceph_block_o_direct(ceph_inode(inode), inode); + return err; } /** @@ -106,12 +131,22 @@ ceph_end_io_write(struct inode *inode) /* Call with exclusively locked inode->i_rwsem */ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) { + bool is_odirect; + lockdep_assert_held_write(&inode->i_rwsem); - if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= CEPH_I_ODIRECT; - spin_unlock(&ci->i_ceph_lock); + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (!is_odirect) { + set_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); + } + spin_unlock(&ci->i_ceph_lock); + + if (!is_odirect) { /* FIXME: unmap_mapping_range? */ filemap_write_and_wait(inode->i_mapping); } @@ -133,20 +168,35 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void -ceph_start_io_direct(struct inode *inode) +int ceph_start_io_direct(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; /* Be an optimist! */ - down_read(&inode->i_rwsem); - if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) - return; + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (is_odirect) + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + ceph_block_buffered(ci, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** diff --git a/fs/ceph/io.h b/fs/ceph/io.h index fa594cd77348..79029825e8b8 100644 --- a/fs/ceph/io.h +++ b/fs/ceph/io.h @@ -2,11 +2,13 @@ #ifndef _FS_CEPH_IO_H #define _FS_CEPH_IO_H -void ceph_start_io_read(struct inode *inode); +#include <linux/compiler_attributes.h> + +int __must_check ceph_start_io_read(struct inode *inode); void ceph_end_io_read(struct inode *inode); -void ceph_start_io_write(struct inode *inode); +int __must_check ceph_start_io_write(struct inode *inode); void ceph_end_io_write(struct inode *inode); -void ceph_start_io_direct(struct inode *inode); +int __must_check ceph_start_io_direct(struct inode *inode); void ceph_end_io_direct(struct inode *inode); #endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index e861de3c79b9..15cde055f3da 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -246,21 +246,28 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; + bool is_file_already_lazy = false; + spin_lock(&ci->i_ceph_lock); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { - spin_lock(&ci->i_ceph_lock); fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; __ceph_touch_fmode(ci, mdsc, fi->fmode); - spin_unlock(&ci->i_ceph_lock); + } else { + is_file_already_lazy = true; + } + spin_unlock(&ci->i_ceph_lock); + + if (is_file_already_lazy) { + doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, + ceph_vinop(inode)); + } else { doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); - } else { - doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, - ceph_vinop(inode)); } + return 0; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ebf4ac0055dd..dd764f9c64b9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -221,7 +221,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, if (err && err != -ERESTARTSYS) return err; - wait_for_completion_killable(&req->r_safe_completion); + err = wait_for_completion_killable(&req->r_safe_completion); + if (err) + return err; + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 73da2648fa0f..1740047aef0f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -979,14 +979,15 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, if (mds >= mdsc->max_sessions) { int newmax = 1 << get_count_order(mds + 1); struct ceph_mds_session **sa; + size_t ptr_size = sizeof(struct ceph_mds_session *); doutc(cl, "realloc to %d\n", newmax); - sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + sa = kcalloc(newmax, ptr_size, GFP_NOFS); if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, - mdsc->max_sessions * sizeof(void *)); + mdsc->max_sessions * ptr_size); kfree(mdsc->sessions); } mdsc->sessions = sa; @@ -2532,6 +2533,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); unsigned int num_entries; + u64 bytes_count; int order; spin_lock(&ci->i_ceph_lock); @@ -2540,7 +2542,11 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); - order = get_order(size * num_entries); + bytes_count = (u64)size * num_entries; + if (unlikely(bytes_count > ULONG_MAX)) + bytes_count = ULONG_MAX; + + order = get_order((unsigned long)bytes_count); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | @@ -2550,7 +2556,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, break; order--; } - if (!rinfo->dir_entries) + if (!rinfo->dir_entries || unlikely(order < 0)) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -5649,11 +5655,19 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_client *cl = mdsc->fsc->client; + const char *fs_name = mdsc->fsc->mount_options->mds_namespace; const char *spath = mdsc->fsc->mount_options->server_path; bool gid_matched = false; u32 gid, tlen, len; int i, j; + doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n", + fs_name, auth->match.fs_name ? auth->match.fs_name : ""); + if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) { + /* fsname mismatch, try next one */ + return 0; + } + doutc(cl, "match.uid %lld\n", auth->match.uid); if (auth->match.uid != MDS_AUTH_UID_ANY) { if (auth->match.uid != caller_uid) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8109aba66e02..2c7b151a7c95 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -353,10 +353,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, __decode_and_drop_type(p, end, u8, bad_ext); } if (mdsmap_ev >= 8) { + u32 fsname_len; /* enabled */ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); /* fs_name */ - ceph_decode_skip_string(p, end, bad_ext); + ceph_decode_32_safe(p, end, fsname_len, bad_ext); + + /* validate fsname against mds_namespace */ + if (!namespace_equals(mdsc->fsc->mount_options, *p, + fsname_len)) { + pr_warn_client(cl, "fsname %*pE doesn't match mds_namespace %s\n", + (int)fsname_len, (char *)*p, + mdsc->fsc->mount_options->mds_namespace); + goto bad; + } + /* skip fsname after validation */ + ceph_decode_skip_n(p, end, fsname_len, bad); } /* damaged */ if (mdsmap_ev >= 9) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index db6c2db68f96..ad0cf177e75a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -246,20 +246,6 @@ static void canonicalize_path(char *path) path[j] = '\0'; } -/* - * Check if the mds namespace in ceph_mount_options matches - * the passed in namespace string. First time match (when - * ->mds_namespace is NULL) is treated specially, since - * ->mds_namespace needs to be initialized by the caller. - */ -static int namespace_equals(struct ceph_mount_options *fsopt, - const char *namespace, size_t len) -{ - return !(fsopt->mds_namespace && - (strlen(fsopt->mds_namespace) != len || - strncmp(fsopt->mds_namespace, namespace, len))); -} - static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, struct fs_context *fc) { diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 25d8bacbcf44..a1f781c46b41 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -104,6 +104,20 @@ struct ceph_mount_options { struct fscrypt_dummy_policy dummy_enc_policy; }; +/* + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static inline int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + /* mount state */ enum { CEPH_MOUNT_MOUNTING, @@ -639,7 +653,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ -#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ +#define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */ +#define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT) #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index f327fbb9a0ca..81f4f06bc87e 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1601,10 +1601,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) err = -ENOENT; if (configfs_dirent_is_ready(parent_sd)) { file->private_data = configfs_new_dirent(parent_sd, NULL, 0, NULL); - if (IS_ERR(file->private_data)) - err = PTR_ERR(file->private_data); - else - err = 0; + err = PTR_ERR_OR_ZERO(file->private_data); } inode_unlock(d_inode(dentry)); diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 69133ec1fac2..f3f79c67add5 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -114,26 +114,21 @@ static int create_link(struct config_item *parent_item, } -static int get_target(const char *symname, struct path *path, - struct config_item **target, struct super_block *sb) +static int get_target(const char *symname, struct config_item **target, + struct super_block *sb) { + struct path path __free(path_put) = {}; int ret; - ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, path); - if (!ret) { - if (path->dentry->d_sb == sb) { - *target = configfs_get_config_item(path->dentry); - if (!*target) { - ret = -ENOENT; - path_put(path); - } - } else { - ret = -EPERM; - path_put(path); - } - } - - return ret; + ret = kern_path(symname, LOOKUP_FOLLOW|LOOKUP_DIRECTORY, &path); + if (ret) + return ret; + if (path.dentry->d_sb != sb) + return -EPERM; + *target = configfs_get_config_item(path.dentry); + if (!*target) + return -ENOENT; + return 0; } @@ -141,7 +136,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const char *symname) { int ret; - struct path path; struct configfs_dirent *sd; struct config_item *parent_item; struct config_item *target_item = NULL; @@ -188,7 +182,7 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, * AV, a thoroughly annoyed bastard. */ inode_unlock(dir); - ret = get_target(symname, &path, &target_item, dentry->d_sb); + ret = get_target(symname, &target_item, dentry->d_sb); inode_lock(dir); if (ret) goto out_put; @@ -210,7 +204,6 @@ int configfs_symlink(struct mnt_idmap *idmap, struct inode *dir, } config_item_put(target_item); - path_put(&path); out_put: config_item_put(parent_item); diff --git a/fs/coredump.c b/fs/coredump.c index b5fc06a092a4..5c1c381ee380 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -1468,7 +1468,7 @@ static int proc_dostring_coredump(const struct ctl_table *table, int write, ssize_t retval; char old_core_pattern[CORENAME_MAX_SIZE]; - if (write) + if (!write) return proc_dostring(table, write, buffer, lenp, ppos); retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); @@ -1725,7 +1725,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, if (iov_iter_rw(iter) == WRITE) { lockdep_assert_held_write(&iomi.inode->i_rwsem); iomi.flags |= IOMAP_WRITE; - } else { + } else if (!sb_rdonly(iomi.inode->i_sb)) { lockdep_assert_held(&iomi.inode->i_rwsem); } diff --git a/fs/dcache.c b/fs/dcache.c index 65cc11939654..035cccbc9276 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -1390,6 +1390,7 @@ struct check_mount { unsigned int mounted; }; +/* locks: mount_locked_reader && dentry->d_lock */ static enum d_walk_ret path_check_mount(void *data, struct dentry *dentry) { struct check_mount *info = data; @@ -1416,9 +1417,8 @@ int path_has_submounts(const struct path *parent) { struct check_mount data = { .mnt = parent->mnt, .mounted = 0 }; - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); d_walk(parent->dentry, &data, path_check_mount); - read_sequnlock_excl(&mount_lock); return data.mounted; } @@ -1717,13 +1717,13 @@ static struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) dname = dentry->d_shortname.string; } - dentry->d_name.len = name->len; - dentry->d_name.hash = name->hash; + dentry->__d_name.len = name->len; + dentry->__d_name.hash = name->hash; memcpy(dname, name->name, name->len); dname[name->len] = 0; /* Make sure we always see the terminating NUL character */ - smp_store_release(&dentry->d_name.name, dname); /* ^^^ */ + smp_store_release(&dentry->__d_name.name, dname); /* ^^^ */ dentry->d_flags = 0; lockref_init(&dentry->d_lockref); @@ -2557,6 +2557,8 @@ struct dentry *d_alloc_parallel(struct dentry *parent, spin_lock(&parent->d_lock); new->d_parent = dget_dlock(parent); hlist_add_head(&new->d_sib, &parent->d_children); + if (parent->d_flags & DCACHE_DISCONNECTED) + new->d_flags |= DCACHE_DISCONNECTED; spin_unlock(&parent->d_lock); retry: @@ -2743,15 +2745,15 @@ static void swap_names(struct dentry *dentry, struct dentry *target) /* * Both external: swap the pointers */ - swap(target->d_name.name, dentry->d_name.name); + swap(target->__d_name.name, dentry->__d_name.name); } else { /* * dentry:internal, target:external. Steal target's * storage and make target internal. */ - dentry->d_name.name = target->d_name.name; + dentry->__d_name.name = target->__d_name.name; target->d_shortname = dentry->d_shortname; - target->d_name.name = target->d_shortname.string; + target->__d_name.name = target->d_shortname.string; } } else { if (unlikely(dname_external(dentry))) { @@ -2759,9 +2761,9 @@ static void swap_names(struct dentry *dentry, struct dentry *target) * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal */ - target->d_name.name = dentry->d_name.name; + target->__d_name.name = dentry->__d_name.name; dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; + dentry->__d_name.name = dentry->d_shortname.string; } else { /* * Both are internal. @@ -2771,7 +2773,7 @@ static void swap_names(struct dentry *dentry, struct dentry *target) target->d_shortname.words[i]); } } - swap(dentry->d_name.hash_len, target->d_name.hash_len); + swap(dentry->__d_name.hash_len, target->__d_name.hash_len); } static void copy_name(struct dentry *dentry, struct dentry *target) @@ -2781,11 +2783,11 @@ static void copy_name(struct dentry *dentry, struct dentry *target) old_name = external_name(dentry); if (unlikely(dname_external(target))) { atomic_inc(&external_name(target)->count); - dentry->d_name = target->d_name; + dentry->__d_name = target->__d_name; } else { dentry->d_shortname = target->d_shortname; - dentry->d_name.name = dentry->d_shortname.string; - dentry->d_name.hash_len = target->d_name.hash_len; + dentry->__d_name.name = dentry->d_shortname.string; + dentry->__d_name.hash_len = target->__d_name.hash_len; } if (old_name && likely(atomic_dec_and_test(&old_name->count))) kfree_rcu(old_name, head); @@ -3134,7 +3136,7 @@ void d_mark_tmpfile(struct file *file, struct inode *inode) !d_unlinked(dentry)); spin_lock(&dentry->d_parent->d_lock); spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); - dentry->d_name.len = sprintf(dentry->d_shortname.string, "#%llu", + dentry->__d_name.len = sprintf(dentry->d_shortname.string, "#%llu", (unsigned long long)inode->i_ino); spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_parent->d_lock); diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 1dfd5b81d831..6648a924e31a 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -59,14 +59,6 @@ static int ecryptfs_d_revalidate(struct inode *dir, const struct qstr *name, return rc; } -struct kmem_cache *ecryptfs_dentry_info_cache; - -static void ecryptfs_dentry_free_rcu(struct rcu_head *head) -{ - kmem_cache_free(ecryptfs_dentry_info_cache, - container_of(head, struct ecryptfs_dentry_info, rcu)); -} - /** * ecryptfs_d_release * @dentry: The ecryptfs dentry @@ -75,11 +67,7 @@ static void ecryptfs_dentry_free_rcu(struct rcu_head *head) */ static void ecryptfs_d_release(struct dentry *dentry) { - struct ecryptfs_dentry_info *p = dentry->d_fsdata; - if (p) { - path_put(&p->lower_path); - call_rcu(&p->rcu, ecryptfs_dentry_free_rcu); - } + dput(dentry->d_fsdata); } const struct dentry_operations ecryptfs_dops = { diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index 1f562e75d0e4..9e6ab0b41337 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -258,13 +258,6 @@ struct ecryptfs_inode_info { struct ecryptfs_crypt_stat crypt_stat; }; -/* dentry private data. Each dentry must keep track of a lower - * vfsmount too. */ -struct ecryptfs_dentry_info { - struct path lower_path; - struct rcu_head rcu; -}; - /** * ecryptfs_global_auth_tok - A key used to encrypt all new files under the mountpoint * @flags: Status flags @@ -348,6 +341,7 @@ struct ecryptfs_mount_crypt_stat { /* superblock private data. */ struct ecryptfs_sb_info { struct super_block *wsi_sb; + struct vfsmount *lower_mnt; struct ecryptfs_mount_crypt_stat mount_crypt_stat; }; @@ -494,22 +488,25 @@ ecryptfs_set_superblock_lower(struct super_block *sb, } static inline void -ecryptfs_set_dentry_private(struct dentry *dentry, - struct ecryptfs_dentry_info *dentry_info) +ecryptfs_set_dentry_lower(struct dentry *dentry, + struct dentry *lower_dentry) { - dentry->d_fsdata = dentry_info; + dentry->d_fsdata = lower_dentry; } static inline struct dentry * ecryptfs_dentry_to_lower(struct dentry *dentry) { - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; + return dentry->d_fsdata; } -static inline const struct path * -ecryptfs_dentry_to_lower_path(struct dentry *dentry) +static inline struct path +ecryptfs_lower_path(struct dentry *dentry) { - return &((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path; + return (struct path){ + .mnt = ecryptfs_superblock_to_private(dentry->d_sb)->lower_mnt, + .dentry = ecryptfs_dentry_to_lower(dentry) + }; } #define ecryptfs_printk(type, fmt, arg...) \ @@ -532,7 +529,6 @@ extern unsigned int ecryptfs_number_of_users; extern struct kmem_cache *ecryptfs_auth_tok_list_item_cache; extern struct kmem_cache *ecryptfs_file_info_cache; -extern struct kmem_cache *ecryptfs_dentry_info_cache; extern struct kmem_cache *ecryptfs_inode_info_cache; extern struct kmem_cache *ecryptfs_sb_info_cache; extern struct kmem_cache *ecryptfs_header_cache; @@ -557,7 +553,6 @@ int ecryptfs_encrypt_and_encode_filename( size_t *encoded_name_size, struct ecryptfs_mount_crypt_stat *mount_crypt_stat, const char *name, size_t name_size); -struct dentry *ecryptfs_lower_dentry(struct dentry *this_dentry); void ecryptfs_dump_hex(char *data, int bytes); int virt_to_scatterlist(const void *addr, int size, struct scatterlist *sg, int sg_size); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5f8f96da09fe..7929411837cf 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -33,13 +33,12 @@ static ssize_t ecryptfs_read_update_atime(struct kiocb *iocb, struct iov_iter *to) { ssize_t rc; - const struct path *path; struct file *file = iocb->ki_filp; rc = generic_file_read_iter(iocb, to); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(file->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(file->f_path.dentry); + touch_atime(&path); } return rc; } @@ -59,12 +58,11 @@ static ssize_t ecryptfs_splice_read_update_atime(struct file *in, loff_t *ppos, size_t len, unsigned int flags) { ssize_t rc; - const struct path *path; rc = filemap_splice_read(in, ppos, pipe, len, flags); if (rc >= 0) { - path = ecryptfs_dentry_to_lower_path(in->f_path.dentry); - touch_atime(path); + struct path path = ecryptfs_lower_path(in->f_path.dentry); + touch_atime(&path); } return rc; } @@ -283,6 +281,7 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) * ecryptfs_lookup() */ struct ecryptfs_file_info *file_info; struct file *lower_file; + struct path path; /* Released in ecryptfs_release or end of function if failure */ file_info = kmem_cache_zalloc(ecryptfs_file_info_cache, GFP_KERNEL); @@ -292,8 +291,8 @@ static int ecryptfs_dir_open(struct inode *inode, struct file *file) "Error attempting to allocate memory\n"); return -ENOMEM; } - lower_file = dentry_open(ecryptfs_dentry_to_lower_path(ecryptfs_dentry), - file->f_flags, current_cred()); + path = ecryptfs_lower_path(ecryptfs_dentry); + lower_file = dentry_open(&path, file->f_flags, current_cred()); if (IS_ERR(lower_file)) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index abd954c6a14e..ed1394da8d6b 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -327,24 +327,15 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, struct dentry *lower_dentry) { - const struct path *path = ecryptfs_dentry_to_lower_path(dentry->d_parent); + struct dentry *lower_parent = ecryptfs_dentry_to_lower(dentry->d_parent); struct inode *inode, *lower_inode; - struct ecryptfs_dentry_info *dentry_info; int rc = 0; - dentry_info = kmem_cache_alloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!dentry_info) { - dput(lower_dentry); - return ERR_PTR(-ENOMEM); - } - fsstack_copy_attr_atime(d_inode(dentry->d_parent), - d_inode(path->dentry)); + d_inode(lower_parent)); BUG_ON(!d_count(lower_dentry)); - ecryptfs_set_dentry_private(dentry, dentry_info); - dentry_info->lower_path.mnt = mntget(path->mnt); - dentry_info->lower_path.dentry = lower_dentry; + ecryptfs_set_dentry_lower(dentry, lower_dentry); /* * negative dentry can go positive under us here - its parent is not @@ -1021,10 +1012,10 @@ static int ecryptfs_getattr(struct mnt_idmap *idmap, { struct dentry *dentry = path->dentry; struct kstat lower_stat; + struct path lower_path = ecryptfs_lower_path(dentry); int rc; - rc = vfs_getattr_nosec(ecryptfs_dentry_to_lower_path(dentry), - &lower_stat, request_mask, flags); + rc = vfs_getattr_nosec(&lower_path, &lower_stat, request_mask, flags); if (!rc) { fsstack_copy_attr_all(d_inode(dentry), ecryptfs_inode_to_lower(d_inode(dentry))); diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index eab1beb846d3..16ea14dd2c62 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -106,15 +106,14 @@ static int ecryptfs_init_lower_file(struct dentry *dentry, struct file **lower_file) { const struct cred *cred = current_cred(); - const struct path *path = ecryptfs_dentry_to_lower_path(dentry); + struct path path = ecryptfs_lower_path(dentry); int rc; - rc = ecryptfs_privileged_open(lower_file, path->dentry, path->mnt, - cred); + rc = ecryptfs_privileged_open(lower_file, path.dentry, path.mnt, cred); if (rc) { printk(KERN_ERR "Error opening lower file " "for lower_dentry [0x%p] and lower_mnt [0x%p]; " - "rc = [%d]\n", path->dentry, path->mnt, rc); + "rc = [%d]\n", path.dentry, path.mnt, rc); (*lower_file) = NULL; } return rc; @@ -437,7 +436,6 @@ static int ecryptfs_get_tree(struct fs_context *fc) struct ecryptfs_fs_context *ctx = fc->fs_private; struct ecryptfs_sb_info *sbi = fc->s_fs_info; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; - struct ecryptfs_dentry_info *root_info; const char *err = "Getting sb failed"; struct inode *inode; struct path path; @@ -543,14 +541,8 @@ static int ecryptfs_get_tree(struct fs_context *fc) goto out_free; } - rc = -ENOMEM; - root_info = kmem_cache_zalloc(ecryptfs_dentry_info_cache, GFP_KERNEL); - if (!root_info) - goto out_free; - - /* ->kill_sb() will take care of root_info */ - ecryptfs_set_dentry_private(s->s_root, root_info); - root_info->lower_path = path; + ecryptfs_set_dentry_lower(s->s_root, path.dentry); + ecryptfs_superblock_to_private(s)->lower_mnt = path.mnt; s->s_flags |= SB_ACTIVE; fc->root = dget(s->s_root); @@ -580,6 +572,7 @@ static void ecryptfs_kill_block_super(struct super_block *sb) kill_anon_super(sb); if (!sb_info) return; + mntput(sb_info->lower_mnt); ecryptfs_destroy_mount_crypt_stat(&sb_info->mount_crypt_stat); kmem_cache_free(ecryptfs_sb_info_cache, sb_info); } @@ -668,11 +661,6 @@ static struct ecryptfs_cache_info { .size = sizeof(struct ecryptfs_file_info), }, { - .cache = &ecryptfs_dentry_info_cache, - .name = "ecryptfs_dentry_info_cache", - .size = sizeof(struct ecryptfs_dentry_info), - }, - { .cache = &ecryptfs_inode_info_cache, .name = "ecryptfs_inode_cache", .size = sizeof(struct ecryptfs_inode_info), diff --git a/fs/exec.c b/fs/exec.c index 6b70c6726d31..4298e7e08d5d 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -2048,7 +2048,7 @@ static int proc_dointvec_minmax_coredump(const struct ctl_table *table, int writ { int error = proc_dointvec_minmax(table, write, buffer, lenp, ppos); - if (!error && !write) + if (!error && write) validate_coredump_safety(); return error; } diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index cc01556c9d9b..2d2d510f2372 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -7,6 +7,7 @@ #include <linux/slab.h> #include <linux/bitmap.h> #include <linux/buffer_head.h> +#include <linux/backing-dev.h> #include "exfat_raw.h" #include "exfat_fs.h" @@ -26,13 +27,58 @@ /* * Allocation Bitmap Management Functions */ +static bool exfat_test_bitmap_range(struct super_block *sb, unsigned int clu, + unsigned int count) +{ + struct exfat_sb_info *sbi = EXFAT_SB(sb); + unsigned int start = clu; + unsigned int end = clu + count; + unsigned int ent_idx, i, b; + unsigned int bit_offset, bits_to_check; + __le_long *bitmap_le; + unsigned long mask, word; + + if (!is_valid_cluster(sbi, start) || !is_valid_cluster(sbi, end - 1)) + return false; + + while (start < end) { + ent_idx = CLUSTER_TO_BITMAP_ENT(start); + i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); + b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); + + bitmap_le = (__le_long *)sbi->vol_amap[i]->b_data; + + /* Calculate how many bits we can check in the current word */ + bit_offset = b % BITS_PER_LONG; + bits_to_check = min(end - start, + (unsigned int)(BITS_PER_LONG - bit_offset)); + + /* Create a bitmask for the range of bits to check */ + if (bits_to_check >= BITS_PER_LONG) + mask = ~0UL; + else + mask = ((1UL << bits_to_check) - 1) << bit_offset; + word = lel_to_cpu(bitmap_le[b / BITS_PER_LONG]); + + /* Check if all bits in the mask are set */ + if ((word & mask) != mask) + return false; + + start += bits_to_check; + } + + return true; +} + static int exfat_allocate_bitmap(struct super_block *sb, struct exfat_dentry *ep) { struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct blk_plug plug; long long map_size; - unsigned int i, need_map_size; + unsigned int i, j, need_map_size; sector_t sector; + unsigned int max_ra_count; sbi->map_clu = le32_to_cpu(ep->dentry.bitmap.start_clu); map_size = le64_to_cpu(ep->dentry.bitmap.size); @@ -56,22 +102,37 @@ static int exfat_allocate_bitmap(struct super_block *sb, return -ENOMEM; sector = exfat_cluster_to_sector(sbi, sbi->map_clu); + max_ra_count = min(sb->s_bdi->ra_pages, sb->s_bdi->io_pages) << + (PAGE_SHIFT - sb->s_blocksize_bits); for (i = 0; i < sbi->map_sectors; i++) { - sbi->vol_amap[i] = sb_bread(sb, sector + i); - if (!sbi->vol_amap[i]) { - /* release all buffers and free vol_amap */ - int j = 0; - - while (j < i) - brelse(sbi->vol_amap[j++]); - - kvfree(sbi->vol_amap); - sbi->vol_amap = NULL; - return -EIO; + /* Trigger the next readahead in advance. */ + if (0 == (i % max_ra_count)) { + blk_start_plug(&plug); + for (j = i; j < min(max_ra_count, sbi->map_sectors - i) + i; j++) + sb_breadahead(sb, sector + j); + blk_finish_plug(&plug); } + + sbi->vol_amap[i] = sb_bread(sb, sector + i); + if (!sbi->vol_amap[i]) + goto err_out; } + if (exfat_test_bitmap_range(sb, sbi->map_clu, + EXFAT_B_TO_CLU_ROUND_UP(map_size, sbi)) == false) + goto err_out; + return 0; + +err_out: + j = 0; + /* release all buffers and free vol_amap */ + while (j < i) + brelse(sbi->vol_amap[j++]); + + kvfree(sbi->vol_amap); + sbi->vol_amap = NULL; + return -EIO; } int exfat_load_bitmap(struct super_block *sb) diff --git a/fs/exfat/dir.c b/fs/exfat/dir.c index ee060e26f51d..7229146fe2bf 100644 --- a/fs/exfat/dir.c +++ b/fs/exfat/dir.c @@ -1244,3 +1244,163 @@ int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir) return count; } + +static int exfat_get_volume_label_dentry(struct super_block *sb, + struct exfat_entry_set_cache *es) +{ + int i; + int dentry = 0; + unsigned int type; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_hint_femp hint_femp; + struct exfat_inode_info *ei = EXFAT_I(sb->s_root->d_inode); + struct exfat_chain clu; + struct exfat_dentry *ep; + struct buffer_head *bh; + + hint_femp.eidx = EXFAT_HINT_NONE; + exfat_chain_set(&clu, sbi->root_dir, 0, ALLOC_FAT_CHAIN); + + while (clu.dir != EXFAT_EOF_CLUSTER) { + for (i = 0; i < sbi->dentries_per_clu; i++, dentry++) { + ep = exfat_get_dentry(sb, &clu, i, &bh); + if (!ep) + return -EIO; + + type = exfat_get_entry_type(ep); + if (hint_femp.eidx == EXFAT_HINT_NONE) { + if (type == TYPE_DELETED || type == TYPE_UNUSED) { + hint_femp.cur = clu; + hint_femp.eidx = dentry; + hint_femp.count = 1; + } + } + + if (type == TYPE_UNUSED) { + brelse(bh); + goto not_found; + } + + if (type != TYPE_VOLUME) { + brelse(bh); + continue; + } + + memset(es, 0, sizeof(*es)); + es->sb = sb; + es->bh = es->__bh; + es->bh[0] = bh; + es->num_bh = 1; + es->start_off = EXFAT_DEN_TO_B(i) % sb->s_blocksize; + + return 0; + } + + if (exfat_get_next_cluster(sb, &(clu.dir))) + return -EIO; + } + +not_found: + if (hint_femp.eidx == EXFAT_HINT_NONE) { + hint_femp.cur.dir = EXFAT_EOF_CLUSTER; + hint_femp.eidx = dentry; + hint_femp.count = 0; + } + + ei->hint_femp = hint_femp; + + return -ENOENT; +} + +int exfat_read_volume_label(struct super_block *sb, struct exfat_uni_name *label_out) +{ + int ret, i; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_entry_set_cache es; + struct exfat_dentry *ep; + + mutex_lock(&sbi->s_lock); + + memset(label_out, 0, sizeof(*label_out)); + ret = exfat_get_volume_label_dentry(sb, &es); + if (ret < 0) { + /* + * ENOENT signifies that a volume label dentry doesn't exist + * We will treat this as an empty volume label and not fail. + */ + if (ret == -ENOENT) + ret = 0; + + goto unlock; + } + + ep = exfat_get_dentry_cached(&es, 0); + label_out->name_len = ep->dentry.volume_label.char_count; + if (label_out->name_len > EXFAT_VOLUME_LABEL_LEN) { + ret = -EIO; + exfat_put_dentry_set(&es, false); + goto unlock; + } + + for (i = 0; i < label_out->name_len; i++) + label_out->name[i] = le16_to_cpu(ep->dentry.volume_label.volume_label[i]); + + exfat_put_dentry_set(&es, false); +unlock: + mutex_unlock(&sbi->s_lock); + return ret; +} + +int exfat_write_volume_label(struct super_block *sb, + struct exfat_uni_name *label) +{ + int ret, i; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct inode *root_inode = sb->s_root->d_inode; + struct exfat_entry_set_cache es; + struct exfat_chain clu; + struct exfat_dentry *ep; + + if (label->name_len > EXFAT_VOLUME_LABEL_LEN) + return -EINVAL; + + mutex_lock(&sbi->s_lock); + + ret = exfat_get_volume_label_dentry(sb, &es); + if (ret == -ENOENT) { + if (label->name_len == 0) { + /* No volume label dentry, no need to clear */ + ret = 0; + goto unlock; + } + + ret = exfat_find_empty_entry(root_inode, &clu, 1, &es); + } + + if (ret < 0) + goto unlock; + + ep = exfat_get_dentry_cached(&es, 0); + + if (label->name_len == 0 && ep->dentry.volume_label.char_count == 0) { + /* volume label had been cleared */ + exfat_put_dentry_set(&es, 0); + goto unlock; + } + + memset(ep, 0, sizeof(*ep)); + ep->type = EXFAT_VOLUME; + + for (i = 0; i < label->name_len; i++) + ep->dentry.volume_label.volume_label[i] = + cpu_to_le16(label->name[i]); + + ep->dentry.volume_label.char_count = label->name_len; + es.modified = true; + + ret = exfat_put_dentry_set(&es, IS_DIRSYNC(root_inode)); + +unlock: + mutex_unlock(&sbi->s_lock); + return ret; +} diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index f8ead4d47ef0..38210fb6901c 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -29,7 +29,6 @@ enum exfat_error_mode { enum { NLS_NAME_NO_LOSSY = 0, /* no lossy */ NLS_NAME_LOSSY = 1 << 0, /* just detected incorrect filename(s) */ - NLS_NAME_OVERLEN = 1 << 1, /* the length is over than its limit */ }; #define EXFAT_HASH_BITS 8 @@ -477,6 +476,9 @@ int exfat_force_shutdown(struct super_block *sb, u32 flags); /* namei.c */ extern const struct dentry_operations exfat_dentry_ops; extern const struct dentry_operations exfat_utf8_dentry_ops; +int exfat_find_empty_entry(struct inode *inode, + struct exfat_chain *p_dir, int num_entries, + struct exfat_entry_set_cache *es); /* cache.c */ int exfat_cache_init(void); @@ -517,6 +519,10 @@ int exfat_get_empty_dentry_set(struct exfat_entry_set_cache *es, unsigned int num_entries); int exfat_put_dentry_set(struct exfat_entry_set_cache *es, int sync); int exfat_count_dir_entries(struct super_block *sb, struct exfat_chain *p_dir); +int exfat_read_volume_label(struct super_block *sb, + struct exfat_uni_name *label_out); +int exfat_write_volume_label(struct super_block *sb, + struct exfat_uni_name *label); /* inode.c */ extern const struct inode_operations exfat_file_inode_operations; diff --git a/fs/exfat/exfat_raw.h b/fs/exfat/exfat_raw.h index 971a1ccd0e89..4082fa7b8c14 100644 --- a/fs/exfat/exfat_raw.h +++ b/fs/exfat/exfat_raw.h @@ -80,6 +80,7 @@ #define BOOTSEC_OLDBPB_LEN 53 #define EXFAT_FILE_NAME_LEN 15 +#define EXFAT_VOLUME_LABEL_LEN 11 #define EXFAT_MIN_SECT_SIZE_BITS 9 #define EXFAT_MAX_SECT_SIZE_BITS 12 @@ -160,6 +161,11 @@ struct exfat_dentry { __le64 size; } __packed upcase; /* up-case table directory entry */ struct { + __u8 char_count; + __le16 volume_label[EXFAT_VOLUME_LABEL_LEN]; + __u8 reserved[8]; + } __packed volume_label; /* volume label directory entry */ + struct { __u8 flags; __u8 vendor_guid[16]; __u8 vendor_defined[14]; diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 232cc7f8ab92..825083634ba2 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -89,35 +89,36 @@ int exfat_ent_get(struct super_block *sb, unsigned int loc, int err; if (!is_valid_cluster(sbi, loc)) { - exfat_fs_error(sb, "invalid access to FAT (entry 0x%08x)", + exfat_fs_error_ratelimit(sb, + "invalid access to FAT (entry 0x%08x)", loc); return -EIO; } err = __exfat_ent_get(sb, loc, content); if (err) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "failed to access to FAT (entry 0x%08x, err:%d)", loc, err); return err; } if (*content == EXFAT_FREE_CLUSTER) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT free cluster (entry 0x%08x)", loc); return -EIO; } if (*content == EXFAT_BAD_CLUSTER) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT bad cluster (entry 0x%08x)", loc); return -EIO; } if (*content != EXFAT_EOF_CLUSTER && !is_valid_cluster(sbi, *content)) { - exfat_fs_error(sb, + exfat_fs_error_ratelimit(sb, "invalid access to FAT (entry 0x%08x) bogus content (0x%08x)", loc, *content); return -EIO; diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 538d2b6ac2ec..adc37b4d7fc2 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -486,6 +486,55 @@ static int exfat_ioctl_shutdown(struct super_block *sb, unsigned long arg) return exfat_force_shutdown(sb, flags); } +static int exfat_ioctl_get_volume_label(struct super_block *sb, unsigned long arg) +{ + int ret; + char label[FSLABEL_MAX] = {0}; + struct exfat_uni_name uniname; + + ret = exfat_read_volume_label(sb, &uniname); + if (ret < 0) + return ret; + + ret = exfat_utf16_to_nls(sb, &uniname, label, uniname.name_len); + if (ret < 0) + return ret; + + if (copy_to_user((char __user *)arg, label, ret + 1)) + return -EFAULT; + + return 0; +} + +static int exfat_ioctl_set_volume_label(struct super_block *sb, + unsigned long arg) +{ + int ret = 0, lossy, label_len; + char label[FSLABEL_MAX] = {0}; + struct exfat_uni_name uniname; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(label, (char __user *)arg, FSLABEL_MAX)) + return -EFAULT; + + memset(&uniname, 0, sizeof(uniname)); + label_len = strnlen(label, FSLABEL_MAX - 1); + if (label[0]) { + ret = exfat_nls_to_utf16(sb, label, label_len, + &uniname, &lossy); + if (ret < 0) + return ret; + else if (lossy & NLS_NAME_LOSSY) + return -EINVAL; + } + + uniname.name_len = ret; + + return exfat_write_volume_label(sb, &uniname); +} + long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -500,6 +549,10 @@ long exfat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return exfat_ioctl_shutdown(inode->i_sb, arg); case FITRIM: return exfat_ioctl_fitrim(inode, arg); + case FS_IOC_GETFSLABEL: + return exfat_ioctl_get_volume_label(inode->i_sb, arg); + case FS_IOC_SETFSLABEL: + return exfat_ioctl_set_volume_label(inode->i_sb, arg); default: return -ENOTTY; } diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index c10844e1e16c..f9501c3a3666 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -25,7 +25,7 @@ int __exfat_write_inode(struct inode *inode, int sync) struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); struct exfat_inode_info *ei = EXFAT_I(inode); - bool is_dir = (ei->type == TYPE_DIR) ? true : false; + bool is_dir = (ei->type == TYPE_DIR); struct timespec64 ts; if (inode->i_ino == EXFAT_ROOT_INO) diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index f5f1c4e8a29f..745dce29ddb5 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -300,7 +300,7 @@ static int exfat_check_max_dentries(struct inode *inode) * the directory entry index in p_dir is returned on succeeds * -error code is returned on failure */ -static int exfat_find_empty_entry(struct inode *inode, +int exfat_find_empty_entry(struct inode *inode, struct exfat_chain *p_dir, int num_entries, struct exfat_entry_set_cache *es) { @@ -442,7 +442,7 @@ static int __exfat_resolve_path(struct inode *inode, const unsigned char *path, return namelen; /* return error value */ if ((lossy && !lookup) || !namelen) - return (lossy & NLS_NAME_OVERLEN) ? -ENAMETOOLONG : -EINVAL; + return -EINVAL; return 0; } @@ -587,7 +587,7 @@ unlock: } /* lookup a file */ -static int exfat_find(struct inode *dir, struct qstr *qname, +static int exfat_find(struct inode *dir, const struct qstr *qname, struct exfat_dir_entry *info) { int ret, dentry, count; @@ -642,10 +642,14 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->type = exfat_get_entry_type(ep); info->attr = le16_to_cpu(ep->dentry.file.attr); - info->size = le64_to_cpu(ep2->dentry.stream.valid_size); info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); + if (info->valid_size < 0) { + exfat_fs_error(sb, "data valid size is invalid(%lld)", info->valid_size); + return -EIO; + } + if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { exfat_fs_error(sb, "data size is invalid(%lld)", info->size); return -EIO; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index 1729bf42eb51..57db08a5271c 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -616,9 +616,6 @@ static int exfat_nls_to_ucs2(struct super_block *sb, unilen++; } - if (p_cstring[i] != '\0') - lossy |= NLS_NAME_OVERLEN; - *uniname = '\0'; p_uniname->name_len = unilen; p_uniname->name_hash = exfat_calc_chksum16(upname, unilen << 1, 0, @@ -789,7 +786,7 @@ int exfat_create_upcase_table(struct super_block *sb) return ret; } - if (exfat_get_next_cluster(sb, &(clu.dir))) + if (exfat_get_next_cluster(sb, &clu.dir)) return -EIO; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8926e63f5bb7..7f9592856bf7 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -31,6 +31,16 @@ static void exfat_free_iocharset(struct exfat_sb_info *sbi) kfree(sbi->options.iocharset); } +static void exfat_set_iocharset(struct exfat_mount_options *opts, + char *iocharset) +{ + opts->iocharset = iocharset; + if (!strcmp(opts->iocharset, "utf8")) + opts->utf8 = 1; + else + opts->utf8 = 0; +} + static void exfat_put_super(struct super_block *sb) { struct exfat_sb_info *sbi = EXFAT_SB(sb); @@ -243,11 +253,11 @@ static const struct fs_parameter_spec exfat_parameters[] = { fsparam_u32oct("allow_utime", Opt_allow_utime), fsparam_string("iocharset", Opt_charset), fsparam_enum("errors", Opt_errors, exfat_param_enums), - fsparam_flag("discard", Opt_discard), + fsparam_flag_no("discard", Opt_discard), fsparam_flag("keep_last_dots", Opt_keep_last_dots), fsparam_flag("sys_tz", Opt_sys_tz), fsparam_s32("time_offset", Opt_time_offset), - fsparam_flag("zero_size_dir", Opt_zero_size_dir), + fsparam_flag_no("zero_size_dir", Opt_zero_size_dir), __fsparam(NULL, "utf8", Opt_utf8, fs_param_deprecated, NULL), __fsparam(NULL, "debug", Opt_debug, fs_param_deprecated, @@ -292,14 +302,14 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_charset: exfat_free_iocharset(sbi); - opts->iocharset = param->string; + exfat_set_iocharset(opts, param->string); param->string = NULL; break; case Opt_errors: opts->errors = result.uint_32; break; case Opt_discard: - opts->discard = 1; + opts->discard = !result.negated; break; case Opt_keep_last_dots: opts->keep_last_dots = 1; @@ -317,7 +327,7 @@ static int exfat_parse_param(struct fs_context *fc, struct fs_parameter *param) opts->time_offset = result.int_32; break; case Opt_zero_size_dir: - opts->zero_size_dir = true; + opts->zero_size_dir = !result.negated; break; case Opt_utf8: case Opt_debug: @@ -664,8 +674,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) /* set up enough so that it can read an inode */ exfat_hash_init(sb); - if (!strcmp(sbi->options.iocharset, "utf8")) - opts->utf8 = 1; + if (sbi->options.utf8) + set_default_d_op(sb, &exfat_utf8_dentry_ops); else { sbi->nls_io = load_nls(sbi->options.iocharset); if (!sbi->nls_io) { @@ -674,12 +684,8 @@ static int exfat_fill_super(struct super_block *sb, struct fs_context *fc) err = -EINVAL; goto free_table; } - } - - if (sbi->options.utf8) - set_default_d_op(sb, &exfat_utf8_dentry_ops); - else set_default_d_op(sb, &exfat_dentry_ops); + } root_inode = new_inode(sb); if (!root_inode) { @@ -742,12 +748,44 @@ static void exfat_free(struct fs_context *fc) static int exfat_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; + struct exfat_sb_info *remount_sbi = fc->s_fs_info; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + struct exfat_mount_options *new_opts = &remount_sbi->options; + struct exfat_mount_options *cur_opts = &sbi->options; + fc->sb_flags |= SB_NODIRATIME; sync_filesystem(sb); - mutex_lock(&EXFAT_SB(sb)->s_lock); + mutex_lock(&sbi->s_lock); exfat_clear_volume_dirty(sb); - mutex_unlock(&EXFAT_SB(sb)->s_lock); + mutex_unlock(&sbi->s_lock); + + if (new_opts->allow_utime == (unsigned short)-1) + new_opts->allow_utime = ~new_opts->fs_dmask & 0022; + + /* + * Since the old settings of these mount options are cached in + * inodes or dentries, they cannot be modified dynamically. + */ + if (strcmp(new_opts->iocharset, cur_opts->iocharset) || + new_opts->keep_last_dots != cur_opts->keep_last_dots || + new_opts->sys_tz != cur_opts->sys_tz || + new_opts->time_offset != cur_opts->time_offset || + !uid_eq(new_opts->fs_uid, cur_opts->fs_uid) || + !gid_eq(new_opts->fs_gid, cur_opts->fs_gid) || + new_opts->fs_fmask != cur_opts->fs_fmask || + new_opts->fs_dmask != cur_opts->fs_dmask || + new_opts->allow_utime != cur_opts->allow_utime) + return -EINVAL; + + if (new_opts->discard != cur_opts->discard && + new_opts->discard && + !bdev_max_discard_sectors(sb->s_bdev)) { + exfat_warn(sb, "remounting with \"discard\" option, but the device does not support discard"); + return -EINVAL; + } + + swap(*cur_opts, *new_opts); return 0; } @@ -777,8 +815,8 @@ static int exfat_init_fs_context(struct fs_context *fc) sbi->options.fs_fmask = current->fs->umask; sbi->options.fs_dmask = current->fs->umask; sbi->options.allow_utime = -1; - sbi->options.iocharset = exfat_default_iocharset; sbi->options.errors = EXFAT_ERRORS_RO; + exfat_set_iocharset(&sbi->options, exfat_default_iocharset); fc->s_fs_info = sbi; fc->ops = &exfat_context_ops; diff --git a/fs/ext4/Kconfig b/fs/ext4/Kconfig index c9ca41d91a6c..01873c2a34ad 100644 --- a/fs/ext4/Kconfig +++ b/fs/ext4/Kconfig @@ -1,31 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only -# Ext3 configs are here for backward compatibility with old configs which may -# have EXT3_FS set but not EXT4_FS set and thus would result in non-bootable -# kernels after the removal of ext3 driver. -config EXT3_FS - tristate "The Extended 3 (ext3) filesystem" - select EXT4_FS - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_POSIX_ACL - bool "Ext3 POSIX Access Control Lists" - depends on EXT3_FS - select EXT4_FS_POSIX_ACL - select FS_POSIX_ACL - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - -config EXT3_FS_SECURITY - bool "Ext3 Security Labels" - depends on EXT3_FS - select EXT4_FS_SECURITY - help - This config option is here only for backward compatibility. ext3 - filesystem is now handled by the ext4 driver. - config EXT4_FS tristate "The Extended 4 (ext4) filesystem" select BUFFER_HEAD diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 6cb784a56b3b..57087da6c7be 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -1450,7 +1450,9 @@ struct ext4_super_block { __le16 s_encoding; /* Filename charset encoding */ __le16 s_encoding_flags; /* Filename charset encoding flags */ __le32 s_orphan_file_inum; /* Inode for tracking orphan inodes */ - __le32 s_reserved[94]; /* Padding to the end of the block */ + __le16 s_def_resuid_hi; + __le16 s_def_resgid_hi; + __le32 s_reserved[93]; /* Padding to the end of the block */ __le32 s_checksum; /* crc32c(superblock) */ }; @@ -1820,6 +1822,18 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) ino <= le32_to_cpu(EXT4_SB(sb)->s_es->s_inodes_count)); } +static inline int ext4_get_resuid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resuid) | + le16_to_cpu(es->s_def_resuid_hi) << 16; +} + +static inline int ext4_get_resgid(struct ext4_super_block *es) +{ + return le16_to_cpu(es->s_def_resgid) | + le16_to_cpu(es->s_def_resgid_hi) << 16; +} + /* * Returns: sbi->field[index] * Used to access an array element from the following sbi fields which require @@ -1990,6 +2004,16 @@ static inline bool ext4_verity_in_progress(struct inode *inode) #define NEXT_ORPHAN(inode) EXT4_I(inode)->i_dtime /* + * Check whether the inode is tracked as orphan (either in orphan file or + * orphan list). + */ +static inline bool ext4_inode_orphan_tracked(struct inode *inode) +{ + return ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || + !list_empty(&EXT4_I(inode)->i_orphan); +} + +/* * Codes for operating systems */ #define EXT4_OS_LINUX 0 @@ -3142,6 +3166,8 @@ extern struct buffer_head *ext4_sb_bread(struct super_block *sb, sector_t block, blk_opf_t op_flags); extern struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, sector_t block); +extern struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block); extern void ext4_read_bh_nowait(struct buffer_head *bh, blk_opf_t op_flags, bh_end_io_t *end_io, bool simu_fail); extern int ext4_read_bh(struct buffer_head *bh, blk_opf_t op_flags, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index b3e9b7bd7978..a0e66bc10093 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -280,9 +280,16 @@ int __ext4_forget(const char *where, unsigned int line, handle_t *handle, bh, is_metadata, inode->i_mode, test_opt(inode->i_sb, DATA_FLAGS)); - /* In the no journal case, we can just do a bforget and return */ + /* + * In the no journal case, we should wait for the ongoing buffer + * to complete and do a forget. + */ if (!ext4_handle_valid(handle)) { - bforget(bh); + if (bh) { + clear_buffer_dirty(bh); + wait_on_buffer(bh); + __bforget(bh); + } return 0; } diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 42bee1d4f9f9..fa66b08de999 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -663,7 +663,7 @@ void ext4_fc_track_range(handle_t *handle, struct inode *inode, ext4_lblk_t star static void ext4_fc_submit_bh(struct super_block *sb, bool is_tail) { - blk_opf_t write_flags = REQ_SYNC; + blk_opf_t write_flags = JBD2_JOURNAL_REQ_FLAGS; struct buffer_head *bh = EXT4_SB(sb)->s_fc_bh; /* Add REQ_FUA | REQ_PREFLUSH only its tail */ diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 93240e35ee36..7a8b30932189 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -354,7 +354,7 @@ static void ext4_inode_extension_cleanup(struct inode *inode, bool need_trunc) * to cleanup the orphan list in ext4_handle_inode_extension(). Do it * now. */ - if (!list_empty(&EXT4_I(inode)->i_orphan) && inode->i_nlink) { + if (ext4_inode_orphan_tracked(inode) && inode->i_nlink) { handle_t *handle = ext4_journal_start(inode, EXT4_HT_INODE, 2); if (IS_ERR(handle)) { diff --git a/fs/ext4/fsmap.c b/fs/ext4/fsmap.c index 91185c40f755..22fc333244ef 100644 --- a/fs/ext4/fsmap.c +++ b/fs/ext4/fsmap.c @@ -74,7 +74,8 @@ static int ext4_getfsmap_dev_compare(const void *p1, const void *p2) static bool ext4_getfsmap_rec_before_low_key(struct ext4_getfsmap_info *info, struct ext4_fsmap *rec) { - return rec->fmr_physical < info->gfi_low.fmr_physical; + return rec->fmr_physical + rec->fmr_length <= + info->gfi_low.fmr_physical; } /* @@ -200,15 +201,18 @@ static int ext4_getfsmap_meta_helper(struct super_block *sb, ext4_group_first_block_no(sb, agno)); fs_end = fs_start + EXT4_C2B(sbi, len); - /* Return relevant extents from the meta_list */ + /* + * Return relevant extents from the meta_list. We emit all extents that + * partially/fully overlap with the query range + */ list_for_each_entry_safe(p, tmp, &info->gfi_meta_list, fmr_list) { - if (p->fmr_physical < info->gfi_next_fsblk) { + if (p->fmr_physical + p->fmr_length <= info->gfi_next_fsblk) { list_del(&p->fmr_list); kfree(p); continue; } - if (p->fmr_physical <= fs_start || - p->fmr_physical + p->fmr_length <= fs_end) { + if (p->fmr_physical <= fs_end && + p->fmr_physical + p->fmr_length > fs_start) { /* Emit the retained free extent record if present */ if (info->gfi_lastfree.fmr_owner) { error = ext4_getfsmap_helper(sb, info, diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index d45124318200..da76353b3a57 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -1025,7 +1025,7 @@ static void ext4_free_branches(handle_t *handle, struct inode *inode, } /* Go read the buffer for the next level down */ - bh = ext4_sb_bread(inode->i_sb, nr, 0); + bh = ext4_sb_bread_nofail(inode->i_sb, nr); /* * A read failure? Report error and clear slot diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 5b7a15db4953..e99306a8f47c 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -3872,47 +3872,12 @@ static int ext4_iomap_overwrite_begin(struct inode *inode, loff_t offset, return ret; } -static inline bool ext4_want_directio_fallback(unsigned flags, ssize_t written) -{ - /* must be a directio to fall back to buffered */ - if ((flags & (IOMAP_WRITE | IOMAP_DIRECT)) != - (IOMAP_WRITE | IOMAP_DIRECT)) - return false; - - /* atomic writes are all-or-nothing */ - if (flags & IOMAP_ATOMIC) - return false; - - /* can only try again if we wrote nothing */ - return written == 0; -} - -static int ext4_iomap_end(struct inode *inode, loff_t offset, loff_t length, - ssize_t written, unsigned flags, struct iomap *iomap) -{ - /* - * Check to see whether an error occurred while writing out the data to - * the allocated blocks. If so, return the magic error code for - * non-atomic write so that we fallback to buffered I/O and attempt to - * complete the remainder of the I/O. - * For non-atomic writes, any blocks that may have been - * allocated in preparation for the direct I/O will be reused during - * buffered I/O. For atomic write, we never fallback to buffered-io. - */ - if (ext4_want_directio_fallback(flags, written)) - return -ENOTBLK; - - return 0; -} - const struct iomap_ops ext4_iomap_ops = { .iomap_begin = ext4_iomap_begin, - .iomap_end = ext4_iomap_end, }; const struct iomap_ops ext4_iomap_overwrite_ops = { .iomap_begin = ext4_iomap_overwrite_begin, - .iomap_end = ext4_iomap_end, }; static int ext4_iomap_begin_report(struct inode *inode, loff_t offset, @@ -4287,7 +4252,11 @@ int ext4_can_truncate(struct inode *inode) * We have to make sure i_disksize gets properly updated before we truncate * page cache due to hole punching or zero range. Otherwise i_disksize update * can get lost as it may have been postponed to submission of writeback but - * that will never happen after we truncate page cache. + * that will never happen if we remove the folio containing i_size from the + * page cache. Also if we punch hole within i_size but above i_disksize, + * following ext4_page_mkwrite() may mistakenly allocate written blocks over + * the hole and thus introduce allocated blocks beyond i_disksize which is + * not allowed (e2fsck would complain in case of crash). */ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t len) @@ -4298,9 +4267,11 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, loff_t size = i_size_read(inode); WARN_ON(!inode_is_locked(inode)); - if (offset > size || offset + len < size) + if (offset > size) return 0; + if (offset + len < size) + size = offset + len; if (EXT4_I(inode)->i_disksize >= size) return 0; @@ -4748,7 +4719,7 @@ static int ext4_fill_raw_inode(struct inode *inode, struct ext4_inode *raw_inode * old inodes get re-used with the upper 16 bits of the * uid/gid intact. */ - if (ei->i_dtime && list_empty(&ei->i_orphan)) { + if (ei->i_dtime && !ext4_inode_orphan_tracked(inode)) { raw_inode->i_uid_high = 0; raw_inode->i_gid_high = 0; } else { @@ -5348,6 +5319,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, } ei->i_flags = le32_to_cpu(raw_inode->i_flags); ext4_set_inode_flags(inode, true); + /* Detect invalid flag combination - can't have both inline data and extents */ + if (ext4_test_inode_flag(inode, EXT4_INODE_INLINE_DATA) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { + ext4_error_inode(inode, function, line, 0, + "inode has both inline data and extents flags"); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode->i_blocks = ext4_inode_blocks(raw_inode, ei); ei->i_file_acl = le32_to_cpu(raw_inode->i_file_acl_lo); if (ext4_has_feature_64bit(sb)) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 84e3c73952d7..a93a7baae990 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -27,14 +27,16 @@ #include "fsmap.h" #include <trace/events/ext4.h> -typedef void ext4_update_sb_callback(struct ext4_super_block *es, - const void *arg); +typedef void ext4_update_sb_callback(struct ext4_sb_info *sbi, + struct ext4_super_block *es, + const void *arg); /* * Superblock modification callback function for changing file system * label */ -static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setlabel(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { /* Sanity check, this should never happen */ BUILD_BUG_ON(sizeof(es->s_volume_name) < EXT4_LABEL_MAX); @@ -46,7 +48,8 @@ static void ext4_sb_setlabel(struct ext4_super_block *es, const void *arg) * Superblock modification callback function for changing file system * UUID. */ -static void ext4_sb_setuuid(struct ext4_super_block *es, const void *arg) +static void ext4_sb_setuuid(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { memcpy(es->s_uuid, (__u8 *)arg, UUID_SIZE); } @@ -71,7 +74,7 @@ int ext4_update_primary_sb(struct super_block *sb, handle_t *handle, goto out_err; lock_buffer(bh); - func(es, arg); + func(sbi, es, arg); ext4_superblock_csum_set(sb); unlock_buffer(bh); @@ -149,7 +152,7 @@ static int ext4_update_backup_sb(struct super_block *sb, unlock_buffer(bh); goto out_bh; } - func(es, arg); + func(EXT4_SB(sb), es, arg); if (ext4_has_feature_metadata_csum(sb)) es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); @@ -1230,6 +1233,295 @@ static int ext4_ioctl_setuuid(struct file *filp, return ret; } + +#define TUNE_OPS_SUPPORTED (EXT4_TUNE_FL_ERRORS_BEHAVIOR | \ + EXT4_TUNE_FL_MNT_COUNT | EXT4_TUNE_FL_MAX_MNT_COUNT | \ + EXT4_TUNE_FL_CHECKINTRVAL | EXT4_TUNE_FL_LAST_CHECK_TIME | \ + EXT4_TUNE_FL_RESERVED_BLOCKS | EXT4_TUNE_FL_RESERVED_UID | \ + EXT4_TUNE_FL_RESERVED_GID | EXT4_TUNE_FL_DEFAULT_MNT_OPTS | \ + EXT4_TUNE_FL_DEF_HASH_ALG | EXT4_TUNE_FL_RAID_STRIDE | \ + EXT4_TUNE_FL_RAID_STRIPE_WIDTH | EXT4_TUNE_FL_MOUNT_OPTS | \ + EXT4_TUNE_FL_FEATURES | EXT4_TUNE_FL_EDIT_FEATURES | \ + EXT4_TUNE_FL_FORCE_FSCK | EXT4_TUNE_FL_ENCODING | \ + EXT4_TUNE_FL_ENCODING_FLAGS) + +#define EXT4_TUNE_SET_COMPAT_SUPP \ + (EXT4_FEATURE_COMPAT_DIR_INDEX | \ + EXT4_FEATURE_COMPAT_STABLE_INODES) +#define EXT4_TUNE_SET_INCOMPAT_SUPP \ + (EXT4_FEATURE_INCOMPAT_EXTENTS | \ + EXT4_FEATURE_INCOMPAT_EA_INODE | \ + EXT4_FEATURE_INCOMPAT_ENCRYPT | \ + EXT4_FEATURE_INCOMPAT_CSUM_SEED | \ + EXT4_FEATURE_INCOMPAT_LARGEDIR | \ + EXT4_FEATURE_INCOMPAT_CASEFOLD) +#define EXT4_TUNE_SET_RO_COMPAT_SUPP \ + (EXT4_FEATURE_RO_COMPAT_LARGE_FILE | \ + EXT4_FEATURE_RO_COMPAT_DIR_NLINK | \ + EXT4_FEATURE_RO_COMPAT_EXTRA_ISIZE | \ + EXT4_FEATURE_RO_COMPAT_PROJECT | \ + EXT4_FEATURE_RO_COMPAT_VERITY) + +#define EXT4_TUNE_CLEAR_COMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_INCOMPAT_SUPP (0) +#define EXT4_TUNE_CLEAR_RO_COMPAT_SUPP (0) + +#define SB_ENC_SUPP_MASK (SB_ENC_STRICT_MODE_FL | \ + SB_ENC_NO_COMPAT_FALLBACK_FL) + +static int ext4_ioctl_get_tune_sb(struct ext4_sb_info *sbi, + struct ext4_tune_sb_params __user *params) +{ + struct ext4_tune_sb_params ret; + struct ext4_super_block *es = sbi->s_es; + + memset(&ret, 0, sizeof(ret)); + ret.set_flags = TUNE_OPS_SUPPORTED; + ret.errors_behavior = le16_to_cpu(es->s_errors); + ret.mnt_count = le16_to_cpu(es->s_mnt_count); + ret.max_mnt_count = le16_to_cpu(es->s_max_mnt_count); + ret.checkinterval = le32_to_cpu(es->s_checkinterval); + ret.last_check_time = le32_to_cpu(es->s_lastcheck); + ret.reserved_blocks = ext4_r_blocks_count(es); + ret.blocks_count = ext4_blocks_count(es); + ret.reserved_uid = ext4_get_resuid(es); + ret.reserved_gid = ext4_get_resgid(es); + ret.default_mnt_opts = le32_to_cpu(es->s_default_mount_opts); + ret.def_hash_alg = es->s_def_hash_version; + ret.raid_stride = le16_to_cpu(es->s_raid_stride); + ret.raid_stripe_width = le32_to_cpu(es->s_raid_stripe_width); + ret.encoding = le16_to_cpu(es->s_encoding); + ret.encoding_flags = le16_to_cpu(es->s_encoding_flags); + strscpy_pad(ret.mount_opts, es->s_mount_opts); + ret.feature_compat = le32_to_cpu(es->s_feature_compat); + ret.feature_incompat = le32_to_cpu(es->s_feature_incompat); + ret.feature_ro_compat = le32_to_cpu(es->s_feature_ro_compat); + ret.set_feature_compat_mask = EXT4_TUNE_SET_COMPAT_SUPP; + ret.set_feature_incompat_mask = EXT4_TUNE_SET_INCOMPAT_SUPP; + ret.set_feature_ro_compat_mask = EXT4_TUNE_SET_RO_COMPAT_SUPP; + ret.clear_feature_compat_mask = EXT4_TUNE_CLEAR_COMPAT_SUPP; + ret.clear_feature_incompat_mask = EXT4_TUNE_CLEAR_INCOMPAT_SUPP; + ret.clear_feature_ro_compat_mask = EXT4_TUNE_CLEAR_RO_COMPAT_SUPP; + if (copy_to_user(params, &ret, sizeof(ret))) + return -EFAULT; + return 0; +} + +static void ext4_sb_setparams(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) +{ + const struct ext4_tune_sb_params *params = arg; + + if (params->set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) + es->s_errors = cpu_to_le16(params->errors_behavior); + if (params->set_flags & EXT4_TUNE_FL_MNT_COUNT) + es->s_mnt_count = cpu_to_le16(params->mnt_count); + if (params->set_flags & EXT4_TUNE_FL_MAX_MNT_COUNT) + es->s_max_mnt_count = cpu_to_le16(params->max_mnt_count); + if (params->set_flags & EXT4_TUNE_FL_CHECKINTRVAL) + es->s_checkinterval = cpu_to_le32(params->checkinterval); + if (params->set_flags & EXT4_TUNE_FL_LAST_CHECK_TIME) + es->s_lastcheck = cpu_to_le32(params->last_check_time); + if (params->set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) { + ext4_fsblk_t blk = params->reserved_blocks; + + es->s_r_blocks_count_lo = cpu_to_le32((u32)blk); + es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_UID) { + int uid = params->reserved_uid; + + es->s_def_resuid = cpu_to_le16(uid & 0xFFFF); + es->s_def_resuid_hi = cpu_to_le16(uid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_RESERVED_GID) { + int gid = params->reserved_gid; + + es->s_def_resgid = cpu_to_le16(gid & 0xFFFF); + es->s_def_resgid_hi = cpu_to_le16(gid >> 16); + } + if (params->set_flags & EXT4_TUNE_FL_DEFAULT_MNT_OPTS) + es->s_default_mount_opts = cpu_to_le32(params->default_mnt_opts); + if (params->set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + es->s_def_hash_version = params->def_hash_alg; + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIDE) + es->s_raid_stride = cpu_to_le16(params->raid_stride); + if (params->set_flags & EXT4_TUNE_FL_RAID_STRIPE_WIDTH) + es->s_raid_stripe_width = + cpu_to_le32(params->raid_stripe_width); + if (params->set_flags & EXT4_TUNE_FL_ENCODING) + es->s_encoding = cpu_to_le16(params->encoding); + if (params->set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) + es->s_encoding_flags = cpu_to_le16(params->encoding_flags); + strscpy_pad(es->s_mount_opts, params->mount_opts); + if (params->set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + es->s_feature_compat |= + cpu_to_le32(params->set_feature_compat_mask); + es->s_feature_incompat |= + cpu_to_le32(params->set_feature_incompat_mask); + es->s_feature_ro_compat |= + cpu_to_le32(params->set_feature_ro_compat_mask); + es->s_feature_compat &= + ~cpu_to_le32(params->clear_feature_compat_mask); + es->s_feature_incompat &= + ~cpu_to_le32(params->clear_feature_incompat_mask); + es->s_feature_ro_compat &= + ~cpu_to_le32(params->clear_feature_ro_compat_mask); + if (params->set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX) + es->s_def_hash_version = sbi->s_def_hash_version; + if (params->set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CSUM_SEED) + es->s_checksum_seed = cpu_to_le32(sbi->s_csum_seed); + } + if (params->set_flags & EXT4_TUNE_FL_FORCE_FSCK) + es->s_state |= cpu_to_le16(EXT4_ERROR_FS); +} + +static int ext4_ioctl_set_tune_sb(struct file *filp, + struct ext4_tune_sb_params __user *in) +{ + struct ext4_tune_sb_params params; + struct super_block *sb = file_inode(filp)->i_sb; + struct ext4_sb_info *sbi = EXT4_SB(sb); + struct ext4_super_block *es = sbi->s_es; + int enabling_casefold = 0; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(¶ms, in, sizeof(params))) + return -EFAULT; + + if ((params.set_flags & ~TUNE_OPS_SUPPORTED) != 0) + return -EOPNOTSUPP; + + if ((params.set_flags & EXT4_TUNE_FL_ERRORS_BEHAVIOR) && + (params.errors_behavior > EXT4_ERRORS_PANIC)) + return -EINVAL; + + if ((params.set_flags & EXT4_TUNE_FL_RESERVED_BLOCKS) && + (params.reserved_blocks > ext4_blocks_count(sbi->s_es) / 2)) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) && + ((params.def_hash_alg > DX_HASH_LAST) || + (params.def_hash_alg == DX_HASH_SIPHASH))) + return -EINVAL; + if ((params.set_flags & EXT4_TUNE_FL_FEATURES) && + (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES)) + return -EINVAL; + + if (params.set_flags & EXT4_TUNE_FL_FEATURES) { + params.set_feature_compat_mask = + params.feature_compat & + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask = + params.feature_incompat & + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask = + params.feature_ro_compat & + ~le32_to_cpu(es->s_feature_ro_compat); + params.clear_feature_compat_mask = + ~params.feature_compat & + le32_to_cpu(es->s_feature_compat); + params.clear_feature_incompat_mask = + ~params.feature_incompat & + le32_to_cpu(es->s_feature_incompat); + params.clear_feature_ro_compat_mask = + ~params.feature_ro_compat & + le32_to_cpu(es->s_feature_ro_compat); + params.set_flags |= EXT4_TUNE_FL_EDIT_FEATURES; + } + if (params.set_flags & EXT4_TUNE_FL_EDIT_FEATURES) { + if ((params.set_feature_compat_mask & + ~EXT4_TUNE_SET_COMPAT_SUPP) || + (params.set_feature_incompat_mask & + ~EXT4_TUNE_SET_INCOMPAT_SUPP) || + (params.set_feature_ro_compat_mask & + ~EXT4_TUNE_SET_RO_COMPAT_SUPP) || + (params.clear_feature_compat_mask & + ~EXT4_TUNE_CLEAR_COMPAT_SUPP) || + (params.clear_feature_incompat_mask & + ~EXT4_TUNE_CLEAR_INCOMPAT_SUPP) || + (params.clear_feature_ro_compat_mask & + ~EXT4_TUNE_CLEAR_RO_COMPAT_SUPP)) + return -EOPNOTSUPP; + + /* + * Filter out the features that are already set from + * the set_mask. + */ + params.set_feature_compat_mask &= + ~le32_to_cpu(es->s_feature_compat); + params.set_feature_incompat_mask &= + ~le32_to_cpu(es->s_feature_incompat); + params.set_feature_ro_compat_mask &= + ~le32_to_cpu(es->s_feature_ro_compat); + if ((params.set_feature_incompat_mask & + EXT4_FEATURE_INCOMPAT_CASEFOLD)) { + enabling_casefold = 1; + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING)) { + params.encoding = EXT4_ENC_UTF8_12_1; + params.set_flags |= EXT4_TUNE_FL_ENCODING; + } + if (!(params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS)) { + params.encoding_flags = 0; + params.set_flags |= EXT4_TUNE_FL_ENCODING_FLAGS; + } + } + if ((params.set_feature_compat_mask & + EXT4_FEATURE_COMPAT_DIR_INDEX)) { + uuid_t uu; + + memcpy(&uu, sbi->s_hash_seed, UUID_SIZE); + if (uuid_is_null(&uu)) + generate_random_uuid((char *) + &sbi->s_hash_seed); + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + else if (sbi->s_def_hash_version == 0) + sbi->s_def_hash_version = DX_HASH_HALF_MD4; + if (!(es->s_flags & + cpu_to_le32(EXT2_FLAGS_UNSIGNED_HASH)) && + !(es->s_flags & + cpu_to_le32(EXT2_FLAGS_SIGNED_HASH))) { +#ifdef __CHAR_UNSIGNED__ + sbi->s_hash_unsigned = 3; +#else + sbi->s_hash_unsigned = 0; +#endif + } + } + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding == 0) + params.encoding = EXT4_ENC_UTF8_12_1; + else if (params.encoding != EXT4_ENC_UTF8_12_1) + return -EINVAL; + } + if (params.set_flags & EXT4_TUNE_FL_ENCODING_FLAGS) { + if (!enabling_casefold) + return -EINVAL; + if (params.encoding_flags & ~SB_ENC_SUPP_MASK) + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + ret = ext4_update_superblocks_fn(sb, ext4_sb_setparams, ¶ms); + mnt_drop_write_file(filp); + + if (params.set_flags & EXT4_TUNE_FL_DEF_HASH_ALG) + sbi->s_def_hash_version = params.def_hash_alg; + + return ret; +} + static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -1616,6 +1908,11 @@ resizefs_out: return ext4_ioctl_getuuid(EXT4_SB(sb), (void __user *)arg); case EXT4_IOC_SETFSUUID: return ext4_ioctl_setuuid(filp, (const void __user *)arg); + case EXT4_IOC_GET_TUNE_SB_PARAM: + return ext4_ioctl_get_tune_sb(EXT4_SB(sb), + (void __user *)arg); + case EXT4_IOC_SET_TUNE_SB_PARAM: + return ext4_ioctl_set_tune_sb(filp, (void __user *)arg); default: return -ENOTTY; } @@ -1703,7 +2000,8 @@ long ext4_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } #endif -static void set_overhead(struct ext4_super_block *es, const void *arg) +static void set_overhead(struct ext4_sb_info *sbi, + struct ext4_super_block *es, const void *arg) { es->s_overhead_clusters = cpu_to_le32(*((unsigned long *) arg)); } diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b18802e83eb..9087183602e4 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3655,16 +3655,26 @@ static void ext4_discard_work(struct work_struct *work) static inline void ext4_mb_avg_fragment_size_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_avg_fragment_size) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_avg_fragment_size[i]); + kfree(sbi->s_mb_avg_fragment_size); + sbi->s_mb_avg_fragment_size = NULL; } static inline void ext4_mb_largest_free_orders_destroy(struct ext4_sb_info *sbi) { + if (!sbi->s_mb_largest_free_orders) + return; + for (int i = 0; i < MB_NUM_ORDERS(sbi->s_sb); i++) xa_destroy(&sbi->s_mb_largest_free_orders[i]); + kfree(sbi->s_mb_largest_free_orders); + sbi->s_mb_largest_free_orders = NULL; } int ext4_mb_init(struct super_block *sb) diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 51661570cf3b..ab1ff51302fb 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -231,9 +231,9 @@ static int kmmpd(void *data) * Adjust the mmp_check_interval depending on how much time * it took for the MMP block to be written. */ - mmp_check_interval = max(min(EXT4_MMP_CHECK_MULT * diff / HZ, - EXT4_MMP_MAX_CHECK_INTERVAL), - EXT4_MMP_MIN_CHECK_INTERVAL); + mmp_check_interval = clamp(EXT4_MMP_CHECK_MULT * diff / HZ, + EXT4_MMP_MIN_CHECK_INTERVAL, + EXT4_MMP_MAX_CHECK_INTERVAL); mmp->mmp_check_interval = cpu_to_le16(mmp_check_interval); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index adae3caf175a..4b091c21908f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -225,7 +225,7 @@ static int mext_page_mkuptodate(struct folio *folio, size_t from, size_t to) do { if (bh_offset(bh) + blocksize <= from) continue; - if (bh_offset(bh) > to) + if (bh_offset(bh) >= to) break; wait_on_buffer(bh); if (buffer_uptodate(bh)) diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index 524d4658fa40..82d5e7501455 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -109,11 +109,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && !inode_is_locked(inode)); - /* - * Inode orphaned in orphan file or in orphan list? - */ - if (ext4_test_inode_state(inode, EXT4_STATE_ORPHAN_FILE) || - !list_empty(&EXT4_I(inode)->i_orphan)) + if (ext4_inode_orphan_tracked(inode)) return 0; /* @@ -517,7 +513,7 @@ void ext4_release_orphan_info(struct super_block *sb) return; for (i = 0; i < oi->of_blocks; i++) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); } static struct ext4_orphan_block_tail *ext4_orphan_block_tail( @@ -587,9 +583,20 @@ int ext4_init_orphan_info(struct super_block *sb) ext4_msg(sb, KERN_ERR, "get orphan inode failed"); return PTR_ERR(inode); } + /* + * This is just an artificial limit to prevent corrupted fs from + * consuming absurd amounts of memory when pinning blocks of orphan + * file in memory. + */ + if (inode->i_size > 8 << 20) { + ext4_msg(sb, KERN_ERR, "orphan file too big: %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto out_put; + } oi->of_blocks = inode->i_size >> sb->s_blocksize_bits; oi->of_csum_seed = EXT4_I(inode)->i_csum_seed; - oi->of_binfo = kmalloc_array(oi->of_blocks, + oi->of_binfo = kvmalloc_array(oi->of_blocks, sizeof(struct ext4_orphan_block), GFP_KERNEL); if (!oi->of_binfo) { @@ -630,7 +637,7 @@ int ext4_init_orphan_info(struct super_block *sb) out_free: for (i--; i >= 0; i--) brelse(oi->of_binfo[i].ob_bh); - kfree(oi->of_binfo); + kvfree(oi->of_binfo); out_put: iput(inode); return ret; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 7f2d4014d128..33e7c08c9529 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -265,6 +265,15 @@ struct buffer_head *ext4_sb_bread_unmovable(struct super_block *sb, return __ext4_sb_bread_gfp(sb, block, 0, gfp); } +struct buffer_head *ext4_sb_bread_nofail(struct super_block *sb, + sector_t block) +{ + gfp_t gfp = mapping_gfp_constraint(sb->s_bdev->bd_mapping, + ~__GFP_FS) | __GFP_MOVABLE | __GFP_NOFAIL; + + return __ext4_sb_bread_gfp(sb, block, 0, gfp); +} + void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block) { struct buffer_head *bh = bdev_getblk(sb->s_bdev, block, @@ -1438,9 +1447,9 @@ static void ext4_free_in_core_inode(struct inode *inode) static void ext4_destroy_inode(struct inode *inode) { - if (!list_empty(&(EXT4_I(inode)->i_orphan))) { + if (ext4_inode_orphan_tracked(inode)) { ext4_msg(inode->i_sb, KERN_ERR, - "Inode %lu (%p): orphan list check failed!", + "Inode %lu (%p): inode tracked as orphan!", inode->i_ino, EXT4_I(inode)); print_hex_dump(KERN_INFO, "", DUMP_PREFIX_ADDRESS, 16, 4, EXT4_I(inode), sizeof(struct ext4_inode_info), @@ -2466,7 +2475,7 @@ static int parse_apply_sb_mount_options(struct super_block *sb, struct ext4_fs_context *m_ctx) { struct ext4_sb_info *sbi = EXT4_SB(sb); - char *s_mount_opts = NULL; + char s_mount_opts[65]; struct ext4_fs_context *s_ctx = NULL; struct fs_context *fc = NULL; int ret = -ENOMEM; @@ -2474,15 +2483,11 @@ static int parse_apply_sb_mount_options(struct super_block *sb, if (!sbi->s_es->s_mount_opts[0]) return 0; - s_mount_opts = kstrndup(sbi->s_es->s_mount_opts, - sizeof(sbi->s_es->s_mount_opts), - GFP_KERNEL); - if (!s_mount_opts) - return ret; + strscpy_pad(s_mount_opts, sbi->s_es->s_mount_opts); fc = kzalloc(sizeof(struct fs_context), GFP_KERNEL); if (!fc) - goto out_free; + return -ENOMEM; s_ctx = kzalloc(sizeof(struct ext4_fs_context), GFP_KERNEL); if (!s_ctx) @@ -2514,11 +2519,8 @@ parse_failed: ret = 0; out_free: - if (fc) { - ext4_fc_free(fc); - kfree(fc); - } - kfree(s_mount_opts); + ext4_fc_free(fc); + kfree(fc); return ret; } @@ -2964,11 +2966,11 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, } if (nodefs || !uid_eq(sbi->s_resuid, make_kuid(&init_user_ns, EXT4_DEF_RESUID)) || - le16_to_cpu(es->s_def_resuid) != EXT4_DEF_RESUID) + ext4_get_resuid(es) != EXT4_DEF_RESUID) SEQ_OPTS_PRINT("resuid=%u", from_kuid_munged(&init_user_ns, sbi->s_resuid)); if (nodefs || !gid_eq(sbi->s_resgid, make_kgid(&init_user_ns, EXT4_DEF_RESGID)) || - le16_to_cpu(es->s_def_resgid) != EXT4_DEF_RESGID) + ext4_get_resgid(es) != EXT4_DEF_RESGID) SEQ_OPTS_PRINT("resgid=%u", from_kgid_munged(&init_user_ns, sbi->s_resgid)); def_errors = nodefs ? -1 : le16_to_cpu(es->s_errors); @@ -5283,8 +5285,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) ext4_set_def_opts(sb, es); - sbi->s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - sbi->s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); + sbi->s_resuid = make_kuid(&init_user_ns, ext4_get_resuid(es)); + sbi->s_resgid = make_kgid(&init_user_ns, ext4_get_resuid(es)); sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 5a6fe1513fd2..ce7253b3f549 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -251,6 +251,10 @@ check_xattrs(struct inode *inode, struct buffer_head *bh, err_str = "invalid ea_ino"; goto errout; } + if (ea_ino && !size) { + err_str = "invalid size in ea xattr"; + goto errout; + } if (size > EXT4_XATTR_SIZE_MAX) { err_str = "e_value size too large"; goto errout; @@ -1019,7 +1023,7 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, int ref_change) { struct ext4_iloc iloc; - s64 ref_count; + u64 ref_count; int ret; inode_lock_nested(ea_inode, I_MUTEX_XATTR); @@ -1029,13 +1033,17 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, goto out; ref_count = ext4_xattr_inode_get_ref(ea_inode); + if ((ref_count == 0 && ref_change < 0) || (ref_count == U64_MAX && ref_change > 0)) { + ext4_error_inode(ea_inode, __func__, __LINE__, 0, + "EA inode %lu ref wraparound: ref_count=%lld ref_change=%d", + ea_inode->i_ino, ref_count, ref_change); + ret = -EFSCORRUPTED; + goto out; + } ref_count += ref_change; ext4_xattr_inode_set_ref(ea_inode, ref_count); if (ref_change > 0) { - WARN_ONCE(ref_count <= 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 1) { WARN_ONCE(ea_inode->i_nlink, "EA inode %lu i_nlink=%u", ea_inode->i_ino, ea_inode->i_nlink); @@ -1044,9 +1052,6 @@ static int ext4_xattr_inode_update_ref(handle_t *handle, struct inode *ea_inode, ext4_orphan_del(handle, ea_inode); } } else { - WARN_ONCE(ref_count < 0, "EA inode %lu ref_count=%lld", - ea_inode->i_ino, ref_count); - if (ref_count == 0) { WARN_ONCE(ea_inode->i_nlink != 1, "EA inode %lu i_nlink=%u", @@ -1530,7 +1535,7 @@ ext4_xattr_inode_cache_find(struct inode *inode, const void *value, WARN_ON_ONCE(ext4_handle_valid(journal_current_handle()) && !(current->flags & PF_MEMALLOC_NOFS)); - ea_data = kvmalloc(value_len, GFP_KERNEL); + ea_data = kvmalloc(value_len, GFP_NOFS); if (!ea_data) { mb_cache_entry_put(ea_inode_cache, ce); return NULL; diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index db3831f7f2f5..bbe07e3a6c75 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1442,6 +1442,34 @@ u64 f2fs_get_sectors_written(struct f2fs_sb_info *sbi) return get_sectors_written(sbi->sb->s_bdev); } +static inline void stat_cp_time(struct cp_control *cpc, enum cp_time type) +{ + cpc->stats.times[type] = ktime_get(); +} + +static inline void check_cp_time(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long long sb_diff, cur_diff; + enum cp_time ct; + + sb_diff = (u64)ktime_ms_delta(sbi->cp_stats.times[CP_TIME_END], + sbi->cp_stats.times[CP_TIME_START]); + cur_diff = (u64)ktime_ms_delta(cpc->stats.times[CP_TIME_END], + cpc->stats.times[CP_TIME_START]); + + if (cur_diff > sb_diff) { + sbi->cp_stats = cpc->stats; + if (cur_diff < CP_LONG_LATENCY_THRESHOLD) + return; + + f2fs_warn(sbi, "checkpoint was blocked for %llu ms", cur_diff); + for (ct = CP_TIME_START; ct < CP_TIME_MAX - 1; ct++) + f2fs_warn(sbi, "Step#%d: %llu ms", ct, + (u64)ktime_ms_delta(cpc->stats.times[ct + 1], + cpc->stats.times[ct])); + } +} + static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1459,6 +1487,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_META); + /* start to update checkpoint, cp ver is already updated previously */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); @@ -1555,20 +1585,26 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Here, we have one bio having CP pack except cp pack 2 page */ f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + stat_cp_time(cpc, CP_TIME_SYNC_CP_META); + /* Wait for all dirty meta pages to be submitted for IO */ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META); + stat_cp_time(cpc, CP_TIME_WAIT_DIRTY_META); /* wait for previous submitted meta pages writeback */ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_CP_DATA); /* flush all device cache */ err = f2fs_flush_device_cache(sbi); if (err) return err; + stat_cp_time(cpc, CP_TIME_FLUSH_DEVICE); /* barrier and flush checkpoint cp pack 2 page if it can */ commit_checkpoint(sbi, ckpt, start_blk); f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA); + stat_cp_time(cpc, CP_TIME_WAIT_LAST_CP); /* * invalidate intermediate page cache borrowed from meta inode which are @@ -1613,6 +1649,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned long long ckpt_ver; int err = 0; + stat_cp_time(cpc, CP_TIME_START); + if (f2fs_readonly(sbi->sb) || f2fs_hw_is_readonly(sbi)) return -EROFS; @@ -1624,6 +1662,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (cpc->reason != CP_RESIZE) f2fs_down_write(&sbi->cp_global_sem); + stat_cp_time(cpc, CP_TIME_LOCK); + if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) @@ -1639,6 +1679,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (err) goto out; + stat_cp_time(cpc, CP_TIME_OP_LOCK); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_flush_merged_writes(sbi); @@ -1678,6 +1720,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_sit_entries(sbi, cpc); + stat_cp_time(cpc, CP_TIME_FLUSH_META); + /* save inmem log status */ f2fs_save_inmem_curseg(sbi); @@ -1695,6 +1739,8 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) stat_inc_cp_count(sbi); stop: unblock_operations(sbi); + stat_cp_time(cpc, CP_TIME_END); + check_cp_time(sbi, cpc); if (cpc->reason & CP_RECOVERY) f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver); @@ -1778,6 +1824,7 @@ static void __checkpoint_and_complete_reqs(struct f2fs_sb_info *sbi) llist_for_each_entry_safe(req, next, dispatch_list, llnode) { diff = (u64)ktime_ms_delta(ktime_get(), req->queue_time); req->ret = ret; + req->delta_time = diff; complete(&req->wait); sum_diff += diff; @@ -1873,6 +1920,12 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) else flush_remained_ckpt_reqs(sbi, &req); + if (unlikely(req.delta_time >= CP_LONG_LATENCY_THRESHOLD)) { + f2fs_warn_ratelimited(sbi, + "blocked on checkpoint for %u ms", cprc->peak_time); + dump_stack(); + } + return req.ret; } diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 5c1f47e45dab..6ad8d3bc6df7 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1215,9 +1215,11 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) { void *fsdata = NULL; struct page *pagep; + struct page **rpages; int log_cluster_size = F2FS_I(inode)->i_log_cluster_size; pgoff_t start_idx = from >> (PAGE_SHIFT + log_cluster_size) << log_cluster_size; + int i; int err; err = f2fs_is_compressed_cluster(inode, start_idx); @@ -1238,27 +1240,30 @@ int f2fs_truncate_partial_cluster(struct inode *inode, u64 from, bool lock) if (err <= 0) return err; - if (err > 0) { - struct page **rpages = fsdata; - int cluster_size = F2FS_I(inode)->i_cluster_size; - int i; - - for (i = cluster_size - 1; i >= 0; i--) { - struct folio *folio = page_folio(rpages[i]); - loff_t start = folio->index << PAGE_SHIFT; - - if (from <= start) { - folio_zero_segment(folio, 0, folio_size(folio)); - } else { - folio_zero_segment(folio, from - start, - folio_size(folio)); - break; - } - } + rpages = fsdata; + + for (i = (1 << log_cluster_size) - 1; i >= 0; i--) { + struct folio *folio = page_folio(rpages[i]); + loff_t start = (loff_t)folio->index << PAGE_SHIFT; + loff_t offset = from > start ? from - start : 0; - f2fs_compress_write_end(inode, fsdata, start_idx, true); + folio_zero_segment(folio, offset, folio_size(folio)); + + if (from >= start) + break; } - return 0; + + f2fs_compress_write_end(inode, fsdata, start_idx, true); + + err = filemap_write_and_wait_range(inode->i_mapping, + round_down(from, 1 << log_cluster_size << PAGE_SHIFT), + LLONG_MAX); + if (err) + return err; + + truncate_pagecache(inode, from); + + return f2fs_do_truncate_blocks(inode, round_up(from, PAGE_SIZE), lock); } static int f2fs_write_compressed_pages(struct compress_ctx *cc, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 7961e0ddfca3..775aa4f63aa3 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -733,9 +733,11 @@ static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, static bool io_type_is_mergeable(struct f2fs_bio_info *io, struct f2fs_io_info *fio) { + blk_opf_t mask = ~(REQ_PREFLUSH | REQ_FUA); + if (io->fio.op != fio->op) return false; - return io->fio.op_flags == fio->op_flags; + return (io->fio.op_flags & mask) == (fio->op_flags & mask); } static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio, @@ -911,7 +913,7 @@ alloc_new: if (fio->io_wbc) wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); - inc_page_count(fio->sbi, WB_DATA_TYPE(data_folio, false)); + inc_page_count(fio->sbi, WB_DATA_TYPE(folio, false)); *fio->last_block = fio->new_blkaddr; *fio->bio = bio; @@ -1083,7 +1085,7 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, } /* This can handle encryption stuffs */ -static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, +static void f2fs_submit_page_read(struct inode *inode, struct folio *folio, block_t blkaddr, blk_opf_t op_flags, bool for_write) { @@ -1092,23 +1094,16 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, bio = f2fs_grab_read_bio(inode, blkaddr, 1, op_flags, folio->index, for_write); - if (IS_ERR(bio)) - return PTR_ERR(bio); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, blkaddr); - if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) { - iostat_update_and_unbind_ctx(bio); - if (bio->bi_private) - mempool_free(bio->bi_private, bio_post_read_ctx_pool); - bio_put(bio); - return -EFAULT; - } + if (!bio_add_folio(bio, folio, PAGE_SIZE, 0)) + f2fs_bug_on(sbi, 1); + inc_page_count(sbi, F2FS_RD_DATA); f2fs_update_iostat(sbi, NULL, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_submit_read_bio(sbi, bio, DATA); - return 0; } static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) @@ -1265,10 +1260,8 @@ got_it: return folio; } - err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, + f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); - if (err) - goto put_err; return folio; put_err: @@ -1504,8 +1497,8 @@ static bool f2fs_map_blocks_cached(struct inode *inode, struct f2fs_dev_info *dev = &sbi->devs[bidx]; map->m_bdev = dev->bdev; - map->m_pblk -= dev->start_blk; map->m_len = min(map->m_len, dev->end_blk + 1 - map->m_pblk); + map->m_pblk -= dev->start_blk; } else { map->m_bdev = inode->i_sb->s_bdev; } @@ -1572,6 +1565,9 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag) pgofs = (pgoff_t)map->m_lblk; end = pgofs + maxblocks; + if (flag == F2FS_GET_BLOCK_PRECACHE) + mode = LOOKUP_NODE_RA; + next_dnode: if (map->m_may_create) { if (f2fs_lfs_mode(sbi)) @@ -1778,12 +1774,13 @@ sync_out: if (map->m_flags & F2FS_MAP_MAPPED) { unsigned int ofs = start_pgofs - map->m_lblk; - f2fs_update_read_extent_cache_range(&dn, - start_pgofs, map->m_pblk + ofs, - map->m_len - ofs); + if (map->m_len > ofs) + f2fs_update_read_extent_cache_range(&dn, + start_pgofs, map->m_pblk + ofs, + map->m_len - ofs); } if (map->m_next_extent) - *map->m_next_extent = pgofs + 1; + *map->m_next_extent = is_hole ? pgofs + 1 : pgofs; } f2fs_put_dnode(&dn); unlock_out: @@ -2145,16 +2142,10 @@ submit_and_realloc: f2fs_submit_read_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } - if (bio == NULL) { + if (bio == NULL) bio = f2fs_grab_read_bio(inode, block_nr, nr_pages, f2fs_ra_op_flags(rac), index, false); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - bio = NULL; - goto out; - } - } /* * If the page is under writeback, we need to wait for @@ -2303,18 +2294,10 @@ submit_and_realloc: bio = NULL; } - if (!bio) { + if (!bio) bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages - i, f2fs_ra_op_flags(rac), folio->index, for_write); - if (IS_ERR(bio)) { - ret = PTR_ERR(bio); - f2fs_decompress_end_io(dic, ret, true); - f2fs_put_dnode(&dn); - *bio_ret = NULL; - return ret; - } - } if (!bio_add_folio(bio, folio, blocksize, 0)) goto submit_and_realloc; @@ -3639,11 +3622,9 @@ repeat: err = -EFSCORRUPTED; goto put_folio; } - err = f2fs_submit_page_read(use_cow ? + f2fs_submit_page_read(use_cow ? F2FS_I(inode)->cow_inode : inode, folio, blkaddr, 0, true); - if (err) - goto put_folio; folio_lock(folio); if (unlikely(folio->mapping != mapping)) { diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index fffd7749d6d1..48f4f98afb01 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -16,6 +16,21 @@ #include "xattr.h" #include <trace/events/f2fs.h> +static inline bool f2fs_should_fallback_to_linear(struct inode *dir) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); + + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return false; + case LOOKUP_COMPAT: + return true; + case LOOKUP_AUTO: + return !sb_no_casefold_compat_fallback(sbi->sb); + } + return false; +} + #if IS_ENABLED(CONFIG_UNICODE) extern struct kmem_cache *f2fs_cf_name_slab; #endif @@ -366,7 +381,7 @@ start_find_entry: out: #if IS_ENABLED(CONFIG_UNICODE) - if (!sb_no_casefold_compat_fallback(dir->i_sb) && + if (f2fs_should_fallback_to_linear(dir) && IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 199c1e7a83ef..33e09c453c70 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -604,7 +604,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, p = &(*p)->rb_right; leftmost = false; } else { + f2fs_err_ratelimited(sbi, "%s: corrupted extent, type: %d, " + "extent node in rb tree [%u, %u, %u], age [%llu, %llu], " + "extent node to insert [%u, %u, %u], age [%llu, %llu]", + __func__, et->type, en->ei.fofs, en->ei.blk, en->ei.len, en->ei.age, + en->ei.last_blocks, ei->fofs, ei->blk, ei->len, ei->age, ei->last_blocks); f2fs_bug_on(sbi, 1); + return NULL; } } @@ -664,6 +670,15 @@ static void __update_extent_tree_range(struct inode *inode, if (!et) return; + if (unlikely(len == 0)) { + f2fs_err_ratelimited(sbi, "%s: extent len is zero, type: %d, " + "extent [%u, %u, %u], age [%llu, %llu]", + __func__, type, tei->fofs, tei->blk, tei->len, + tei->age, tei->last_blocks); + f2fs_bug_on(sbi, 1); + return; + } + if (type == EX_READ) trace_f2fs_update_read_extent_tree_range(inode, fofs, len, tei->blk, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 6e465bbc85ee..5b4e9548a231 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -131,6 +131,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; * string rather than using the MS_LAZYTIME flag, so this must remain. */ #define F2FS_MOUNT_LAZYTIME 0x40000000 +#define F2FS_MOUNT_RESERVE_NODE 0x80000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -155,6 +156,18 @@ enum blkzone_allocation_policy { BLKZONE_ALLOC_PRIOR_CONV, /* Prioritize writing to conventional zones */ }; +enum bggc_io_aware_policy { + AWARE_ALL_IO, /* skip background GC if there is any kind of pending IO */ + AWARE_READ_IO, /* skip background GC if there is pending read IO */ + AWARE_NONE, /* don't aware IO for background GC */ +}; + +enum device_allocation_policy { + ALLOCATE_FORWARD_NOHINT, + ALLOCATE_FORWARD_WITHIN_HINT, + ALLOCATE_FORWARD_FROM_HINT, +}; + /* * An implementation of an rwsem that is explicitly unfair to readers. This * prevents priority inversion when a low-priority reader acquires the read lock @@ -172,6 +185,7 @@ struct f2fs_rwsem { struct f2fs_mount_info { unsigned int opt; block_t root_reserved_blocks; /* root reserved blocks */ + block_t root_reserved_nodes; /* root reserved nodes */ kuid_t s_resuid; /* reserved blocks for uid */ kgid_t s_resgid; /* reserved blocks for gid */ int active_logs; /* # of active logs */ @@ -212,6 +226,7 @@ struct f2fs_mount_info { int compress_mode; /* compression mode */ unsigned char extensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ unsigned char noextensions[COMPRESS_EXT_NUM][F2FS_EXTENSION_LEN]; /* extensions */ + unsigned int lookup_mode; }; #define F2FS_FEATURE_ENCRYPT 0x00000001 @@ -266,14 +281,36 @@ enum { #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ #define DEF_DISABLE_INTERVAL 5 /* 5 secs */ +#define DEF_ENABLE_INTERVAL 16 /* 16 secs */ #define DEF_DISABLE_QUICK_INTERVAL 1 /* 1 secs */ #define DEF_UMOUNT_DISCARD_TIMEOUT 5 /* 5 secs */ +enum cp_time { + CP_TIME_START, /* begin */ + CP_TIME_LOCK, /* after cp_global_sem */ + CP_TIME_OP_LOCK, /* after block_operation */ + CP_TIME_FLUSH_META, /* after flush sit/nat */ + CP_TIME_SYNC_META, /* after sync_meta_pages */ + CP_TIME_SYNC_CP_META, /* after sync cp meta pages */ + CP_TIME_WAIT_DIRTY_META,/* after wait on dirty meta */ + CP_TIME_WAIT_CP_DATA, /* after wait on cp data */ + CP_TIME_FLUSH_DEVICE, /* after flush device cache */ + CP_TIME_WAIT_LAST_CP, /* after wait on last cp pack */ + CP_TIME_END, /* after unblock_operation */ + CP_TIME_MAX, +}; + +/* time cost stats of checkpoint */ +struct cp_stats { + ktime_t times[CP_TIME_MAX]; +}; + struct cp_control { int reason; __u64 trim_start; __u64 trim_end; __u64 trim_minlen; + struct cp_stats stats; }; /* @@ -334,7 +371,10 @@ struct ckpt_req { struct completion wait; /* completion for checkpoint done */ struct llist_node llnode; /* llist_node to be linked in wait queue */ int ret; /* return code of checkpoint */ - ktime_t queue_time; /* request queued time */ + union { + ktime_t queue_time; /* request queued time */ + ktime_t delta_time; /* time in queue */ + }; }; struct ckpt_req_control { @@ -350,6 +390,9 @@ struct ckpt_req_control { unsigned int peak_time; /* peak wait time in msec until now */ }; +/* a time threshold that checkpoint was blocked for, unit: ms */ +#define CP_LONG_LATENCY_THRESHOLD 5000 + /* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ @@ -1375,6 +1418,7 @@ enum { DISCARD_TIME, GC_TIME, DISABLE_TIME, + ENABLE_TIME, UMOUNT_DISCARD_TIMEOUT, MAX_TIME, }; @@ -1454,6 +1498,12 @@ enum { TOTAL_CALL = FOREGROUND, }; +enum f2fs_lookup_mode { + LOOKUP_PERF, + LOOKUP_COMPAT, + LOOKUP_AUTO, +}; + static inline int f2fs_test_bit(unsigned int nr, char *addr); static inline void f2fs_set_bit(unsigned int nr, char *addr); static inline void f2fs_clear_bit(unsigned int nr, char *addr); @@ -1643,6 +1693,7 @@ struct f2fs_sb_info { unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ struct ckpt_req_control cprc_info; /* for checkpoint request control */ + struct cp_stats cp_stats; /* for time stat of checkpoint */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ @@ -1810,6 +1861,9 @@ struct f2fs_sb_info { spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ unsigned int first_seq_zone_segno; /* first segno in sequential zone */ + unsigned int bggc_io_aware; /* For adjust the BG_GC priority when pending IO */ + unsigned int allocate_section_hint; /* the boundary position between devices */ + unsigned int allocate_section_policy; /* determine the section writing priority */ /* For write statistics */ u64 sectors_written_start; @@ -2362,13 +2416,11 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline bool __allow_reserved_blocks(struct f2fs_sb_info *sbi, +static inline bool __allow_reserved_root(struct f2fs_sb_info *sbi, struct inode *inode, bool cap) { if (!inode) return true; - if (!test_opt(sbi, RESERVE_ROOT)) - return false; if (IS_NOQUOTA(inode)) return true; if (uid_eq(F2FS_OPTION(sbi).s_resuid, current_fsuid())) @@ -2389,7 +2441,7 @@ static inline unsigned int get_available_block_count(struct f2fs_sb_info *sbi, avail_user_block_count = sbi->user_block_count - sbi->current_reserved_blocks; - if (!__allow_reserved_blocks(sbi, inode, cap)) + if (test_opt(sbi, RESERVE_ROOT) && !__allow_reserved_root(sbi, inode, cap)) avail_user_block_count -= F2FS_OPTION(sbi).root_reserved_blocks; if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { @@ -2747,7 +2799,7 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, struct inode *inode, bool is_inode) { block_t valid_block_count; - unsigned int valid_node_count; + unsigned int valid_node_count, avail_user_node_count; unsigned int avail_user_block_count; int err; @@ -2769,15 +2821,20 @@ static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - avail_user_block_count = get_available_block_count(sbi, inode, false); + avail_user_block_count = get_available_block_count(sbi, inode, + test_opt(sbi, RESERVE_NODE)); if (unlikely(valid_block_count > avail_user_block_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } + avail_user_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + if (test_opt(sbi, RESERVE_NODE) && + !__allow_reserved_root(sbi, inode, true)) + avail_user_node_count -= F2FS_OPTION(sbi).root_reserved_nodes; valid_node_count = sbi->total_valid_node_count + 1; - if (unlikely(valid_node_count > sbi->total_node_count)) { + if (unlikely(valid_node_count > avail_user_node_count)) { spin_unlock(&sbi->stat_lock); goto enospc; } @@ -3004,13 +3061,10 @@ static inline bool is_idle(struct f2fs_sb_info *sbi, int type) if (sbi->gc_mode == GC_URGENT_HIGH) return true; - if (zoned_gc) { - if (is_inflight_read_io(sbi)) - return false; - } else { - if (is_inflight_io(sbi, type)) - return false; - } + if (sbi->bggc_io_aware == AWARE_READ_IO && is_inflight_read_io(sbi)) + return false; + if (sbi->bggc_io_aware == AWARE_ALL_IO && is_inflight_io(sbi, type)) + return false; if (sbi->gc_mode == GC_URGENT_MID) return true; @@ -3770,6 +3824,7 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); * node.c */ struct node_info; +enum node_type; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); @@ -3792,7 +3847,8 @@ int f2fs_remove_inode_page(struct inode *inode); struct folio *f2fs_new_inode_folio(struct inode *inode); struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); int f2fs_move_node_folio(struct folio *node_folio, int gc_type); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 42faaed6a02d..ffa045b39c01 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -35,15 +35,23 @@ #include <trace/events/f2fs.h> #include <uapi/linux/f2fs.h> -static void f2fs_zero_post_eof_page(struct inode *inode, loff_t new_size) +static void f2fs_zero_post_eof_page(struct inode *inode, + loff_t new_size, bool lock) { loff_t old_size = i_size_read(inode); if (old_size >= new_size) return; + if (mapping_empty(inode->i_mapping)) + return; + + if (lock) + filemap_invalidate_lock(inode->i_mapping); /* zero or drop pages only in range of [old_size, new_size] */ - truncate_pagecache(inode, old_size); + truncate_inode_pages_range(inode->i_mapping, old_size, new_size); + if (lock) + filemap_invalidate_unlock(inode->i_mapping); } static vm_fault_t f2fs_filemap_fault(struct vm_fault *vmf) @@ -114,9 +122,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, (folio->index + 1) << PAGE_SHIFT, true); file_update_time(vmf->vma->vm_file); filemap_invalidate_lock_shared(inode->i_mapping); @@ -904,8 +910,16 @@ int f2fs_truncate(struct inode *inode) /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); - if (err) + if (err) { + /* + * Always truncate page #0 to avoid page cache + * leak in evict() path. + */ + truncate_inode_pages_range(inode->i_mapping, + F2FS_BLK_TO_BYTES(0), + F2FS_BLK_END_BYTES(0)); return err; + } } err = f2fs_truncate_blocks(inode, i_size_read(inode), true); @@ -1141,7 +1155,7 @@ int f2fs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, filemap_invalidate_lock(inode->i_mapping); if (attr->ia_size > old_size) - f2fs_zero_post_eof_page(inode, attr->ia_size); + f2fs_zero_post_eof_page(inode, attr->ia_size, false); truncate_setsize(inode, attr->ia_size); if (attr->ia_size <= old_size) @@ -1260,9 +1274,7 @@ static int f2fs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1547,7 +1559,7 @@ static int f2fs_do_collapse(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); f2fs_lock_op(sbi); f2fs_drop_extent_tree(inode); @@ -1670,9 +1682,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; @@ -1806,7 +1816,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(mapping); - f2fs_zero_post_eof_page(inode, offset + len); + f2fs_zero_post_eof_page(inode, offset + len, false); truncate_pagecache(inode, offset); while (!ret && idx > pg_start) { @@ -1864,9 +1874,7 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, offset + len); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, offset + len, true); f2fs_balance_fs(sbi, true); @@ -4914,9 +4922,8 @@ static ssize_t f2fs_write_checks(struct kiocb *iocb, struct iov_iter *from) if (err) return err; - filemap_invalidate_lock(inode->i_mapping); - f2fs_zero_post_eof_page(inode, iocb->ki_pos + iov_iter_count(from)); - filemap_invalidate_unlock(inode->i_mapping); + f2fs_zero_post_eof_page(inode, + iocb->ki_pos + iov_iter_count(from), true); return count; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 098e9f71421e..a7708cf80c04 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1071,7 +1071,7 @@ next_step: } /* phase == 2 */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) continue; @@ -1145,7 +1145,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return false; @@ -1794,6 +1794,13 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) { + f2fs_err(sbi, "%s: segment %u is used by log", + __func__, segno); + f2fs_bug_on(sbi, 1); + goto skip; + } + if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && @@ -1805,7 +1812,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { - f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", + f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SIT and SSA", segno, type, GET_SUM_TYPE((&sum->footer))); f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_SUMMARY); @@ -2068,6 +2075,13 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + /* + * avoid migrating empty section, as it can be allocated by + * log in parallel. + */ + if (!get_valid_blocks(sbi, segno, true)) + continue; + if (is_cursec(sbi, GET_SEC_FROM_SEG(sbi, segno))) continue; @@ -2182,6 +2196,8 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) SM_I(sbi)->segment_count = (int)SM_I(sbi)->segment_count + segs; MAIN_SEGS(sbi) = (int)MAIN_SEGS(sbi) + segs; MAIN_SECS(sbi) += secs; + if (sbi->allocate_section_hint > MAIN_SECS(sbi)) + sbi->allocate_section_hint = MAIN_SECS(sbi); FREE_I(sbi)->free_sections = (int)FREE_I(sbi)->free_sections + secs; FREE_I(sbi)->free_segments = (int)FREE_I(sbi)->free_segments + segs; F2FS_CKPT(sbi)->user_block_count = cpu_to_le64(user_block_count + blks); @@ -2189,6 +2205,9 @@ static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs) if (f2fs_is_multi_device(sbi)) { int last_dev = sbi->s_ndevs - 1; + sbi->allocate_section_hint = FDEV(0).total_segments / + SEGS_PER_SEC(sbi); + FDEV(last_dev).total_segments = (int)FDEV(last_dev).total_segments + segs; FDEV(last_dev).end_blk = diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 27743b93e186..482a362f2625 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -27,12 +27,17 @@ static struct kmem_cache *free_nid_slab; static struct kmem_cache *nat_entry_set_slab; static struct kmem_cache *fsync_node_entry_slab; +static inline bool is_invalid_nid(struct f2fs_sb_info *sbi, nid_t nid) +{ + return nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid; +} + /* * Check whether the given nid is within node id range. */ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) { - if (unlikely(nid < F2FS_ROOT_INO(sbi) || nid >= NM_I(sbi)->max_nid)) { + if (unlikely(is_invalid_nid(sbi, nid))) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: out-of-range nid=%x, run fsck to fix.", __func__, nid); @@ -871,7 +876,8 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } if (!done) { - nfolio[i] = f2fs_get_node_folio(sbi, nids[i]); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i], + NODE_TYPE_NON_INODE); if (IS_ERR(nfolio[i])) { err = PTR_ERR(nfolio[i]); f2fs_folio_put(nfolio[0], false); @@ -989,7 +995,7 @@ static int truncate_dnode(struct dnode_of_data *dn) return 1; /* get direct node */ - folio = f2fs_get_node_folio(sbi, dn->nid); + folio = f2fs_get_node_folio(sbi, dn->nid, NODE_TYPE_NON_INODE); if (PTR_ERR(folio) == -ENOENT) return 1; else if (IS_ERR(folio)) @@ -1033,7 +1039,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid, + NODE_TYPE_NON_INODE); if (IS_ERR(folio)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); return PTR_ERR(folio); @@ -1111,7 +1118,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]); + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i], + NODE_TYPE_NON_INODE); if (IS_ERR(folios[i])) { err = PTR_ERR(folios[i]); idx = i - 1; @@ -1496,21 +1504,37 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, struct folio *folio, pgoff_t nid, enum node_type ntype) { - if (unlikely(nid != nid_of_node(folio) || - (ntype == NODE_TYPE_INODE && !IS_INODE(folio)) || - (ntype == NODE_TYPE_XATTR && - !f2fs_has_xattr_block(ofs_of_node(folio))) || - time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { - f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " - "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - ntype, nid, nid_of_node(folio), ino_of_node(folio), - ofs_of_node(folio), cpver_of_node(folio), - next_blkaddr_of_node(folio)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - return -EFSCORRUPTED; + if (unlikely(nid != nid_of_node(folio))) + goto out_err; + + switch (ntype) { + case NODE_TYPE_INODE: + if (!IS_INODE(folio)) + goto out_err; + break; + case NODE_TYPE_XATTR: + if (!f2fs_has_xattr_block(ofs_of_node(folio))) + goto out_err; + break; + case NODE_TYPE_NON_INODE: + if (IS_INODE(folio)) + goto out_err; + break; + default: + break; } + if (time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER)) + goto out_err; return 0; +out_err: + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(folio), ino_of_node(folio), + ofs_of_node(folio), cpver_of_node(folio), + next_blkaddr_of_node(folio)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; } static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, @@ -1546,7 +1570,7 @@ repeat: if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; - goto out_err; + goto out_put_err; } if (!f2fs_inode_chksum_verify(sbi, folio)) { @@ -1567,9 +1591,10 @@ out_put_err: return ERR_PTR(err); } -struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + enum node_type node_type) { - return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR); + return __get_node_folio(sbi, nid, NULL, 0, node_type); } struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) @@ -2634,6 +2659,16 @@ retry: f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); i = list_first_entry(&nm_i->free_nid_list, struct free_nid, list); + + if (unlikely(is_invalid_nid(sbi, i->nid))) { + spin_unlock(&nm_i->nid_list_lock); + f2fs_err(sbi, "Corrupted nid %u in free_nid_list", + i->nid); + f2fs_stop_checkpoint(sbi, false, + STOP_CP_REASON_CORRUPTED_NID); + return false; + } + *nid = i->nid; __move_free_nid(sbi, i, FREE_NID, PREALLOC_NID); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 030390543b54..9cb8dcf8d417 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -57,6 +57,7 @@ enum node_type { NODE_TYPE_REGULAR, NODE_TYPE_INODE, NODE_TYPE_XATTR, + NODE_TYPE_NON_INODE, }; /* diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4cb3a91801b4..215e442db72c 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -548,7 +548,7 @@ got_it: } /* Get the node page */ - node_folio = f2fs_get_node_folio(sbi, nid); + node_folio = f2fs_get_node_folio(sbi, nid, NODE_TYPE_REGULAR); if (IS_ERR(node_folio)) return PTR_ERR(node_folio); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index cc82d42ef14c..b45eace879d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2774,6 +2774,8 @@ static int get_new_segment(struct f2fs_sb_info *sbi, unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); + unsigned int alloc_policy = sbi->allocate_section_policy; + unsigned int alloc_hint = sbi->allocate_section_hint; bool init = true; int i; int ret = 0; @@ -2807,6 +2809,21 @@ static int get_new_segment(struct f2fs_sb_info *sbi, } #endif + /* + * Prevent allocate_section_hint from exceeding MAIN_SECS() + * due to desynchronization. + */ + if (alloc_policy != ALLOCATE_FORWARD_NOHINT && + alloc_hint > MAIN_SECS(sbi)) + alloc_hint = MAIN_SECS(sbi); + + if (alloc_policy == ALLOCATE_FORWARD_FROM_HINT && + hint < alloc_hint) + hint = alloc_hint; + else if (alloc_policy == ALLOCATE_FORWARD_WITHIN_HINT && + hint >= alloc_hint) + hint = 0; + find_other_zone: secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); @@ -3672,7 +3689,8 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) if (file_is_hot(inode) || is_inode_flag_set(inode, FI_HOT_DATA) || - f2fs_is_cow_file(inode)) + f2fs_is_cow_file(inode) || + is_inode_flag_set(inode, FI_NEED_IPU)) return CURSEG_HOT_DATA; return f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); @@ -3936,12 +3954,18 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && seg_type == CURSEG_COLD_DATA); + int err; if (keep_order) f2fs_down_read(&fio->sbi->io_order_lock); - if (f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, - &fio->new_blkaddr, sum, type, fio)) { + err = f2fs_allocate_data_block(fio->sbi, folio, fio->old_blkaddr, + &fio->new_blkaddr, sum, type, fio); + if (unlikely(err)) { + f2fs_err_ratelimited(fio->sbi, + "%s Failed to allocate data block, ino:%u, index:%lu, type:%d, old_blkaddr:0x%x, new_blkaddr:0x%x, err:%d", + __func__, fio->ino, folio->index, type, + fio->old_blkaddr, fio->new_blkaddr, err); if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5e2ee5c686b1..1ce2c8abaf48 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -600,6 +600,16 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi)); } +static inline unsigned int get_left_section_blocks(struct f2fs_sb_info *sbi, + enum log_type type, unsigned int segno) +{ + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) + return CAP_BLKS_PER_SEC(sbi) - SEGS_TO_BLKS(sbi, + (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, type)->next_blkoff; + return CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); +} + static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) @@ -614,14 +624,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, i)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, i, segno); blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; if (blocks > left_blocks) @@ -634,14 +637,7 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - - CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff; - } else { - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); - } + left_blocks = get_left_section_blocks(sbi, CURSEG_HOT_DATA, segno); if (dent_blocks > left_blocks) return false; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2619cbbd7d2d..db7afb806411 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -143,6 +143,7 @@ enum { Opt_extent_cache, Opt_data_flush, Opt_reserve_root, + Opt_reserve_node, Opt_resgid, Opt_resuid, Opt_mode, @@ -181,6 +182,7 @@ enum { Opt_nat_bits, Opt_jqfmt, Opt_checkpoint, + Opt_lookup_mode, Opt_err, }; @@ -244,6 +246,13 @@ static const struct constant_table f2fs_param_errors[] = { {} }; +static const struct constant_table f2fs_param_lookup_mode[] = { + {"perf", LOOKUP_PERF}, + {"compat", LOOKUP_COMPAT}, + {"auto", LOOKUP_AUTO}, + {} +}; + static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("background_gc", Opt_gc_background, f2fs_param_background_gc), fsparam_flag("disable_roll_forward", Opt_disable_roll_forward), @@ -265,6 +274,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_flag_no("extent_cache", Opt_extent_cache), fsparam_flag("data_flush", Opt_data_flush), fsparam_u32("reserve_root", Opt_reserve_root), + fsparam_u32("reserve_node", Opt_reserve_node), fsparam_gid("resgid", Opt_resgid), fsparam_uid("resuid", Opt_resuid), fsparam_enum("mode", Opt_mode, f2fs_param_mode), @@ -300,6 +310,7 @@ static const struct fs_parameter_spec f2fs_param_specs[] = { fsparam_enum("memory", Opt_memory_mode, f2fs_param_memory_mode), fsparam_flag("age_extent_cache", Opt_age_extent_cache), fsparam_enum("errors", Opt_errors, f2fs_param_errors), + fsparam_enum("lookup_mode", Opt_lookup_mode, f2fs_param_lookup_mode), {} }; @@ -336,6 +347,8 @@ static match_table_t f2fs_checkpoint_tokens = { #define F2FS_SPEC_discard_unit (1 << 21) #define F2FS_SPEC_memory_mode (1 << 22) #define F2FS_SPEC_errors (1 << 23) +#define F2FS_SPEC_lookup_mode (1 << 24) +#define F2FS_SPEC_reserve_node (1 << 25) struct f2fs_fs_context { struct f2fs_mount_info info; @@ -437,22 +450,30 @@ static void f2fs_destroy_casefold_cache(void) { } static inline void limit_reserve_root(struct f2fs_sb_info *sbi) { - block_t limit = min((sbi->user_block_count >> 3), + block_t block_limit = min((sbi->user_block_count >> 3), sbi->user_block_count - sbi->reserved_blocks); + block_t node_limit = sbi->total_node_count >> 3; /* limit is 12.5% */ if (test_opt(sbi, RESERVE_ROOT) && - F2FS_OPTION(sbi).root_reserved_blocks > limit) { - F2FS_OPTION(sbi).root_reserved_blocks = limit; + F2FS_OPTION(sbi).root_reserved_blocks > block_limit) { + F2FS_OPTION(sbi).root_reserved_blocks = block_limit; f2fs_info(sbi, "Reduce reserved blocks for root = %u", F2FS_OPTION(sbi).root_reserved_blocks); } - if (!test_opt(sbi, RESERVE_ROOT) && + if (test_opt(sbi, RESERVE_NODE) && + F2FS_OPTION(sbi).root_reserved_nodes > node_limit) { + F2FS_OPTION(sbi).root_reserved_nodes = node_limit; + f2fs_info(sbi, "Reduce reserved nodes for root = %u", + F2FS_OPTION(sbi).root_reserved_nodes); + } + if (!test_opt(sbi, RESERVE_ROOT) && !test_opt(sbi, RESERVE_NODE) && (!uid_eq(F2FS_OPTION(sbi).s_resuid, make_kuid(&init_user_ns, F2FS_DEF_RESUID)) || !gid_eq(F2FS_OPTION(sbi).s_resgid, make_kgid(&init_user_ns, F2FS_DEF_RESGID)))) - f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root", + f2fs_info(sbi, "Ignore s_resuid=%u, s_resgid=%u w/o reserve_root" + " and reserve_node", from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -847,6 +868,11 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) F2FS_CTX_INFO(ctx).root_reserved_blocks = result.uint_32; ctx->spec_mask |= F2FS_SPEC_reserve_root; break; + case Opt_reserve_node: + ctx_set_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + F2FS_CTX_INFO(ctx).root_reserved_nodes = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_reserve_node; + break; case Opt_resuid: F2FS_CTX_INFO(ctx).s_resuid = result.uid; ctx->spec_mask |= F2FS_SPEC_resuid; @@ -994,6 +1020,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) ctx_set_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; case Opt_checkpoint_enable: + F2FS_CTX_INFO(ctx).unusable_cap_perc = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap_perc; + F2FS_CTX_INFO(ctx).unusable_cap = 0; + ctx->spec_mask |= F2FS_SPEC_checkpoint_disable_cap; ctx_clear_opt(ctx, F2FS_MOUNT_DISABLE_CHECKPOINT); break; default: @@ -1149,6 +1179,10 @@ static int f2fs_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_nat_bits: ctx_set_opt(ctx, F2FS_MOUNT_NAT_BITS); break; + case Opt_lookup_mode: + F2FS_CTX_INFO(ctx).lookup_mode = result.uint_32; + ctx->spec_mask |= F2FS_SPEC_lookup_mode; + break; } return 0; } @@ -1191,7 +1225,11 @@ static int f2fs_check_quota_consistency(struct fs_context *fc, goto err_jquota_change; if (old_qname) { - if (strcmp(old_qname, new_qname) == 0) { + if (!new_qname) { + f2fs_info(sbi, "remove qf_name %s", + old_qname); + continue; + } else if (strcmp(old_qname, new_qname) == 0) { ctx->qname_mask &= ~(1 << i); continue; } @@ -1430,6 +1468,14 @@ static int f2fs_check_opt_consistency(struct fs_context *fc, ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_ROOT); ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_ROOT; } + if (test_opt(sbi, RESERVE_NODE) && + (ctx->opt_mask & F2FS_MOUNT_RESERVE_NODE) && + ctx_test_opt(ctx, F2FS_MOUNT_RESERVE_NODE)) { + f2fs_info(sbi, "Preserve previous reserve_node=%u", + F2FS_OPTION(sbi).root_reserved_nodes); + ctx_clear_opt(ctx, F2FS_MOUNT_RESERVE_NODE); + ctx->opt_mask &= ~F2FS_MOUNT_RESERVE_NODE; + } err = f2fs_check_test_dummy_encryption(fc, sb); if (err) @@ -1629,6 +1675,9 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) if (ctx->spec_mask & F2FS_SPEC_reserve_root) F2FS_OPTION(sbi).root_reserved_blocks = F2FS_CTX_INFO(ctx).root_reserved_blocks; + if (ctx->spec_mask & F2FS_SPEC_reserve_node) + F2FS_OPTION(sbi).root_reserved_nodes = + F2FS_CTX_INFO(ctx).root_reserved_nodes; if (ctx->spec_mask & F2FS_SPEC_resgid) F2FS_OPTION(sbi).s_resgid = F2FS_CTX_INFO(ctx).s_resgid; if (ctx->spec_mask & F2FS_SPEC_resuid) @@ -1658,6 +1707,8 @@ static void f2fs_apply_options(struct fs_context *fc, struct super_block *sb) F2FS_OPTION(sbi).memory_mode = F2FS_CTX_INFO(ctx).memory_mode; if (ctx->spec_mask & F2FS_SPEC_errors) F2FS_OPTION(sbi).errors = F2FS_CTX_INFO(ctx).errors; + if (ctx->spec_mask & F2FS_SPEC_lookup_mode) + F2FS_OPTION(sbi).lookup_mode = F2FS_CTX_INFO(ctx).lookup_mode; f2fs_apply_compression(fc, sb); f2fs_apply_test_dummy_encryption(fc, sb); @@ -1769,7 +1820,7 @@ static int f2fs_drop_inode(struct inode *inode) sb_end_intwrite(inode->i_sb); spin_lock(&inode->i_lock); - iput(inode); + atomic_dec(&inode->i_count); } trace_f2fs_drop_inode(inode, 0); return 0; @@ -2349,9 +2400,11 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_FRAGMENT_BLK) seq_puts(seq, "fragment:block"); seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs); - if (test_opt(sbi, RESERVE_ROOT)) - seq_printf(seq, ",reserve_root=%u,resuid=%u,resgid=%u", + if (test_opt(sbi, RESERVE_ROOT) || test_opt(sbi, RESERVE_NODE)) + seq_printf(seq, ",reserve_root=%u,reserve_node=%u,resuid=%u," + "resgid=%u", F2FS_OPTION(sbi).root_reserved_blocks, + F2FS_OPTION(sbi).root_reserved_nodes, from_kuid_munged(&init_user_ns, F2FS_OPTION(sbi).s_resuid), from_kgid_munged(&init_user_ns, @@ -2422,6 +2475,13 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, NAT_BITS)) seq_puts(seq, ",nat_bits"); + if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_PERF) + seq_show_option(seq, "lookup_mode", "perf"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_COMPAT) + seq_show_option(seq, "lookup_mode", "compat"); + else if (F2FS_OPTION(sbi).lookup_mode == LOOKUP_AUTO) + seq_show_option(seq, "lookup_mode", "auto"); + return 0; } @@ -2486,6 +2546,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) #endif f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); + + F2FS_OPTION(sbi).lookup_mode = LOOKUP_PERF; } #ifdef CONFIG_QUOTA @@ -2566,21 +2628,39 @@ out_unlock: restore_flag: sbi->gc_mode = gc_mode; sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ + f2fs_info(sbi, "f2fs_disable_checkpoint() finish, err:%d", err); return err; } static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) { - int retry = DEFAULT_RETRY_IO_COUNT; + unsigned int nr_pages = get_pages(sbi, F2FS_DIRTY_DATA) / 16; + long long start, writeback, end; + + f2fs_info(sbi, "f2fs_enable_checkpoint() starts, meta: %lld, node: %lld, data: %lld", + get_pages(sbi, F2FS_DIRTY_META), + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DATA)); + + f2fs_update_time(sbi, ENABLE_TIME); + + start = ktime_get(); /* we should flush all the data to keep data consistency */ - do { - sync_inodes_sb(sbi->sb); + while (get_pages(sbi, F2FS_DIRTY_DATA)) { + writeback_inodes_sb_nr(sbi->sb, nr_pages, WB_REASON_SYNC); f2fs_io_schedule_timeout(DEFAULT_IO_TIMEOUT); - } while (get_pages(sbi, F2FS_DIRTY_DATA) && retry--); - if (unlikely(retry < 0)) - f2fs_warn(sbi, "checkpoint=enable has some unwritten data."); + if (f2fs_time_over(sbi, ENABLE_TIME)) + break; + } + writeback = ktime_get(); + + sync_inodes_sb(sbi->sb); + + if (unlikely(get_pages(sbi, F2FS_DIRTY_DATA))) + f2fs_warn(sbi, "checkpoint=enable has some unwritten data: %lld", + get_pages(sbi, F2FS_DIRTY_DATA)); f2fs_down_write(&sbi->gc_lock); f2fs_dirty_to_prefree(sbi); @@ -2593,6 +2673,12 @@ static void f2fs_enable_checkpoint(struct f2fs_sb_info *sbi) /* Let's ensure there's no pending checkpoint anymore */ f2fs_flush_ckpt_thread(sbi); + + end = ktime_get(); + + f2fs_info(sbi, "f2fs_enable_checkpoint() finishes, writeback:%llu, sync:%llu", + ktime_ms_delta(writeback, start), + ktime_ms_delta(end, writeback)); } static int __f2fs_remount(struct fs_context *fc, struct super_block *sb) @@ -4156,6 +4242,8 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->total_node_count = SEGS_TO_BLKS(sbi, ((le32_to_cpu(raw_super->segment_count_nat) / 2) * NAT_ENTRY_PER_BLOCK)); + sbi->allocate_section_hint = le32_to_cpu(raw_super->section_count); + sbi->allocate_section_policy = ALLOCATE_FORWARD_NOHINT; F2FS_ROOT_INO(sbi) = le32_to_cpu(raw_super->root_ino); F2FS_NODE_INO(sbi) = le32_to_cpu(raw_super->node_ino); F2FS_META_INO(sbi) = le32_to_cpu(raw_super->meta_ino); @@ -4179,6 +4267,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[DISCARD_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[GC_TIME] = DEF_IDLE_INTERVAL; sbi->interval_time[DISABLE_TIME] = DEF_DISABLE_INTERVAL; + sbi->interval_time[ENABLE_TIME] = DEF_ENABLE_INTERVAL; sbi->interval_time[UMOUNT_DISCARD_TIMEOUT] = DEF_UMOUNT_DISCARD_TIMEOUT; clear_sbi_flag(sbi, SBI_NEED_FSCK); @@ -4637,9 +4726,11 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) logical_blksize = bdev_logical_block_size(sbi->sb->s_bdev); sbi->aligned_blksize = true; + sbi->bggc_io_aware = AWARE_ALL_IO; #ifdef CONFIG_BLK_DEV_ZONED sbi->max_open_zones = UINT_MAX; sbi->blkzone_alloc_policy = BLKZONE_ALLOC_PRIOR_SEQ; + sbi->bggc_io_aware = AWARE_READ_IO; #endif for (i = 0; i < max_devices; i++) { @@ -4667,6 +4758,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) SEGS_TO_BLKS(sbi, FDEV(i).total_segments) - 1 + le32_to_cpu(raw_super->segment0_blkaddr); + sbi->allocate_section_hint = FDEV(i).total_segments / + SEGS_PER_SEC(sbi); } else { FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; FDEV(i).end_blk = FDEV(i).start_blk + diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index f736052dea50..6d2a4fba68a2 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -281,6 +281,22 @@ static ssize_t encoding_flags_show(struct f2fs_attr *a, le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); } +static ssize_t effective_lookup_mode_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (F2FS_OPTION(sbi).lookup_mode) { + case LOOKUP_PERF: + return sysfs_emit(buf, "perf\n"); + case LOOKUP_COMPAT: + return sysfs_emit(buf, "compat\n"); + case LOOKUP_AUTO: + if (sb_no_casefold_compat_fallback(sbi->sb)) + return sysfs_emit(buf, "auto:perf\n"); + return sysfs_emit(buf, "auto:compat\n"); + } + return 0; +} + static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -866,6 +882,27 @@ out: return count; } + if (!strcmp(a->attr.name, "bggc_io_aware")) { + if (t < AWARE_ALL_IO || t > AWARE_NONE) + return -EINVAL; + sbi->bggc_io_aware = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_hint")) { + if (t < 0 || t > MAIN_SECS(sbi)) + return -EINVAL; + sbi->allocate_section_hint = t; + return count; + } + + if (!strcmp(a->attr.name, "allocate_section_policy")) { + if (t < ALLOCATE_FORWARD_NOHINT || t > ALLOCATE_FORWARD_FROM_HINT) + return -EINVAL; + sbi->allocate_section_policy = t; + return count; + } + *ui = (unsigned int)t; return count; @@ -1138,6 +1175,8 @@ F2FS_SBI_GENERAL_RW_ATTR(max_victim_search); F2FS_SBI_GENERAL_RW_ATTR(migration_granularity); F2FS_SBI_GENERAL_RW_ATTR(migration_window_granularity); F2FS_SBI_GENERAL_RW_ATTR(dir_level); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_hint); +F2FS_SBI_GENERAL_RW_ATTR(allocate_section_policy); #ifdef CONFIG_F2FS_IOSTAT F2FS_SBI_GENERAL_RW_ATTR(iostat_enable); F2FS_SBI_GENERAL_RW_ATTR(iostat_period_ms); @@ -1175,6 +1214,7 @@ F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif F2FS_SBI_GENERAL_RW_ATTR(carve_out); F2FS_SBI_GENERAL_RW_ATTR(reserved_pin_section); +F2FS_SBI_GENERAL_RW_ATTR(bggc_io_aware); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1211,6 +1251,7 @@ F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); F2FS_GENERAL_RO_ATTR(encoding_flags); +F2FS_GENERAL_RO_ATTR(effective_lookup_mode); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); @@ -1303,6 +1344,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(discard_idle_interval), ATTR_LIST(gc_idle_interval), ATTR_LIST(umount_discard_timeout), + ATTR_LIST(bggc_io_aware), #ifdef CONFIG_F2FS_IOSTAT ATTR_LIST(iostat_enable), ATTR_LIST(iostat_period_ms), @@ -1329,6 +1371,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), ATTR_LIST(encoding_flags), + ATTR_LIST(effective_lookup_mode), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), @@ -1371,6 +1414,8 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_read_extent_count), ATTR_LIST(carve_out), ATTR_LIST(reserved_pin_section), + ATTR_LIST(allocate_section_hint), + ATTR_LIST(allocate_section_policy), NULL, }; ATTRIBUTE_GROUPS(f2fs); @@ -1723,12 +1768,15 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, seq_printf(seq, " Main : 0x%010x (%10d)\n", SM_I(sbi)->main_blkaddr, le32_to_cpu(F2FS_RAW_SUPER(sbi)->segment_count_main)); - seq_printf(seq, " # of Sections : %12d\n", - le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); + seq_printf(seq, " Block size : %12lu KB\n", F2FS_BLKSIZE >> 10); + seq_printf(seq, " Segment size : %12d MB\n", + (BLKS_PER_SEG(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); seq_printf(seq, " Segs/Sections : %12d\n", SEGS_PER_SEC(sbi)); seq_printf(seq, " Section size : %12d MB\n", - SEGS_PER_SEC(sbi) << 1); + (BLKS_PER_SEC(sbi) << (F2FS_BLKSIZE_BITS - 10)) >> 10); + seq_printf(seq, " # of Sections : %12d\n", + le32_to_cpu(F2FS_RAW_SUPER(sbi)->section_count)); if (!f2fs_is_multi_device(sbi)) return 0; @@ -1742,6 +1790,69 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, return 0; } +static int __maybe_unused donation_list_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct inode *inode; + struct f2fs_inode_info *fi; + struct dentry *dentry; + char *buf, *path; + int i; + + buf = f2fs_getname(sbi); + if (!buf) + return 0; + + seq_printf(seq, "Donation List\n"); + seq_printf(seq, " # of files : %u\n", sbi->donate_files); + seq_printf(seq, " %-50s %10s %20s %20s %22s\n", + "File path", "Status", "Donation offset (kb)", + "Donation size (kb)", "File cached size (kb)"); + seq_printf(seq, "---\n"); + + for (i = 0; i < sbi->donate_files; i++) { + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + inode_lock_shared(inode); + + dentry = d_find_alias(inode); + if (!dentry) { + path = NULL; + } else { + path = dentry_path_raw(dentry, buf, PATH_MAX); + if (IS_ERR(path)) + goto next; + } + seq_printf(seq, " %-50s %10s %20llu %20llu %22llu\n", + path ? path : "<unlinked>", + is_inode_flag_set(inode, FI_DONATE_FINISHED) ? + "Evicted" : "Donated", + (loff_t)fi->donate_start << (PAGE_SHIFT - 10), + (loff_t)(fi->donate_end + 1) << (PAGE_SHIFT - 10), + (loff_t)inode->i_mapping->nrpages << (PAGE_SHIFT - 10)); +next: + dput(dentry); + inode_unlock_shared(inode); + iput(inode); + } + f2fs_putname(buf); + return 0; +} + #ifdef CONFIG_F2FS_FAULT_INJECTION static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, void *offset) @@ -1851,6 +1962,8 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); + proc_create_single_data("donation_list", 0444, sbi->s_proc, + donation_list_seq_show, sb); #ifdef CONFIG_F2FS_FAULT_INJECTION proc_create_single_data("inject_stats", 0444, sbi->s_proc, inject_stats_seq_show, sb); diff --git a/fs/file_attr.c b/fs/file_attr.c index 12424d4945d0..1dcec88c0680 100644 --- a/fs/file_attr.c +++ b/fs/file_attr.c @@ -84,7 +84,7 @@ int vfs_fileattr_get(struct dentry *dentry, struct file_kattr *fa) int error; if (!inode->i_op->fileattr_get) - return -EOPNOTSUPP; + return -ENOIOCTLCMD; error = security_inode_file_getattr(dentry, fa); if (error) @@ -270,7 +270,7 @@ int vfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, int err; if (!inode->i_op->fileattr_set) - return -EOPNOTSUPP; + return -ENOIOCTLCMD; if (!inode_owner_or_capable(idmap, inode)) return -EPERM; @@ -312,8 +312,6 @@ int ioctl_getflags(struct file *file, unsigned int __user *argp) int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; if (!err) err = put_user(fa.flags, argp); return err; @@ -335,8 +333,6 @@ int ioctl_setflags(struct file *file, unsigned int __user *argp) fileattr_fill_flags(&fa, flags); err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; } } return err; @@ -349,8 +345,6 @@ int ioctl_fsgetxattr(struct file *file, void __user *argp) int err; err = vfs_fileattr_get(file->f_path.dentry, &fa); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; if (!err) err = copy_fsxattr_to_user(&fa, argp); @@ -371,8 +365,6 @@ int ioctl_fssetxattr(struct file *file, void __user *argp) if (!err) { err = vfs_fileattr_set(idmap, dentry, &fa); mnt_drop_write_file(file); - if (err == -EOPNOTSUPP) - err = -ENOIOCTLCMD; } } return err; @@ -424,6 +416,8 @@ SYSCALL_DEFINE5(file_getattr, int, dfd, const char __user *, filename, } error = vfs_fileattr_get(filepath.dentry, &fa); + if (error == -ENOIOCTLCMD || error == -ENOTTY) + error = -EOPNOTSUPP; if (error) return error; @@ -491,6 +485,8 @@ SYSCALL_DEFINE5(file_setattr, int, dfd, const char __user *, filename, if (!error) { error = vfs_fileattr_set(mnt_idmap(filepath.mnt), filepath.dentry, &fa); + if (error == -ENOIOCTLCMD || error == -ENOTTY) + error = -EOPNOTSUPP; mnt_drop_write(filepath.mnt); } diff --git a/fs/file_table.c b/fs/file_table.c index 81c72576e548..cd4a3db4659a 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -54,7 +54,7 @@ struct backing_file { #define backing_file(f) container_of(f, struct backing_file, file) -struct path *backing_file_user_path(const struct file *f) +const struct path *backing_file_user_path(const struct file *f) { return &backing_file(f)->user_path; } @@ -171,7 +171,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) * the respective member when opening the file. */ mutex_init(&f->f_pos_lock); - memset(&f->f_path, 0, sizeof(f->f_path)); + memset(&f->__f_path, 0, sizeof(f->f_path)); memset(&f->f_ra, 0, sizeof(f->f_ra)); f->f_flags = flags; @@ -192,7 +192,7 @@ static int init_file(struct file *f, int flags, const struct cred *cred) f->f_sb_err = 0; /* - * We're SLAB_TYPESAFE_BY_RCU so initialize f_count last. While + * We're SLAB_TYPESAFE_BY_RCU so initialize f_ref last. While * fget-rcu pattern users need to be able to handle spurious * refcount bumps we should reinitialize the reused file first. */ @@ -319,7 +319,7 @@ struct file *alloc_empty_backing_file(int flags, const struct cred *cred) static void file_init_path(struct file *file, const struct path *path, const struct file_operations *fop) { - file->f_path = *path; + file->__f_path = *path; file->f_inode = path->dentry->d_inode; file->f_mapping = path->dentry->d_inode->i_mapping; file->f_wb_err = filemap_sample_wb_err(file->f_mapping); diff --git a/fs/fs_context.c b/fs/fs_context.c index 666e61753aed..93b7ebf8d927 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -161,25 +161,24 @@ int vfs_parse_fs_param(struct fs_context *fc, struct fs_parameter *param) EXPORT_SYMBOL(vfs_parse_fs_param); /** - * vfs_parse_fs_string - Convenience function to just parse a string. + * vfs_parse_fs_qstr - Convenience function to just parse a string. * @fc: Filesystem context. * @key: Parameter name. * @value: Default value. - * @v_size: Maximum number of bytes in the value. */ -int vfs_parse_fs_string(struct fs_context *fc, const char *key, - const char *value, size_t v_size) +int vfs_parse_fs_qstr(struct fs_context *fc, const char *key, + const struct qstr *value) { int ret; struct fs_parameter param = { .key = key, .type = fs_value_is_flag, - .size = v_size, + .size = value ? value->len : 0, }; if (value) { - param.string = kmemdup_nul(value, v_size, GFP_KERNEL); + param.string = kmemdup_nul(value->name, value->len, GFP_KERNEL); if (!param.string) return -ENOMEM; param.type = fs_value_is_string; @@ -189,7 +188,7 @@ int vfs_parse_fs_string(struct fs_context *fc, const char *key, kfree(param.string); return ret; } -EXPORT_SYMBOL(vfs_parse_fs_string); +EXPORT_SYMBOL(vfs_parse_fs_qstr); /** * vfs_parse_monolithic_sep - Parse key[=val][,key[=val]]* mount data @@ -218,16 +217,14 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, while ((key = sep(&options)) != NULL) { if (*key) { - size_t v_len = 0; char *value = strchr(key, '='); if (value) { if (unlikely(value == key)) continue; *value++ = 0; - v_len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, v_len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } diff --git a/fs/fuse/Kconfig b/fs/fuse/Kconfig index a774166264de..3a4ae632c94a 100644 --- a/fs/fuse/Kconfig +++ b/fs/fuse/Kconfig @@ -13,7 +13,7 @@ config FUSE_FS although chances are your distribution already has that library installed if you've installed the "fuse" package itself. - See <file:Documentation/filesystems/fuse.rst> for more information. + See <file:Documentation/filesystems/fuse/fuse.rst> for more information. See <file:Documentation/Changes> for needed library/utility version. If you want to develop a userspace FS, or if you want to use diff --git a/fs/fuse/Makefile b/fs/fuse/Makefile index 3f0f312a31c1..22ad9538dfc4 100644 --- a/fs/fuse/Makefile +++ b/fs/fuse/Makefile @@ -10,10 +10,11 @@ obj-$(CONFIG_FUSE_FS) += fuse.o obj-$(CONFIG_CUSE) += cuse.o obj-$(CONFIG_VIRTIO_FS) += virtiofs.o -fuse-y := dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o +fuse-y := trace.o # put trace.o first so we see ftrace errors sooner +fuse-y += dev.o dir.o file.o inode.o control.o xattr.o acl.o readdir.o ioctl.o fuse-y += iomode.o fuse-$(CONFIG_FUSE_DAX) += dax.o -fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o +fuse-$(CONFIG_FUSE_PASSTHROUGH) += passthrough.o backing.o fuse-$(CONFIG_SYSCTL) += sysctl.o fuse-$(CONFIG_FUSE_IO_URING) += dev_uring.o diff --git a/fs/fuse/backing.c b/fs/fuse/backing.c new file mode 100644 index 000000000000..4afda419dd14 --- /dev/null +++ b/fs/fuse/backing.c @@ -0,0 +1,179 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * FUSE passthrough to backing file. + * + * Copyright (c) 2023 CTERA Networks. + */ + +#include "fuse_i.h" + +#include <linux/file.h> + +struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) +{ + if (fb && refcount_inc_not_zero(&fb->count)) + return fb; + return NULL; +} + +static void fuse_backing_free(struct fuse_backing *fb) +{ + pr_debug("%s: fb=0x%p\n", __func__, fb); + + if (fb->file) + fput(fb->file); + put_cred(fb->cred); + kfree_rcu(fb, rcu); +} + +void fuse_backing_put(struct fuse_backing *fb) +{ + if (fb && refcount_dec_and_test(&fb->count)) + fuse_backing_free(fb); +} + +void fuse_backing_files_init(struct fuse_conn *fc) +{ + idr_init(&fc->backing_files_map); +} + +static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) +{ + int id; + + idr_preload(GFP_KERNEL); + spin_lock(&fc->lock); + /* FIXME: xarray might be space inefficient */ + id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); + spin_unlock(&fc->lock); + idr_preload_end(); + + WARN_ON_ONCE(id == 0); + return id; +} + +static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, + int id) +{ + struct fuse_backing *fb; + + spin_lock(&fc->lock); + fb = idr_remove(&fc->backing_files_map, id); + spin_unlock(&fc->lock); + + return fb; +} + +static int fuse_backing_id_free(int id, void *p, void *data) +{ + struct fuse_backing *fb = p; + + WARN_ON_ONCE(refcount_read(&fb->count) != 1); + fuse_backing_free(fb); + return 0; +} + +void fuse_backing_files_free(struct fuse_conn *fc) +{ + idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); + idr_destroy(&fc->backing_files_map); +} + +int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) +{ + struct file *file; + struct super_block *backing_sb; + struct fuse_backing *fb = NULL; + int res; + + pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + res = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + res = -EINVAL; + if (map->flags || map->padding) + goto out; + + file = fget_raw(map->fd); + res = -EBADF; + if (!file) + goto out; + + /* read/write/splice/mmap passthrough only relevant for regular files */ + res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; + if (!d_is_reg(file->f_path.dentry)) + goto out_fput; + + backing_sb = file_inode(file)->i_sb; + res = -ELOOP; + if (backing_sb->s_stack_depth >= fc->max_stack_depth) + goto out_fput; + + fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); + res = -ENOMEM; + if (!fb) + goto out_fput; + + fb->file = file; + fb->cred = prepare_creds(); + refcount_set(&fb->count, 1); + + res = fuse_backing_id_alloc(fc, fb); + if (res < 0) { + fuse_backing_free(fb); + fb = NULL; + } + +out: + pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); + + return res; + +out_fput: + fput(file); + goto out; +} + +int fuse_backing_close(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb = NULL; + int err; + + pr_debug("%s: backing_id=%d\n", __func__, backing_id); + + /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ + err = -EPERM; + if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) + goto out; + + err = -EINVAL; + if (backing_id <= 0) + goto out; + + err = -ENOENT; + fb = fuse_backing_id_remove(fc, backing_id); + if (!fb) + goto out; + + fuse_backing_put(fb); + err = 0; +out: + pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); + + return err; +} + +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id) +{ + struct fuse_backing *fb; + + rcu_read_lock(); + fb = idr_find(&fc->backing_files_map, backing_id); + fb = fuse_backing_get(fb); + rcu_read_unlock(); + + return fb; +} diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index b39844d75a80..28c96961e85d 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -52,6 +52,7 @@ #include <linux/user_namespace.h> #include "fuse_i.h" +#include "fuse_dev_i.h" #define CUSE_CONNTBL_LEN 64 @@ -547,7 +548,7 @@ static int cuse_channel_open(struct inode *inode, struct file *file) */ static int cuse_channel_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = file->private_data; + struct fuse_dev *fud = __fuse_get_dev(file); struct cuse_conn *cc = fc_to_cc(fud->fc); /* remove from the conntbl, no more access from this point on */ diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index ad8645c0f9fe..132f38619d70 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -25,7 +25,6 @@ #include <linux/sched.h> #include <linux/seq_file.h> -#define CREATE_TRACE_POINTS #include "fuse_trace.h" MODULE_ALIAS_MISCDEV(FUSE_MINOR); @@ -207,8 +206,9 @@ static struct fuse_req *fuse_get_req(struct mnt_idmap *idmap, if (fuse_block_alloc(fc, for_background)) { err = -EINTR; - if (wait_event_killable_exclusive(fc->blocked_waitq, - !fuse_block_alloc(fc, for_background))) + if (wait_event_state_exclusive(fc->blocked_waitq, + !fuse_block_alloc(fc, for_background), + (TASK_KILLABLE | TASK_FREEZABLE))) goto out; } /* Matches smp_wmb() in fuse_set_initialized() */ @@ -322,6 +322,7 @@ unsigned int fuse_req_hash(u64 unique) { return hash_long(unique & ~FUSE_INT_REQ_BIT, FUSE_PQ_HASH_BITS); } +EXPORT_SYMBOL_GPL(fuse_req_hash); /* * A new request is available, wake fiq->waitq @@ -369,12 +370,32 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req) } } +static inline void fuse_request_assign_unique_locked(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique_locked(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} + +inline void fuse_request_assign_unique(struct fuse_iqueue *fiq, + struct fuse_req *req) +{ + if (req->in.h.opcode != FUSE_NOTIFY_REPLY) + req->in.h.unique = fuse_get_unique(fiq); + + /* tracepoint captures in.h.unique and in.h.len */ + trace_fuse_request_send(req); +} +EXPORT_SYMBOL_GPL(fuse_request_assign_unique); + static void fuse_dev_queue_req(struct fuse_iqueue *fiq, struct fuse_req *req) { spin_lock(&fiq->lock); if (fiq->connected) { - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique_locked(fiq); + fuse_request_assign_unique_locked(fiq, req); list_add_tail(&req->list, &fiq->pending); fuse_dev_wake_and_unlock(fiq); } else { @@ -397,7 +418,6 @@ static void fuse_send_one(struct fuse_iqueue *fiq, struct fuse_req *req) req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); - trace_fuse_request_send(req); fiq->ops->send_req(fiq, req); } @@ -687,10 +707,10 @@ static bool fuse_request_queue_background_uring(struct fuse_conn *fc, { struct fuse_iqueue *fiq = &fc->iq; - req->in.h.unique = fuse_get_unique(fiq); req->in.h.len = sizeof(struct fuse_in_header) + fuse_len_args(req->args->in_numargs, (struct fuse_arg *) req->args->in_args); + fuse_request_assign_unique(fiq, req); return fuse_uring_queue_bq_req(req); } @@ -1528,14 +1548,34 @@ static int fuse_dev_open(struct inode *inode, struct file *file) return 0; } +struct fuse_dev *fuse_get_dev(struct file *file) +{ + struct fuse_dev *fud = __fuse_get_dev(file); + int err; + + if (likely(fud)) + return fud; + + err = wait_event_interruptible(fuse_dev_waitq, + READ_ONCE(file->private_data) != FUSE_DEV_SYNC_INIT); + if (err) + return ERR_PTR(err); + + fud = __fuse_get_dev(file); + if (!fud) + return ERR_PTR(-EPERM); + + return fud; +} + static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) { struct fuse_copy_state cs; struct file *file = iocb->ki_filp; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!user_backed_iter(to)) return -EINVAL; @@ -1555,8 +1595,8 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, struct fuse_copy_state cs; struct fuse_dev *fud = fuse_get_dev(in); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); bufs = kvmalloc_array(pipe->max_usage, sizeof(struct pipe_buffer), GFP_KERNEL); @@ -1600,35 +1640,31 @@ static int fuse_notify_poll(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_poll_wakeup_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); return fuse_notify_poll_wakeup(fc, &outarg); - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_inode_out outarg; - int err = -EINVAL; + int err; if (size != sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; fuse_copy_finish(cs); down_read(&fc->killsb); @@ -1636,10 +1672,6 @@ static int fuse_notify_inval_inode(struct fuse_conn *fc, unsigned int size, outarg.off, outarg.len); up_read(&fc->killsb); return err; - -err: - fuse_copy_finish(cs); - return err; } static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, @@ -1647,29 +1679,26 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_inval_entry_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1682,12 +1711,8 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1696,29 +1721,25 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, { struct fuse_notify_delete_out outarg; int err; - char *buf = NULL; + char *buf; struct qstr name; - err = -EINVAL; if (size < sizeof(outarg)) - goto err; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto err; + return err; - err = -ENAMETOOLONG; if (outarg.namelen > fc->name_max) - goto err; + return -ENAMETOOLONG; - err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) - goto err; + return -EINVAL; - err = -ENOMEM; buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); if (!buf) - goto err; + return -ENOMEM; name.name = buf; name.len = outarg.namelen; @@ -1731,12 +1752,8 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, down_read(&fc->killsb); err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); up_read(&fc->killsb); - kfree(buf); - return err; - err: kfree(buf); - fuse_copy_finish(cs); return err; } @@ -1754,17 +1771,15 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, loff_t file_size; loff_t end; - err = -EINVAL; if (size < sizeof(outarg)) - goto out_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto out_finish; + return err; - err = -EINVAL; if (size - sizeof(outarg) != outarg.size) - goto out_finish; + return -EINVAL; nodeid = outarg.nodeid; @@ -1824,8 +1839,6 @@ out_iput: iput(inode); out_up_killsb: up_read(&fc->killsb); -out_finish: - fuse_copy_finish(cs); return err; } @@ -1940,13 +1953,12 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, u64 nodeid; int err; - err = -EINVAL; if (size != sizeof(outarg)) - goto copy_finish; + return -EINVAL; err = fuse_copy_one(cs, &outarg, sizeof(outarg)); if (err) - goto copy_finish; + return err; fuse_copy_finish(cs); @@ -1962,10 +1974,6 @@ static int fuse_notify_retrieve(struct fuse_conn *fc, unsigned int size, up_read(&fc->killsb); return err; - -copy_finish: - fuse_copy_finish(cs); - return err; } /* @@ -2044,6 +2052,42 @@ static int fuse_notify_inc_epoch(struct fuse_conn *fc) return 0; } +static int fuse_notify_prune(struct fuse_conn *fc, unsigned int size, + struct fuse_copy_state *cs) +{ + struct fuse_notify_prune_out outarg; + const unsigned int batch = 512; + u64 *nodeids __free(kfree) = kmalloc(sizeof(u64) * batch, GFP_KERNEL); + unsigned int num, i; + int err; + + if (!nodeids) + return -ENOMEM; + + if (size < sizeof(outarg)) + return -EINVAL; + + err = fuse_copy_one(cs, &outarg, sizeof(outarg)); + if (err) + return err; + + if (size - sizeof(outarg) != outarg.count * sizeof(u64)) + return -EINVAL; + + for (; outarg.count; outarg.count -= num) { + num = min(batch, outarg.count); + err = fuse_copy_one(cs, nodeids, num * sizeof(u64)); + if (err) + return err; + + scoped_guard(rwsem_read, &fc->killsb) { + for (i = 0; i < num; i++) + fuse_try_prune_one_inode(fc, nodeids[i]); + } + } + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { @@ -2075,8 +2119,10 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_INC_EPOCH: return fuse_notify_inc_epoch(fc); + case FUSE_NOTIFY_PRUNE: + return fuse_notify_prune(fc, size, cs); + default: - fuse_copy_finish(cs); return -EINVAL; } } @@ -2156,7 +2202,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, */ if (!oh.unique) { err = fuse_notify(fc, oh.error, nbytes - sizeof(oh), cs); - goto out; + goto copy_finish; } err = -EINVAL; @@ -2229,7 +2275,7 @@ copy_finish: static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) { struct fuse_copy_state cs; - struct fuse_dev *fud = fuse_get_dev(iocb->ki_filp); + struct fuse_dev *fud = __fuse_get_dev(iocb->ki_filp); if (!fud) return -EPERM; @@ -2251,11 +2297,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, unsigned idx; struct pipe_buffer *bufs; struct fuse_copy_state cs; - struct fuse_dev *fud; + struct fuse_dev *fud = __fuse_get_dev(out); size_t rem; ssize_t ret; - fud = fuse_get_dev(out); if (!fud) return -EPERM; @@ -2341,7 +2386,7 @@ static __poll_t fuse_dev_poll(struct file *file, poll_table *wait) struct fuse_iqueue *fiq; struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) + if (IS_ERR(fud)) return EPOLLERR; fiq = &fud->fc->iq; @@ -2394,7 +2439,7 @@ static void end_polls(struct fuse_conn *fc) * The same effect is usually achievable through killing the filesystem daemon * and all users of the filesystem. The exception is the combination of an * asynchronous request and the tricky deadlock (see - * Documentation/filesystems/fuse.rst). + * Documentation/filesystems/fuse/fuse.rst). * * Aborting requests under I/O goes as follows: 1: Separate out unlocked * requests, they should be finished off immediately. Locked requests will be @@ -2488,7 +2533,7 @@ void fuse_wait_aborted(struct fuse_conn *fc) int fuse_dev_release(struct inode *inode, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (fud) { struct fuse_conn *fc = fud->fc; @@ -2519,8 +2564,8 @@ static int fuse_dev_fasync(int fd, struct file *file, int on) { struct fuse_dev *fud = fuse_get_dev(file); - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); /* No locking - fasync_helper does its own locking */ return fasync_helper(fd, file, on, &fud->fc->iq.fasync); @@ -2530,7 +2575,7 @@ static int fuse_device_clone(struct fuse_conn *fc, struct file *new) { struct fuse_dev *fud; - if (new->private_data) + if (__fuse_get_dev(new)) return -EINVAL; fud = fuse_dev_alloc_install(fc); @@ -2561,7 +2606,7 @@ static long fuse_dev_ioctl_clone(struct file *file, __u32 __user *argp) * uses the same ioctl handler. */ if (fd_file(f)->f_op == file->f_op) - fud = fuse_get_dev(fd_file(f)); + fud = __fuse_get_dev(fd_file(f)); res = -EINVAL; if (fud) { @@ -2579,8 +2624,8 @@ static long fuse_dev_ioctl_backing_open(struct file *file, struct fuse_dev *fud = fuse_get_dev(file); struct fuse_backing_map map; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2596,8 +2641,8 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) struct fuse_dev *fud = fuse_get_dev(file); int backing_id; - if (!fud) - return -EPERM; + if (IS_ERR(fud)) + return PTR_ERR(fud); if (!IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) return -EOPNOTSUPP; @@ -2608,6 +2653,19 @@ static long fuse_dev_ioctl_backing_close(struct file *file, __u32 __user *argp) return fuse_backing_close(fud->fc, backing_id); } +static long fuse_dev_ioctl_sync_init(struct file *file) +{ + int err = -EINVAL; + + mutex_lock(&fuse_mutex); + if (!__fuse_get_dev(file)) { + WRITE_ONCE(file->private_data, FUSE_DEV_SYNC_INIT); + err = 0; + } + mutex_unlock(&fuse_mutex); + return err; +} + static long fuse_dev_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { @@ -2623,6 +2681,9 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, case FUSE_DEV_IOC_BACKING_CLOSE: return fuse_dev_ioctl_backing_close(file, argp); + case FUSE_DEV_IOC_SYNC_INIT: + return fuse_dev_ioctl_sync_init(file); + default: return -ENOTTY; } @@ -2631,7 +2692,7 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, #ifdef CONFIG_PROC_FS static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) { - struct fuse_dev *fud = fuse_get_dev(file); + struct fuse_dev *fud = __fuse_get_dev(file); if (!fud) return; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index a30c44234a4e..f6b12aebb8bb 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -7,6 +7,7 @@ #include "fuse_i.h" #include "dev_uring_i.h" #include "fuse_dev_i.h" +#include "fuse_trace.h" #include <linux/fs.h> #include <linux/io_uring/cmd.h> @@ -1139,9 +1140,9 @@ int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) return -EINVAL; fud = fuse_get_dev(cmd->file); - if (!fud) { + if (IS_ERR(fud)) { pr_info_ratelimited("No fuse device found\n"); - return -ENOTCONN; + return PTR_ERR(fud); } fc = fud->fc; @@ -1268,8 +1269,7 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (!queue) goto err; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); spin_lock(&queue->lock); err = -ENOTCONN; diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 5c569c3cb53f..ecaec0fea3a1 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -739,22 +739,18 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, int err; struct mnt_idmap *idmap = file_mnt_idmap(file); struct fuse_conn *fc = get_fuse_conn(dir); - struct dentry *res = NULL; if (fuse_is_bad(dir)) return -EIO; if (d_in_lookup(entry)) { - res = fuse_lookup(dir, entry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - entry = res; + struct dentry *res = fuse_lookup(dir, entry, 0); + if (res || d_really_is_positive(entry)) + return finish_no_open(file, res); } - if (!(flags & O_CREAT) || d_really_is_positive(entry)) - goto no_open; + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); /* Only creates */ file->f_mode |= FMODE_CREATED; @@ -768,16 +764,13 @@ static int fuse_atomic_open(struct inode *dir, struct dentry *entry, goto mknod; } else if (err == -EEXIST) fuse_invalidate_entry(entry); -out_dput: - dput(res); return err; mknod: err = fuse_mknod(idmap, dir, entry, mode, 0); if (err) - goto out_dput; -no_open: - return finish_no_open(file, res); + return err; + return finish_no_open(file, NULL); } /* diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 4adcf09d4b01..f1ef77a0be05 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -356,8 +356,14 @@ void fuse_file_release(struct inode *inode, struct fuse_file *ff, * Make the release synchronous if this is a fuseblk mount, * synchronous RELEASE is allowed (and desirable) in this case * because the server can be trusted not to screw up. + * + * Always use the asynchronous file put because the current thread + * might be the fuse server. This can happen if a process starts some + * aio and closes the fd before the aio completes. Since aio takes its + * own ref to the file, the IO completion has to drop the ref, which is + * how the fuse server can end up closing its clients' files. */ - fuse_file_put(ff, ff->fm->fc->destroy); + fuse_file_put(ff, false); } void fuse_release_common(struct file *file, bool isdir) @@ -865,22 +871,20 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, struct fuse_args_pages *ap = &ia->ap; size_t count = ia->read.in.size; size_t num_read = args->out_args[0].size; - struct address_space *mapping = NULL; - - for (i = 0; mapping == NULL && i < ap->num_folios; i++) - mapping = ap->folios[i]->mapping; + struct address_space *mapping; + struct inode *inode; - if (mapping) { - struct inode *inode = mapping->host; + WARN_ON_ONCE(!ap->num_folios); + mapping = ap->folios[0]->mapping; + inode = mapping->host; - /* - * Short read means EOF. If file size is larger, truncate it - */ - if (!err && num_read < count) - fuse_short_read(inode, ia->read.attr_ver, num_read, ap); + /* + * Short read means EOF. If file size is larger, truncate it + */ + if (!err && num_read < count) + fuse_short_read(inode, ia->read.attr_ver, num_read, ap); - fuse_invalidate_atime(inode); - } + fuse_invalidate_atime(inode); for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); @@ -1175,7 +1179,6 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, num = min(iov_iter_count(ii), fc->max_write); ap->args.in_pages = true; - ap->descs[0].offset = offset; while (num && ap->num_folios < max_folios) { size_t tmp; @@ -1823,19 +1826,15 @@ static void fuse_writepage_finish(struct fuse_writepage_args *wpa) struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); - struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) { + for (i = 0; i < ap->num_folios; i++) /* * Benchmarks showed that ending writeback within the * scope of the fi->lock alleviates xarray lock * contention and noticeably improves performance. */ iomap_finish_folio_write(inode, ap->folios[i], 1); - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - wb_writeout_inc(&bdi->wb); - } wake_up(&fi->page_waitq); } @@ -2010,14 +2009,11 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, uint32_t folio_index, loff_t offset, unsigned len) { - struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; ap->folios[folio_index] = folio; ap->descs[folio_index].offset = offset; ap->descs[folio_index].length = len; - - inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2960,10 +2956,12 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, .nodeid_out = ff_out->nodeid, .fh_out = ff_out->fh, .off_out = pos_out, - .len = min_t(size_t, len, UINT_MAX & PAGE_MASK), + .len = len, .flags = flags }; struct fuse_write_out outarg; + struct fuse_copy_file_range_out outarg_64; + u64 bytes_copied; ssize_t err; /* mark unstable when write-back is not used, and file_out gets * extended */ @@ -3013,33 +3011,51 @@ static ssize_t __fuse_copy_file_range(struct file *file_in, loff_t pos_in, if (is_unstable) set_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); - args.opcode = FUSE_COPY_FILE_RANGE; + args.opcode = FUSE_COPY_FILE_RANGE_64; args.nodeid = ff_in->nodeid; args.in_numargs = 1; args.in_args[0].size = sizeof(inarg); args.in_args[0].value = &inarg; args.out_numargs = 1; - args.out_args[0].size = sizeof(outarg); - args.out_args[0].value = &outarg; + args.out_args[0].size = sizeof(outarg_64); + args.out_args[0].value = &outarg_64; + if (fc->no_copy_file_range_64) { +fallback: + /* Fall back to old op that can't handle large copy length */ + args.opcode = FUSE_COPY_FILE_RANGE; + args.out_args[0].size = sizeof(outarg); + args.out_args[0].value = &outarg; + inarg.len = len = min_t(size_t, len, UINT_MAX & PAGE_MASK); + } err = fuse_simple_request(fm, &args); if (err == -ENOSYS) { - fc->no_copy_file_range = 1; - err = -EOPNOTSUPP; + if (fc->no_copy_file_range_64) { + fc->no_copy_file_range = 1; + err = -EOPNOTSUPP; + } else { + fc->no_copy_file_range_64 = 1; + goto fallback; + } } - if (!err && outarg.size > len) - err = -EIO; - if (err) goto out; + bytes_copied = fc->no_copy_file_range_64 ? + outarg.size : outarg_64.bytes_copied; + + if (bytes_copied > len) { + err = -EIO; + goto out; + } + truncate_inode_pages_range(inode_out->i_mapping, ALIGN_DOWN(pos_out, PAGE_SIZE), - ALIGN(pos_out + outarg.size, PAGE_SIZE) - 1); + ALIGN(pos_out + bytes_copied, PAGE_SIZE) - 1); file_update_time(file_out); - fuse_write_update_attr(inode_out, pos_out + outarg.size, outarg.size); + fuse_write_update_attr(inode_out, pos_out + bytes_copied, bytes_copied); - err = outarg.size; + err = bytes_copied; out: if (is_unstable) clear_bit(FUSE_I_SIZE_UNSTABLE, &fi_out->state); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 5a9bd771a319..6e8373f97040 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -12,6 +12,8 @@ #define FUSE_INT_REQ_BIT (1ULL << 0) #define FUSE_REQ_ID_STEP (1ULL << 1) +extern struct wait_queue_head fuse_dev_waitq; + struct fuse_arg; struct fuse_args; struct fuse_pqueue; @@ -37,15 +39,22 @@ struct fuse_copy_state { } ring; }; -static inline struct fuse_dev *fuse_get_dev(struct file *file) +#define FUSE_DEV_SYNC_INIT ((struct fuse_dev *) 1) +#define FUSE_DEV_PTR_MASK (~1UL) + +static inline struct fuse_dev *__fuse_get_dev(struct file *file) { /* * Lockless access is OK, because file->private data is set * once during mount and is valid until the file is released. */ - return READ_ONCE(file->private_data); + struct fuse_dev *fud = READ_ONCE(file->private_data); + + return (typeof(fud)) ((unsigned long) fud & FUSE_DEV_PTR_MASK); } +struct fuse_dev *fuse_get_dev(struct file *file); + unsigned int fuse_req_hash(u64 unique); struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index cc428d04be3e..c2f2a48156d6 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -856,6 +856,9 @@ struct fuse_conn { /** Does the filesystem support copy_file_range? */ unsigned no_copy_file_range:1; + /** Does the filesystem support copy_file_range_64? */ + unsigned no_copy_file_range_64:1; + /* Send DESTROY request */ unsigned int destroy:1; @@ -901,6 +904,9 @@ struct fuse_conn { /* Is link not implemented by fs? */ unsigned int no_link:1; + /* Is synchronous FUSE_INIT allowed? */ + unsigned int sync_init:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -1255,6 +1261,11 @@ int fuse_simple_background(struct fuse_mount *fm, struct fuse_args *args, gfp_t gfp_flags); /** + * Assign a unique id to a fuse request + */ +void fuse_request_assign_unique(struct fuse_iqueue *fiq, struct fuse_req *req); + +/** * End a finished request */ void fuse_request_end(struct fuse_req *req); @@ -1315,7 +1326,7 @@ struct fuse_dev *fuse_dev_alloc_install(struct fuse_conn *fc); struct fuse_dev *fuse_dev_alloc(void); void fuse_dev_install(struct fuse_dev *fud, struct fuse_conn *fc); void fuse_dev_free(struct fuse_dev *fud); -void fuse_send_init(struct fuse_mount *fm); +int fuse_send_init(struct fuse_mount *fm); /** * Fill in superblock and initialize fuse connection @@ -1407,6 +1418,12 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, u64 child_nodeid, struct qstr *name, u32 flags); +/* + * Try to prune this inode. If neither the inode itself nor dentries associated + * with this inode have any external reference, then the inode can be freed. + */ +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid); + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, bool isdir); @@ -1512,29 +1529,11 @@ struct fuse_file *fuse_file_open(struct fuse_mount *fm, u64 nodeid, void fuse_file_release(struct inode *inode, struct fuse_file *ff, unsigned int open_flags, fl_owner_t id, bool isdir); -/* passthrough.c */ -static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return READ_ONCE(fi->fb); -#else - return NULL; -#endif -} - -static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, - struct fuse_backing *fb) -{ -#ifdef CONFIG_FUSE_PASSTHROUGH - return xchg(&fi->fb, fb); -#else - return NULL; -#endif -} - +/* backing.c */ #ifdef CONFIG_FUSE_PASSTHROUGH struct fuse_backing *fuse_backing_get(struct fuse_backing *fb); void fuse_backing_put(struct fuse_backing *fb); +struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, int backing_id); #else static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) @@ -1545,6 +1544,11 @@ static inline struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) static inline void fuse_backing_put(struct fuse_backing *fb) { } +static inline struct fuse_backing *fuse_backing_lookup(struct fuse_conn *fc, + int backing_id) +{ + return NULL; +} #endif void fuse_backing_files_init(struct fuse_conn *fc); @@ -1552,9 +1556,27 @@ void fuse_backing_files_free(struct fuse_conn *fc); int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map); int fuse_backing_close(struct fuse_conn *fc, int backing_id); -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id); +/* passthrough.c */ +static inline struct fuse_backing *fuse_inode_backing(struct fuse_inode *fi) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return READ_ONCE(fi->fb); +#else + return NULL; +#endif +} + +static inline struct fuse_backing *fuse_inode_backing_set(struct fuse_inode *fi, + struct fuse_backing *fb) +{ +#ifdef CONFIG_FUSE_PASSTHROUGH + return xchg(&fi->fb, fb); +#else + return NULL; +#endif +} + +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id); void fuse_passthrough_release(struct fuse_file *ff, struct fuse_backing *fb); static inline struct file *fuse_file_passthrough(struct fuse_file *ff) diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 7485a41af892..d1babf56f254 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -7,6 +7,7 @@ */ #include "fuse_i.h" +#include "fuse_dev_i.h" #include "dev_uring_i.h" #include <linux/dax.h> @@ -34,6 +35,7 @@ MODULE_LICENSE("GPL"); static struct kmem_cache *fuse_inode_cachep; struct list_head fuse_conn_list; DEFINE_MUTEX(fuse_mutex); +DECLARE_WAIT_QUEUE_HEAD(fuse_dev_waitq); static int set_global_limit(const char *val, const struct kernel_param *kp); @@ -101,14 +103,11 @@ static struct inode *fuse_alloc_inode(struct super_block *sb) if (!fi) return NULL; - fi->i_time = 0; + /* Initialize private data (i.e. everything except fi->inode) */ + BUILD_BUG_ON(offsetof(struct fuse_inode, inode) != 0); + memset((void *) fi + sizeof(fi->inode), 0, sizeof(*fi) - sizeof(fi->inode)); + fi->inval_mask = ~0; - fi->nodeid = 0; - fi->nlookup = 0; - fi->attr_version = 0; - fi->orig_ino = 0; - fi->state = 0; - fi->submount_lookup = NULL; mutex_init(&fi->mutex); spin_lock_init(&fi->lock); fi->forget = fuse_alloc_forget(); @@ -586,6 +585,17 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, return 0; } +void fuse_try_prune_one_inode(struct fuse_conn *fc, u64 nodeid) +{ + struct inode *inode; + + inode = fuse_ilookup(fc, nodeid, NULL); + if (!inode) + return; + d_prune_aliases(inode); + iput(inode); +} + bool fuse_lock_inode(struct inode *inode) { bool locked = false; @@ -1469,7 +1479,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, wake_up_all(&fc->blocked_waitq); } -void fuse_send_init(struct fuse_mount *fm) +static struct fuse_init_args *fuse_new_init(struct fuse_mount *fm) { struct fuse_init_args *ia; u64 flags; @@ -1528,10 +1538,30 @@ void fuse_send_init(struct fuse_mount *fm) ia->args.out_args[0].value = &ia->out; ia->args.force = true; ia->args.nocreds = true; - ia->args.end = process_init_reply; - if (fuse_simple_background(fm, &ia->args, GFP_KERNEL) != 0) - process_init_reply(fm, &ia->args, -ENOTCONN); + return ia; +} + +int fuse_send_init(struct fuse_mount *fm) +{ + struct fuse_init_args *ia = fuse_new_init(fm); + int err; + + if (fm->fc->sync_init) { + err = fuse_simple_request(fm, &ia->args); + /* Ignore size of init reply */ + if (err > 0) + err = 0; + } else { + ia->args.end = process_init_reply; + err = fuse_simple_background(fm, &ia->args, GFP_KERNEL); + if (!err) + return 0; + } + process_init_reply(fm, &ia->args, err); + if (fm->fc->conn_error) + return -ENOTCONN; + return 0; } EXPORT_SYMBOL_GPL(fuse_send_init); @@ -1561,8 +1591,6 @@ static int fuse_bdi_init(struct fuse_conn *fc, struct super_block *sb) if (err) return err; - /* fuse does it's own writeback accounting */ - sb->s_bdi->capabilities &= ~BDI_CAP_WRITEBACK_ACCT; sb->s_bdi->capabilities |= BDI_CAP_STRICTLIMIT; /* @@ -1821,6 +1849,7 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) !sb_set_blocksize(sb, PAGE_SIZE)) goto err; #endif + fc->sync_fs = 1; } else { sb->s_blocksize = PAGE_SIZE; sb->s_blocksize_bits = PAGE_SHIFT; @@ -1872,8 +1901,12 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) mutex_lock(&fuse_mutex); err = -EINVAL; - if (ctx->fudptr && *ctx->fudptr) - goto err_unlock; + if (ctx->fudptr && *ctx->fudptr) { + if (*ctx->fudptr == FUSE_DEV_SYNC_INIT) + fc->sync_init = 1; + else + goto err_unlock; + } err = fuse_ctl_add_conn(fc); if (err) @@ -1881,8 +1914,10 @@ int fuse_fill_super_common(struct super_block *sb, struct fuse_fs_context *ctx) list_add_tail(&fc->entry, &fuse_conn_list); sb->s_root = root_dentry; - if (ctx->fudptr) + if (ctx->fudptr) { *ctx->fudptr = fud; + wake_up_all(&fuse_dev_waitq); + } mutex_unlock(&fuse_mutex); return 0; @@ -1903,6 +1938,7 @@ EXPORT_SYMBOL_GPL(fuse_fill_super_common); static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) { struct fuse_fs_context *ctx = fsc->fs_private; + struct fuse_mount *fm; int err; if (!ctx->file || !ctx->rootmode_present || @@ -1923,8 +1959,10 @@ static int fuse_fill_super(struct super_block *sb, struct fs_context *fsc) return err; /* file->private_data shall be visible on all CPUs after this */ smp_mb(); - fuse_send_init(get_fuse_mount_super(sb)); - return 0; + + fm = get_fuse_mount_super(sb); + + return fuse_send_init(fm); } /* @@ -1985,7 +2023,7 @@ static int fuse_get_tree(struct fs_context *fsc) * Allow creating a fuse mount with an already initialized fuse * connection */ - fud = READ_ONCE(ctx->file->private_data); + fud = __fuse_get_dev(ctx->file); if (ctx->file->f_op == &fuse_dev_operations && fud) { fsc->sget_key = fud->fc; sb = sget_fc(fsc, fuse_test_super, fuse_set_no_super); diff --git a/fs/fuse/ioctl.c b/fs/fuse/ioctl.c index 57032eadca6c..fdc175e93f74 100644 --- a/fs/fuse/ioctl.c +++ b/fs/fuse/ioctl.c @@ -536,8 +536,6 @@ int fuse_fileattr_get(struct dentry *dentry, struct file_kattr *fa) cleanup: fuse_priv_ioctl_cleanup(inode, ff); - if (err == -ENOTTY) - err = -EOPNOTSUPP; return err; } @@ -574,7 +572,5 @@ int fuse_fileattr_set(struct mnt_idmap *idmap, cleanup: fuse_priv_ioctl_cleanup(inode, ff); - if (err == -ENOTTY) - err = -EOPNOTSUPP; return err; } diff --git a/fs/fuse/iomode.c b/fs/fuse/iomode.c index c99e285f3183..3728933188f3 100644 --- a/fs/fuse/iomode.c +++ b/fs/fuse/iomode.c @@ -177,8 +177,7 @@ static int fuse_file_passthrough_open(struct inode *inode, struct file *file) (ff->open_flags & ~FOPEN_PASSTHROUGH_MASK)) return -EINVAL; - fb = fuse_passthrough_open(file, inode, - ff->args->open_outarg.backing_id); + fb = fuse_passthrough_open(file, ff->args->open_outarg.backing_id); if (IS_ERR(fb)) return PTR_ERR(fb); diff --git a/fs/fuse/passthrough.c b/fs/fuse/passthrough.c index eb97ac009e75..72de97c03d0e 100644 --- a/fs/fuse/passthrough.c +++ b/fs/fuse/passthrough.c @@ -144,171 +144,12 @@ ssize_t fuse_passthrough_mmap(struct file *file, struct vm_area_struct *vma) return backing_file_mmap(backing_file, vma, &ctx); } -struct fuse_backing *fuse_backing_get(struct fuse_backing *fb) -{ - if (fb && refcount_inc_not_zero(&fb->count)) - return fb; - return NULL; -} - -static void fuse_backing_free(struct fuse_backing *fb) -{ - pr_debug("%s: fb=0x%p\n", __func__, fb); - - if (fb->file) - fput(fb->file); - put_cred(fb->cred); - kfree_rcu(fb, rcu); -} - -void fuse_backing_put(struct fuse_backing *fb) -{ - if (fb && refcount_dec_and_test(&fb->count)) - fuse_backing_free(fb); -} - -void fuse_backing_files_init(struct fuse_conn *fc) -{ - idr_init(&fc->backing_files_map); -} - -static int fuse_backing_id_alloc(struct fuse_conn *fc, struct fuse_backing *fb) -{ - int id; - - idr_preload(GFP_KERNEL); - spin_lock(&fc->lock); - /* FIXME: xarray might be space inefficient */ - id = idr_alloc_cyclic(&fc->backing_files_map, fb, 1, 0, GFP_ATOMIC); - spin_unlock(&fc->lock); - idr_preload_end(); - - WARN_ON_ONCE(id == 0); - return id; -} - -static struct fuse_backing *fuse_backing_id_remove(struct fuse_conn *fc, - int id) -{ - struct fuse_backing *fb; - - spin_lock(&fc->lock); - fb = idr_remove(&fc->backing_files_map, id); - spin_unlock(&fc->lock); - - return fb; -} - -static int fuse_backing_id_free(int id, void *p, void *data) -{ - struct fuse_backing *fb = p; - - WARN_ON_ONCE(refcount_read(&fb->count) != 1); - fuse_backing_free(fb); - return 0; -} - -void fuse_backing_files_free(struct fuse_conn *fc) -{ - idr_for_each(&fc->backing_files_map, fuse_backing_id_free, NULL); - idr_destroy(&fc->backing_files_map); -} - -int fuse_backing_open(struct fuse_conn *fc, struct fuse_backing_map *map) -{ - struct file *file; - struct super_block *backing_sb; - struct fuse_backing *fb = NULL; - int res; - - pr_debug("%s: fd=%d flags=0x%x\n", __func__, map->fd, map->flags); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - res = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - res = -EINVAL; - if (map->flags || map->padding) - goto out; - - file = fget_raw(map->fd); - res = -EBADF; - if (!file) - goto out; - - /* read/write/splice/mmap passthrough only relevant for regular files */ - res = d_is_dir(file->f_path.dentry) ? -EISDIR : -EINVAL; - if (!d_is_reg(file->f_path.dentry)) - goto out_fput; - - backing_sb = file_inode(file)->i_sb; - res = -ELOOP; - if (backing_sb->s_stack_depth >= fc->max_stack_depth) - goto out_fput; - - fb = kmalloc(sizeof(struct fuse_backing), GFP_KERNEL); - res = -ENOMEM; - if (!fb) - goto out_fput; - - fb->file = file; - fb->cred = prepare_creds(); - refcount_set(&fb->count, 1); - - res = fuse_backing_id_alloc(fc, fb); - if (res < 0) { - fuse_backing_free(fb); - fb = NULL; - } - -out: - pr_debug("%s: fb=0x%p, ret=%i\n", __func__, fb, res); - - return res; - -out_fput: - fput(file); - goto out; -} - -int fuse_backing_close(struct fuse_conn *fc, int backing_id) -{ - struct fuse_backing *fb = NULL; - int err; - - pr_debug("%s: backing_id=%d\n", __func__, backing_id); - - /* TODO: relax CAP_SYS_ADMIN once backing files are visible to lsof */ - err = -EPERM; - if (!fc->passthrough || !capable(CAP_SYS_ADMIN)) - goto out; - - err = -EINVAL; - if (backing_id <= 0) - goto out; - - err = -ENOENT; - fb = fuse_backing_id_remove(fc, backing_id); - if (!fb) - goto out; - - fuse_backing_put(fb); - err = 0; -out: - pr_debug("%s: fb=0x%p, err=%i\n", __func__, fb, err); - - return err; -} - /* * Setup passthrough to a backing file. * * Returns an fb object with elevated refcount to be stored in fuse inode. */ -struct fuse_backing *fuse_passthrough_open(struct file *file, - struct inode *inode, - int backing_id) +struct fuse_backing *fuse_passthrough_open(struct file *file, int backing_id) { struct fuse_file *ff = file->private_data; struct fuse_conn *fc = ff->fm->fc; @@ -320,12 +161,8 @@ struct fuse_backing *fuse_passthrough_open(struct file *file, if (backing_id <= 0) goto out; - rcu_read_lock(); - fb = idr_find(&fc->backing_files_map, backing_id); - fb = fuse_backing_get(fb); - rcu_read_unlock(); - err = -ENOENT; + fb = fuse_backing_lookup(fc, backing_id); if (!fb) goto out; diff --git a/fs/fuse/trace.c b/fs/fuse/trace.c new file mode 100644 index 000000000000..93bd72efc98c --- /dev/null +++ b/fs/fuse/trace.c @@ -0,0 +1,13 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (C) 2025 Oracle. All Rights Reserved. + * Author: Darrick J. Wong <djwong@kernel.org> + */ +#include "dev_uring_i.h" +#include "fuse_i.h" +#include "fuse_dev_i.h" + +#include <linux/pagemap.h> + +#define CREATE_TRACE_POINTS +#include "fuse_trace.h" diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 76c8fd0bfc75..6bc7c97b017d 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -20,6 +20,7 @@ #include <linux/cleanup.h> #include <linux/uio.h> #include "fuse_i.h" +#include "fuse_dev_i.h" /* Used to help calculate the FUSE connection's max_pages limit for a request's * size. Parts of the struct fuse_req are sliced into scattergather lists in @@ -761,7 +762,6 @@ static void copy_args_from_argbuf(struct fuse_args *args, struct fuse_req *req) static void virtio_fs_request_complete(struct fuse_req *req, struct virtio_fs_vq *fsvq) { - struct fuse_pqueue *fpq = &fsvq->fud->pq; struct fuse_args *args; struct fuse_args_pages *ap; unsigned int len, i, thislen; @@ -790,9 +790,7 @@ static void virtio_fs_request_complete(struct fuse_req *req, } } - spin_lock(&fpq->lock); clear_bit(FR_SENT, &req->flags); - spin_unlock(&fpq->lock); fuse_request_end(req); spin_lock(&fsvq->lock); @@ -1384,7 +1382,7 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, unsigned int out_sgs = 0; unsigned int in_sgs = 0; unsigned int total_sgs; - unsigned int i; + unsigned int i, hash; int ret; bool notify; struct fuse_pqueue *fpq; @@ -1444,8 +1442,9 @@ static int virtio_fs_enqueue_req(struct virtio_fs_vq *fsvq, /* Request successfully sent. */ fpq = &fsvq->fud->pq; + hash = fuse_req_hash(req->in.h.unique); spin_lock(&fpq->lock); - list_add_tail(&req->list, fpq->processing); + list_add_tail(&req->list, &fpq->processing[hash]); spin_unlock(&fpq->lock); set_bit(FR_SENT, &req->flags); /* matches barrier in request_wait_answer() */ @@ -1480,8 +1479,7 @@ static void virtio_fs_send_req(struct fuse_iqueue *fiq, struct fuse_req *req) struct virtio_fs_vq *fsvq; int ret; - if (req->in.h.opcode != FUSE_NOTIFY_REPLY) - req->in.h.unique = fuse_get_unique(fiq); + fuse_request_assign_unique(fiq, req); clear_bit(FR_PENDING, &req->flags); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 8760e7e20c9d..8a7ed80d9f2d 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -1368,27 +1368,19 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned flags, umode_t mode) { - struct dentry *d; bool excl = !!(flags & O_EXCL); - if (!d_in_lookup(dentry)) - goto skip_lookup; - - d = __gfs2_lookup(dir, dentry, file); - if (IS_ERR(d)) - return PTR_ERR(d); - if (d != NULL) - dentry = d; - if (d_really_is_positive(dentry)) { - if (!(file->f_mode & FMODE_OPENED)) + if (d_in_lookup(dentry)) { + struct dentry *d = __gfs2_lookup(dir, dentry, file); + if (file->f_mode & FMODE_OPENED) { + if (IS_ERR(d)) + return PTR_ERR(d); + dput(d); + return excl && (flags & O_CREAT) ? -EEXIST : 0; + } + if (d || d_really_is_positive(dentry)) return finish_no_open(file, d); - dput(d); - return excl && (flags & O_CREAT) ? -EEXIST : 0; } - - BUG_ON(d != NULL); - -skip_lookup: if (!(flags & O_CREAT)) return -ENOENT; diff --git a/fs/hpfs/anode.c b/fs/hpfs/anode.c index c14c9a035ee0..a4f5321eafae 100644 --- a/fs/hpfs/anode.c +++ b/fs/hpfs/anode.c @@ -27,7 +27,7 @@ secno hpfs_bplus_lookup(struct super_block *s, struct inode *inode, a = le32_to_cpu(btree->u.internal[i].down); brelse(bh); if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); goto go_down; } hpfs_error(s, "sector %08x not found in internal anode %08x", sec, a); @@ -69,12 +69,13 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi int n; unsigned fs; int c1, c2 = 0; + if (fnod) { if (!(fnode = hpfs_map_fnode(s, node, &bh))) return -1; - btree = &fnode->btree; + btree = GET_BTREE_PTR(&fnode->btree); } else { if (!(anode = hpfs_map_anode(s, node, &bh))) return -1; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } a = node; go_down: @@ -91,7 +92,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi if (hpfs_sb(s)->sb_chk) if (hpfs_stop_cycles(s, a, &c1, &c2, "hpfs_add_sector_to_btree #1")) return -1; if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); goto go_down; } if (n >= 0) { @@ -151,7 +152,7 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi } brelse(bh); bh = bh1; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } btree->n_free_nodes--; n = btree->n_used_nodes++; le16_add_cpu(&btree->first_free, 12); @@ -168,10 +169,10 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi if (hpfs_stop_cycles(s, up, &c1, &c2, "hpfs_add_sector_to_btree #2")) return -1; if (up != node || !fnod) { if (!(anode = hpfs_map_anode(s, up, &bh))) return -1; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } else { if (!(fnode = hpfs_map_fnode(s, up, &bh))) return -1; - btree = &fnode->btree; + btree = GET_BTREE_PTR(&fnode->btree); } if (btree->n_free_nodes) { btree->n_free_nodes--; n = btree->n_used_nodes++; @@ -206,8 +207,8 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi anode->btree.n_used_nodes = 1; anode->btree.n_free_nodes = 59; anode->btree.first_free = cpu_to_le16(16); - anode->btree.u.internal[0].down = cpu_to_le32(a); - anode->btree.u.internal[0].file_secno = cpu_to_le32(-1); + GET_BTREE_PTR(&anode->btree)->u.internal[0].down = cpu_to_le32(a); + GET_BTREE_PTR(&anode->btree)->u.internal[0].file_secno = cpu_to_le32(-1); mark_buffer_dirty(bh); brelse(bh); if ((anode = hpfs_map_anode(s, a, &bh))) { @@ -229,20 +230,20 @@ secno hpfs_add_sector_to_btree(struct super_block *s, secno node, int fnod, unsi brelse(bh2); return -1; } - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } else { if (!(fnode = hpfs_map_fnode(s, node, &bh))) { brelse(bh2); return -1; } - btree = &fnode->btree; + btree = GET_BTREE_PTR(&fnode->btree); } ranode->up = cpu_to_le32(node); memcpy(&ranode->btree, btree, le16_to_cpu(btree->first_free)); if (fnod) ranode->btree.flags |= BP_fnode_parent; - ranode->btree.n_free_nodes = (bp_internal(&ranode->btree) ? 60 : 40) - ranode->btree.n_used_nodes; - if (bp_internal(&ranode->btree)) for (n = 0; n < ranode->btree.n_used_nodes; n++) { + GET_BTREE_PTR(&ranode->btree)->n_free_nodes = (bp_internal(GET_BTREE_PTR(&ranode->btree)) ? 60 : 40) - GET_BTREE_PTR(&ranode->btree)->n_used_nodes; + if (bp_internal(GET_BTREE_PTR(&ranode->btree))) for (n = 0; n < GET_BTREE_PTR(&ranode->btree)->n_used_nodes; n++) { struct anode *unode; if ((unode = hpfs_map_anode(s, le32_to_cpu(ranode->u.internal[n].down), &bh1))) { unode->up = cpu_to_le32(ra); @@ -291,7 +292,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) if (hpfs_stop_cycles(s, ano, &d1, &d2, "hpfs_remove_btree #1")) return; if (!(anode = hpfs_map_anode(s, ano, &bh))) return; - btree1 = &anode->btree; + btree1 = GET_BTREE_PTR(&anode->btree); level++; pos = 0; } @@ -307,7 +308,7 @@ void hpfs_remove_btree(struct super_block *s, struct bplus_header *btree) ano = le32_to_cpu(anode->up); if (--level) { if (!(anode = hpfs_map_anode(s, ano, &bh))) return; - btree1 = &anode->btree; + btree1 = GET_BTREE_PTR(&anode->btree); } else btree1 = btree; for (i = 0; i < btree1->n_used_nodes; i++) { if (le32_to_cpu(btree1->u.internal[i].down) == oano) { @@ -332,7 +333,7 @@ static secno anode_lookup(struct super_block *s, anode_secno a, unsigned sec) struct anode *anode; struct buffer_head *bh; if (!(anode = hpfs_map_anode(s, a, &bh))) return -1; - return hpfs_bplus_lookup(s, NULL, &anode->btree, sec, bh); + return hpfs_bplus_lookup(s, NULL, GET_BTREE_PTR(&anode->btree), sec, bh); } int hpfs_ea_read(struct super_block *s, secno a, int ano, unsigned pos, @@ -388,7 +389,7 @@ void hpfs_ea_remove(struct super_block *s, secno a, int ano, unsigned len) struct buffer_head *bh; if (ano) { if (!(anode = hpfs_map_anode(s, a, &bh))) return; - hpfs_remove_btree(s, &anode->btree); + hpfs_remove_btree(s, GET_BTREE_PTR(&anode->btree)); brelse(bh); hpfs_free_sectors(s, a, 1); } else hpfs_free_sectors(s, a, (len + 511) >> 9); @@ -407,10 +408,10 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) int c1, c2 = 0; if (fno) { if (!(fnode = hpfs_map_fnode(s, f, &bh))) return; - btree = &fnode->btree; + btree = GET_BTREE_PTR(&fnode->btree); } else { if (!(anode = hpfs_map_anode(s, f, &bh))) return; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } if (!secs) { hpfs_remove_btree(s, btree); @@ -448,7 +449,7 @@ void hpfs_truncate_btree(struct super_block *s, secno f, int fno, unsigned secs) if (hpfs_stop_cycles(s, node, &c1, &c2, "hpfs_truncate_btree")) return; if (!(anode = hpfs_map_anode(s, node, &bh))) return; - btree = &anode->btree; + btree = GET_BTREE_PTR(&anode->btree); } nodes = btree->n_used_nodes + btree->n_free_nodes; for (i = 0; i < btree->n_used_nodes; i++) @@ -485,7 +486,7 @@ void hpfs_remove_fnode(struct super_block *s, fnode_secno fno) struct extended_attribute *ea; struct extended_attribute *ea_end; if (!(fnode = hpfs_map_fnode(s, fno, &bh))) return; - if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, &fnode->btree); + if (!fnode_is_dir(fnode)) hpfs_remove_btree(s, GET_BTREE_PTR(&fnode->btree)); else hpfs_remove_dtree(s, le32_to_cpu(fnode->u.external[0].disk_secno)); ea_end = fnode_end_ea(fnode); for (ea = fnode_ea(fnode); ea < ea_end; ea = next_ea(ea)) diff --git a/fs/hpfs/ea.c b/fs/hpfs/ea.c index 102ba18e561f..2149d3ca530b 100644 --- a/fs/hpfs/ea.c +++ b/fs/hpfs/ea.c @@ -41,7 +41,7 @@ void hpfs_ea_ext_remove(struct super_block *s, secno a, int ano, unsigned len) struct buffer_head *bh; struct anode *anode; if ((anode = hpfs_map_anode(s, a, &bh))) { - hpfs_remove_btree(s, &anode->btree); + hpfs_remove_btree(s, GET_BTREE_PTR(&anode->btree)); brelse(bh); hpfs_free_sectors(s, a, 1); } diff --git a/fs/hpfs/file.c b/fs/hpfs/file.c index 263b5bbe1849..29e876705369 100644 --- a/fs/hpfs/file.c +++ b/fs/hpfs/file.c @@ -51,7 +51,9 @@ static secno hpfs_bmap(struct inode *inode, unsigned file_secno, unsigned *n_sec return hpfs_inode->i_disk_sec + n; } if (!(fnode = hpfs_map_fnode(inode->i_sb, inode->i_ino, &bh))) return 0; - disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, &fnode->btree, file_secno, bh); + disk_secno = hpfs_bplus_lookup(inode->i_sb, inode, + GET_BTREE_PTR(&fnode->btree), + file_secno, bh); if (disk_secno == -1) return 0; if (hpfs_chk_sectors(inode->i_sb, disk_secno, 1, "bmap")) return 0; n = file_secno - hpfs_inode->i_file_sec; diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 281dec8f636b..353f73c914d9 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -394,27 +394,45 @@ enum { BP_binary_search = 0x40, BP_internal = 0x80 }; + +/** + * GET_BTREE_PTR() - Get a pointer to struct bplus_header + * + * Wrapper around container_of() to retrieve a pointer to struct + * bplus_header from a pointer to struct bplus_header_fixed. + * + * @ptr: Pointer to struct bplus_header_fixed. + * + */ +#define GET_BTREE_PTR(ptr) \ + container_of(ptr, struct bplus_header, __hdr) + struct bplus_header { - u8 flags; /* bit 0 - high bit of first free entry offset + /* New members MUST be added within the struct_group() macro below. */ + struct_group_tagged(bplus_header_fixed, __hdr, + u8 flags; /* bit 0 - high bit of first free entry offset bit 5 - we're pointed to by an fnode, the data btree or some ea or the main ea bootage pointer ea_secno bit 6 - suggest binary search (unused) bit 7 - 1 -> (internal) tree of anodes 0 -> (leaf) list of extents */ - u8 fill[3]; - u8 n_free_nodes; /* free nodes in following array */ - u8 n_used_nodes; /* used nodes in following array */ - __le16 first_free; /* offset from start of header to + u8 fill[3]; + u8 n_free_nodes; /* free nodes in following array */ + u8 n_used_nodes; /* used nodes in following array */ + __le16 first_free; /* offset from start of header to first free node in array */ - union { - /* (internal) 2-word entries giving subtree pointers */ - DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal); - /* (external) 3-word entries giving sector runs */ - DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external); - } u; + ); + union { + /* (internal) 2-word entries giving subtree pointers */ + DECLARE_FLEX_ARRAY(struct bplus_internal_node, internal); + /* (external) 3-word entries giving sector runs */ + DECLARE_FLEX_ARRAY(struct bplus_leaf_node, external); + } u; }; +static_assert(offsetof(struct bplus_header, u.internal) == sizeof(struct bplus_header_fixed), + "struct member likely outside of struct_group_tagged()"); static inline bool bp_internal(struct bplus_header *bp) { @@ -453,7 +471,7 @@ struct fnode __le16 flags; /* bit 1 set -> ea_secno is an anode */ /* bit 8 set -> directory. first & only extent points to dnode. */ - struct bplus_header btree; /* b+ tree, 8 extents or 12 subtrees */ + struct bplus_header_fixed btree; /* b+ tree, 8 extents or 12 subtrees */ union { struct bplus_leaf_node external[8]; struct bplus_internal_node internal[12]; @@ -495,7 +513,7 @@ struct anode __le32 self; /* pointer to this anode */ __le32 up; /* parent anode or fnode */ - struct bplus_header btree; /* b+tree, 40 extents or 60 subtrees */ + struct bplus_header_fixed btree; /* b+tree, 40 extents or 60 subtrees */ union { struct bplus_leaf_node external[40]; struct bplus_internal_node internal[60]; diff --git a/fs/hpfs/map.c b/fs/hpfs/map.c index ecd9fccd1663..be73233502f8 100644 --- a/fs/hpfs/map.c +++ b/fs/hpfs/map.c @@ -178,14 +178,14 @@ struct fnode *hpfs_map_fnode(struct super_block *s, ino_t ino, struct buffer_hea } if (!fnode_is_dir(fnode)) { if ((unsigned)fnode->btree.n_used_nodes + (unsigned)fnode->btree.n_free_nodes != - (bp_internal(&fnode->btree) ? 12 : 8)) { + (bp_internal(GET_BTREE_PTR(&fnode->btree)) ? 12 : 8)) { hpfs_error(s, "bad number of nodes in fnode %08lx", (unsigned long)ino); goto bail; } if (le16_to_cpu(fnode->btree.first_free) != - 8 + fnode->btree.n_used_nodes * (bp_internal(&fnode->btree) ? 8 : 12)) { + 8 + fnode->btree.n_used_nodes * (bp_internal(GET_BTREE_PTR(&fnode->btree)) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in fnode %08lx", (unsigned long)ino); @@ -233,12 +233,12 @@ struct anode *hpfs_map_anode(struct super_block *s, anode_secno ano, struct buff goto bail; } if ((unsigned)anode->btree.n_used_nodes + (unsigned)anode->btree.n_free_nodes != - (bp_internal(&anode->btree) ? 60 : 40)) { + (bp_internal(GET_BTREE_PTR(&anode->btree)) ? 60 : 40)) { hpfs_error(s, "bad number of nodes in anode %08x", ano); goto bail; } if (le16_to_cpu(anode->btree.first_free) != - 8 + anode->btree.n_used_nodes * (bp_internal(&anode->btree) ? 8 : 12)) { + 8 + anode->btree.n_used_nodes * (bp_internal(GET_BTREE_PTR(&anode->btree)) ? 8 : 12)) { hpfs_error(s, "bad first_free pointer in anode %08x", ano); goto bail; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index e3cdc421dfba..353e13a615f5 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -52,8 +52,10 @@ static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, dee.fnode = cpu_to_le32(fno); dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); - if (!result) + if (!result) { + err = -ENOMEM; goto bail2; + } hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; @@ -153,9 +155,10 @@ static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); - if (!result) + if (!result) { + err = -ENOMEM; goto bail1; - + } hpfs_init_inode(result); result->i_ino = fno; result->i_mode |= S_IFREG; @@ -239,9 +242,10 @@ static int hpfs_mknod(struct mnt_idmap *idmap, struct inode *dir, dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); - if (!result) + if (!result) { + err = -ENOMEM; goto bail1; - + } hpfs_init_inode(result); result->i_ino = fno; hpfs_i(result)->i_parent_dir = dir->i_ino; @@ -314,8 +318,10 @@ static int hpfs_symlink(struct mnt_idmap *idmap, struct inode *dir, dee.creation_date = dee.write_date = dee.read_date = cpu_to_le32(local_get_seconds(dir->i_sb)); result = new_inode(dir->i_sb); - if (!result) + if (!result) { + err = -ENOMEM; goto bail1; + } result->i_ino = fno; hpfs_init_inode(result); hpfs_i(result)->i_parent_dir = dir->i_ino; diff --git a/fs/hpfs/super.c b/fs/hpfs/super.c index 42b779b4d87f..8ab85e7ac91e 100644 --- a/fs/hpfs/super.c +++ b/fs/hpfs/super.c @@ -404,15 +404,11 @@ static int hpfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_timeshift: { - int m = 1; char *rhs = param->string; int timeshift; - if (*rhs == '-') m = -1; - if (*rhs == '+' || *rhs == '-') rhs++; - timeshift = simple_strtoul(rhs, &rhs, 0) * m; - if (*rhs) - return -EINVAL; + if (kstrtoint(rhs, 0, ×hift)) + return -EINVAL; ctx->timeshift = timeshift; break; } diff --git a/fs/internal.h b/fs/internal.h index a33d18ee5b74..9b2b4d116880 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -53,7 +53,7 @@ extern int finish_clean_context(struct fs_context *fc); * namei.c */ extern int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root); + struct path *path, const struct path *root); int do_rmdir(int dfd, struct filename *name); int do_unlinkat(int dfd, struct filename *name); int may_linkat(struct mnt_idmap *idmap, const struct path *link); @@ -84,9 +84,9 @@ void mnt_put_write_access_file(struct file *file); extern void dissolve_on_fput(struct vfsmount *); extern bool may_mount(void); -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page); -int path_umount(struct path *path, int flags); +int path_umount(const struct path *path, int flags); int show_path(struct seq_file *m, struct dentry *root); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index 9802b2cc29bb..5d5d63efbd57 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -433,7 +433,8 @@ static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) bio->bi_private = dio; bio->bi_end_io = iomap_dio_bio_end_io; - ret = bio_iov_iter_get_bdev_pages(bio, dio->submit.iter, iomap->bdev); + ret = bio_iov_iter_get_pages(bio, dio->submit.iter, + bdev_logical_block_size(iomap->bdev) - 1); if (unlikely(ret)) { /* * We have to stop part way through an IO. We must fall diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 38861ca04899..2d0719bf6d87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -131,7 +131,7 @@ __flush_batch(journal_t *journal, int *batch_count) blk_start_plug(&plug); for (i = 0; i < *batch_count; i++) - write_dirty_buffer(journal->j_chkpt_bhs[i], REQ_SYNC); + write_dirty_buffer(journal->j_chkpt_bhs[i], JBD2_JOURNAL_REQ_FLAGS); blk_finish_plug(&plug); for (i = 0; i < *batch_count; i++) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index c7867139af69..3e510564de6e 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1659,6 +1659,7 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) int drop_reserve = 0; int err = 0; int was_modified = 0; + int wait_for_writeback = 0; if (is_handle_aborted(handle)) return -EROFS; @@ -1782,18 +1783,22 @@ int jbd2_journal_forget(handle_t *handle, struct buffer_head *bh) } /* - * The buffer is still not written to disk, we should - * attach this buffer to current transaction so that the - * buffer can be checkpointed only after the current - * transaction commits. + * The buffer has not yet been written to disk. We should + * either clear the buffer or ensure that the ongoing I/O + * is completed, and attach this buffer to current + * transaction so that the buffer can be checkpointed only + * after the current transaction commits. */ clear_buffer_dirty(bh); + wait_for_writeback = 1; __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); spin_unlock(&journal->j_list_lock); } drop: __brelse(bh); spin_unlock(&jh->b_state_lock); + if (wait_for_writeback) + wait_on_buffer(bh); jbd2_journal_put_journal_head(jh); if (drop_reserve) { /* no need to reserve log space for this block -bzzz */ diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index fcedeb514e14..21f3d029da7d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -59,9 +59,15 @@ struct inode *jfs_iget(struct super_block *sb, unsigned long ino) */ inode->i_link[inode->i_size] = '\0'; } - } else { + } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || + S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { inode->i_op = &jfs_file_inode_operations; init_special_inode(inode, inode->i_mode, inode->i_rdev); + } else { + printk(KERN_DEBUG "JFS: Invalid file type 0%04o for inode %lu.\n", + inode->i_mode, inode->i_ino); + iget_failed(inode); + return ERR_PTR(-EIO); } unlock_new_inode(inode); return inode; diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index ab11849cf9cc..0ab83bb7bbdf 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2903,7 +2903,7 @@ int jfs_readdir(struct file *file, struct dir_context *ctx) stbl = DT_GETSTBL(p); for (i = index; i < p->header.nextindex; i++) { - if (stbl[i] < 0 || stbl[i] > 127) { + if (stbl[i] < 0 || stbl[i] >= DTPAGEMAXSLOT) { jfs_err("JFS: Invalid stbl[%d] = %d for inode %ld, block = %lld", i, stbl[i], (long)ip->i_ino, (long long)bn); free_page(dirent_buf); @@ -3108,7 +3108,7 @@ static int dtReadFirst(struct inode *ip, struct btstack * btstack) /* get the leftmost entry */ stbl = DT_GETSTBL(p); - if (stbl[0] < 0 || stbl[0] > 127) { + if (stbl[0] < 0 || stbl[0] >= DTPAGEMAXSLOT) { DT_PUTPAGE(mp); jfs_error(ip->i_sb, "stbl[0] out of bound\n"); return -EIO; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 270808b6219b..b343c5ea1159 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1199,7 +1199,6 @@ static int open_dummy_log(struct super_block *sb) init_waitqueue_head(&dummy_log->syncwait); dummy_log->no_integrity = 1; /* Make up some stuff */ - dummy_log->base = 0; dummy_log->size = 1024; rc = lmLogInit(dummy_log); if (rc) { diff --git a/fs/jfs/jfs_mount.c b/fs/jfs/jfs_mount.c index 98f9a432c336..52e6b58c5dbd 100644 --- a/fs/jfs/jfs_mount.c +++ b/fs/jfs/jfs_mount.c @@ -325,13 +325,13 @@ static int chkSuper(struct super_block *sb) if ((j_sb->s_flag & cpu_to_le32(JFS_BAD_SAIT)) != cpu_to_le32(JFS_BAD_SAIT)) { expected_AIM_bytesize = 2 * PSIZE; - AIM_bytesize = lengthPXD(&(j_sb->s_aim2)) * bsize; + AIM_bytesize = lengthPXD(&j_sb->s_aim2) * bsize; expected_AIT_bytesize = 4 * PSIZE; - AIT_bytesize = lengthPXD(&(j_sb->s_ait2)) * bsize; - AIM_byte_addr = addressPXD(&(j_sb->s_aim2)) * bsize; - AIT_byte_addr = addressPXD(&(j_sb->s_ait2)) * bsize; + AIT_bytesize = lengthPXD(&j_sb->s_ait2) * bsize; + AIM_byte_addr = addressPXD(&j_sb->s_aim2) * bsize; + AIT_byte_addr = addressPXD(&j_sb->s_ait2) * bsize; byte_addr_diff0 = AIT_byte_addr - AIM_byte_addr; - fsckwsp_addr = addressPXD(&(j_sb->s_fsckpxd)) * bsize; + fsckwsp_addr = addressPXD(&j_sb->s_fsckpxd) * bsize; byte_addr_diff1 = fsckwsp_addr - AIT_byte_addr; if ((AIM_bytesize != expected_AIM_bytesize) || (AIT_bytesize != expected_AIT_bytesize) || diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index be17e3c43582..7840a03e5bcb 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -272,14 +272,15 @@ int txInit(void) if (TxBlock == NULL) return -ENOMEM; - for (k = 1; k < nTxBlock - 1; k++) { - TxBlock[k].next = k + 1; + for (k = 0; k < nTxBlock; k++) { init_waitqueue_head(&TxBlock[k].gcwait); init_waitqueue_head(&TxBlock[k].waitor); } + + for (k = 1; k < nTxBlock - 1; k++) { + TxBlock[k].next = k + 1; + } TxBlock[k].next = 0; - init_waitqueue_head(&TxBlock[k].gcwait); - init_waitqueue_head(&TxBlock[k].waitor); TxAnchor.freetid = 1; init_waitqueue_head(&TxAnchor.freewait); diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index e80262a51884..d68afa196535 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -216,8 +216,7 @@ out_err: if (warned++ == 0) printk(KERN_WARNING "lockd_up: makesock failed, error=%d\n", err); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); return err; } @@ -255,8 +254,7 @@ static void lockd_down_net(struct svc_serv *serv, struct net *net) nlm_shutdown_hosts_net(net); cancel_delayed_work_sync(&ln->grace_period_end); locks_end_grace(&ln->lockd_manager); - svc_xprt_destroy_all(serv, net); - svc_rpcb_cleanup(serv, net); + svc_xprt_destroy_all(serv, net, true); } } else { pr_err("%s: no users! net=%x\n", diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index c1315df4b350..a31dc9588eb8 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -980,7 +980,7 @@ nlmsvc_grant_reply(struct nlm_cookie *cookie, __be32 status) struct file_lock *fl; int error; - dprintk("grant_reply: looking for cookie %x, s=%d \n", + dprintk("grant_reply: looking for cookie %x, s=%d\n", *(unsigned int *)(cookie->data), status); if (!(block = nlmsvc_find_block(cookie))) return; diff --git a/fs/mount.h b/fs/mount.h index 79c85639a7ba..f13a28752d0b 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -58,7 +58,10 @@ struct mount { #endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ - struct list_head mnt_instance; /* mount instance on sb->s_mounts */ + struct mount *mnt_next_for_sb; /* the next two fields are hlist_node, */ + struct mount * __aligned(1) *mnt_pprev_for_sb; + /* except that LSB of pprev is stolen */ +#define WRITE_HOLD 1 /* ... for use by mnt_hold_writers() */ const char *mnt_devname; /* Name of device e.g. /dev/dsk/hda1 */ struct list_head mnt_list; struct list_head mnt_expire; /* link in fs-specific expiry list */ @@ -148,6 +151,11 @@ static inline void get_mnt_ns(struct mnt_namespace *ns) extern seqlock_t mount_lock; +DEFINE_LOCK_GUARD_0(mount_writer, write_seqlock(&mount_lock), + write_sequnlock(&mount_lock)) +DEFINE_LOCK_GUARD_0(mount_locked_reader, read_seqlock_excl(&mount_lock), + read_sequnlock_excl(&mount_lock)) + struct proc_mounts { struct mnt_namespace *ns; struct path root; @@ -224,4 +232,33 @@ static inline void mnt_notify_add(struct mount *m) } #endif +static inline struct mount *topmost_overmount(struct mount *m) +{ + while (m->overmount) + m = m->overmount; + return m; +} + +static inline bool __test_write_hold(struct mount * __aligned(1) *val) +{ + return (unsigned long)val & WRITE_HOLD; +} + +static inline bool test_write_hold(const struct mount *m) +{ + return __test_write_hold(m->mnt_pprev_for_sb); +} + +static inline void set_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + | WRITE_HOLD); +} + +static inline void clear_write_hold(struct mount *m) +{ + m->mnt_pprev_for_sb = (void *)((unsigned long)m->mnt_pprev_for_sb + & ~WRITE_HOLD); +} + struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/namei.c b/fs/namei.c index 507ca0d7878d..7377020a2cba 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2695,7 +2695,7 @@ static int path_lookupat(struct nameidata *nd, unsigned flags, struct path *path } int filename_lookup(int dfd, struct filename *name, unsigned flags, - struct path *path, struct path *root) + struct path *path, const struct path *root) { int retval; struct nameidata nd; @@ -3651,8 +3651,8 @@ static struct dentry *atomic_open(struct nameidata *nd, struct dentry *dentry, if (nd->flags & LOOKUP_DIRECTORY) open_flag |= O_DIRECTORY; - file->f_path.dentry = DENTRY_NOT_SET; - file->f_path.mnt = nd->path.mnt; + file->__f_path.dentry = DENTRY_NOT_SET; + file->__f_path.mnt = nd->path.mnt; error = dir->i_op->atomic_open(dir, dentry, file, open_to_namei_flags(open_flag), mode); d_lookup_done(dentry); @@ -4020,8 +4020,8 @@ int vfs_tmpfile(struct mnt_idmap *idmap, child = d_alloc(parentpath->dentry, &slash_name); if (unlikely(!child)) return -ENOMEM; - file->f_path.mnt = parentpath->mnt; - file->f_path.dentry = child; + file->__f_path.mnt = parentpath->mnt; + file->__f_path.dentry = child; mode = vfs_prepare_mode(idmap, dir, mode, mode, mode); error = dir->i_op->tmpfile(idmap, dir, file, mode); dput(child); @@ -4256,7 +4256,7 @@ struct dentry *start_creating_path(int dfd, const char *pathname, } EXPORT_SYMBOL(start_creating_path); -void end_creating_path(struct path *path, struct dentry *dentry) +void end_creating_path(const struct path *path, struct dentry *dentry) { if (!IS_ERR(dentry)) dput(dentry); diff --git a/fs/namespace.c b/fs/namespace.c index dc01b14c58cd..d82910f33dc4 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -91,6 +91,14 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static struct mnt_namespace *emptied_ns; /* protected by namespace_sem */ +static inline void namespace_lock(void); +static void namespace_unlock(void); +DEFINE_LOCK_GUARD_0(namespace_excl, namespace_lock(), namespace_unlock()) +DEFINE_LOCK_GUARD_0(namespace_shared, down_read(&namespace_sem), + up_read(&namespace_sem)) + +DEFINE_FREE(mntput, struct vfsmount *, if (!IS_ERR(_T)) mntput(_T)) + #ifdef CONFIG_FSNOTIFY LIST_HEAD(notify_list); /* protected by namespace_sem */ #endif @@ -363,7 +371,7 @@ out_free_cache: * mnt_want/drop_write() will _keep_ the filesystem * r/w. */ -bool __mnt_is_readonly(struct vfsmount *mnt) +bool __mnt_is_readonly(const struct vfsmount *mnt) { return (mnt->mnt_flags & MNT_READONLY) || sb_rdonly(mnt->mnt_sb); } @@ -403,7 +411,7 @@ static unsigned int mnt_get_writers(struct mount *mnt) #endif } -static int mnt_is_readonly(struct vfsmount *mnt) +static int mnt_is_readonly(const struct vfsmount *mnt) { if (READ_ONCE(mnt->mnt_sb->s_readonly_remount)) return 1; @@ -444,31 +452,31 @@ int mnt_get_write_access(struct vfsmount *m) mnt_inc_writers(mnt); /* * The store to mnt_inc_writers must be visible before we pass - * MNT_WRITE_HOLD loop below, so that the slowpath can see our - * incremented count after it has set MNT_WRITE_HOLD. + * WRITE_HOLD loop below, so that the slowpath can see our + * incremented count after it has set WRITE_HOLD. */ smp_mb(); might_lock(&mount_lock.lock); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + while (__test_write_hold(READ_ONCE(mnt->mnt_pprev_for_sb))) { if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { cpu_relax(); } else { /* * This prevents priority inversion, if the task - * setting MNT_WRITE_HOLD got preempted on a remote + * setting WRITE_HOLD got preempted on a remote * CPU, and it prevents life lock if the task setting - * MNT_WRITE_HOLD has a lower priority and is bound to + * WRITE_HOLD has a lower priority and is bound to * the same CPU as the task that is spinning here. */ preempt_enable(); - lock_mount_hash(); - unlock_mount_hash(); + read_seqlock_excl(&mount_lock); + read_sequnlock_excl(&mount_lock); preempt_disable(); } } /* * The barrier pairs with the barrier sb_start_ro_state_change() making - * sure that if we see MNT_WRITE_HOLD cleared, we will also see + * sure that if we see WRITE_HOLD cleared, we will also see * s_readonly_remount set (or even SB_RDONLY / MNT_READONLY flags) in * mnt_is_readonly() and bail in case we are racing with remount * read-only. @@ -606,16 +614,16 @@ EXPORT_SYMBOL(mnt_drop_write_file); * a call to mnt_unhold_writers() in order to stop preventing write access to * @mnt. * - * Context: This function expects lock_mount_hash() to be held serializing - * setting MNT_WRITE_HOLD. + * Context: This function expects to be in mount_locked_reader scope serializing + * setting WRITE_HOLD. * Return: On success 0 is returned. * On error, -EBUSY is returned. */ static inline int mnt_hold_writers(struct mount *mnt) { - mnt->mnt.mnt_flags |= MNT_WRITE_HOLD; + set_write_hold(mnt); /* - * After storing MNT_WRITE_HOLD, we'll read the counters. This store + * After storing WRITE_HOLD, we'll read the counters. This store * should be visible before we do. */ smp_mb(); @@ -631,9 +639,9 @@ static inline int mnt_hold_writers(struct mount *mnt) * sum up each counter, if we read a counter before it is incremented, * but then read another CPU's count which it has been subsequently * decremented from -- we would see more decrements than we should. - * MNT_WRITE_HOLD protects against this scenario, because + * WRITE_HOLD protects against this scenario, because * mnt_want_write first increments count, then smp_mb, then spins on - * MNT_WRITE_HOLD, so it can't be decremented by another CPU while + * WRITE_HOLD, so it can't be decremented by another CPU while * we're counting up here. */ if (mnt_get_writers(mnt) > 0) @@ -649,19 +657,42 @@ static inline int mnt_hold_writers(struct mount *mnt) * Stop preventing write access to @mnt allowing callers to gain write access * to @mnt again. * - * This function can only be called after a successful call to - * mnt_hold_writers(). + * This function can only be called after a call to mnt_hold_writers(). * - * Context: This function expects lock_mount_hash() to be held. + * Context: This function expects to be in the same mount_locked_reader scope + * as the matching mnt_hold_writers(). */ static inline void mnt_unhold_writers(struct mount *mnt) { + if (!test_write_hold(mnt)) + return; /* - * MNT_READONLY must become visible before ~MNT_WRITE_HOLD, so writers + * MNT_READONLY must become visible before ~WRITE_HOLD, so writers * that become unheld will see MNT_READONLY. */ smp_wmb(); - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + clear_write_hold(mnt); +} + +static inline void mnt_del_instance(struct mount *m) +{ + struct mount **p = m->mnt_pprev_for_sb; + struct mount *next = m->mnt_next_for_sb; + + if (next) + next->mnt_pprev_for_sb = p; + *p = next; +} + +static inline void mnt_add_instance(struct mount *m, struct super_block *s) +{ + struct mount *first = s->s_mounts; + + if (first) + first->mnt_pprev_for_sb = &m->mnt_next_for_sb; + m->mnt_next_for_sb = first; + m->mnt_pprev_for_sb = &s->s_mounts; + s->s_mounts = m; } static int mnt_make_readonly(struct mount *mnt) @@ -677,17 +708,17 @@ static int mnt_make_readonly(struct mount *mnt) int sb_prepare_remount_readonly(struct super_block *sb) { - struct mount *mnt; int err = 0; - /* Racy optimization. Recheck the counter under MNT_WRITE_HOLD */ + /* Racy optimization. Recheck the counter under WRITE_HOLD */ if (atomic_long_read(&sb->s_remove_count)) return -EBUSY; - lock_mount_hash(); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (!(mnt->mnt.mnt_flags & MNT_READONLY)) { - err = mnt_hold_writers(mnt); + guard(mount_locked_reader)(); + + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (!(m->mnt.mnt_flags & MNT_READONLY)) { + err = mnt_hold_writers(m); if (err) break; } @@ -697,11 +728,10 @@ int sb_prepare_remount_readonly(struct super_block *sb) if (!err) sb_start_ro_state_change(sb); - list_for_each_entry(mnt, &sb->s_mounts, mnt_instance) { - if (mnt->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt->mnt.mnt_flags &= ~MNT_WRITE_HOLD; + for (struct mount *m = sb->s_mounts; m; m = m->mnt_next_for_sb) { + if (test_write_hold(m)) + clear_write_hold(m); } - unlock_mount_hash(); return err; } @@ -760,24 +790,16 @@ static bool legitimize_mnt(struct vfsmount *bastard, unsigned seq) } /** - * __lookup_mnt - find first child mount + * __lookup_mnt - mount hash lookup * @mnt: parent mount - * @dentry: mountpoint - * - * If @mnt has a child mount @c mounted @dentry find and return it. + * @dentry: dentry of mountpoint * - * Note that the child mount @c need not be unique. There are cases - * where shadow mounts are created. For example, during mount - * propagation when a source mount @mnt whose root got overmounted by a - * mount @o after path lookup but before @namespace_sem could be - * acquired gets copied and propagated. So @mnt gets copied including - * @o. When @mnt is propagated to a destination mount @d that already - * has another mount @n mounted at the same mountpoint then the source - * mount @mnt will be tucked beneath @n, i.e., @n will be mounted on - * @mnt and @mnt mounted on @d. Now both @n and @o are mounted at @mnt - * on @dentry. + * If @mnt has a child mount @c mounted on @dentry find and return it. + * Caller must either hold the spinlock component of @mount_lock or + * hold rcu_read_lock(), sample the seqcount component before the call + * and recheck it afterwards. * - * Return: The first child of @mnt mounted @dentry or NULL. + * Return: The child of @mnt mounted on @dentry or %NULL. */ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) { @@ -790,21 +812,12 @@ struct mount *__lookup_mnt(struct vfsmount *mnt, struct dentry *dentry) return NULL; } -/* - * lookup_mnt - Return the first child mount mounted at path - * - * "First" means first mounted chronologically. If you create the - * following mounts: - * - * mount /dev/sda1 /mnt - * mount /dev/sda2 /mnt - * mount /dev/sda3 /mnt - * - * Then lookup_mnt() on the base /mnt dentry in the root mount will - * return successively the root dentry and vfsmount of /dev/sda1, then - * /dev/sda2, then /dev/sda3, then NULL. +/** + * lookup_mnt - Return the child mount mounted at given location + * @path: location in the namespace * - * lookup_mnt takes a reference to the found vfsmount. + * Acquires and returns a new reference to mount at given location + * or %NULL if nothing is mounted there. */ struct vfsmount *lookup_mnt(const struct path *path) { @@ -841,22 +854,20 @@ bool __is_local_mountpoint(const struct dentry *dentry) { struct mnt_namespace *ns = current->nsproxy->mnt_ns; struct mount *mnt, *n; - bool is_covered = false; - down_read(&namespace_sem); - rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { - is_covered = (mnt->mnt_mountpoint == dentry); - if (is_covered) - break; - } - up_read(&namespace_sem); + guard(namespace_shared)(); - return is_covered; + rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) + if (mnt->mnt_mountpoint == dentry) + return true; + + return false; } struct pinned_mountpoint { struct hlist_node node; struct mountpoint *mp; + struct mount *parent; }; static bool lookup_mountpoint(struct dentry *dentry, struct pinned_mountpoint *m) @@ -947,7 +958,7 @@ static void unpin_mountpoint(struct pinned_mountpoint *m) } } -static inline int check_mnt(struct mount *mnt) +static inline int check_mnt(const struct mount *mnt) { return mnt->mnt_ns == current->nsproxy->mnt_ns; } @@ -1149,6 +1160,20 @@ static void commit_tree(struct mount *mnt) touch_mnt_namespace(n); } +static void setup_mnt(struct mount *m, struct dentry *root) +{ + struct super_block *s = root->d_sb; + + atomic_inc(&s->s_active); + m->mnt.mnt_sb = s; + m->mnt.mnt_root = dget(root); + m->mnt_mountpoint = m->mnt.mnt_root; + m->mnt_parent = m; + + guard(mount_locked_reader)(); + mnt_add_instance(m, s); +} + /** * vfs_create_mount - Create a mount for a configured superblock * @fc: The configuration context with the superblock attached @@ -1172,15 +1197,8 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (fc->sb_flags & SB_KERNMOUNT) mnt->mnt.mnt_flags = MNT_INTERNAL; - atomic_inc(&fc->root->d_sb->s_active); - mnt->mnt.mnt_sb = fc->root->d_sb; - mnt->mnt.mnt_root = dget(fc->root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; + setup_mnt(mnt, fc->root); - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &mnt->mnt.mnt_sb->s_mounts); - unlock_mount_hash(); return &mnt->mnt; } EXPORT_SYMBOL(vfs_create_mount); @@ -1221,8 +1239,7 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, return ERR_CAST(fc); if (name) - ret = vfs_parse_fs_string(fc, "source", - name, strlen(name)); + ret = vfs_parse_fs_string(fc, "source", name); if (!ret) ret = parse_monolithic_mount_data(fc, data); if (!ret) @@ -1238,7 +1255,6 @@ EXPORT_SYMBOL_GPL(vfs_kern_mount); static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { - struct super_block *sb = old->mnt.mnt_sb; struct mount *mnt; int err; @@ -1263,16 +1279,9 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (mnt->mnt_group_id) set_mnt_shared(mnt); - atomic_inc(&sb->s_active); mnt->mnt.mnt_idmap = mnt_idmap_get(mnt_idmap(&old->mnt)); - mnt->mnt.mnt_sb = sb; - mnt->mnt.mnt_root = dget(root); - mnt->mnt_mountpoint = mnt->mnt.mnt_root; - mnt->mnt_parent = mnt; - lock_mount_hash(); - list_add_tail(&mnt->mnt_instance, &sb->s_mounts); - unlock_mount_hash(); + setup_mnt(mnt, root); if (flag & CL_PRIVATE) // we are done with it return mnt; @@ -1378,7 +1387,7 @@ static void mntput_no_expire(struct mount *mnt) mnt->mnt.mnt_flags |= MNT_DOOMED; rcu_read_unlock(); - list_del(&mnt->mnt_instance); + mnt_del_instance(mnt); if (unlikely(!list_empty(&mnt->mnt_expire))) list_del(&mnt->mnt_expire); @@ -1719,8 +1728,6 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } -DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) - enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -1785,6 +1792,8 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) if (how & UMOUNT_PROPAGATE) propagate_umount(&tmp_list); + bulk_make_private(&tmp_list); + while (!list_empty(&tmp_list)) { struct mnt_namespace *ns; bool disconnect; @@ -1809,7 +1818,6 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) umount_mnt(p); } } - change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); @@ -1969,10 +1977,11 @@ void __detach_mounts(struct dentry *dentry) struct pinned_mountpoint mp = {}; struct mount *mnt; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); + if (!lookup_mountpoint(dentry, &mp)) - goto out_unlock; + return; event++; while (mp.node.next) { @@ -1984,9 +1993,6 @@ void __detach_mounts(struct dentry *dentry) else umount_tree(mnt, UMOUNT_CONNECTED); } unpin_mountpoint(&mp); -out_unlock: - unlock_mount_hash(); - namespace_unlock(); } /* @@ -2025,7 +2031,7 @@ static int can_umount(const struct path *path, int flags) } // caller is responsible for flags being sane -int path_umount(struct path *path, int flags) +int path_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); int ret; @@ -2238,7 +2244,7 @@ static inline bool extend_array(struct path **res, struct path **to_free, return p; } -struct path *collect_paths(const struct path *path, +const struct path *collect_paths(const struct path *path, struct path *prealloc, unsigned count) { struct mount *root = real_mount(path->mnt); @@ -2246,7 +2252,7 @@ struct path *collect_paths(const struct path *path, struct path *res = prealloc, *to_free = NULL; unsigned n = 0; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (!check_mnt(root)) return ERR_PTR(-EINVAL); @@ -2272,9 +2278,9 @@ struct path *collect_paths(const struct path *path, return res; } -void drop_collected_paths(struct path *paths, struct path *prealloc) +void drop_collected_paths(const struct path *paths, const struct path *prealloc) { - for (struct path *p = paths; p->mnt; p++) + for (const struct path *p = paths; p->mnt; p++) path_put(p); if (paths != prealloc) kfree(paths); @@ -2301,7 +2307,7 @@ void dissolve_on_fput(struct vfsmount *mnt) return; } - scoped_guard(namespace_lock, &namespace_sem) { + scoped_guard(namespace_excl) { if (!anon_ns_root(m)) return; @@ -2312,6 +2318,7 @@ void dissolve_on_fput(struct vfsmount *mnt) } } +/* locks: namespace_shared && pinned(mnt) || mount_locked_reader */ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) { struct mount *child; @@ -2328,12 +2335,8 @@ static bool __has_locked_children(struct mount *mnt, struct dentry *dentry) bool has_locked_children(struct mount *mnt, struct dentry *dentry) { - bool res; - - read_seqlock_excl(&mount_lock); - res = __has_locked_children(mnt, dentry); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return __has_locked_children(mnt, dentry); } /* @@ -2341,21 +2344,15 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * specified subtree. Such references can act as pins for mount namespaces * that aren't checked by the mount-cycle checking code, thereby allowing * cycles to be made. + * + * locks: mount_locked_reader || namespace_shared && pinned(subtree) */ static bool check_for_nsfs_mounts(struct mount *subtree) { - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) + for (struct mount *p = subtree; p; p = next_mnt(p, subtree)) if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; + return false; + return true; } /** @@ -2375,7 +2372,7 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - guard(rwsem_read)(&namespace_sem); + guard(namespace_shared)(); if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); @@ -2496,8 +2493,7 @@ enum mnt_tree_flags_t { /** * attach_recursive_mnt - attach a source mount tree * @source_mnt: mount tree to be attached - * @dest_mnt: mount that @source_mnt will be mounted on - * @dest_mp: the mountpoint @source_mnt will be mounted at + * @dest: the context for mounting at the place where the tree should go * * NOTE: in the table below explains the semantics when a source mount * of a given type is attached to a destination mount of a given type. @@ -2560,10 +2556,11 @@ enum mnt_tree_flags_t { * Otherwise a negative error code is returned. */ static int attach_recursive_mnt(struct mount *source_mnt, - struct mount *dest_mnt, - struct mountpoint *dest_mp) + const struct pinned_mountpoint *dest) { struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; + struct mount *dest_mnt = dest->parent; + struct mountpoint *dest_mp = dest->mp; HLIST_HEAD(tree_list); struct mnt_namespace *ns = dest_mnt->mnt_ns; struct pinned_mountpoint root = {}; @@ -2643,10 +2640,9 @@ static int attach_recursive_mnt(struct mount *source_mnt, child->mnt_mountpoint); commit_tree(child); if (q) { + struct mount *r = topmost_overmount(child); struct mountpoint *mp = root.mp; - struct mount *r = child; - while (unlikely(r->overmount)) - r = r->overmount; + if (unlikely(shorter) && child != source_mnt) mp = shorter; mnt_change_mountpoint(r, mp, q); @@ -2675,110 +2671,120 @@ static int attach_recursive_mnt(struct mount *source_mnt, return err; } +static inline struct mount *where_to_mount(const struct path *path, + struct dentry **dentry, + bool beneath) +{ + struct mount *m; + + if (unlikely(beneath)) { + m = topmost_overmount(real_mount(path->mnt)); + *dentry = m->mnt_mountpoint; + return m->mnt_parent; + } + m = __lookup_mnt(path->mnt, path->dentry); + if (unlikely(m)) { + m = topmost_overmount(m); + *dentry = m->mnt.mnt_root; + return m; + } + *dentry = path->dentry; + return real_mount(path->mnt); +} + /** - * do_lock_mount - lock mount and mountpoint - * @path: target path - * @beneath: whether the intention is to mount beneath @path - * - * Follow the mount stack on @path until the top mount @mnt is found. If - * the initial @path->{mnt,dentry} is a mountpoint lookup the first - * mount stacked on top of it. Then simply follow @{mnt,mnt->mnt_root} - * until nothing is stacked on top of it anymore. + * do_lock_mount - acquire environment for mounting + * @path: target path + * @res: context to set up + * @beneath: whether the intention is to mount beneath @path * - * Acquire the inode_lock() on the top mount's ->mnt_root to protect - * against concurrent removal of the new mountpoint from another mount - * namespace. + * To mount something at given location, we need + * namespace_sem locked exclusive + * inode of dentry we are mounting on locked exclusive + * struct mountpoint for that dentry + * struct mount we are mounting on * - * If @beneath is requested, acquire inode_lock() on @mnt's mountpoint - * @mp on @mnt->mnt_parent must be acquired. This protects against a - * concurrent unlink of @mp->mnt_dentry from another mount namespace - * where @mnt doesn't have a child mount mounted @mp. A concurrent - * removal of @mnt->mnt_root doesn't matter as nothing will be mounted - * on top of it for @beneath. + * Results are stored in caller-supplied context (pinned_mountpoint); + * on success we have res->parent and res->mp pointing to parent and + * mountpoint respectively and res->node inserted into the ->m_list + * of the mountpoint, making sure the mountpoint won't disappear. + * On failure we have res->parent set to ERR_PTR(-E...), res->mp + * left NULL, res->node - empty. + * In case of success do_lock_mount returns with locks acquired (in + * proper order - inode lock nests outside of namespace_sem). * - * In addition, @beneath needs to make sure that @mnt hasn't been - * unmounted or moved from its current mountpoint in between dropping - * @mount_lock and acquiring @namespace_sem. For the !@beneath case @mnt - * being unmounted would be detected later by e.g., calling - * check_mnt(mnt) in the function it's called from. For the @beneath - * case however, it's useful to detect it directly in do_lock_mount(). - * If @mnt hasn't been unmounted then @mnt->mnt_mountpoint still points - * to @mnt->mnt_mp->m_dentry. But if @mnt has been unmounted it will - * point to @mnt->mnt_root and @mnt->mnt_mp will be NULL. + * Request to mount on overmounted location is treated as "mount on + * top of whatever's overmounting it"; request to mount beneath + * a location - "mount immediately beneath the topmost mount at that + * place". * - * Return: Either the target mountpoint on the top mount or the top - * mount's mountpoint. + * In all cases the location must not have been unmounted and the + * chosen mountpoint must be allowed to be mounted on. For "beneath" + * case we also require the location to be at the root of a mount + * that has a parent (i.e. is not a root of some namespace). */ -static int do_lock_mount(struct path *path, struct pinned_mountpoint *pinned, bool beneath) +static void do_lock_mount(const struct path *path, + struct pinned_mountpoint *res, + bool beneath) { - struct vfsmount *mnt = path->mnt; - struct dentry *dentry; - struct path under = {}; - int err = -ENOENT; + int err; - for (;;) { - struct mount *m = real_mount(mnt); + if (unlikely(beneath) && !path_mounted(path)) { + res->parent = ERR_PTR(-EINVAL); + return; + } - if (beneath) { - path_put(&under); - read_seqlock_excl(&mount_lock); - under.mnt = mntget(&m->mnt_parent->mnt); - under.dentry = dget(m->mnt_mountpoint); - read_sequnlock_excl(&mount_lock); - dentry = under.dentry; - } else { - dentry = path->dentry; + do { + struct dentry *dentry, *d; + struct mount *m, *n; + + scoped_guard(mount_locked_reader) { + m = where_to_mount(path, &dentry, beneath); + if (&m->mnt != path->mnt) { + mntget(&m->mnt); + dget(dentry); + } } inode_lock(dentry->d_inode); namespace_lock(); - if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) - break; // not to be mounted on + // check if the chain of mounts (if any) has changed. + scoped_guard(mount_locked_reader) + n = where_to_mount(path, &d, beneath); - if (beneath && unlikely(m->mnt_mountpoint != dentry || - &m->mnt_parent->mnt != under.mnt)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); - continue; // got moved - } + if (unlikely(n != m || dentry != d)) + err = -EAGAIN; // something moved, retry + else if (unlikely(cant_mount(dentry) || !is_mounted(path->mnt))) + err = -ENOENT; // not to be mounted on + else if (beneath && &m->mnt == path->mnt && !m->overmount) + err = -EINVAL; + else + err = get_mountpoint(dentry, res); - mnt = lookup_mnt(path); - if (unlikely(mnt)) { + if (unlikely(err)) { + res->parent = ERR_PTR(err); namespace_unlock(); inode_unlock(dentry->d_inode); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - continue; // got overmounted + } else { + res->parent = m; } - err = get_mountpoint(dentry, pinned); - if (err) - break; - if (beneath) { - /* - * @under duplicates the references that will stay - * at least until namespace_unlock(), so the path_put() - * below is safe (and OK to do under namespace_lock - - * we are not dropping the final references here). - */ - path_put(&under); + /* + * Drop the temporary references. This is subtle - on success + * we are doing that under namespace_sem, which would normally + * be forbidden. However, in that case we are guaranteed that + * refcounts won't reach zero, since we know that path->mnt + * is mounted and thus all mounts reachable from it are pinned + * and stable, along with their mountpoints and roots. + */ + if (&m->mnt != path->mnt) { + dput(dentry); + mntput(&m->mnt); } - return 0; - } - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - path_put(&under); - return err; -} - -static inline int lock_mount(struct path *path, struct pinned_mountpoint *m) -{ - return do_lock_mount(path, m, false); + } while (err == -EAGAIN); } -static void unlock_mount(struct pinned_mountpoint *m) +static void __unlock_mount(struct pinned_mountpoint *m) { inode_unlock(m->mp->m_dentry->d_inode); read_seqlock_excl(&mount_lock); @@ -2787,16 +2793,30 @@ static void unlock_mount(struct pinned_mountpoint *m) namespace_unlock(); } -static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) +static inline void unlock_mount(struct pinned_mountpoint *m) +{ + if (!IS_ERR(m->parent)) + __unlock_mount(m); +} + +#define LOCK_MOUNT_MAYBE_BENEATH(mp, path, beneath) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + do_lock_mount((path), &mp, (beneath)) +#define LOCK_MOUNT(mp, path) LOCK_MOUNT_MAYBE_BENEATH(mp, (path), false) +#define LOCK_MOUNT_EXACT(mp, path) \ + struct pinned_mountpoint mp __cleanup(unlock_mount) = {}; \ + lock_mount_exact((path), &mp) + +static int graft_tree(struct mount *mnt, const struct pinned_mountpoint *mp) { if (mnt->mnt.mnt_sb->s_flags & SB_NOUSER) return -EINVAL; - if (d_is_dir(mp->m_dentry) != + if (d_is_dir(mp->mp->m_dentry) != d_is_dir(mnt->mnt.mnt_root)) return -ENOTDIR; - return attach_recursive_mnt(mnt, p, mp); + return attach_recursive_mnt(mnt, mp); } static int may_change_propagation(const struct mount *m) @@ -2832,13 +2852,13 @@ static int flags_to_propagation_type(int ms_flags) /* * recursively change the type of the mountpoint. */ -static int do_change_type(struct path *path, int ms_flags) +static int do_change_type(const struct path *path, int ms_flags) { struct mount *m; struct mount *mnt = real_mount(path->mnt); int recurse = ms_flags & MS_REC; int type; - int err = 0; + int err; if (!path_mounted(path)) return -EINVAL; @@ -2847,23 +2867,22 @@ static int do_change_type(struct path *path, int ms_flags) if (!type) return -EINVAL; - namespace_lock(); + guard(namespace_excl)(); + err = may_change_propagation(mnt); if (err) - goto out_unlock; + return err; if (type == MS_SHARED) { err = invent_group_ids(mnt, recurse); if (err) - goto out_unlock; + return err; } for (m = mnt; m; m = (recurse ? next_mnt(m, mnt) : NULL)) change_mnt_propagation(m, type); - out_unlock: - namespace_unlock(); - return err; + return 0; } /* may_copy_tree() - check if a mount tree can be copied @@ -2909,7 +2928,7 @@ static int do_change_type(struct path *path, int ms_flags) * * Returns true if the mount tree can be copied, false otherwise. */ -static inline bool may_copy_tree(struct path *path) +static inline bool may_copy_tree(const struct path *path) { struct mount *mnt = real_mount(path->mnt); const struct dentry_operations *d_op; @@ -2931,7 +2950,7 @@ static inline bool may_copy_tree(struct path *path) } -static struct mount *__do_loopback(struct path *old_path, int recurse) +static struct mount *__do_loopback(const struct path *old_path, int recurse) { struct mount *old = real_mount(old_path->mnt); @@ -2953,12 +2972,11 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) /* * do loopback mount. */ -static int do_loopback(struct path *path, const char *old_name, - int recurse) +static int do_loopback(const struct path *path, const char *old_name, + int recurse) { - struct path old_path; - struct mount *mnt = NULL, *parent; - struct pinned_mountpoint mp = {}; + struct path old_path __free(path_put) = {}; + struct mount *mnt = NULL; int err; if (!old_name || !*old_name) return -EINVAL; @@ -2966,49 +2984,40 @@ static int do_loopback(struct path *path, const char *old_name, if (err) return err; - err = -EINVAL; if (mnt_ns_loop(old_path.dentry)) - goto out; + return -EINVAL; - err = lock_mount(path, &mp); - if (err) - goto out; + LOCK_MOUNT(mp, path); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); - parent = real_mount(path->mnt); - if (!check_mnt(parent)) - goto out2; + if (!check_mnt(mp.parent)) + return -EINVAL; mnt = __do_loopback(&old_path, recurse); - if (IS_ERR(mnt)) { - err = PTR_ERR(mnt); - goto out2; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - err = graft_tree(mnt, parent, mp.mp); + err = graft_tree(mnt, &mp); if (err) { lock_mount_hash(); umount_tree(mnt, UMOUNT_SYNC); unlock_mount_hash(); } -out2: - unlock_mount(&mp); -out: - path_put(&old_path); return err; } -static struct file *open_detached_copy(struct path *path, bool recursive) +static struct mnt_namespace *get_detached_copy(const struct path *path, bool recursive) { struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; - struct file *file; ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) - return ERR_CAST(ns); + return ns; - namespace_lock(); + guard(namespace_excl)(); /* * Record the sequence number of the source mount namespace. @@ -3025,23 +3034,28 @@ static struct file *open_detached_copy(struct path *path, bool recursive) mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { - namespace_unlock(); - free_mnt_ns(ns); + emptied_ns = ns; return ERR_CAST(mnt); } - lock_mount_hash(); for (p = mnt; p; p = next_mnt(p, mnt)) { mnt_add_to_ns(ns, p); ns->nr_mounts++; } ns->root = mnt; - mntget(&mnt->mnt); - unlock_mount_hash(); - namespace_unlock(); + return ns; +} + +static struct file *open_detached_copy(struct path *path, bool recursive) +{ + struct mnt_namespace *ns = get_detached_copy(path, recursive); + struct file *file; + + if (IS_ERR(ns)) + return ERR_CAST(ns); mntput(path->mnt); - path->mnt = &mnt->mnt; + path->mnt = mntget(&ns->root->mnt); file = dentry_open(path, O_PATH, current_cred()); if (IS_ERR(file)) dissolve_on_fput(path->mnt); @@ -3158,7 +3172,8 @@ static void set_mount_attributes(struct mount *mnt, unsigned int mnt_flags) touch_mnt_namespace(mnt->mnt_ns); } -static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount *mnt) +static void mnt_warn_timestamp_expiry(const struct path *mountpoint, + struct vfsmount *mnt) { struct super_block *sb = mnt->mnt_sb; @@ -3192,7 +3207,7 @@ static void mnt_warn_timestamp_expiry(struct path *mountpoint, struct vfsmount * * superblock it refers to. This is triggered by specifying MS_REMOUNT|MS_BIND * to mount(2). */ -static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) +static int do_reconfigure_mnt(const struct path *path, unsigned int mnt_flags) { struct super_block *sb = path->mnt->mnt_sb; struct mount *mnt = real_mount(path->mnt); @@ -3229,7 +3244,7 @@ static int do_reconfigure_mnt(struct path *path, unsigned int mnt_flags) * If you've mounted a non-root directory somewhere and want to do remount * on it - tough luck. */ -static int do_remount(struct path *path, int sb_flags, +static int do_remount(const struct path *path, int sb_flags, int mnt_flags, void *data) { int err; @@ -3287,49 +3302,46 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -static int do_set_group(struct path *from_path, struct path *to_path) +static int do_set_group(const struct path *from_path, const struct path *to_path) { - struct mount *from, *to; + struct mount *from = real_mount(from_path->mnt); + struct mount *to = real_mount(to_path->mnt); int err; - from = real_mount(from_path->mnt); - to = real_mount(to_path->mnt); - - namespace_lock(); + guard(namespace_excl)(); err = may_change_propagation(from); if (err) - goto out; + return err; err = may_change_propagation(to); if (err) - goto out; + return err; - err = -EINVAL; /* To and From paths should be mount roots */ if (!path_mounted(from_path)) - goto out; + return -EINVAL; if (!path_mounted(to_path)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed across same superblock */ if (from->mnt.mnt_sb != to->mnt.mnt_sb) - goto out; + return -EINVAL; /* From mount root should be wider than To mount root */ if (!is_subdir(to->mnt.mnt_root, from->mnt.mnt_root)) - goto out; + return -EINVAL; /* From mount should not have locked children in place of To's root */ if (__has_locked_children(from, to->mnt.mnt_root)) - goto out; + return -EINVAL; /* Setting sharing groups is only allowed on private mounts */ if (IS_MNT_SHARED(to) || IS_MNT_SLAVE(to)) - goto out; + return -EINVAL; /* From should not be private */ if (!IS_MNT_SHARED(from) && !IS_MNT_SLAVE(from)) - goto out; + return -EINVAL; if (IS_MNT_SLAVE(from)) { hlist_add_behind(&to->mnt_slave, &from->mnt_slave); @@ -3341,11 +3353,7 @@ static int do_set_group(struct path *from_path, struct path *to_path) list_add(&to->mnt_share, &from->mnt_share); set_mnt_shared(to); } - - err = 0; -out: - namespace_unlock(); - return err; + return 0; } /** @@ -3389,17 +3397,15 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) /** * can_move_mount_beneath - check that we can mount beneath the top mount - * @from: mount to mount beneath - * @to: mount under which to mount - * @mp: mountpoint of @to + * @mnt_from: mount we are trying to move + * @mnt_to: mount under which to mount + * @mp: mountpoint of @mnt_to * - * - Make sure that @to->dentry is actually the root of a mount under - * which we can mount another mount. * - Make sure that nothing can be mounted beneath the caller's current * root or the rootfs of the namespace. * - Make sure that the caller can unmount the topmost mount ensuring * that the caller could reveal the underlying mountpoint. - * - Ensure that nothing has been mounted on top of @from before we + * - Ensure that nothing has been mounted on top of @mnt_from before we * grabbed @namespace_sem to avoid creating pointless shadow mounts. * - Prevent mounting beneath a mount if the propagation relationship * between the source mount, parent mount, and top mount would lead to @@ -3408,25 +3414,17 @@ static bool mount_is_ancestor(const struct mount *p1, const struct mount *p2) * Context: This function expects namespace_lock() to be held. * Return: On success 0, and on error a negative error code is returned. */ -static int can_move_mount_beneath(const struct path *from, - const struct path *to, +static int can_move_mount_beneath(const struct mount *mnt_from, + const struct mount *mnt_to, const struct mountpoint *mp) { - struct mount *mnt_from = real_mount(from->mnt), - *mnt_to = real_mount(to->mnt), - *parent_mnt_to = mnt_to->mnt_parent; - - if (!mnt_has_parent(mnt_to)) - return -EINVAL; - - if (!path_mounted(to)) - return -EINVAL; + struct mount *parent_mnt_to = mnt_to->mnt_parent; if (IS_MNT_LOCKED(mnt_to)) return -EINVAL; /* Avoid creating shadow mounts during mount propagation. */ - if (path_overmounted(from)) + if (mnt_from->overmount) return -EINVAL; /* @@ -3517,97 +3515,83 @@ static inline bool may_use_mount(struct mount *mnt) return check_anonymous_mnt(mnt); } -static int do_move_mount(struct path *old_path, - struct path *new_path, enum mnt_tree_flags_t flags) +static int do_move_mount(const struct path *old_path, + const struct path *new_path, + enum mnt_tree_flags_t flags) { - struct mnt_namespace *ns; - struct mount *p; - struct mount *old; - struct mount *parent; - struct pinned_mountpoint mp; + struct mount *old = real_mount(old_path->mnt); int err; bool beneath = flags & MNT_TREE_BENEATH; - err = do_lock_mount(new_path, &mp, beneath); - if (err) - return err; + if (!path_mounted(old_path)) + return -EINVAL; - old = real_mount(old_path->mnt); - p = real_mount(new_path->mnt); - parent = old->mnt_parent; - ns = old->mnt_ns; + if (d_is_dir(new_path->dentry) != d_is_dir(old_path->dentry)) + return -EINVAL; - err = -EINVAL; + LOCK_MOUNT_MAYBE_BENEATH(mp, new_path, beneath); + if (IS_ERR(mp.parent)) + return PTR_ERR(mp.parent); if (check_mnt(old)) { /* if the source is in our namespace... */ /* ... it should be detachable from parent */ if (!mnt_has_parent(old) || IS_MNT_LOCKED(old)) - goto out; + return -EINVAL; + /* ... which should not be shared */ + if (IS_MNT_SHARED(old->mnt_parent)) + return -EINVAL; /* ... and the target should be in our namespace */ - if (!check_mnt(p)) - goto out; - /* parent of the source should not be shared */ - if (IS_MNT_SHARED(parent)) - goto out; + if (!check_mnt(mp.parent)) + return -EINVAL; } else { /* * otherwise the source must be the root of some anon namespace. */ if (!anon_ns_root(old)) - goto out; + return -EINVAL; /* * Bail out early if the target is within the same namespace - * subsequent checks would've rejected that, but they lose * some corner cases if we check it early. */ - if (ns == p->mnt_ns) - goto out; + if (old->mnt_ns == mp.parent->mnt_ns) + return -EINVAL; /* * Target should be either in our namespace or in an acceptable * anon namespace, sensu check_anonymous_mnt(). */ - if (!may_use_mount(p)) - goto out; + if (!may_use_mount(mp.parent)) + return -EINVAL; } - if (!path_mounted(old_path)) - goto out; - - if (d_is_dir(new_path->dentry) != - d_is_dir(old_path->dentry)) - goto out; - if (beneath) { - err = can_move_mount_beneath(old_path, new_path, mp.mp); - if (err) - goto out; + struct mount *over = real_mount(new_path->mnt); - err = -EINVAL; - p = p->mnt_parent; + if (mp.parent != over->mnt_parent) + over = mp.parent->overmount; + err = can_move_mount_beneath(old, over, mp.mp); + if (err) + return err; } /* * Don't move a mount tree containing unbindable mounts to a destination * mount which is shared. */ - if (IS_MNT_SHARED(p) && tree_contains_unbindable(old)) - goto out; - err = -ELOOP; + if (IS_MNT_SHARED(mp.parent) && tree_contains_unbindable(old)) + return -EINVAL; if (!check_for_nsfs_mounts(old)) - goto out; - if (mount_is_ancestor(old, p)) - goto out; + return -ELOOP; + if (mount_is_ancestor(old, mp.parent)) + return -ELOOP; - err = attach_recursive_mnt(old, p, mp.mp); -out: - unlock_mount(&mp); - return err; + return attach_recursive_mnt(old, &mp); } -static int do_move_mount_old(struct path *path, const char *old_name) +static int do_move_mount_old(const struct path *path, const char *old_name) { - struct path old_path; + struct path old_path __free(path_put) = {}; int err; if (!old_name || !*old_name) @@ -3617,18 +3601,19 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, 0); - path_put(&old_path); - return err; + return do_move_mount(&old_path, path, 0); } /* * add a mount into a namespace's mount tree */ -static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, - const struct path *path, int mnt_flags) +static int do_add_mount(struct mount *newmnt, const struct pinned_mountpoint *mp, + int mnt_flags) { - struct mount *parent = real_mount(path->mnt); + struct mount *parent = mp->parent; + + if (IS_ERR(parent)) + return PTR_ERR(parent); mnt_flags &= ~MNT_INTERNAL_FLAGS; @@ -3642,14 +3627,15 @@ static int do_add_mount(struct mount *newmnt, struct mountpoint *mp, } /* Refuse the same filesystem on the same mount point */ - if (path->mnt->mnt_sb == newmnt->mnt.mnt_sb && path_mounted(path)) + if (parent->mnt.mnt_sb == newmnt->mnt.mnt_sb && + parent->mnt.mnt_root == mp->mp->m_dentry) return -EBUSY; if (d_is_symlink(newmnt->mnt.mnt_root)) return -EINVAL; newmnt->mnt.mnt_flags = mnt_flags; - return graft_tree(newmnt, parent, mp); + return graft_tree(newmnt, mp); } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags); @@ -3658,41 +3644,32 @@ static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags * Create a new mount using a superblock configuration and request it * be added to the namespace tree. */ -static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, +static int do_new_mount_fc(struct fs_context *fc, const struct path *mountpoint, unsigned int mnt_flags) { - struct vfsmount *mnt; - struct pinned_mountpoint mp = {}; - struct super_block *sb = fc->root->d_sb; + struct super_block *sb; + struct vfsmount *mnt __free(mntput) = fc_mount(fc); int error; - error = security_sb_kern_mount(sb); - if (!error && mount_too_revealing(sb, &mnt_flags)) { - errorfcp(fc, "VFS", "Mount too revealing"); - error = -EPERM; - } + if (IS_ERR(mnt)) + return PTR_ERR(mnt); - if (unlikely(error)) { - fc_drop_locked(fc); + sb = fc->root->d_sb; + error = security_sb_kern_mount(sb); + if (unlikely(error)) return error; - } - up_write(&sb->s_umount); - - mnt = vfs_create_mount(fc); - if (IS_ERR(mnt)) - return PTR_ERR(mnt); + if (unlikely(mount_too_revealing(sb, &mnt_flags))) { + errorfcp(fc, "VFS", "Mount too revealing"); + return -EPERM; + } mnt_warn_timestamp_expiry(mountpoint, mnt); - error = lock_mount(mountpoint, &mp); - if (!error) { - error = do_add_mount(real_mount(mnt), mp.mp, - mountpoint, mnt_flags); - unlock_mount(&mp); - } - if (error < 0) - mntput(mnt); + LOCK_MOUNT(mp, mountpoint); + error = do_add_mount(real_mount(mnt), &mp, mnt_flags); + if (!error) + retain_and_null_ptr(mnt); // consumed on success return error; } @@ -3700,8 +3677,9 @@ static int do_new_mount_fc(struct fs_context *fc, struct path *mountpoint, * create a new mount for userspace and request it to be added into the * namespace's tree */ -static int do_new_mount(struct path *path, const char *fstype, int sb_flags, - int mnt_flags, const char *name, void *data) +static int do_new_mount(const struct path *path, const char *fstype, + int sb_flags, int mnt_flags, + const char *name, void *data) { struct file_system_type *type; struct fs_context *fc; @@ -3738,27 +3716,46 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, fc->oldapi = true; if (subtype) - err = vfs_parse_fs_string(fc, "subtype", - subtype, strlen(subtype)); + err = vfs_parse_fs_string(fc, "subtype", subtype); if (!err && name) - err = vfs_parse_fs_string(fc, "source", name, strlen(name)); + err = vfs_parse_fs_string(fc, "source", name); if (!err) err = parse_monolithic_mount_data(fc, data); if (!err && !mount_capable(fc)) err = -EPERM; if (!err) - err = vfs_get_tree(fc); - if (!err) err = do_new_mount_fc(fc, path, mnt_flags); put_fs_context(fc); return err; } -int finish_automount(struct vfsmount *m, const struct path *path) +static void lock_mount_exact(const struct path *path, + struct pinned_mountpoint *mp) { struct dentry *dentry = path->dentry; - struct pinned_mountpoint mp = {}; + int err; + + inode_lock(dentry->d_inode); + namespace_lock(); + if (unlikely(cant_mount(dentry))) + err = -ENOENT; + else if (path_overmounted(path)) + err = -EBUSY; + else + err = get_mountpoint(dentry, mp); + if (unlikely(err)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + mp->parent = ERR_PTR(err); + } else { + mp->parent = real_mount(path->mnt); + } +} + +int finish_automount(struct vfsmount *__m, const struct path *path) +{ + struct vfsmount *m __free(mntput) = __m; struct mount *mnt; int err; @@ -3769,43 +3766,21 @@ int finish_automount(struct vfsmount *m, const struct path *path) mnt = real_mount(m); - if (m->mnt_sb == path->mnt->mnt_sb && - m->mnt_root == dentry) { - err = -ELOOP; - goto discard; - } + if (m->mnt_root == path->dentry) + return -ELOOP; /* - * we don't want to use lock_mount() - in this case finding something + * we don't want to use LOCK_MOUNT() - in this case finding something * that overmounts our mountpoint to be means "quitely drop what we've * got", not "try to mount it on top". */ - inode_lock(dentry->d_inode); - namespace_lock(); - if (unlikely(cant_mount(dentry))) { - err = -ENOENT; - goto discard_locked; - } - if (path_overmounted(path)) { - err = 0; - goto discard_locked; - } - err = get_mountpoint(dentry, &mp); - if (err) - goto discard_locked; - - err = do_add_mount(mnt, mp.mp, path, - path->mnt->mnt_flags | MNT_SHRINKABLE); - unlock_mount(&mp); - if (unlikely(err)) - goto discard; - return 0; + LOCK_MOUNT_EXACT(mp, path); + if (mp.parent == ERR_PTR(-EBUSY)) + return 0; -discard_locked: - namespace_unlock(); - inode_unlock(dentry->d_inode); -discard: - mntput(m); + err = do_add_mount(mnt, &mp, path->mnt->mnt_flags | MNT_SHRINKABLE); + if (likely(!err)) + retain_and_null_ptr(m); return err; } @@ -3816,9 +3791,8 @@ discard: */ void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list) { - read_seqlock_excl(&mount_lock); + guard(mount_locked_reader)(); list_add_tail(&real_mount(mnt)->mnt_expire, expiry_list); - read_sequnlock_excl(&mount_lock); } EXPORT_SYMBOL(mnt_set_expiry); @@ -3835,8 +3809,8 @@ void mark_mounts_for_expiry(struct list_head *mounts) if (list_empty(mounts)) return; - namespace_lock(); - lock_mount_hash(); + guard(namespace_excl)(); + guard(mount_writer)(); /* extract from the expiration list every vfsmount that matches the * following criteria: @@ -3858,8 +3832,6 @@ void mark_mounts_for_expiry(struct list_head *mounts) touch_mnt_namespace(mnt->mnt_ns); umount_tree(mnt, UMOUNT_PROPAGATE|UMOUNT_SYNC); } - unlock_mount_hash(); - namespace_unlock(); } EXPORT_SYMBOL_GPL(mark_mounts_for_expiry); @@ -3987,7 +3959,7 @@ static char *copy_mount_string(const void __user *data) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -int path_mount(const char *dev_name, struct path *path, +int path_mount(const char *dev_name, const struct path *path, const char *type_page, unsigned long flags, void *data_page) { unsigned int mnt_flags = 0, sb_flags; @@ -4069,15 +4041,13 @@ int path_mount(const char *dev_name, struct path *path, int do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { - struct path path; + struct path path __free(path_put) = {}; int ret; ret = user_path_at(AT_FDCWD, dir_name, LOOKUP_FOLLOW, &path); if (ret) return ret; - ret = path_mount(dev_name, &path, type_page, flags, data_page); - path_put(&path); - return ret; + return path_mount(dev_name, &path, type_page, flags, data_page); } static struct ucounts *inc_mnt_namespaces(struct user_namespace *ns) @@ -4138,7 +4108,8 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, struct user_namespace *user_ns, struct fs_struct *new_fs) { struct mnt_namespace *new_ns; - struct vfsmount *rootmnt = NULL, *pwdmnt = NULL; + struct vfsmount *rootmnt __free(mntput) = NULL; + struct vfsmount *pwdmnt __free(mntput) = NULL; struct mount *p, *q; struct mount *old; struct mount *new; @@ -4157,23 +4128,19 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, if (IS_ERR(new_ns)) return new_ns; - namespace_lock(); + guard(namespace_excl)(); /* First pass: copy the tree topology */ copy_flags = CL_COPY_UNBINDABLE | CL_EXPIRE; if (user_ns != ns->user_ns) copy_flags |= CL_SLAVE; new = copy_tree(old, old->mnt.mnt_root, copy_flags); if (IS_ERR(new)) { - namespace_unlock(); - ns_common_free(ns); - dec_mnt_namespaces(new_ns->ucounts); - mnt_ns_release(new_ns); + emptied_ns = new_ns; return ERR_CAST(new); } if (user_ns != ns->user_ns) { - lock_mount_hash(); + guard(mount_writer)(); lock_mnt_tree(new); - unlock_mount_hash(); } new_ns->root = new; @@ -4205,13 +4172,6 @@ struct mnt_namespace *copy_mnt_ns(u64 flags, struct mnt_namespace *ns, while (p->mnt.mnt_root != q->mnt.mnt_root) p = next_mnt(skip_mnt_tree(p), old); } - namespace_unlock(); - - if (rootmnt) - mntput(rootmnt); - if (pwdmnt) - mntput(pwdmnt); - ns_tree_add_raw(new_ns); return new_ns; } @@ -4436,7 +4396,8 @@ err_unlock: return ret; } -static inline int vfs_move_mount(struct path *from_path, struct path *to_path, +static inline int vfs_move_mount(const struct path *from_path, + const struct path *to_path, enum mnt_tree_flags_t mflags) { int ret; @@ -4542,7 +4503,7 @@ SYSCALL_DEFINE5(move_mount, /* * Return true if path is reachable from root * - * namespace_sem or mount_lock is held + * locks: mount_locked_reader || namespace_shared && is_mounted(mnt) */ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, const struct path *root) @@ -4556,11 +4517,8 @@ bool is_path_reachable(struct mount *mnt, struct dentry *dentry, bool path_is_under(const struct path *path1, const struct path *path2) { - bool res; - read_seqlock_excl(&mount_lock); - res = is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); - read_sequnlock_excl(&mount_lock); - return res; + guard(mount_locked_reader)(); + return is_path_reachable(real_mount(path1->mnt), path1->dentry, path2); } EXPORT_SYMBOL(path_is_under); @@ -4592,9 +4550,10 @@ EXPORT_SYMBOL(path_is_under); SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, const char __user *, put_old) { - struct path new, old, root; + struct path new __free(path_put) = {}; + struct path old __free(path_put) = {}; + struct path root __free(path_put) = {}; struct mount *new_mnt, *root_mnt, *old_mnt, *root_parent, *ex_parent; - struct pinned_mountpoint old_mp = {}; int error; if (!may_mount()) @@ -4603,57 +4562,54 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, error = user_path_at(AT_FDCWD, new_root, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &new); if (error) - goto out0; + return error; error = user_path_at(AT_FDCWD, put_old, LOOKUP_FOLLOW | LOOKUP_DIRECTORY, &old); if (error) - goto out1; + return error; error = security_sb_pivotroot(&old, &new); if (error) - goto out2; + return error; get_fs_root(current->fs, &root); - error = lock_mount(&old, &old_mp); - if (error) - goto out3; - error = -EINVAL; + LOCK_MOUNT(old_mp, &old); + old_mnt = old_mp.parent; + if (IS_ERR(old_mnt)) + return PTR_ERR(old_mnt); + new_mnt = real_mount(new.mnt); root_mnt = real_mount(root.mnt); - old_mnt = real_mount(old.mnt); ex_parent = new_mnt->mnt_parent; root_parent = root_mnt->mnt_parent; if (IS_MNT_SHARED(old_mnt) || IS_MNT_SHARED(ex_parent) || IS_MNT_SHARED(root_parent)) - goto out4; + return -EINVAL; if (!check_mnt(root_mnt) || !check_mnt(new_mnt)) - goto out4; + return -EINVAL; if (new_mnt->mnt.mnt_flags & MNT_LOCKED) - goto out4; - error = -ENOENT; + return -EINVAL; if (d_unlinked(new.dentry)) - goto out4; - error = -EBUSY; + return -ENOENT; if (new_mnt == root_mnt || old_mnt == root_mnt) - goto out4; /* loop, on the same file system */ - error = -EINVAL; + return -EBUSY; /* loop, on the same file system */ if (!path_mounted(&root)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(root_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ if (!path_mounted(&new)) - goto out4; /* not a mountpoint */ + return -EINVAL; /* not a mountpoint */ if (!mnt_has_parent(new_mnt)) - goto out4; /* absolute root */ + return -EINVAL; /* absolute root */ /* make sure we can reach put_old from new_root */ - if (!is_path_reachable(old_mnt, old.dentry, &new)) - goto out4; + if (!is_path_reachable(old_mnt, old_mp.mp->m_dentry, &new)) + return -EINVAL; /* make certain new is below the root */ if (!is_path_reachable(new_mnt, new.dentry, &root)) - goto out4; + return -EINVAL; lock_mount_hash(); umount_mnt(new_mnt); if (root_mnt->mnt.mnt_flags & MNT_LOCKED) { @@ -4672,17 +4628,7 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, mnt_notify_add(root_mnt); mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); - error = 0; -out4: - unlock_mount(&old_mp); -out3: - path_put(&root); -out2: - path_put(&old); -out1: - path_put(&new); -out0: - return error; + return 0; } static unsigned int recalc_flags(struct mount_kattr *kattr, struct mount *mnt) @@ -4772,8 +4718,10 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) if (!mnt_allow_writers(kattr, m)) { err = mnt_hold_writers(m); - if (err) + if (err) { + m = next_mnt(m, mnt); break; + } } if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) @@ -4781,25 +4729,9 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) } if (err) { - struct mount *p; - - /* - * If we had to call mnt_hold_writers() MNT_WRITE_HOLD will - * be set in @mnt_flags. The loop unsets MNT_WRITE_HOLD for all - * mounts and needs to take care to include the first mount. - */ - for (p = mnt; p; p = next_mnt(p, mnt)) { - /* If we had to hold writers unblock them. */ - if (p->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(p); - - /* - * We're done once the first mount we changed got - * MNT_WRITE_HOLD unset. - */ - if (p == m) - break; - } + /* undo all mnt_hold_writers() we'd done */ + for (struct mount *p = mnt; p != m; p = next_mnt(p, mnt)) + mnt_unhold_writers(p); } return err; } @@ -4830,8 +4762,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) WRITE_ONCE(m->mnt.mnt_flags, flags); /* If we had to hold writers unblock them. */ - if (m->mnt.mnt_flags & MNT_WRITE_HOLD) - mnt_unhold_writers(m); + mnt_unhold_writers(m); if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); @@ -4841,7 +4772,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) touch_mnt_namespace(mnt->mnt_ns); } -static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) +static int do_mount_setattr(const struct path *path, struct mount_kattr *kattr) { struct mount *mnt = real_mount(path->mnt); int err = 0; @@ -5639,6 +5570,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) STATMOUNT_MNT_UIDMAP | \ STATMOUNT_MNT_GIDMAP) +/* locks: namespace_shared */ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5885,7 +5817,7 @@ retry: if (ret) return ret; - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_statmount(ks, kreq.mnt_id, kreq.mnt_ns_id, ns); if (!ret) @@ -5906,6 +5838,7 @@ struct klistmount { struct path root; }; +/* locks: namespace_shared */ static ssize_t do_listmount(struct klistmount *kls, bool reverse) { struct mnt_namespace *ns = kls->ns; @@ -6040,7 +5973,7 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, * We only need to guard against mount topology changes as * listmount() doesn't care about any mount properties. */ - scoped_guard(rwsem_read, &namespace_sem) + scoped_guard(namespace_shared) ret = do_listmount(&kls, (flags & LISTMOUNT_REVERSE)); if (ret <= 0) return ret; @@ -6127,12 +6060,10 @@ void put_mnt_ns(struct mnt_namespace *ns) { if (!ns_ref_put(ns)) return; - namespace_lock(); + guard(namespace_excl)(); emptied_ns = ns; - lock_mount_hash(); + guard(mount_writer)(); umount_tree(ns->root, 0); - unlock_mount_hash(); - namespace_unlock(); } struct vfsmount *kern_mount(struct file_system_type *type) @@ -6181,25 +6112,18 @@ bool our_mnt(struct vfsmount *mnt) bool current_chrooted(void) { /* Does the current process have a non-standard root */ - struct path ns_root; - struct path fs_root; - bool chrooted; - - /* Find the namespace root */ - ns_root.mnt = ¤t->nsproxy->mnt_ns->root->mnt; - ns_root.dentry = ns_root.mnt->mnt_root; - path_get(&ns_root); - while (d_mountpoint(ns_root.dentry) && follow_down_one(&ns_root)) - ; + struct path fs_root __free(path_put) = {}; + struct mount *root; get_fs_root(current->fs, &fs_root); - chrooted = !path_equal(&fs_root, &ns_root); + /* Find the namespace root */ + + guard(mount_locked_reader)(); - path_put(&fs_root); - path_put(&ns_root); + root = topmost_overmount(current->nsproxy->mnt_ns->root); - return chrooted; + return fs_root.mnt != &root->mnt || !path_mounted(&fs_root); } static bool mnt_already_visible(struct mnt_namespace *ns, @@ -6208,9 +6132,8 @@ static bool mnt_already_visible(struct mnt_namespace *ns, { int new_flags = *new_mnt_flags; struct mount *mnt, *n; - bool visible = false; - down_read(&namespace_sem); + guard(namespace_shared)(); rbtree_postorder_for_each_entry_safe(mnt, n, &ns->mounts, mnt_node) { struct mount *child; int mnt_flags; @@ -6257,13 +6180,10 @@ static bool mnt_already_visible(struct mnt_namespace *ns, /* Preserve the locked attributes */ *new_mnt_flags |= mnt_flags & (MNT_LOCK_READONLY | \ MNT_LOCK_ATIME); - visible = true; - goto found; + return true; next: ; } -found: - up_read(&namespace_sem); - return visible; + return false; } static bool mount_too_revealing(const struct super_block *sb, int *new_mnt_flags) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 5d6edafbed20..0e4c67373e4f 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -676,7 +676,7 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, struct pnfs_layout_segment *lseg; struct xdr_buf buf; struct xdr_stream xdr; - struct page *scratch; + struct folio *scratch; int status, i; uint32_t count; __be32 *p; @@ -689,13 +689,13 @@ bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, return ERR_PTR(-ENOMEM); status = -ENOMEM; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); status = -EIO; p = xdr_inline_decode(&xdr, 4); @@ -744,7 +744,7 @@ process_extents: } out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: dprintk("%s returns %d\n", __func__, status); switch (status) { diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 44306ac22353..ab76120705e2 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -541,16 +541,16 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct pnfs_block_dev *top; struct xdr_stream xdr; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; int nr_volumes, ret, i; __be32 *p; - scratch = alloc_page(gfp_mask); + scratch = folio_alloc(gfp_mask, 0); if (!scratch) goto out; xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&xdr, scratch); + xdr_set_scratch_folio(&xdr, scratch); p = xdr_inline_decode(&xdr, sizeof(__be32)); if (!p) @@ -582,7 +582,7 @@ bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, out_free_volumes: kfree(volumes); out_free_scratch: - __free_page(scratch); + folio_put(scratch); out: return node; } diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 86bdc7d23fb9..c8b837006bb2 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -136,7 +136,7 @@ static void nfs_callback_down_net(u32 minorversion, struct svc_serv *serv, struc return; dprintk("NFS: destroy per-net callback data; net=%x\n", net->ns.inum); - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); } static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, @@ -153,7 +153,7 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, ret = svc_bind(serv, net); if (ret < 0) { printk(KERN_WARNING "NFS: bind callback service failed\n"); - goto err_bind; + goto err; } ret = 0; @@ -166,13 +166,11 @@ static int nfs_callback_up_net(int minorversion, struct svc_serv *serv, if (ret < 0) { printk(KERN_ERR "NFS: callback service start failed\n"); - goto err_socks; + goto err; } return 0; -err_socks: - svc_rpcb_cleanup(serv, net); -err_bind: +err: nn->cb_users[minorversion]--; dprintk("NFS: Couldn't create callback socket: err = %d; " "net = %x\n", ret, net->ns.inum); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index d81217923936..46d9c65d50f8 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -829,17 +829,17 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, struct address_space *mapping = desc->file->f_mapping; struct folio *new, *folio = *arrays; struct xdr_stream stream; - struct page *scratch; + struct folio *scratch; struct xdr_buf buf; u64 cookie; int status; - scratch = alloc_page(GFP_KERNEL); + scratch = folio_alloc(GFP_KERNEL, 0); if (scratch == NULL) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, xdr_pages, buflen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); do { status = nfs_readdir_entry_decode(desc, entry, &stream); @@ -891,7 +891,7 @@ static int nfs_readdir_folio_filler(struct nfs_readdir_descriptor *desc, if (folio != *arrays) nfs_readdir_folio_unlock_and_put(folio); - put_page(scratch); + folio_put(scratch); return status; } @@ -2198,8 +2198,6 @@ no_open: else dput(dentry); } - if (IS_ERR(res)) - return PTR_ERR(res); return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open); @@ -2260,7 +2258,7 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, struct file *file, unsigned int open_flags, umode_t mode) { - + struct dentry *res = NULL; /* Same as look+open from lookup_open(), but with different O_TRUNC * handling. */ @@ -2275,21 +2273,15 @@ int nfs_atomic_open_v23(struct inode *dir, struct dentry *dentry, if (error) return error; return finish_open(file, dentry, NULL); - } else if (d_in_lookup(dentry)) { + } + if (d_in_lookup(dentry)) { /* The only flags nfs_lookup considers are * LOOKUP_EXCL and LOOKUP_RENAME_TARGET, and * we want those to be zero so the lookup isn't skipped. */ - struct dentry *res = nfs_lookup(dir, dentry, 0); - - d_lookup_done(dentry); - if (unlikely(res)) { - if (IS_ERR(res)) - return PTR_ERR(res); - return finish_no_open(file, res); - } + res = nfs_lookup(dir, dentry, 0); } - return finish_no_open(file, NULL); + return finish_no_open(file, res); } EXPORT_SYMBOL_GPL(nfs_atomic_open_v23); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 8059ece82468..d020aab40c64 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -161,6 +161,8 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) struct inode *inode = file_inode(iocb->ki_filp); ssize_t result; + trace_nfs_file_read(iocb, to); + if (iocb->ki_flags & IOCB_DIRECT) return nfs_file_direct_read(iocb, to, false); @@ -361,6 +363,8 @@ static bool nfs_want_read_modify_write(struct file *file, struct folio *folio, if (pnfs_ld_read_whole_page(file_inode(file))) return true; + if (folio_test_dropbehind(folio)) + return false; /* Open for reading too? */ if (file->f_mode & FMODE_READ) return true; @@ -380,22 +384,23 @@ static int nfs_write_begin(const struct kiocb *iocb, loff_t pos, unsigned len, struct folio **foliop, void **fsdata) { - fgf_t fgp = FGP_WRITEBEGIN; struct folio *folio; struct file *file = iocb->ki_filp; int once_thru = 0; int ret; + trace_nfs_write_begin(file_inode(file), pos, len); + dfprintk(PAGECACHE, "NFS: write_begin(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); nfs_truncate_last_folio(mapping, i_size_read(mapping->host), pos); - fgp |= fgf_set_order(len); start: - folio = __filemap_get_folio(mapping, pos >> PAGE_SHIFT, fgp, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) - return PTR_ERR(folio); + folio = write_begin_get_folio(iocb, mapping, pos >> PAGE_SHIFT, len); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); + goto out; + } *foliop = folio; ret = nfs_flush_incompatible(file, folio); @@ -405,11 +410,14 @@ start: } else if (!once_thru && nfs_want_read_modify_write(file, folio, pos, len)) { once_thru = 1; + folio_clear_dropbehind(folio); ret = nfs_read_folio(file, folio); folio_put(folio); if (!ret) goto start; } +out: + trace_nfs_write_begin_done(file_inode(file), pos, len, ret); return ret; } @@ -423,6 +431,7 @@ static int nfs_write_end(const struct kiocb *iocb, unsigned offset = offset_in_folio(folio, pos); int status; + trace_nfs_write_end(file_inode(file), pos, len); dfprintk(PAGECACHE, "NFS: write_end(%pD2(%lu), %u@%lld)\n", file, mapping->host->i_ino, len, (long long) pos); @@ -451,13 +460,16 @@ static int nfs_write_end(const struct kiocb *iocb, folio_unlock(folio); folio_put(folio); - if (status < 0) + if (status < 0) { + trace_nfs_write_end_done(file_inode(file), pos, len, status); return status; + } NFS_I(mapping->host)->write_io += copied; if (nfs_ctx_key_to_expire(ctx, mapping->host)) nfs_wb_all(mapping->host); + trace_nfs_write_end_done(file_inode(file), pos, len, copied); return copied; } @@ -690,6 +702,8 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) errseq_t since; int error; + trace_nfs_file_write(iocb, from); + result = nfs_key_timeout_notify(file, inode); if (result) return result; @@ -949,5 +963,6 @@ const struct file_operations nfs_file_operations = { .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, .setlease = simple_nosetlease, + .fop_flags = FOP_DONTCACHE, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d39a1f58e18d..5c4551117c58 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -646,19 +646,19 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; __be32 *p; uint32_t nfl_util; int i; dprintk("%s: set_layout_map Begin\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return -ENOMEM; xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* 20 = ufl_util (4), first_stripe_index (4), pattern_offset (8), * num_fh (4) */ @@ -724,11 +724,11 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, fl->fh_array[i]->size); } - __free_page(scratch); + folio_put(scratch); return 0; out_err: - __free_page(scratch); + folio_put(scratch); return -EIO; } diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 29d9234d5c08..df79aeb68db4 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -73,18 +73,18 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_file_layout_dsaddr *dsaddr = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* Get the stripe count (number of stripe index) */ p = xdr_inline_decode(&stream, 4); @@ -186,7 +186,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, } } - __free_page(scratch); + folio_put(scratch); return dsaddr; out_err_drain_dsaddrs: @@ -204,7 +204,7 @@ out_err_free_deviceid: out_err_free_stripe_indices: kfree(stripe_indices); out_err_free_scratch: - __free_page(scratch); + folio_put(scratch); out_err: dprintk("%s ERROR: returning NULL\n", __func__); return NULL; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 9edb5f9b0c4e..9056f05a67dc 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -47,7 +47,7 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, int dev_limit, enum nfs4_ff_op_type type); static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror); + struct nfs4_ff_layout_ds_stripe *dss_info); static struct pnfs_layout_hdr * ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags) @@ -164,31 +164,32 @@ decode_name(struct xdr_stream *xdr, u32 *id) } static struct nfsd_file * -ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, +ff_local_open_fh(struct pnfs_layout_segment *lseg, u32 ds_idx, u32 dss_id, struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, fmode_t mode) { #if IS_ENABLED(CONFIG_NFS_LOCALIO) struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, ds_idx); - return nfs_local_open_fh(clp, cred, fh, &mirror->nfl, mode); + return nfs_local_open_fh(clp, cred, fh, &mirror->dss[dss_id].nfl, mode); #else return NULL; #endif } -static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, - const struct nfs4_ff_layout_mirror *m2) +static bool ff_dss_match_fh(const struct nfs4_ff_layout_ds_stripe *dss1, + const struct nfs4_ff_layout_ds_stripe *dss2) { int i, j; - if (m1->fh_versions_cnt != m2->fh_versions_cnt) + if (dss1->fh_versions_cnt != dss2->fh_versions_cnt) return false; - for (i = 0; i < m1->fh_versions_cnt; i++) { + + for (i = 0; i < dss1->fh_versions_cnt; i++) { bool found_fh = false; - for (j = 0; j < m2->fh_versions_cnt; j++) { - if (nfs_compare_fh(&m1->fh_versions[i], - &m2->fh_versions[j]) == 0) { + for (j = 0; j < dss2->fh_versions_cnt; j++) { + if (nfs_compare_fh(&dss1->fh_versions[i], + &dss2->fh_versions[j]) == 0) { found_fh = true; break; } @@ -199,6 +200,38 @@ static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, return true; } +static bool ff_mirror_match_fh(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (!ff_dss_match_fh(&m1->dss[dss_id], &m2->dss[dss_id])) + return false; + + return true; +} + +static bool ff_mirror_match_devid(const struct nfs4_ff_layout_mirror *m1, + const struct nfs4_ff_layout_mirror *m2) +{ + u32 dss_id; + + if (m1->dss_count != m2->dss_count) + return false; + + for (dss_id = 0; dss_id < m1->dss_count; dss_id++) + if (memcmp(&m1->dss[dss_id].devid, + &m2->dss[dss_id].devid, + sizeof(m1->dss[dss_id].devid)) != 0) + return false; + + return true; +} + static struct nfs4_ff_layout_mirror * ff_layout_add_mirror(struct pnfs_layout_hdr *lo, struct nfs4_ff_layout_mirror *mirror) @@ -209,7 +242,7 @@ ff_layout_add_mirror(struct pnfs_layout_hdr *lo, spin_lock(&inode->i_lock); list_for_each_entry(pos, &ff_layout->mirrors, mirrors) { - if (memcmp(&mirror->devid, &pos->devid, sizeof(pos->devid)) != 0) + if (!ff_mirror_match_devid(mirror, pos)) continue; if (!ff_mirror_match_fh(mirror, pos)) continue; @@ -237,32 +270,52 @@ ff_layout_remove_mirror(struct nfs4_ff_layout_mirror *mirror) mirror->layout = NULL; } -static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(gfp_t gfp_flags) +static struct nfs4_ff_layout_mirror *ff_layout_alloc_mirror(u32 dss_count, + gfp_t gfp_flags) { struct nfs4_ff_layout_mirror *mirror; mirror = kzalloc(sizeof(*mirror), gfp_flags); - if (mirror != NULL) { - spin_lock_init(&mirror->lock); - refcount_set(&mirror->ref, 1); - INIT_LIST_HEAD(&mirror->mirrors); - nfs_localio_file_init(&mirror->nfl); + if (mirror == NULL) + return NULL; + + spin_lock_init(&mirror->lock); + refcount_set(&mirror->ref, 1); + INIT_LIST_HEAD(&mirror->mirrors); + + mirror->dss_count = dss_count; + mirror->dss = + kcalloc(dss_count, sizeof(struct nfs4_ff_layout_ds_stripe), + gfp_flags); + if (mirror->dss == NULL) { + kfree(mirror); + return NULL; } + + for (u32 dss_id = 0; dss_id < mirror->dss_count; dss_id++) + nfs_localio_file_init(&mirror->dss[dss_id].nfl); + return mirror; } static void ff_layout_free_mirror(struct nfs4_ff_layout_mirror *mirror) { - const struct cred *cred; + const struct cred *cred; + u32 dss_id; ff_layout_remove_mirror(mirror); - kfree(mirror->fh_versions); - nfs_close_local_fh(&mirror->nfl); - cred = rcu_access_pointer(mirror->ro_cred); - put_cred(cred); - cred = rcu_access_pointer(mirror->rw_cred); - put_cred(cred); - nfs4_ff_layout_put_deviceid(mirror->mirror_ds); + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + kfree(mirror->dss[dss_id].fh_versions); + cred = rcu_access_pointer(mirror->dss[dss_id].ro_cred); + put_cred(cred); + cred = rcu_access_pointer(mirror->dss[dss_id].rw_cred); + put_cred(cred); + nfs_close_local_fh(&mirror->dss[dss_id].nfl); + nfs4_ff_layout_put_deviceid(mirror->dss[dss_id].mirror_ds); + } + + kfree(mirror->dss); kfree(mirror); } @@ -366,14 +419,24 @@ ff_layout_add_lseg(struct pnfs_layout_hdr *lo, free_me); } +static u32 ff_mirror_efficiency_sum(const struct nfs4_ff_layout_mirror *mirror) +{ + u32 dss_id, sum = 0; + + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) + sum += mirror->dss[dss_id].efficiency; + + return sum; +} + static void ff_layout_sort_mirrors(struct nfs4_ff_layout_segment *fls) { int i, j; for (i = 0; i < fls->mirror_array_cnt - 1; i++) { for (j = i + 1; j < fls->mirror_array_cnt; j++) - if (fls->mirror_array[i]->efficiency < - fls->mirror_array[j]->efficiency) + if (ff_mirror_efficiency_sum(fls->mirror_array[i]) < + ff_mirror_efficiency_sum(fls->mirror_array[j])) swap(fls->mirror_array[i], fls->mirror_array[j]); } @@ -388,20 +451,21 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, struct nfs4_ff_layout_segment *fls = NULL; struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; u64 stripe_unit; u32 mirror_array_cnt; __be32 *p; int i, rc; + struct nfs4_ff_layout_ds_stripe *dss_info; dprintk("--> %s\n", __func__); - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) return ERR_PTR(-ENOMEM); xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* stripe unit and mirror_array_cnt */ rc = -EIO; @@ -427,116 +491,134 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh, fls->mirror_array_cnt = mirror_array_cnt; fls->stripe_unit = stripe_unit; + u32 dss_count = 0; for (i = 0; i < fls->mirror_array_cnt; i++) { struct nfs4_ff_layout_mirror *mirror; struct cred *kcred; const struct cred __rcu *cred; kuid_t uid; kgid_t gid; - u32 ds_count, fh_count, id; - int j; + u32 fh_count, id; + int j, dss_id; rc = -EIO; p = xdr_inline_decode(&stream, 4); if (!p) goto out_err_free; - ds_count = be32_to_cpup(p); - /* FIXME: allow for striping? */ - if (ds_count != 1) + // Ensure all mirrors have same stripe count. + if (dss_count == 0) + dss_count = be32_to_cpup(p); + else if (dss_count != be32_to_cpup(p)) goto out_err_free; - fls->mirror_array[i] = ff_layout_alloc_mirror(gfp_flags); + if (dss_count > NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT || + dss_count == 0) + goto out_err_free; + + if (dss_count > 1 && stripe_unit == 0) + goto out_err_free; + + fls->mirror_array[i] = ff_layout_alloc_mirror(dss_count, gfp_flags); if (fls->mirror_array[i] == NULL) { rc = -ENOMEM; goto out_err_free; } - fls->mirror_array[i]->ds_count = ds_count; + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + dss_info->mirror = fls->mirror_array[i]; - /* deviceid */ - rc = decode_deviceid(&stream, &fls->mirror_array[i]->devid); - if (rc) - goto out_err_free; + /* deviceid */ + rc = decode_deviceid(&stream, &dss_info->devid); + if (rc) + goto out_err_free; - /* efficiency */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fls->mirror_array[i]->efficiency = be32_to_cpup(p); + /* efficiency */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + dss_info->efficiency = be32_to_cpup(p); - /* stateid */ - rc = decode_pnfs_stateid(&stream, &fls->mirror_array[i]->stateid); - if (rc) - goto out_err_free; + /* stateid */ + rc = decode_pnfs_stateid(&stream, &dss_info->stateid); + if (rc) + goto out_err_free; - /* fh */ - rc = -EIO; - p = xdr_inline_decode(&stream, 4); - if (!p) - goto out_err_free; - fh_count = be32_to_cpup(p); + /* fh */ + rc = -EIO; + p = xdr_inline_decode(&stream, 4); + if (!p) + goto out_err_free; + fh_count = be32_to_cpup(p); - fls->mirror_array[i]->fh_versions = - kcalloc(fh_count, sizeof(struct nfs_fh), - gfp_flags); - if (fls->mirror_array[i]->fh_versions == NULL) { - rc = -ENOMEM; - goto out_err_free; - } + dss_info->fh_versions = + kcalloc(fh_count, sizeof(struct nfs_fh), + gfp_flags); + if (dss_info->fh_versions == NULL) { + rc = -ENOMEM; + goto out_err_free; + } + + for (j = 0; j < fh_count; j++) { + rc = decode_nfs_fh(&stream, + &dss_info->fh_versions[j]); + if (rc) + goto out_err_free; + } - for (j = 0; j < fh_count; j++) { - rc = decode_nfs_fh(&stream, - &fls->mirror_array[i]->fh_versions[j]); + dss_info->fh_versions_cnt = fh_count; + + /* user */ + rc = decode_name(&stream, &id); if (rc) goto out_err_free; - } - fls->mirror_array[i]->fh_versions_cnt = fh_count; + uid = make_kuid(&init_user_ns, id); - /* user */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + /* group */ + rc = decode_name(&stream, &id); + if (rc) + goto out_err_free; - uid = make_kuid(&init_user_ns, id); + gid = make_kgid(&init_user_ns, id); - /* group */ - rc = decode_name(&stream, &id); - if (rc) - goto out_err_free; + if (gfp_flags & __GFP_FS) + kcred = prepare_kernel_cred(&init_task); + else { + unsigned int nofs_flags = memalloc_nofs_save(); - gid = make_kgid(&init_user_ns, id); + kcred = prepare_kernel_cred(&init_task); + memalloc_nofs_restore(nofs_flags); + } + rc = -ENOMEM; + if (!kcred) + goto out_err_free; + kcred->fsuid = uid; + kcred->fsgid = gid; + cred = RCU_INITIALIZER(kcred); - if (gfp_flags & __GFP_FS) - kcred = prepare_kernel_cred(&init_task); - else { - unsigned int nofs_flags = memalloc_nofs_save(); - kcred = prepare_kernel_cred(&init_task); - memalloc_nofs_restore(nofs_flags); + if (lgr->range.iomode == IOMODE_READ) + rcu_assign_pointer(dss_info->ro_cred, cred); + else + rcu_assign_pointer(dss_info->rw_cred, cred); } - rc = -ENOMEM; - if (!kcred) - goto out_err_free; - kcred->fsuid = uid; - kcred->fsgid = gid; - cred = RCU_INITIALIZER(kcred); - - if (lgr->range.iomode == IOMODE_READ) - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - else - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); mirror = ff_layout_add_mirror(lh, fls->mirror_array[i]); if (mirror != fls->mirror_array[i]) { - /* swap cred ptrs so free_mirror will clean up old */ - if (lgr->range.iomode == IOMODE_READ) { - cred = xchg(&mirror->ro_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->ro_cred, cred); - } else { - cred = xchg(&mirror->rw_cred, cred); - rcu_assign_pointer(fls->mirror_array[i]->rw_cred, cred); + for (dss_id = 0; dss_id < dss_count; dss_id++) { + dss_info = &fls->mirror_array[i]->dss[dss_id]; + /* swap cred ptrs so free_mirror will clean up old */ + if (lgr->range.iomode == IOMODE_READ) { + cred = xchg(&mirror->dss[dss_id].ro_cred, + dss_info->ro_cred); + rcu_assign_pointer(dss_info->ro_cred, cred); + } else { + cred = xchg(&mirror->dss[dss_id].rw_cred, + dss_info->rw_cred); + rcu_assign_pointer(dss_info->rw_cred, cred); + } } ff_layout_free_mirror(fls->mirror_array[i]); fls->mirror_array[i] = mirror; @@ -564,7 +646,7 @@ out_sort_mirrors: ret = &fls->generic_hdr; dprintk("<-- %s (success)\n", __func__); out_free_page: - __free_page(scratch); + folio_put(scratch); return ret; out_err_free: _ff_layout_free_lseg(fls); @@ -593,6 +675,26 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg) _ff_layout_free_lseg(fls); } +static u32 calc_commit_idx(struct pnfs_layout_segment *lseg, + u32 mirror_idx, u32 dss_id) +{ + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); + + return (mirror_idx * flseg->mirror_array[0]->dss_count) + dss_id; +} + +static u32 calc_mirror_idx_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index / FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + +static u32 calc_dss_id_from_commit(struct pnfs_layout_segment *lseg, + u32 commit_index) +{ + return commit_index % FF_LAYOUT_LSEG(lseg)->mirror_array[0]->dss_count; +} + static void nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) { @@ -617,6 +719,7 @@ nfs4_ff_end_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now) static bool nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, struct nfs4_ff_layoutstat *layoutstat, ktime_t now) { @@ -624,8 +727,8 @@ nfs4_ff_layoutstat_start_io(struct nfs4_ff_layout_mirror *mirror, struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(mirror->layout); nfs4_ff_start_busy_timer(&layoutstat->busy_timer, now); - if (!mirror->start_time) - mirror->start_time = now; + if (!mirror->dss[dss_id].start_time) + mirror->dss[dss_id].start_time = now; if (mirror->report_interval != 0) report_interval = (s64)mirror->report_interval * 1000LL; else if (layoutstats_timer != 0) @@ -675,13 +778,16 @@ nfs4_ff_layout_stat_io_update_completed(struct nfs4_ff_layoutstat *layoutstat, static void nfs4_ff_layout_stat_io_start_read(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror, &mirror->read_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->read_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, dss_id, &mirror->dss[dss_id].read_stat, now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].read_stat, requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -692,11 +798,12 @@ nfs4_ff_layout_stat_io_start_read(struct inode *inode, static void nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed) { spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->read_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].read_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); @@ -706,13 +813,20 @@ nfs4_ff_layout_stat_io_end_read(struct rpc_task *task, static void nfs4_ff_layout_stat_io_start_write(struct inode *inode, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, ktime_t now) { bool report; spin_lock(&mirror->lock); - report = nfs4_ff_layoutstat_start_io(mirror , &mirror->write_stat, now); - nfs4_ff_layout_stat_io_update_requested(&mirror->write_stat, requested); + report = nfs4_ff_layoutstat_start_io( + mirror, + dss_id, + &mirror->dss[dss_id].write_stat, + now); + nfs4_ff_layout_stat_io_update_requested( + &mirror->dss[dss_id].write_stat, + requested); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); @@ -723,6 +837,7 @@ nfs4_ff_layout_stat_io_start_write(struct inode *inode, static void nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, __u64 requested, __u64 completed, enum nfs3_stable_how committed) @@ -731,25 +846,25 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task, requested = completed = 0; spin_lock(&mirror->lock); - nfs4_ff_layout_stat_io_update_completed(&mirror->write_stat, + nfs4_ff_layout_stat_io_update_completed(&mirror->dss[dss_id].write_stat, requested, completed, ktime_get(), task->tk_start); set_bit(NFS4_FF_MIRROR_STAT_AVAIL, &mirror->flags); spin_unlock(&mirror->lock); } static void -ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_unavailable(devid); } static void -ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) +ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); if (devid) nfs4_mark_deviceid_available(devid); @@ -758,6 +873,7 @@ ff_layout_mark_ds_reachable(struct pnfs_layout_segment *lseg, u32 idx) static struct nfs4_pnfs_ds * ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id, bool check_device) { struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg); @@ -768,12 +884,16 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, /* mirrors are initially sorted by efficiency */ for (idx = start_idx; idx < fls->mirror_array_cnt; idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); + *dss_id = nfs4_ff_layout_calc_dss_id( + fls->stripe_unit, + fls->mirror_array[idx]->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, *dss_id, false); if (IS_ERR(ds)) continue; if (check_device && - nfs4_test_deviceid_unavailable(&mirror->mirror_ds->id_node)) { + nfs4_test_deviceid_unavailable(&mirror->dss[*dss_id].mirror_ds->id_node)) { // reinitialize the error state in case if this is the last iteration ds = ERR_PTR(-EINVAL); continue; @@ -788,42 +908,52 @@ ff_layout_choose_ds_for_read(struct pnfs_layout_segment *lseg, static struct nfs4_pnfs_ds * ff_layout_choose_any_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, false); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, false); } static struct nfs4_pnfs_ds * ff_layout_choose_valid_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { - return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, true); + return ff_layout_choose_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id, true); } static struct nfs4_pnfs_ds * ff_layout_choose_best_ds_for_read(struct pnfs_layout_segment *lseg, - u32 start_idx, u32 *best_idx) + u32 start_idx, u32 *best_idx, + u32 offset, u32 *dss_id) { struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx); + ds = ff_layout_choose_valid_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); if (!IS_ERR(ds)) return ds; - return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx); + return ff_layout_choose_any_ds_for_read(lseg, start_idx, best_idx, + offset, dss_id); } static struct nfs4_pnfs_ds * ff_layout_get_ds_for_read(struct nfs_pageio_descriptor *pgio, - u32 *best_idx) + u32 *best_idx, + u32 offset, + u32 *dss_id) { struct pnfs_layout_segment *lseg = pgio->pg_lseg; struct nfs4_pnfs_ds *ds; ds = ff_layout_choose_best_ds_for_read(lseg, pgio->pg_mirror_idx, - best_idx); + best_idx, offset, dss_id); if (!IS_ERR(ds) || !pgio->pg_mirror_idx) return ds; - return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx); + return ff_layout_choose_best_ds_for_read(lseg, 0, best_idx, + offset, dss_id); } static void @@ -842,6 +972,56 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio, } } +static bool +ff_layout_lseg_is_striped(const struct nfs4_ff_layout_segment *fls) +{ + return fls->mirror_array[0]->dss_count > 1; +} + +/* + * ff_layout_pg_test(). Called by nfs_can_coalesce_requests() + * + * Return 0 if @req cannot be coalesced into @pgio, otherwise return the number + * of bytes (maximum @req->wb_bytes) that can be coalesced. + */ +static size_t +ff_layout_pg_test(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, + struct nfs_page *req) +{ + unsigned int size; + u64 p_stripe, r_stripe; + u32 stripe_offset; + u64 segment_offset = pgio->pg_lseg->pls_range.offset; + u32 stripe_unit = FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit; + + /* calls nfs_generic_pg_test */ + size = pnfs_generic_pg_test(pgio, prev, req); + if (!size) + return 0; + else if (!ff_layout_lseg_is_striped(FF_LAYOUT_LSEG(pgio->pg_lseg))) + return size; + + /* see if req and prev are in the same stripe */ + if (prev) { + p_stripe = (u64)req_offset(prev) - segment_offset; + r_stripe = (u64)req_offset(req) - segment_offset; + do_div(p_stripe, stripe_unit); + do_div(r_stripe, stripe_unit); + + if (p_stripe != r_stripe) + return 0; + } + + /* calculate remaining bytes in the current stripe */ + div_u64_rem((u64)req_offset(req) - segment_offset, + stripe_unit, + &stripe_offset); + WARN_ON_ONCE(stripe_offset > stripe_unit); + if (stripe_offset >= stripe_unit) + return 0; + return min(stripe_unit - (unsigned int)stripe_offset, size); +} + static void ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) @@ -849,7 +1029,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_pgio_mirror *pgm; struct nfs4_ff_layout_mirror *mirror; struct nfs4_pnfs_ds *ds; - u32 ds_idx; + u32 ds_idx, dss_id; if (NFS_SERVER(pgio->pg_inode)->flags & (NFS_MOUNT_SOFT|NFS_MOUNT_SOFTERR)) @@ -870,7 +1050,8 @@ retry: /* Reset wb_nio, since getting layout segment was successful */ req->wb_nio = 0; - ds = ff_layout_get_ds_for_read(pgio, &ds_idx); + ds = ff_layout_get_ds_for_read(pgio, &ds_idx, + req_offset(req), &dss_id); if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -882,7 +1063,7 @@ retry: mirror = FF_LAYOUT_COMP(pgio->pg_lseg, ds_idx); pgm = &pgio->pg_mirrors[0]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].rsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize; pgio->pg_mirror_idx = ds_idx; return; @@ -919,7 +1100,7 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs4_ff_layout_mirror *mirror; struct nfs_pgio_mirror *pgm; struct nfs4_pnfs_ds *ds; - u32 i; + u32 i, dss_id; retry: pnfs_generic_pg_check_layout(pgio, req); @@ -944,7 +1125,12 @@ retry: for (i = 0; i < pgio->pg_mirror_count; i++) { mirror = FF_LAYOUT_COMP(pgio->pg_lseg, i); - ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, true); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(pgio->pg_lseg)->stripe_unit, + mirror->dss_count, + req_offset(req)); + ds = nfs4_ff_layout_prepare_ds(pgio->pg_lseg, mirror, + dss_id, true); if (IS_ERR(ds)) { if (!ff_layout_no_fallback_to_mds(pgio->pg_lseg)) goto out_mds; @@ -954,7 +1140,7 @@ retry: goto retry; } pgm = &pgio->pg_mirrors[i]; - pgm->pg_bsize = mirror->mirror_ds->ds_versions[0].wsize; + pgm->pg_bsize = mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize; } if (NFS_SERVER(pgio->pg_inode)->flags & @@ -1020,14 +1206,14 @@ ff_layout_pg_get_mirror_write(struct nfs_pageio_descriptor *desc, u32 idx) static const struct nfs_pageio_ops ff_layout_pg_read_ops = { .pg_init = ff_layout_pg_init_read, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_readpages, .pg_cleanup = pnfs_generic_pg_cleanup, }; static const struct nfs_pageio_ops ff_layout_pg_write_ops = { .pg_init = ff_layout_pg_init_write, - .pg_test = pnfs_generic_pg_test, + .pg_test = ff_layout_pg_test, .pg_doio = pnfs_generic_pg_writepages, .pg_get_mirror_count = ff_layout_pg_get_mirror_count_write, .pg_cleanup = pnfs_generic_pg_cleanup, @@ -1075,9 +1261,11 @@ static void ff_layout_resend_pnfs_read(struct nfs_pgio_header *hdr) { u32 idx = hdr->pgio_mirror_idx + 1; u32 new_idx = 0; + u32 dss_id = 0; struct nfs4_pnfs_ds *ds; - ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx); + ds = ff_layout_choose_any_ds_for_read(hdr->lseg, idx, &new_idx, + hdr->args.offset, &dss_id); if (IS_ERR(ds)) pnfs_error_mark_layout_for_return(hdr->inode, hdr->lseg); else @@ -1114,11 +1302,11 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { struct pnfs_layout_hdr *lo = lseg->pls_layout; struct inode *inode = lo->plh_inode; - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); struct nfs4_slot_table *tbl = &clp->cl_session->fc_slot_table; switch (op_status) { @@ -1215,9 +1403,9 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, u32 op_status, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { - struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx); + struct nfs4_deviceid_node *devid = FF_LAYOUT_DEVID_NODE(lseg, idx, dss_id); switch (op_status) { case NFS_OK: @@ -1281,12 +1469,12 @@ static int ff_layout_async_handle_error(struct rpc_task *task, struct nfs4_state *state, struct nfs_client *clp, struct pnfs_layout_segment *lseg, - u32 idx) + u32 idx, u32 dss_id) { int vers = clp->cl_nfs_mod->rpc_vers->number; if (task->tk_status >= 0) { - ff_layout_mark_ds_reachable(lseg, idx); + ff_layout_mark_ds_reachable(lseg, idx, dss_id); return 0; } @@ -1297,10 +1485,10 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: return ff_layout_async_handle_error_v3(task, op_status, clp, - lseg, idx); + lseg, idx, dss_id); case 4: return ff_layout_async_handle_error_v4(task, op_status, state, - clp, lseg, idx); + clp, lseg, idx, dss_id); default: /* should never happen */ WARN_ON_ONCE(1); @@ -1309,7 +1497,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, } static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, - u32 idx, u64 offset, u64 length, + u32 idx, u32 dss_id, u64 offset, u64 length, u32 *op_status, int opnum, int error) { struct nfs4_ff_layout_mirror *mirror; @@ -1347,7 +1535,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, mirror = FF_LAYOUT_COMP(lseg, idx); err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, offset, length, status, opnum, + mirror, dss_id, offset, length, status, opnum, nfs_io_gfp_mask()); switch (status) { @@ -1356,7 +1544,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case NFS4ERR_PERM: break; case NFS4ERR_NXIO: - ff_layout_mark_ds_unreachable(lseg, idx); + ff_layout_mark_ds_unreachable(lseg, idx, dss_id); /* * Don't return the layout if this is a read and we still * have layouts to try @@ -1376,10 +1564,16 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, static int ff_layout_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); @@ -1389,7 +1583,8 @@ static int ff_layout_read_done_cb(struct rpc_task *task, err = ff_layout_async_handle_error(task, hdr->res.op_status, hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_read(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1445,23 +1640,47 @@ ff_layout_set_layoutcommit(struct inode *inode, static void ff_layout_read_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_read(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_read( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_read_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_read(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - hdr->res.count); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_read( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1549,11 +1768,17 @@ static void ff_layout_read_release(void *data) static int ff_layout_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(hdr->lseg); + u32 dss_id = nfs4_ff_layout_calc_dss_id( + flseg->stripe_unit, + flseg->mirror_array[hdr->pgio_mirror_idx]->dss_count, + hdr->args.offset); loff_t end_offs = 0; int err; if (task->tk_status < 0) { - ff_layout_io_track_ds_error(hdr->lseg, hdr->pgio_mirror_idx, + ff_layout_io_track_ds_error(hdr->lseg, + hdr->pgio_mirror_idx, dss_id, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); @@ -1563,7 +1788,8 @@ static int ff_layout_write_done_cb(struct rpc_task *task, err = ff_layout_async_handle_error(task, hdr->res.op_status, hdr->args.context->state, hdr->ds_clp, hdr->lseg, - hdr->pgio_mirror_idx); + hdr->pgio_mirror_idx, + dss_id); trace_nfs4_pnfs_write(hdr, err); clear_bit(NFS_IOHDR_RESEND_PNFS, &hdr->flags); @@ -1601,9 +1827,11 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *data) { int err; + u32 idx = calc_mirror_idx_from_commit(data->lseg, data->ds_commit_index); + u32 dss_id = calc_dss_id_from_commit(data->lseg, data->ds_commit_index); if (task->tk_status < 0) { - ff_layout_io_track_ds_error(data->lseg, data->ds_commit_index, + ff_layout_io_track_ds_error(data->lseg, idx, dss_id, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); @@ -1611,8 +1839,8 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } err = ff_layout_async_handle_error(task, data->res.op_status, - NULL, data->ds_clp, data->lseg, - data->ds_commit_index); + NULL, data->ds_clp, data->lseg, idx, + dss_id); trace_nfs4_pnfs_commit_ds(data, err); switch (err) { @@ -1631,30 +1859,54 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); - return 0; } static void ff_layout_write_record_layoutstats_start(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_start_write(hdr->inode, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, - task->tk_start); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_start_write( + hdr->inode, + mirror, + dss_id, + hdr->args.count, + task->tk_start); } static void ff_layout_write_record_layoutstats_done(struct rpc_task *task, struct nfs_pgio_header *hdr) { + struct nfs4_ff_layout_mirror *mirror; + u32 dss_id; + if (!test_and_clear_bit(NFS_IOHDR_STAT, &hdr->flags)) return; - nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx), - hdr->args.count, hdr->res.count, - hdr->res.verf->committed); + + mirror = FF_LAYOUT_COMP(hdr->lseg, hdr->pgio_mirror_idx); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(hdr->lseg)->stripe_unit, + mirror->dss_count, + hdr->args.offset); + + nfs4_ff_layout_stat_io_end_write( + task, + mirror, + dss_id, + hdr->args.count, + hdr->res.count, + hdr->res.verf->committed); set_bit(NFS_LSEG_LAYOUTRETURN, &hdr->lseg->pls_flags); } @@ -1737,10 +1989,16 @@ static void ff_layout_write_release(void *data) static void ff_layout_commit_record_layoutstats_start(struct rpc_task *task, struct nfs_commit_data *cdata) { + u32 idx, dss_id; + if (test_and_set_bit(NFS_IOHDR_STAT, &cdata->flags)) return; + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_start_write(cdata->inode, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, 0, task->tk_start); } @@ -1749,6 +2007,7 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, { struct nfs_page *req; __u64 count = 0; + u32 idx, dss_id; if (!test_and_clear_bit(NFS_IOHDR_STAT, &cdata->flags)) return; @@ -1757,8 +2016,12 @@ static void ff_layout_commit_record_layoutstats_done(struct rpc_task *task, list_for_each_entry(req, &cdata->pages, wb_list) count += req->wb_bytes; } + + idx = calc_mirror_idx_from_commit(cdata->lseg, cdata->ds_commit_index); + dss_id = calc_dss_id_from_commit(cdata->lseg, cdata->ds_commit_index); nfs4_ff_layout_stat_io_end_write(task, - FF_LAYOUT_COMP(cdata->lseg, cdata->ds_commit_index), + FF_LAYOUT_COMP(cdata->lseg, idx), + dss_id, count, count, NFS_FILE_SYNC); set_bit(NFS_LSEG_LAYOUTRETURN, &cdata->lseg->pls_flags); } @@ -1872,6 +2135,7 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) u32 idx = hdr->pgio_mirror_idx; int vers; struct nfs_fh *fh; + u32 dss_id; bool ds_fatal_error = false; dprintk("--> %s ino %lu pgbase %u req %zu@%llu\n", @@ -1879,22 +2143,26 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->args.pgbase, (size_t)hdr->args.count, offset); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, false); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, false); if (IS_ERR(ds)) { ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s USE DS: %s cl_count %d vers %d\n", __func__, ds->ds_remotestr, refcount_read(&ds->ds_clp->cl_count), vers); @@ -1902,11 +2170,11 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->pgio_done_cb = ff_layout_read_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - fh = nfs4_ff_layout_select_ds_fh(mirror); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1916,7 +2184,8 @@ ff_layout_read_pagelist(struct nfs_pgio_header *hdr) hdr->mds_offset = offset; /* Start IO accounting for local read */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, FMODE_READ); + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, + FMODE_READ); if (localio) { hdr->task.tk_start = ktime_get(); ff_layout_read_record_layoutstats_start(&hdr->task, hdr); @@ -1953,25 +2222,30 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) int vers; struct nfs_fh *fh; u32 idx = hdr->pgio_mirror_idx; + u32 dss_id; bool ds_fatal_error = false; mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); + dss_id = nfs4_ff_layout_calc_dss_id( + FF_LAYOUT_LSEG(lseg)->stripe_unit, + mirror->dss_count, + offset); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); if (IS_ERR(ds)) { ds_fatal_error = nfs_error_is_fatal(PTR_ERR(ds)); goto out_failed; } ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - hdr->inode); + hdr->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_failed; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, hdr->cred, dss_id); if (!ds_cred) goto out_failed; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu sync %d req %zu@%llu DS: %s cl_count %d vers %d\n", __func__, hdr->inode->i_ino, sync, (size_t) hdr->args.count, @@ -1981,12 +2255,12 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->pgio_done_cb = ff_layout_write_done_cb; refcount_inc(&ds->ds_clp->cl_count); hdr->ds_clp = ds->ds_clp; - hdr->ds_commit_idx = idx; - fh = nfs4_ff_layout_select_ds_fh(mirror); + hdr->ds_commit_idx = calc_commit_idx(lseg, idx, dss_id); + fh = nfs4_ff_layout_select_ds_fh(mirror, dss_id); if (fh) hdr->args.fh = fh; - nfs4_ff_layout_select_ds_stateid(mirror, &hdr->args.stateid); + nfs4_ff_layout_select_ds_stateid(mirror, dss_id, &hdr->args.stateid); /* * Note that if we ever decide to split across DSes, @@ -1995,7 +2269,7 @@ ff_layout_write_pagelist(struct nfs_pgio_header *hdr, int sync) hdr->args.offset = offset; /* Start IO accounting for local write */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { hdr->task.tk_start = ktime_get(); @@ -2019,20 +2293,15 @@ out_failed: return PNFS_NOT_ATTEMPTED; } -static u32 calc_ds_index_from_commit(struct pnfs_layout_segment *lseg, u32 i) -{ - return i; -} - static struct nfs_fh * -select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i) +select_ds_fh_from_commit(struct pnfs_layout_segment *lseg, u32 i, u32 dss_id) { struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); /* FIXME: Assume that there is only one NFS version available * for the DS. */ - return &flseg->mirror_array[i]->fh_versions[0]; + return &flseg->mirror_array[i]->dss[dss_id].fh_versions[0]; } static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) @@ -2043,7 +2312,7 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) struct nfsd_file *localio; struct nfs4_ff_layout_mirror *mirror; const struct cred *ds_cred; - u32 idx; + u32 idx, dss_id; int vers, ret; struct nfs_fh *fh; @@ -2051,22 +2320,23 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))) goto out_err; - idx = calc_ds_index_from_commit(lseg, data->ds_commit_index); + idx = calc_mirror_idx_from_commit(lseg, data->ds_commit_index); mirror = FF_LAYOUT_COMP(lseg, idx); - ds = nfs4_ff_layout_prepare_ds(lseg, mirror, true); + dss_id = calc_dss_id_from_commit(lseg, data->ds_commit_index); + ds = nfs4_ff_layout_prepare_ds(lseg, mirror, dss_id, true); if (IS_ERR(ds)) goto out_err; ds_clnt = nfs4_ff_find_or_create_ds_client(mirror, ds->ds_clp, - data->inode); + data->inode, dss_id); if (IS_ERR(ds_clnt)) goto out_err; - ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred); + ds_cred = ff_layout_get_ds_cred(mirror, &lseg->pls_range, data->cred, dss_id); if (!ds_cred) goto out_err; - vers = nfs4_ff_layout_ds_version(mirror); + vers = nfs4_ff_layout_ds_version(mirror, dss_id); dprintk("%s ino %lu, how %d cl_count %d vers %d\n", __func__, data->inode->i_ino, how, refcount_read(&ds->ds_clp->cl_count), @@ -2075,12 +2345,12 @@ static int ff_layout_initiate_commit(struct nfs_commit_data *data, int how) data->cred = ds_cred; refcount_inc(&ds->ds_clp->cl_count); data->ds_clp = ds->ds_clp; - fh = select_ds_fh_from_commit(lseg, data->ds_commit_index); + fh = select_ds_fh_from_commit(lseg, idx, dss_id); if (fh) data->args.fh = fh; /* Start IO accounting for local commit */ - localio = ff_local_open_fh(lseg, idx, ds->ds_clp, ds_cred, fh, + localio = ff_local_open_fh(lseg, idx, dss_id, ds->ds_clp, ds_cred, fh, FMODE_READ|FMODE_WRITE); if (localio) { data->task.tk_start = ktime_get(); @@ -2144,25 +2414,28 @@ static void ff_layout_cancel_io(struct pnfs_layout_segment *lseg) struct nfs4_pnfs_ds *ds; struct nfs_client *ds_clp; struct rpc_clnt *clnt; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < flseg->mirror_array_cnt; idx++) { mirror = flseg->mirror_array[idx]; - mirror_ds = mirror->mirror_ds; - if (IS_ERR_OR_NULL(mirror_ds)) - continue; - ds = mirror->mirror_ds->ds; - if (!ds) - continue; - ds_clp = ds->ds_clp; - if (!ds_clp) - continue; - clnt = ds_clp->cl_rpcclient; - if (!clnt) - continue; - if (!rpc_cancel_tasks(clnt, -EAGAIN, ff_layout_match_io, lseg)) - continue; - rpc_clnt_disconnect(clnt); + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + mirror_ds = mirror->dss[dss_id].mirror_ds; + if (IS_ERR_OR_NULL(mirror_ds)) + continue; + ds = mirror->dss[dss_id].mirror_ds->ds; + if (!ds) + continue; + ds_clp = ds->ds_clp; + if (!ds_clp) + continue; + clnt = ds_clp->cl_rpcclient; + if (!clnt) + continue; + if (!rpc_cancel_tasks(clnt, -EAGAIN, + ff_layout_match_io, lseg)) + continue; + rpc_clnt_disconnect(clnt); + } } } @@ -2184,8 +2457,9 @@ ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg); struct inode *inode = lseg->pls_layout->plh_inode; struct pnfs_commit_array *array, *new; + u32 size = flseg->mirror_array_cnt * flseg->mirror_array[0]->dss_count; - new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, + new = pnfs_alloc_commit_array(size, nfs_io_gfp_mask()); if (new) { spin_lock(&inode->i_lock); @@ -2549,11 +2823,11 @@ ff_layout_encode_io_latency(struct xdr_stream *xdr, static void ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, const struct nfs42_layoutstat_devinfo *devinfo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_ds_stripe *dss_info) { struct nfs4_pnfs_ds_addr *da; - struct nfs4_pnfs_ds *ds = mirror->mirror_ds->ds; - struct nfs_fh *fh = &mirror->fh_versions[0]; + struct nfs4_pnfs_ds *ds = dss_info->mirror_ds->ds; + struct nfs_fh *fh = &dss_info->fh_versions[0]; __be32 *p; da = list_first_entry(&ds->ds_addrs, struct nfs4_pnfs_ds_addr, da_node); @@ -2565,13 +2839,17 @@ ff_layout_encode_ff_layoutupdate(struct xdr_stream *xdr, p = xdr_reserve_space(xdr, 4 + fh->size); xdr_encode_opaque(p, fh->data, fh->size); /* ff_io_latency4 read */ - spin_lock(&mirror->lock); - ff_layout_encode_io_latency(xdr, &mirror->read_stat.io_stat); + spin_lock(&dss_info->mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->read_stat.io_stat); /* ff_io_latency4 write */ - ff_layout_encode_io_latency(xdr, &mirror->write_stat.io_stat); - spin_unlock(&mirror->lock); + ff_layout_encode_io_latency(xdr, + &dss_info->write_stat.io_stat); + spin_unlock(&dss_info->mirror->lock); /* nfstime4 */ - ff_layout_encode_nfstime(xdr, ktime_sub(ktime_get(), mirror->start_time)); + ff_layout_encode_nfstime(xdr, + ktime_sub(ktime_get(), + dss_info->start_time)); /* bool */ p = xdr_reserve_space(xdr, 4); *p = cpu_to_be32(false); @@ -2595,7 +2873,8 @@ ff_layout_encode_layoutstats(struct xdr_stream *xdr, const void *args, static void ff_layout_free_layoutstats(struct nfs4_xdr_opaque_data *opaque) { - struct nfs4_ff_layout_mirror *mirror = opaque->data; + struct nfs4_ff_layout_ds_stripe *dss_info = opaque->data; + struct nfs4_ff_layout_mirror *mirror = dss_info->mirror; ff_layout_put_mirror(mirror); } @@ -2612,37 +2891,47 @@ ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo, { struct nfs4_flexfile_layout *ff_layout = FF_LAYOUT_FROM_HDR(lo); struct nfs4_ff_layout_mirror *mirror; + struct nfs4_ff_layout_ds_stripe *dss_info; struct nfs4_deviceid_node *dev; - int i = 0; + int i = 0, dss_id; list_for_each_entry(mirror, &ff_layout->mirrors, mirrors) { - if (i >= dev_limit) - break; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) - continue; - if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, - &mirror->flags) && - type != NFS4_FF_OP_LAYOUTRETURN) - continue; - /* mirror refcount put in cleanup_layoutstats */ - if (!refcount_inc_not_zero(&mirror->ref)) - continue; - dev = &mirror->mirror_ds->id_node; - memcpy(&devinfo->dev_id, &dev->deviceid, NFS4_DEVICEID4_SIZE); - devinfo->offset = 0; - devinfo->length = NFS4_MAX_UINT64; - spin_lock(&mirror->lock); - devinfo->read_count = mirror->read_stat.io_stat.ops_completed; - devinfo->read_bytes = mirror->read_stat.io_stat.bytes_completed; - devinfo->write_count = mirror->write_stat.io_stat.ops_completed; - devinfo->write_bytes = mirror->write_stat.io_stat.bytes_completed; - spin_unlock(&mirror->lock); - devinfo->layout_type = LAYOUT_FLEX_FILES; - devinfo->ld_private.ops = &layoutstat_ops; - devinfo->ld_private.data = mirror; - - devinfo++; - i++; + for (dss_id = 0; dss_id < mirror->dss_count; ++dss_id) { + dss_info = &mirror->dss[dss_id]; + if (i >= dev_limit) + break; + if (IS_ERR_OR_NULL(dss_info->mirror_ds)) + continue; + if (!test_and_clear_bit(NFS4_FF_MIRROR_STAT_AVAIL, + &mirror->flags) && + type != NFS4_FF_OP_LAYOUTRETURN) + continue; + /* mirror refcount put in cleanup_layoutstats */ + if (!refcount_inc_not_zero(&mirror->ref)) + continue; + dev = &dss_info->mirror_ds->id_node; + memcpy(&devinfo->dev_id, + &dev->deviceid, + NFS4_DEVICEID4_SIZE); + devinfo->offset = 0; + devinfo->length = NFS4_MAX_UINT64; + spin_lock(&mirror->lock); + devinfo->read_count = + dss_info->read_stat.io_stat.ops_completed; + devinfo->read_bytes = + dss_info->read_stat.io_stat.bytes_completed; + devinfo->write_count = + dss_info->write_stat.io_stat.ops_completed; + devinfo->write_bytes = + dss_info->write_stat.io_stat.bytes_completed; + spin_unlock(&mirror->lock); + devinfo->layout_type = LAYOUT_FLEX_FILES; + devinfo->ld_private.ops = &layoutstat_ops; + devinfo->ld_private.data = &mirror->dss[dss_id]; + + devinfo++; + i++; + } } return i; } diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h index 095df09017a5..17a008c8e97c 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.h +++ b/fs/nfs/flexfilelayout/flexfilelayout.h @@ -21,6 +21,8 @@ * due to network error etc. */ #define NFS4_FLEXFILE_LAYOUT_MAX_MIRROR_CNT 4096 +#define NFS4_FLEXFILE_LAYOUT_MAX_STRIPE_CNT 4096 + /* LAYOUTSTATS report interval in ms */ #define FF_LAYOUTSTATS_REPORT_INTERVAL (60000L) #define FF_LAYOUTSTATS_MAXDEV 4 @@ -71,12 +73,12 @@ struct nfs4_ff_layoutstat { struct nfs4_ff_busy_timer busy_timer; }; -struct nfs4_ff_layout_mirror { - struct pnfs_layout_hdr *layout; - struct list_head mirrors; - u32 ds_count; - u32 efficiency; +struct nfs4_ff_layout_mirror; + +struct nfs4_ff_layout_ds_stripe { + struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid devid; + u32 efficiency; struct nfs4_ff_layout_ds *mirror_ds; u32 fh_versions_cnt; struct nfs_fh *fh_versions; @@ -84,12 +86,19 @@ struct nfs4_ff_layout_mirror { const struct cred __rcu *ro_cred; const struct cred __rcu *rw_cred; struct nfs_file_localio nfl; - refcount_t ref; - spinlock_t lock; - unsigned long flags; struct nfs4_ff_layoutstat read_stat; struct nfs4_ff_layoutstat write_stat; ktime_t start_time; +}; + +struct nfs4_ff_layout_mirror { + struct pnfs_layout_hdr *layout; + struct list_head mirrors; + u32 dss_count; + struct nfs4_ff_layout_ds_stripe *dss; + refcount_t ref; + spinlock_t lock; + unsigned long flags; u32 report_interval; }; @@ -150,12 +159,12 @@ FF_LAYOUT_COMP(struct pnfs_layout_segment *lseg, u32 idx) } static inline struct nfs4_deviceid_node * -FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx) +FF_LAYOUT_DEVID_NODE(struct pnfs_layout_segment *lseg, u32 idx, u32 dss_id) { struct nfs4_ff_layout_mirror *mirror = FF_LAYOUT_COMP(lseg, idx); if (mirror != NULL) { - struct nfs4_ff_layout_ds *mirror_ds = mirror->mirror_ds; + struct nfs4_ff_layout_ds *mirror_ds = mirror->dss[dss_id].mirror_ds; if (!IS_ERR_OR_NULL(mirror_ds)) return &mirror_ds->id_node; @@ -182,9 +191,22 @@ ff_layout_no_read_on_rw(struct pnfs_layout_segment *lseg) } static inline int -nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_ds_version(const struct nfs4_ff_layout_mirror *mirror, u32 dss_id) +{ + return mirror->dss[dss_id].mirror_ds->ds_versions[0].version; +} + +static inline u32 +nfs4_ff_layout_calc_dss_id(const u64 stripe_unit, const u32 dss_count, const loff_t offset) { - return mirror->mirror_ds->ds_versions[0].version; + u64 tmp = offset; + + if (dss_count == 1 || stripe_unit == 0) + return 0; + + do_div(tmp, stripe_unit); + + return do_div(tmp, dss_count); } struct nfs4_ff_layout_ds * @@ -193,9 +215,9 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, void nfs4_ff_layout_put_deviceid(struct nfs4_ff_layout_ds *mirror_ds); void nfs4_ff_layout_free_deviceid(struct nfs4_ff_layout_ds *mirror_ds); int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags); + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags); void ff_layout_send_layouterror(struct pnfs_layout_segment *lseg); int ff_layout_encode_ds_ioerr(struct xdr_stream *xdr, const struct list_head *head); void ff_layout_free_ds_ioerr(struct list_head *head); @@ -204,23 +226,27 @@ unsigned int ff_layout_fetch_ds_ioerr(struct pnfs_layout_hdr *lo, struct list_head *head, unsigned int maxnum); struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror); +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id); void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid); + u32 dss_id, + nfs4_stateid *stateid); struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return); struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, struct nfs_client *ds_clp, - struct inode *inode); + struct inode *inode, + u32 dss_id); const struct cred *ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred); + const struct cred *mdscred, + u32 dss_id); bool ff_layout_avoid_mds_available_ds(struct pnfs_layout_segment *lseg); bool ff_layout_avoid_read_on_rw(struct pnfs_layout_segment *lseg); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 30365ec782bb..c55ea8fa3bfa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -44,7 +44,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, { struct xdr_stream stream; struct xdr_buf buf; - struct page *scratch; + struct folio *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; @@ -56,7 +56,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, int i, ret = -ENOMEM; /* set up xdr stream */ - scratch = alloc_page(gfp_flags); + scratch = folio_alloc(gfp_flags, 0); if (!scratch) goto out_err; @@ -70,7 +70,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, INIT_LIST_HEAD(&dsaddrs); xdr_init_decode_pages(&stream, &buf, pdev->pages, pdev->pglen); - xdr_set_scratch_page(&stream, scratch); + xdr_set_scratch_folio(&stream, scratch); /* multipath count */ p = xdr_inline_decode(&stream, 4); @@ -163,7 +163,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, kfree(da); } - __free_page(scratch); + folio_put(scratch); return new_ds; out_err_drain_dsaddrs: @@ -177,7 +177,7 @@ out_err_drain_dsaddrs: kfree(ds_versions); out_scratch: - __free_page(scratch); + folio_put(scratch); out_err: kfree(new_ds); @@ -250,16 +250,16 @@ ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, - struct nfs4_ff_layout_mirror *mirror, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - gfp_t gfp_flags) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, u64 offset, u64 length, int status, + enum nfs_opnum4 opnum, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; if (status == 0) return 0; - if (IS_ERR_OR_NULL(mirror->mirror_ds)) + if (IS_ERR_OR_NULL(mirror->dss[dss_id].mirror_ds)) return -EINVAL; dserr = kmalloc(sizeof(*dserr), gfp_flags); @@ -271,8 +271,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, dserr->length = length; dserr->status = status; dserr->opnum = opnum; - nfs4_stateid_copy(&dserr->stateid, &mirror->stateid); - memcpy(&dserr->deviceid, &mirror->mirror_ds->id_node.deviceid, + nfs4_stateid_copy(&dserr->stateid, &mirror->dss[dss_id].stateid); + memcpy(&dserr->deviceid, &mirror->dss[dss_id].mirror_ds->id_node.deviceid, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); @@ -282,14 +282,14 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, } static const struct cred * -ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) +ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode, u32 dss_id) { const struct cred *cred, __rcu **pcred; if (iomode == IOMODE_READ) - pcred = &mirror->ro_cred; + pcred = &mirror->dss[dss_id].ro_cred; else - pcred = &mirror->rw_cred; + pcred = &mirror->dss[dss_id].rw_cred; rcu_read_lock(); do { @@ -304,43 +304,45 @@ ff_layout_get_mirror_cred(struct nfs4_ff_layout_mirror *mirror, u32 iomode) } struct nfs_fh * -nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror) +nfs4_ff_layout_select_ds_fh(struct nfs4_ff_layout_mirror *mirror, u32 dss_id) { /* FIXME: For now assume there is only 1 version available for the DS */ - return &mirror->fh_versions[0]; + return &mirror->dss[dss_id].fh_versions[0]; } void nfs4_ff_layout_select_ds_stateid(const struct nfs4_ff_layout_mirror *mirror, - nfs4_stateid *stateid) + u32 dss_id, + nfs4_stateid *stateid) { - if (nfs4_ff_layout_ds_version(mirror) == 4) - nfs4_stateid_copy(stateid, &mirror->stateid); + if (nfs4_ff_layout_ds_version(mirror, dss_id) == 4) + nfs4_stateid_copy(stateid, &mirror->dss[dss_id].stateid); } static bool ff_layout_init_mirror_ds(struct pnfs_layout_hdr *lo, - struct nfs4_ff_layout_mirror *mirror) + struct nfs4_ff_layout_mirror *mirror, + u32 dss_id) { if (mirror == NULL) goto outerr; - if (mirror->mirror_ds == NULL) { + if (mirror->dss[dss_id].mirror_ds == NULL) { struct nfs4_deviceid_node *node; struct nfs4_ff_layout_ds *mirror_ds = ERR_PTR(-ENODEV); node = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), - &mirror->devid, lo->plh_lc_cred, + &mirror->dss[dss_id].devid, lo->plh_lc_cred, GFP_KERNEL); if (node) mirror_ds = FF_LAYOUT_MIRROR_DS(node); /* check for race with another call to this function */ - if (cmpxchg(&mirror->mirror_ds, NULL, mirror_ds) && + if (cmpxchg(&mirror->dss[dss_id].mirror_ds, NULL, mirror_ds) && mirror_ds != ERR_PTR(-ENODEV)) nfs4_put_deviceid_node(node); } - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) goto outerr; return true; @@ -352,6 +354,7 @@ outerr: * nfs4_ff_layout_prepare_ds - prepare a DS connection for an RPC call * @lseg: the layout segment we're operating on * @mirror: layout mirror describing the DS to use + * @dss_id: DS stripe id to select stripe to use * @fail_return: return layout on connect failure? * * Try to prepare a DS connection to accept an RPC call. This involves @@ -368,6 +371,7 @@ outerr: struct nfs4_pnfs_ds * nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, struct nfs4_ff_layout_mirror *mirror, + u32 dss_id, bool fail_return) { struct nfs4_pnfs_ds *ds; @@ -376,10 +380,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, unsigned int max_payload; int status = -EAGAIN; - if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror)) + if (!ff_layout_init_mirror_ds(lseg->pls_layout, mirror, dss_id)) goto noconnect; - ds = mirror->mirror_ds->ds; + ds = mirror->dss[dss_id].mirror_ds->ds; if (READ_ONCE(ds->ds_clp)) goto out; /* matching smp_wmb() in _nfs4_pnfs_v3/4_ds_connect */ @@ -388,10 +392,10 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, /* FIXME: For now we assume the server sent only one version of NFS * to use for the DS. */ - status = nfs4_pnfs_ds_connect(s, ds, &mirror->mirror_ds->id_node, + status = nfs4_pnfs_ds_connect(s, ds, &mirror->dss[dss_id].mirror_ds->id_node, dataserver_timeo, dataserver_retrans, - mirror->mirror_ds->ds_versions[0].version, - mirror->mirror_ds->ds_versions[0].minor_version); + mirror->dss[dss_id].mirror_ds->ds_versions[0].version, + mirror->dss[dss_id].mirror_ds->ds_versions[0].minor_version); /* connect success, check rsize/wsize limit */ if (!status) { @@ -404,15 +408,15 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); - if (mirror->mirror_ds->ds_versions[0].rsize > max_payload) - mirror->mirror_ds->ds_versions[0].rsize = max_payload; - if (mirror->mirror_ds->ds_versions[0].wsize > max_payload) - mirror->mirror_ds->ds_versions[0].wsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].rsize = max_payload; + if (mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize > max_payload) + mirror->dss[dss_id].mirror_ds->ds_versions[0].wsize = max_payload; goto out; } noconnect: ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout), - mirror, lseg->pls_range.offset, + mirror, dss_id, lseg->pls_range.offset, lseg->pls_range.length, NFS4ERR_NXIO, OP_ILLEGAL, GFP_NOIO); ff_layout_send_layouterror(lseg); @@ -426,12 +430,13 @@ out: const struct cred * ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, const struct pnfs_layout_range *range, - const struct cred *mdscred) + const struct cred *mdscred, + u32 dss_id) { const struct cred *cred; - if (mirror && !mirror->mirror_ds->ds_versions[0].tightly_coupled) { - cred = ff_layout_get_mirror_cred(mirror, range->iomode); + if (mirror && !mirror->dss[dss_id].mirror_ds->ds_versions[0].tightly_coupled) { + cred = ff_layout_get_mirror_cred(mirror, range->iomode, dss_id); if (!cred) cred = get_cred(mdscred); } else { @@ -445,15 +450,17 @@ ff_layout_get_ds_cred(struct nfs4_ff_layout_mirror *mirror, * @mirror: pointer to the mirror * @ds_clp: nfs_client for the DS * @inode: pointer to inode + * @dss_id: DS stripe id * * Find or create a DS rpc client with th MDS server rpc client auth flavor * in the nfs_client cl_ds_clients list. */ struct rpc_clnt * nfs4_ff_find_or_create_ds_client(struct nfs4_ff_layout_mirror *mirror, - struct nfs_client *ds_clp, struct inode *inode) + struct nfs_client *ds_clp, struct inode *inode, + u32 dss_id) { - switch (mirror->mirror_ds->ds_versions[0].version) { + switch (mirror->dss[dss_id].mirror_ds->ds_versions[0].version) { case 3: /* For NFSv3 DS, flavor is set when creating DS connections */ return ds_clp->cl_rpcclient; @@ -559,16 +566,18 @@ static bool ff_read_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (mirror) { - if (!mirror->mirror_ds) + if (!mirror) + continue; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (!mirror->dss[dss_id].mirror_ds) return true; - if (IS_ERR(mirror->mirror_ds)) + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) continue; - devid = &mirror->mirror_ds->id_node; + devid = &mirror->dss[dss_id].mirror_ds->id_node; if (!nfs4_test_deviceid_unavailable(devid)) return true; } @@ -581,17 +590,21 @@ static bool ff_rw_layout_has_available_ds(struct pnfs_layout_segment *lseg) { struct nfs4_ff_layout_mirror *mirror; struct nfs4_deviceid_node *devid; - u32 idx; + u32 idx, dss_id; for (idx = 0; idx < FF_LAYOUT_MIRROR_COUNT(lseg); idx++) { mirror = FF_LAYOUT_COMP(lseg, idx); - if (!mirror || IS_ERR(mirror->mirror_ds)) - return false; - if (!mirror->mirror_ds) - continue; - devid = &mirror->mirror_ds->id_node; - if (nfs4_test_deviceid_unavailable(devid)) + if (!mirror) return false; + for (dss_id = 0; dss_id < mirror->dss_count; dss_id++) { + if (IS_ERR(mirror->dss[dss_id].mirror_ds)) + return false; + if (!mirror->dss[dss_id].mirror_ds) + continue; + devid = &mirror->dss[dss_id].mirror_ds->id_node; + if (nfs4_test_deviceid_unavailable(devid)) + return false; + } } return FF_LAYOUT_MIRROR_COUNT(lseg) != 0; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 9e94d18448ff..b4679b7161b0 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1269,8 +1269,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, int ret; data->context[NFS_MAX_CONTEXT_LEN] = '\0'; - ret = vfs_parse_fs_string(fc, "context", - data->context, strlen(data->context)); + ret = vfs_parse_fs_string(fc, "context", data->context); if (ret < 0) return ret; #else diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 9bdaf7f38bed..18b57c7c2f97 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -1073,6 +1073,21 @@ out_no_revalidate: if (S_ISDIR(inode->i_mode)) stat->blksize = NFS_SERVER(inode)->dtsize; stat->btime = NFS_I(inode)->btime; + + /* Special handling for STATX_DIOALIGN and STATX_DIO_READ_ALIGN + * - NFS doesn't have DIO alignment constraints, avoid getting + * these DIO attrs from remote and just respond with most + * accommodating limits (so client will issue supported DIO). + * - this is unintuitive, but the most coarse-grained + * dio_offset_align is the most accommodating. + */ + if ((request_mask & (STATX_DIOALIGN | STATX_DIO_READ_ALIGN)) && + S_ISREG(inode->i_mode)) { + stat->result_mask |= STATX_DIOALIGN | STATX_DIO_READ_ALIGN; + stat->dio_mem_align = 4; /* 4-byte alignment */ + stat->dio_offset_align = PAGE_SIZE; + stat->dio_read_offset_align = stat->dio_offset_align; + } out: trace_nfs_getattr_exit(inode, err); return err; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index c0a44f389f8f..2ecd38e1d17a 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -456,6 +456,16 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ +struct nfs_local_dio { + u32 mem_align; + u32 offset_align; + loff_t middle_offset; + loff_t end_offset; + ssize_t start_len; /* Length for misaligned first extent */ + ssize_t middle_len; /* Length for DIO-aligned middle extent */ + ssize_t end_len; /* Length for misaligned last extent */ +}; + extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 97abf62f109d..2c0455e91571 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -30,6 +30,8 @@ #define NFSDBG_FACILITY NFSDBG_VFS +#define NFSLOCAL_MAX_IOS 3 + struct nfs_local_kiocb { struct kiocb kiocb; struct bio_vec *bvec; @@ -37,6 +39,14 @@ struct nfs_local_kiocb { struct work_struct work; void (*aio_complete_work)(struct work_struct *); struct nfsd_file *localio; + /* Begin mostly DIO-specific members */ + size_t end_len; + short int end_iter_index; + short int n_iters; + bool iter_is_dio_aligned[NFSLOCAL_MAX_IOS]; + loff_t offset[NFSLOCAL_MAX_IOS] ____cacheline_aligned; + struct iov_iter iters[NFSLOCAL_MAX_IOS]; + /* End mostly DIO-specific members */ }; struct nfs_local_fsync_ctx { @@ -49,11 +59,6 @@ struct nfs_local_fsync_ctx { static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); -static bool localio_O_DIRECT_semantics __read_mostly = false; -module_param(localio_O_DIRECT_semantics, bool, 0644); -MODULE_PARM_DESC(localio_O_DIRECT_semantics, - "LOCALIO will use O_DIRECT semantics to filesystem."); - static inline bool nfs_client_is_local(const struct nfs_client *clp) { return !!rcu_access_pointer(clp->cl_uuid.net); @@ -231,13 +236,13 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfsd_file __rcu **pnf, const fmode_t mode) { + int status = 0; struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { - int status = PTR_ERR(localio); - trace_nfs_local_open_fh(fh, mode, status); + status = PTR_ERR(localio); switch (status) { case -ENOMEM: case -ENXIO: @@ -247,6 +252,7 @@ __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, nfs_local_probe(clp); } } + trace_nfs_local_open_fh(fh, mode, status); return localio; } @@ -281,23 +287,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, } EXPORT_SYMBOL_GPL(nfs_local_open_fh); -static struct bio_vec * -nfs_bvec_alloc_and_import_pagevec(struct page **pagevec, - unsigned int npages, gfp_t flags) -{ - struct bio_vec *bvec, *p; - - bvec = kmalloc_array(npages, sizeof(*bvec), flags); - if (bvec != NULL) { - for (p = bvec; npages > 0; p++, pagevec++, npages--) { - p->bv_page = *pagevec; - p->bv_len = PAGE_SIZE; - p->bv_offset = 0; - } - } - return bvec; -} - static void nfs_local_iocb_free(struct nfs_local_kiocb *iocb) { @@ -311,40 +300,191 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, { struct nfs_local_kiocb *iocb; - iocb = kmalloc(sizeof(*iocb), flags); + iocb = kzalloc(sizeof(*iocb), flags); if (iocb == NULL) return NULL; - iocb->bvec = nfs_bvec_alloc_and_import_pagevec(hdr->page_array.pagevec, - hdr->page_array.npages, flags); + + iocb->bvec = kmalloc_array(hdr->page_array.npages, + sizeof(struct bio_vec), flags); if (iocb->bvec == NULL) { kfree(iocb); return NULL; } - if (localio_O_DIRECT_semantics && - test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { - iocb->kiocb.ki_filp = file; - iocb->kiocb.ki_flags = IOCB_DIRECT; - } else - init_sync_kiocb(&iocb->kiocb, file); + init_sync_kiocb(&iocb->kiocb, file); - iocb->kiocb.ki_pos = hdr->args.offset; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; iocb->aio_complete_work = NULL; + iocb->end_iter_index = -1; + return iocb; } -static void -nfs_local_iter_init(struct iov_iter *i, struct nfs_local_kiocb *iocb, int dir) +static bool +nfs_is_local_dio_possible(struct nfs_local_kiocb *iocb, int rw, + size_t len, struct nfs_local_dio *local_dio) +{ + struct nfs_pgio_header *hdr = iocb->hdr; + loff_t offset = hdr->args.offset; + u32 nf_dio_mem_align, nf_dio_offset_align, nf_dio_read_offset_align; + loff_t start_end, orig_end, middle_end; + + nfs_to->nfsd_file_dio_alignment(iocb->localio, &nf_dio_mem_align, + &nf_dio_offset_align, &nf_dio_read_offset_align); + if (rw == ITER_DEST) + nf_dio_offset_align = nf_dio_read_offset_align; + + if (unlikely(!nf_dio_mem_align || !nf_dio_offset_align)) + return false; + if (unlikely(nf_dio_offset_align > PAGE_SIZE)) + return false; + if (unlikely(len < nf_dio_offset_align)) + return false; + + local_dio->mem_align = nf_dio_mem_align; + local_dio->offset_align = nf_dio_offset_align; + + start_end = round_up(offset, nf_dio_offset_align); + orig_end = offset + len; + middle_end = round_down(orig_end, nf_dio_offset_align); + + local_dio->middle_offset = start_end; + local_dio->end_offset = middle_end; + + local_dio->start_len = start_end - offset; + local_dio->middle_len = middle_end - start_end; + local_dio->end_len = orig_end - middle_end; + + if (rw == ITER_DEST) + trace_nfs_local_dio_read(hdr->inode, offset, len, local_dio); + else + trace_nfs_local_dio_write(hdr->inode, offset, len, local_dio); + return true; +} + +static bool nfs_iov_iter_aligned_bvec(const struct iov_iter *i, + unsigned int addr_mask, unsigned int len_mask) +{ + const struct bio_vec *bvec = i->bvec; + size_t skip = i->iov_offset; + size_t size = i->count; + + if (size & len_mask) + return false; + do { + size_t len = bvec->bv_len; + + if (len > size) + len = size; + if ((unsigned long)(bvec->bv_offset + skip) & addr_mask) + return false; + bvec++; + size -= len; + skip = 0; + } while (size); + + return true; +} + +/* + * Setup as many as 3 iov_iter based on extents described by @local_dio. + * Returns the number of iov_iter that were setup. + */ +static int +nfs_local_iters_setup_dio(struct nfs_local_kiocb *iocb, int rw, + unsigned int nvecs, size_t len, + struct nfs_local_dio *local_dio) +{ + int n_iters = 0; + struct iov_iter *iters = iocb->iters; + + /* Setup misaligned start? */ + if (local_dio->start_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iters[n_iters].count = local_dio->start_len; + iocb->offset[n_iters] = iocb->hdr->args.offset; + iocb->iter_is_dio_aligned[n_iters] = false; + ++n_iters; + } + + /* Setup misaligned end? + * If so, the end is purposely setup to be issued using buffered IO + * before the middle (which will use DIO, if DIO-aligned, with AIO). + * This creates problems if/when the end results in a partial write. + * So must save index and length of end to handle this corner case. + */ + if (local_dio->end_len) { + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + iocb->offset[n_iters] = local_dio->end_offset; + iov_iter_advance(&iters[n_iters], + local_dio->start_len + local_dio->middle_len); + iocb->iter_is_dio_aligned[n_iters] = false; + /* Save index and length of end */ + iocb->end_iter_index = n_iters; + iocb->end_len = local_dio->end_len; + ++n_iters; + } + + /* Setup DIO-aligned middle to be issued last, to allow for + * DIO with AIO completion (see nfs_local_call_{read,write}). + */ + iov_iter_bvec(&iters[n_iters], rw, iocb->bvec, nvecs, len); + if (local_dio->start_len) + iov_iter_advance(&iters[n_iters], local_dio->start_len); + iters[n_iters].count -= local_dio->end_len; + iocb->offset[n_iters] = local_dio->middle_offset; + + iocb->iter_is_dio_aligned[n_iters] = + nfs_iov_iter_aligned_bvec(&iters[n_iters], + local_dio->mem_align-1, local_dio->offset_align-1); + + if (unlikely(!iocb->iter_is_dio_aligned[n_iters])) { + trace_nfs_local_dio_misaligned(iocb->hdr->inode, + iocb->hdr->args.offset, len, local_dio); + return 0; /* no DIO-aligned IO possible */ + } + ++n_iters; + + iocb->n_iters = n_iters; + return n_iters; +} + +static noinline_for_stack void +nfs_local_iters_init(struct nfs_local_kiocb *iocb, int rw) { struct nfs_pgio_header *hdr = iocb->hdr; + struct page **pagevec = hdr->page_array.pagevec; + unsigned long v, total; + unsigned int base; + size_t len; + + v = 0; + total = hdr->args.count; + base = hdr->args.pgbase; + while (total && v < hdr->page_array.npages) { + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&iocb->bvec[v], *pagevec, len, base); + total -= len; + ++pagevec; + ++v; + base = 0; + } + len = hdr->args.count - total; + + if (test_bit(NFS_IOHDR_ODIRECT, &hdr->flags)) { + struct nfs_local_dio local_dio; + + if (nfs_is_local_dio_possible(iocb, rw, len, &local_dio) && + nfs_local_iters_setup_dio(iocb, rw, v, len, &local_dio) != 0) + return; /* is DIO-aligned */ + } - iov_iter_bvec(i, dir, iocb->bvec, hdr->page_array.npages, - hdr->args.count + hdr->args.pgbase); - if (hdr->args.pgbase != 0) - iov_iter_advance(i, hdr->args.pgbase); + /* Use buffered IO */ + iocb->offset[0] = hdr->args.offset; + iov_iter_bvec(&iocb->iters[0], rw, iocb->bvec, v, len); + iocb->n_iters = 1; } static void @@ -367,10 +507,12 @@ nfs_local_pgio_init(struct nfs_pgio_header *hdr, static void nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) { + /* Must handle partial completions */ if (status >= 0) { - hdr->res.count = status; - hdr->res.op_status = NFS4_OK; - hdr->task.tk_status = 0; + hdr->res.count += status; + /* @hdr was initialized to 0 (zeroed during allocation) */ + if (hdr->task.tk_status == 0) + hdr->res.op_status = NFS4_OK; } else { hdr->res.op_status = nfs_localio_errno_to_nfs4_stat(status); hdr->task.tk_status = status; @@ -378,12 +520,18 @@ nfs_local_pgio_done(struct nfs_pgio_header *hdr, long status) } static void +nfs_local_iocb_release(struct nfs_local_kiocb *iocb) +{ + nfs_local_file_put(iocb->localio); + nfs_local_iocb_free(iocb); +} + +static void nfs_local_pgio_release(struct nfs_local_kiocb *iocb) { struct nfs_pgio_header *hdr = iocb->hdr; - nfs_local_file_put(iocb->localio); - nfs_local_iocb_free(iocb); + nfs_local_iocb_release(iocb); nfs_local_hdr_release(hdr, hdr->task.tk_ops); } @@ -405,7 +553,10 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) struct nfs_pgio_header *hdr = iocb->hdr; struct file *filp = iocb->kiocb.ki_filp; - nfs_local_pgio_done(hdr, status); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O read alignment failure\n"); + } /* * Must clear replen otherwise NFSv3 data corruption will occur @@ -434,6 +585,7 @@ static void nfs_local_read_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_read_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_read_aio_complete_work */ } @@ -444,14 +596,25 @@ static void nfs_local_call_read(struct work_struct *work) container_of(work, struct nfs_local_kiocb, work); struct file *filp = iocb->kiocb.ki_filp; const struct cred *save_cred; - struct iov_iter iter; ssize_t status; save_cred = override_creds(filp->f_cred); - nfs_local_iter_init(&iter, iocb, READ); + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_read_aio_complete; + iocb->aio_complete_work = nfs_local_read_aio_complete_work; + } - status = filp->f_op->read_iter(&iocb->kiocb, &iter); + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->read_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } revert_creds(save_cred); @@ -462,33 +625,17 @@ static void nfs_local_call_read(struct work_struct *work) } static int -nfs_do_local_read(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_read(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without read_iter */ - if (!file->f_op->read_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_read_aio_complete; - iocb->aio_complete_work = nfs_local_read_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_read); queue_work(nfslocaliod_workqueue, &iocb->work); @@ -529,7 +676,7 @@ nfs_set_local_verifier(struct inode *inode, } /* Factored out from fs/nfsd/vfs.h:fh_getattr() */ -static int __vfs_getattr(struct path *p, struct kstat *stat, int version) +static int __vfs_getattr(const struct path *p, struct kstat *stat, int version) { u32 request_mask = STATX_BASIC_STATS; @@ -597,7 +744,13 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) dprintk("%s: wrote %ld bytes.\n", __func__, status > 0 ? status : 0); + if ((iocb->kiocb.ki_flags & IOCB_DIRECT) && status == -EINVAL) { + /* Underlying FS will return -EINVAL if misaligned DIO is attempted. */ + pr_info_ratelimited("nfs: Unexpected direct I/O write alignment failure\n"); + } + /* Handle short writes as if they are ENOSPC */ + status = hdr->res.count; if (status > 0 && status < hdr->args.count) { hdr->mds_offset += status; hdr->args.offset += status; @@ -605,11 +758,11 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) hdr->args.count -= status; nfs_set_pgio_error(hdr, -ENOSPC, hdr->args.offset); status = -ENOSPC; + /* record -ENOSPC in terms of nfs_local_pgio_done */ + nfs_local_pgio_done(hdr, status); } - if (status < 0) + if (hdr->task.tk_status < 0) nfs_reset_boot_verifier(inode); - - nfs_local_pgio_done(hdr, status); } static void nfs_local_write_aio_complete_work(struct work_struct *work) @@ -626,6 +779,7 @@ static void nfs_local_write_aio_complete(struct kiocb *kiocb, long ret) struct nfs_local_kiocb *iocb = container_of(kiocb, struct nfs_local_kiocb, kiocb); + nfs_local_pgio_done(iocb->hdr, ret); nfs_local_write_done(iocb, ret); nfs_local_pgio_aio_complete(iocb); /* Calls nfs_local_write_aio_complete_work */ } @@ -637,16 +791,53 @@ static void nfs_local_call_write(struct work_struct *work) struct file *filp = iocb->kiocb.ki_filp; unsigned long old_flags = current->flags; const struct cred *save_cred; - struct iov_iter iter; ssize_t status; current->flags |= PF_LOCAL_THROTTLE | PF_MEMALLOC_NOIO; save_cred = override_creds(filp->f_cred); - nfs_local_iter_init(&iter, iocb, WRITE); - file_start_write(filp); - status = filp->f_op->write_iter(&iocb->kiocb, &iter); + for (int i = 0; i < iocb->n_iters ; i++) { + if (iocb->iter_is_dio_aligned[i]) { + iocb->kiocb.ki_flags |= IOCB_DIRECT; + iocb->kiocb.ki_complete = nfs_local_write_aio_complete; + iocb->aio_complete_work = nfs_local_write_aio_complete_work; + } +retry: + iocb->kiocb.ki_pos = iocb->offset[i]; + status = filp->f_op->write_iter(&iocb->kiocb, &iocb->iters[i]); + if (status != -EIOCBQUEUED) { + if (unlikely(status >= 0 && status < iocb->iters[i].count)) { + /* partial write */ + if (i == iocb->end_iter_index) { + /* Must not account partial end, otherwise, due + * to end being issued before middle: the partial + * write accounting in nfs_local_write_done() + * would incorrectly advance hdr->args.offset + */ + status = 0; + } else { + /* Partial write at start or buffered middle, + * exit early. + */ + nfs_local_pgio_done(iocb->hdr, status); + break; + } + } else if (unlikely(status == -ENOTBLK && + (iocb->kiocb.ki_flags & IOCB_DIRECT))) { + /* VFS will return -ENOTBLK if DIO WRITE fails to + * invalidate the page cache. Retry using buffered IO. + */ + iocb->kiocb.ki_flags &= ~IOCB_DIRECT; + iocb->kiocb.ki_complete = NULL; + iocb->aio_complete_work = NULL; + goto retry; + } + nfs_local_pgio_done(iocb->hdr, status); + if (iocb->hdr->task.tk_status) + break; + } + } file_end_write(filp); revert_creds(save_cred); @@ -660,26 +851,15 @@ static void nfs_local_call_write(struct work_struct *work) } static int -nfs_do_local_write(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, +nfs_local_do_write(struct nfs_local_kiocb *iocb, const struct rpc_call_ops *call_ops) { - struct nfs_local_kiocb *iocb; - struct file *file = nfs_to->nfsd_file_file(localio); - - /* Don't support filesystems without write_iter */ - if (!file->f_op->write_iter) - return -EAGAIN; + struct nfs_pgio_header *hdr = iocb->hdr; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); - if (iocb == NULL) - return -ENOMEM; - iocb->localio = localio; - switch (hdr->args.stable) { default: break; @@ -694,43 +874,74 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, nfs_set_local_verifier(hdr->inode, hdr->res.verf, hdr->args.stable); - if (iocb->kiocb.ki_flags & IOCB_DIRECT) { - iocb->kiocb.ki_complete = nfs_local_write_aio_complete; - iocb->aio_complete_work = nfs_local_write_aio_complete_work; - } - INIT_WORK(&iocb->work, nfs_local_call_write); queue_work(nfslocaliod_workqueue, &iocb->work); return 0; } +static struct nfs_local_kiocb * +nfs_local_iocb_init(struct nfs_pgio_header *hdr, struct nfsd_file *localio) +{ + struct file *file = nfs_to->nfsd_file_file(localio); + struct nfs_local_kiocb *iocb; + gfp_t gfp_mask; + int rw; + + if (hdr->rw_mode & FMODE_READ) { + if (!file->f_op->read_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_KERNEL; + rw = ITER_DEST; + } else { + if (!file->f_op->write_iter) + return ERR_PTR(-EOPNOTSUPP); + gfp_mask = GFP_NOIO; + rw = ITER_SOURCE; + } + + iocb = nfs_local_iocb_alloc(hdr, file, gfp_mask); + if (iocb == NULL) + return ERR_PTR(-ENOMEM); + iocb->hdr = hdr; + iocb->localio = localio; + + nfs_local_iters_init(iocb, rw); + + return iocb; +} + int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { + struct nfs_local_kiocb *iocb; int status = 0; if (!hdr->args.count) return 0; + iocb = nfs_local_iocb_init(hdr, localio); + if (IS_ERR(iocb)) + return PTR_ERR(iocb); + switch (hdr->rw_mode) { case FMODE_READ: - status = nfs_do_local_read(hdr, localio, call_ops); + status = nfs_local_do_read(iocb, call_ops); break; case FMODE_WRITE: - status = nfs_do_local_write(hdr, localio, call_ops); + status = nfs_local_do_write(iocb, call_ops); break; default: dprintk("%s: invalid mode: %d\n", __func__, hdr->rw_mode); - status = -EINVAL; + status = -EOPNOTSUPP; } if (status != 0) { if (status == -EAGAIN) nfs_localio_disable_client(clp); - nfs_local_file_put(localio); + nfs_local_iocb_release(iocb); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); } diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index f9a3a1fbf44c..5a4d193da1a9 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -290,7 +290,8 @@ int nfs_do_submount(struct fs_context *fc) nfs_errorf(fc, "NFS: Couldn't determine submount pathname"); ret = PTR_ERR(p); } else { - ret = vfs_parse_fs_string(fc, "source", p, buffer + 4096 - p); + ret = vfs_parse_fs_qstr(fc, "source", + &QSTR_LEN(p, buffer + 4096 - p)); if (!ret) ret = vfs_get_tree(fc); } diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c index 6e75c6c2d234..9eff09158518 100644 --- a/fs/nfs/nfs2xdr.c +++ b/fs/nfs/nfs2xdr.c @@ -23,8 +23,8 @@ #include <linux/nfs2.h> #include <linux/nfs_fs.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c index 4ae01c10b7e2..e17d72908412 100644 --- a/fs/nfs/nfs3xdr.c +++ b/fs/nfs/nfs3xdr.c @@ -23,8 +23,8 @@ #include <linux/nfsacl.h> #include <linux/nfs_common.h> -#include "nfstrace.h" #include "internal.h" +#include "nfstrace.h" #define NFSDBG_FACILITY NFSDBG_XDR diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6a0b5871ba3b..d537fb0c230e 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -1514,7 +1514,7 @@ static ssize_t _nfs42_proc_listxattrs(struct inode *inode, void *buf, ret = -ENOMEM; - res.scratch = alloc_page(GFP_KERNEL); + res.scratch = folio_alloc(GFP_KERNEL, 0); if (!res.scratch) goto out; @@ -1552,7 +1552,7 @@ out_free_pages: } kfree(pages); out_free_scratch: - __free_page(res.scratch); + folio_put(res.scratch); out: return ret; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 4cc915d5741d..e10d83ba835e 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -1781,7 +1781,7 @@ static int nfs4_xdr_dec_listxattrs(struct rpc_rqst *rqstp, struct compound_hdr hdr; int status; - xdr_set_scratch_page(xdr, res->scratch); + xdr_set_scratch_folio(xdr, res->scratch); status = decode_compound_hdr(xdr, &hdr); if (status) diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 6fddf43d729c..5998d6bd8a4f 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -222,6 +222,7 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_state = 1 << NFS4CLNT_LEASE_EXPIRED; clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion]; clp->cl_mig_gen = 1; + clp->cl_last_renewal = jiffies; #if IS_ENABLED(CONFIG_NFS_V4_1) init_waitqueue_head(&clp->cl_lock_waitq); #endif diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index c9a0d1e420c6..7f43e890d356 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -456,4 +456,5 @@ const struct file_operations nfs4_file_operations = { #else .llseek = nfs_file_llseek, #endif + .fop_flags = FOP_DONTCACHE, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index ce61253efd45..411776718494 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -391,7 +391,9 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent *p++ = htonl(attrs); /* bitmap */ *p++ = htonl(12); /* attribute buffer length */ *p++ = htonl(NF4DIR); + spin_lock(&dentry->d_lock); p = xdr_encode_hyper(p, NFS_FILEID(d_inode(dentry->d_parent))); + spin_unlock(&dentry->d_lock); readdir->pgbase = (char *)p - (char *)start; readdir->count -= readdir->pgbase; @@ -3634,6 +3636,7 @@ struct nfs4_closedata { } lr; struct nfs_fattr fattr; unsigned long timestamp; + unsigned short retrans; }; static void nfs4_free_closedata(void *data) @@ -3662,6 +3665,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) .state = state, .inode = calldata->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -3709,6 +3713,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) default: task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -5591,9 +5596,11 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, server, task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -5707,10 +5714,12 @@ static int nfs4_write_done_cb(struct rpc_task *task, .inode = hdr->inode, .state = hdr->args.context->state, .stateid = &hdr->args.stateid, + .retrans = hdr->retrans, }; task->tk_status = nfs4_async_handle_exception(task, NFS_SERVER(inode), task->tk_status, &exception); + hdr->retrans = exception.retrans; if (exception.retry) { rpc_restart_call_prepare(task); return -EAGAIN; @@ -6160,7 +6169,7 @@ static ssize_t __nfs4_get_acl_uncached(struct inode *inode, void *buf, } /* for decoding across pages */ - res.acl_scratch = alloc_page(GFP_KERNEL); + res.acl_scratch = folio_alloc(GFP_KERNEL, 0); if (!res.acl_scratch) goto out_free; @@ -6196,7 +6205,7 @@ out_free: while (--i >= 0) __free_page(pages[i]); if (res.acl_scratch) - __free_page(res.acl_scratch); + folio_put(res.acl_scratch); kfree(pages); return ret; } @@ -6724,6 +6733,7 @@ struct nfs4_delegreturndata { struct nfs_fh fh; nfs4_stateid stateid; unsigned long timestamp; + unsigned short retrans; struct { struct nfs4_layoutreturn_args arg; struct nfs4_layoutreturn_res res; @@ -6744,6 +6754,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) .inode = data->inode, .stateid = &data->stateid, .task_is_privileged = data->args.seq_args.sa_privileged, + .retrans = data->retrans, }; if (!nfs4_sequence_done(task, &data->res.seq_res)) @@ -6815,6 +6826,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) task->tk_status = nfs4_async_handle_exception(task, data->res.server, task->tk_status, &exception); + data->retrans = exception.retrans; if (exception.retry) goto out_restart; } @@ -7091,6 +7103,7 @@ struct nfs4_unlockdata { struct file_lock fl; struct nfs_server *server; unsigned long timestamp; + unsigned short retrans; }; static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, @@ -7145,6 +7158,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) struct nfs4_exception exception = { .inode = calldata->lsp->ls_state->inode, .stateid = &calldata->arg.stateid, + .retrans = calldata->retrans, }; if (!nfs4_sequence_done(task, &calldata->res.seq_res)) @@ -7178,6 +7192,7 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) task->tk_status = nfs4_async_handle_exception(task, calldata->server, task->tk_status, &exception); + calldata->retrans = exception.retrans; if (exception.retry) rpc_restart_call_prepare(task); } @@ -7872,10 +7887,10 @@ int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, return err; do { err = _nfs4_do_setlk(state, F_SETLK, fl, NFS_LOCK_NEW); - if (err != -NFS4ERR_DELAY) + if (err != -NFS4ERR_DELAY && err != -NFS4ERR_GRACE) break; ssleep(1); - } while (err == -NFS4ERR_DELAY); + } while (err == -NFS4ERR_DELAY || err == -NFSERR_GRACE); return nfs4_handle_delegation_recall_error(server, state, stateid, fl, err); } @@ -9442,7 +9457,7 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args goto out; if (rcvd->max_rqst_sz > sent->max_rqst_sz) return -EINVAL; - if (rcvd->max_resp_sz < sent->max_resp_sz) + if (rcvd->max_resp_sz > sent->max_resp_sz) return -EINVAL; if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached) return -EINVAL; diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 7612e977e80b..01179f7de322 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -2744,6 +2744,9 @@ out_error: case -ENETUNREACH: nfs_mark_client_ready(clp, -EIO); break; + case -EINVAL: + nfs_mark_client_ready(clp, status); + break; default: ssleep(1); break; diff --git a/fs/nfs/nfs4super.c b/fs/nfs/nfs4super.c index b29a26923ce0..5ec9c83f1ef0 100644 --- a/fs/nfs/nfs4super.c +++ b/fs/nfs/nfs4super.c @@ -149,21 +149,9 @@ static int do_nfs4_mount(struct nfs_server *server, struct fs_context *root_fc; struct vfsmount *root_mnt; struct dentry *dentry; - size_t len; + char *source; int ret; - struct fs_parameter param = { - .key = "source", - .type = fs_value_is_string, - .dirfd = -1, - }; - - struct fs_parameter param_fsc = { - .key = "fsc", - .type = fs_value_is_string, - .dirfd = -1, - }; - if (IS_ERR(server)) return PTR_ERR(server); @@ -181,15 +169,7 @@ static int do_nfs4_mount(struct nfs_server *server, root_ctx->server = server; if (ctx->fscache_uniq) { - len = strlen(ctx->fscache_uniq); - param_fsc.size = len; - param_fsc.string = kmemdup_nul(ctx->fscache_uniq, len, GFP_KERNEL); - if (param_fsc.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - ret = vfs_parse_fs_param(root_fc, ¶m_fsc); - kfree(param_fsc.string); + ret = vfs_parse_fs_string(root_fc, "fsc", ctx->fscache_uniq); if (ret < 0) { put_fs_context(root_fc); return ret; @@ -197,20 +177,18 @@ static int do_nfs4_mount(struct nfs_server *server, } /* We leave export_path unset as it's not used to find the root. */ - len = strlen(hostname) + 5; - param.string = kmalloc(len, GFP_KERNEL); - if (param.string == NULL) { - put_fs_context(root_fc); - return -ENOMEM; - } - /* Does hostname needs to be enclosed in brackets? */ if (strchr(hostname, ':')) - param.size = snprintf(param.string, len, "[%s]:/", hostname); + source = kasprintf(GFP_KERNEL, "[%s]:/", hostname); else - param.size = snprintf(param.string, len, "%s:/", hostname); - ret = vfs_parse_fs_param(root_fc, ¶m); - kfree(param.string); + source = kasprintf(GFP_KERNEL, "%s:/", hostname); + + if (!source) { + put_fs_context(root_fc); + return -ENOMEM; + } + ret = vfs_parse_fs_string(root_fc, "source", source); + kfree(source); if (ret < 0) { put_fs_context(root_fc); return ret; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 49ff98571fa5..1d0e6c10f921 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -4930,7 +4930,7 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap, } /* - * The prefered block size for layout directed io + * The preferred block size for layout directed io */ static int decode_attr_layout_blksize(struct xdr_stream *xdr, uint32_t *bitmap, uint32_t *res) @@ -6585,7 +6585,7 @@ nfs4_xdr_dec_getacl(struct rpc_rqst *rqstp, struct xdr_stream *xdr, int status; if (res->acl_scratch != NULL) - xdr_set_scratch_page(xdr, res->acl_scratch); + xdr_set_scratch_folio(xdr, res->acl_scratch); status = decode_compound_hdr(xdr, &hdr); if (status) goto out; diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h index 627115179795..6ce55e8e6b67 100644 --- a/fs/nfs/nfstrace.h +++ b/fs/nfs/nfstrace.h @@ -45,6 +45,23 @@ { BIT(NFS_INO_LAYOUTSTATS), "LAYOUTSTATS" }, \ { BIT(NFS_INO_ODIRECT), "ODIRECT" }) +#define nfs_show_wb_flags(v) \ + __print_flags(v, "|", \ + { BIT(PG_BUSY), "BUSY" }, \ + { BIT(PG_MAPPED), "MAPPED" }, \ + { BIT(PG_FOLIO), "FOLIO" }, \ + { BIT(PG_CLEAN), "CLEAN" }, \ + { BIT(PG_COMMIT_TO_DS), "COMMIT_TO_DS" }, \ + { BIT(PG_INODE_REF), "INODE_REF" }, \ + { BIT(PG_HEADLOCK), "HEADLOCK" }, \ + { BIT(PG_TEARDOWN), "TEARDOWN" }, \ + { BIT(PG_UNLOCKPAGE), "UNLOCKPAGE" }, \ + { BIT(PG_UPTODATE), "UPTODATE" }, \ + { BIT(PG_WB_END), "WB_END" }, \ + { BIT(PG_REMOVE), "REMOVE" }, \ + { BIT(PG_CONTENDED1), "CONTENDED1" }, \ + { BIT(PG_CONTENDED2), "CONTENDED2" }) + DECLARE_EVENT_CLASS(nfs_inode_event, TP_PROTO( const struct inode *inode @@ -967,7 +984,7 @@ DECLARE_EVENT_CLASS(nfs_folio_event, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, + __entry->offset = offset; __entry->count = count; ), @@ -1017,8 +1034,8 @@ DECLARE_EVENT_CLASS(nfs_folio_event_done, __entry->fileid = nfsi->fileid; __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); __entry->version = inode_peek_iversion_raw(inode); - __entry->offset = offset, - __entry->count = count, + __entry->offset = offset; + __entry->count = count; __entry->ret = ret; ), @@ -1051,6 +1068,73 @@ DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writeback_folio_done); DEFINE_NFS_FOLIO_EVENT(nfs_invalidate_folio); DEFINE_NFS_FOLIO_EVENT_DONE(nfs_launder_folio_done); +DEFINE_NFS_FOLIO_EVENT(nfs_try_to_update_request); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_try_to_update_request_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_update_folio); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_update_folio_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_begin); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_begin_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_write_end); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_write_end_done); + +DEFINE_NFS_FOLIO_EVENT(nfs_writepages); +DEFINE_NFS_FOLIO_EVENT_DONE(nfs_writepages_done); + +DECLARE_EVENT_CLASS(nfs_kiocb_event, + TP_PROTO( + const struct kiocb *iocb, + const struct iov_iter *iter + ), + + TP_ARGS(iocb, iter), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(u64, version) + __field(loff_t, offset) + __field(size_t, count) + __field(int, flags) + ), + + TP_fast_assign( + const struct inode *inode = file_inode(iocb->ki_filp); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->version = inode_peek_iversion_raw(inode); + __entry->offset = iocb->ki_pos; + __entry->count = iov_iter_count(iter); + __entry->flags = iocb->ki_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x version=%llu offset=%lld count=%zu ki_flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->version, + __entry->offset, __entry->count, + __print_flags(__entry->flags, "|", TRACE_IOCB_STRINGS) + ) +); + +#define DEFINE_NFS_KIOCB_EVENT(name) \ + DEFINE_EVENT(nfs_kiocb_event, name, \ + TP_PROTO( \ + const struct kiocb *iocb, \ + const struct iov_iter *iter \ + ), \ + TP_ARGS(iocb, iter)) + +DEFINE_NFS_KIOCB_EVENT(nfs_file_read); +DEFINE_NFS_KIOCB_EVENT(nfs_file_write); + TRACE_EVENT(nfs_aop_readahead, TP_PROTO( const struct inode *inode, @@ -1398,6 +1482,55 @@ TRACE_EVENT(nfs_writeback_done, ) ); +DECLARE_EVENT_CLASS(nfs_page_class, + TP_PROTO( + const struct nfs_page *req + ), + + TP_ARGS(req), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u32, fhandle) + __field(u64, fileid) + __field(const struct nfs_page *__private, req) + __field(loff_t, offset) + __field(unsigned int, count) + __field(unsigned long, flags) + ), + + TP_fast_assign( + const struct inode *inode = folio_inode(req->wb_folio); + const struct nfs_inode *nfsi = NFS_I(inode); + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(&nfsi->fh); + __entry->req = req; + __entry->offset = req_offset(req); + __entry->count = req->wb_bytes; + __entry->flags = req->wb_flags; + ), + + TP_printk( + "fileid=%02x:%02x:%llu fhandle=0x%08x req=%p offset=%lld count=%u flags=%s", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, __entry->fhandle, + __entry->req, __entry->offset, __entry->count, + nfs_show_wb_flags(__entry->flags) + ) +); + +#define DEFINE_NFS_PAGE_EVENT(name) \ + DEFINE_EVENT(nfs_page_class, name, \ + TP_PROTO( \ + const struct nfs_page *req \ + ), \ + TP_ARGS(req)) + +DEFINE_NFS_PAGE_EVENT(nfs_writepage_setup); +DEFINE_NFS_PAGE_EVENT(nfs_do_writepage); + DECLARE_EVENT_CLASS(nfs_page_error_class, TP_PROTO( const struct inode *inode, @@ -1599,6 +1732,76 @@ DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_completion); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_schedule_iovec); DEFINE_NFS_DIRECT_REQ_EVENT(nfs_direct_write_reschedule_io); +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +DECLARE_EVENT_CLASS(nfs_local_dio_class, + TP_PROTO( + const struct inode *inode, + loff_t offset, + ssize_t count, + const struct nfs_local_dio *local_dio + ), + TP_ARGS(inode, offset, count, local_dio), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(u64, fileid) + __field(u32, fhandle) + __field(loff_t, offset) + __field(ssize_t, count) + __field(u32, mem_align) + __field(u32, offset_align) + __field(loff_t, start) + __field(ssize_t, start_len) + __field(loff_t, middle) + __field(ssize_t, middle_len) + __field(loff_t, end) + __field(ssize_t, end_len) + ), + TP_fast_assign( + const struct nfs_inode *nfsi = NFS_I(inode); + const struct nfs_fh *fh = &nfsi->fh; + + __entry->dev = inode->i_sb->s_dev; + __entry->fileid = nfsi->fileid; + __entry->fhandle = nfs_fhandle_hash(fh); + __entry->offset = offset; + __entry->count = count; + __entry->mem_align = local_dio->mem_align; + __entry->offset_align = local_dio->offset_align; + __entry->start = offset; + __entry->start_len = local_dio->start_len; + __entry->middle = local_dio->middle_offset; + __entry->middle_len = local_dio->middle_len; + __entry->end = local_dio->end_offset; + __entry->end_len = local_dio->end_len; + ), + TP_printk("fileid=%02x:%02x:%llu fhandle=0x%08x " + "offset=%lld count=%zd " + "mem_align=%u offset_align=%u " + "start=%llu+%zd middle=%llu+%zd end=%llu+%zd", + MAJOR(__entry->dev), MINOR(__entry->dev), + (unsigned long long)__entry->fileid, + __entry->fhandle, __entry->offset, __entry->count, + __entry->mem_align, __entry->offset_align, + __entry->start, __entry->start_len, + __entry->middle, __entry->middle_len, + __entry->end, __entry->end_len) +) + +#define DEFINE_NFS_LOCAL_DIO_EVENT(name) \ +DEFINE_EVENT(nfs_local_dio_class, nfs_local_dio_##name, \ + TP_PROTO(const struct inode *inode, \ + loff_t offset, \ + ssize_t count, \ + const struct nfs_local_dio *local_dio),\ + TP_ARGS(inode, offset, count, local_dio)) + +DEFINE_NFS_LOCAL_DIO_EVENT(read); +DEFINE_NFS_LOCAL_DIO_EVENT(write); +DEFINE_NFS_LOCAL_DIO_EVENT(misaligned); + +#endif /* CONFIG_NFS_LOCALIO */ + TRACE_EVENT(nfs_fh_to_dentry, TP_PROTO( const struct super_block *sb, @@ -1713,10 +1916,10 @@ TRACE_EVENT(nfs_local_open_fh, ), TP_printk( - "error=%d fhandle=0x%08x mode=%s", - __entry->error, + "fhandle=0x%08x mode=%s result=%d", __entry->fhandle, - show_fs_fmode_flags(__entry->fmode) + show_fs_fmode_flags(__entry->fmode), + __entry->error ) ); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 647c53d1418a..336c510f3750 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -296,7 +296,7 @@ static void nfs_folio_end_writeback(struct folio *folio) { struct nfs_server *nfss = NFS_SERVER(folio->mapping->host); - folio_end_writeback(folio); + folio_end_writeback_no_dropbehind(folio); if (atomic_long_dec_return(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) { nfss->write_congested = 0; @@ -593,6 +593,7 @@ static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_do_writepage(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -656,12 +657,14 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) int priority = 0; int err; + trace_nfs_writepages(inode, wbc->range_start, wbc->range_end - wbc->range_start); + /* Wait with writeback until write congestion eases */ if (wbc->sync_mode == WB_SYNC_NONE && nfss->write_congested) { err = wait_event_killable(nfss->write_congestion_wait, nfss->write_congested == 0); if (err) - return err; + goto out_err; } nfs_inc_stats(inode, NFSIOS_VFSWRITEPAGES); @@ -692,10 +695,10 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } while (err < 0 && !nfs_error_is_fatal(err)); nfs_io_completion_put(ioc); - if (err < 0) - goto out_err; - return 0; + if (err > 0) + err = 0; out_err: + trace_nfs_writepages_done(inode, wbc->range_start, wbc->range_end - wbc->range_start, err); return err; } @@ -745,6 +748,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); + + folio_end_dropbehind(folio); } nfs_page_group_unlock(req); @@ -926,7 +931,7 @@ static void nfs_write_completion(struct nfs_pgio_header *hdr) req->wb_nio = 0; memcpy(&req->wb_verf, &hdr->verf.verifier, sizeof(req->wb_verf)); nfs_mark_request_commit(req, hdr->lseg, &cinfo, - hdr->pgio_mirror_idx); + hdr->ds_commit_idx); goto next; } remove_req: @@ -1017,11 +1022,12 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, unsigned int end; int error; + trace_nfs_try_to_update_request(folio_inode(folio), offset, bytes); end = offset + bytes; req = nfs_lock_and_join_requests(folio); if (IS_ERR_OR_NULL(req)) - return req; + goto out; rqend = req->wb_offset + req->wb_bytes; /* @@ -1043,6 +1049,9 @@ static struct nfs_page *nfs_try_to_update_request(struct folio *folio, else req->wb_bytes = rqend - req->wb_offset; req->wb_nio = 0; +out: + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, + PTR_ERR_OR_ZERO(req)); return req; out_flushme: /* @@ -1053,6 +1062,7 @@ out_flushme: nfs_mark_request_dirty(req); nfs_unlock_and_release_request(req); error = nfs_wb_folio(folio->mapping->host, folio); + trace_nfs_try_to_update_request_done(folio_inode(folio), offset, bytes, error); return (error < 0) ? ERR_PTR(error) : NULL; } @@ -1090,6 +1100,7 @@ static int nfs_writepage_setup(struct nfs_open_context *ctx, req = nfs_setup_write_request(ctx, folio, offset, count); if (IS_ERR(req)) return PTR_ERR(req); + trace_nfs_writepage_setup(req); /* Update file length */ nfs_grow_file(folio, offset, count); nfs_mark_uptodate(req); @@ -1290,6 +1301,8 @@ int nfs_update_folio(struct file *file, struct folio *folio, nfs_inc_stats(inode, NFSIOS_VFSUPDATEPAGE); + trace_nfs_update_folio(inode, offset, count); + dprintk("NFS: nfs_update_folio(%pD2 %d@%lld)\n", file, count, (long long)(folio_pos(folio) + offset)); @@ -1309,6 +1322,7 @@ int nfs_update_folio(struct file *file, struct folio *folio, if (status < 0) nfs_set_pageerror(mapping); out: + trace_nfs_update_folio_done(inode, offset, count, status); dprintk("NFS: nfs_update_folio returns %d (isize %lld)\n", status, (long long)i_size_read(inode)); return status; @@ -1521,7 +1535,8 @@ static int nfs_writeback_done(struct rpc_task *task, /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) { spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE + | NFS_INO_REVAL_FORCED); spin_unlock(&inode->i_lock); } return 0; @@ -1806,7 +1821,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) nfs_mapping_set_error(folio, status); nfs_inode_remove_request(req); } - dprintk_cont(", error = %d\n", status); + dprintk(", error = %d\n", status); goto next; } @@ -1816,11 +1831,11 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) /* We have a match */ if (folio) nfs_inode_remove_request(req); - dprintk_cont(" OK\n"); + dprintk(" OK\n"); goto next; } /* We have a mismatch. Write the page again */ - dprintk_cont(" mismatch\n"); + dprintk(" mismatch\n"); nfs_mark_request_dirty(req); atomic_long_inc(&NFS_I(data->inode)->redirtied_pages); next: diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 879e0b104d1c..e134dce45e35 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -5,6 +5,7 @@ config NFSD depends on FILE_LOCKING depends on FSNOTIFY select CRC32 + select CRYPTO_LIB_SHA256 if NFSD_V4 select LOCKD select SUNRPC select EXPORTFS @@ -77,7 +78,6 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO - select CRYPTO_LIB_SHA256 select CRYPTO_MD5 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 diff --git a/fs/nfsd/blocklayout.c b/fs/nfsd/blocklayout.c index 19078a043e85..fde5539cf6a6 100644 --- a/fs/nfsd/blocklayout.c +++ b/fs/nfsd/blocklayout.c @@ -18,8 +18,8 @@ static __be32 -nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_block_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; struct super_block *sb = inode->i_sb; @@ -29,6 +29,9 @@ nfsd4_block_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, u32 device_generation = 0; int error; + if (locks_in_grace(SVC_NET(rqstp))) + return nfserr_grace; + if (seg->offset & (block_size - 1)) { dprintk("pnfsd: I/O misaligned\n"); goto out_layoutunavailable; @@ -118,7 +121,6 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, struct iomap *iomaps, int nr_iomaps) { struct timespec64 mtime = inode_get_mtime(inode); - loff_t new_size = lcp->lc_last_wr + 1; struct iattr iattr = { .ia_valid = 0 }; int error; @@ -128,9 +130,9 @@ nfsd4_block_commit_blocks(struct inode *inode, struct nfsd4_layoutcommit *lcp, iattr.ia_valid |= ATTR_ATIME | ATTR_CTIME | ATTR_MTIME; iattr.ia_atime = iattr.ia_ctime = iattr.ia_mtime = lcp->lc_mtime; - if (new_size > i_size_read(inode)) { + if (lcp->lc_size_chg) { iattr.ia_valid |= ATTR_SIZE; - iattr.ia_size = new_size; + iattr.ia_size = lcp->lc_newsize; } error = inode->i_sb->s_export_op->commit_blocks(inode, iomaps, @@ -173,16 +175,18 @@ nfsd4_block_proc_getdeviceinfo(struct super_block *sb, } static __be32 -nfsd4_block_proc_layoutcommit(struct inode *inode, +nfsd4_block_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; __be32 nfserr; - nfserr = nfsd4_block_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, &nr_iomaps, - i_blocksize(inode)); + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); + + nfserr = nfsd4_block_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); if (nfserr != nfs_ok) return nfserr; @@ -313,16 +317,18 @@ nfsd4_scsi_proc_getdeviceinfo(struct super_block *sb, return nfserrno(nfsd4_block_get_device_info_scsi(sb, clp, gdp)); } static __be32 -nfsd4_scsi_proc_layoutcommit(struct inode *inode, +nfsd4_scsi_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp) { struct iomap *iomaps; int nr_iomaps; __be32 nfserr; - nfserr = nfsd4_scsi_decode_layoutupdate(lcp->lc_up_layout, - lcp->lc_up_len, &iomaps, &nr_iomaps, - i_blocksize(inode)); + rqstp->rq_arg = lcp->lc_up_layout; + svcxdr_init_decode(rqstp); + + nfserr = nfsd4_scsi_decode_layoutupdate(&rqstp->rq_arg_stream, + &iomaps, &nr_iomaps, i_blocksize(inode)); if (nfserr != nfs_ok) return nfserr; diff --git a/fs/nfsd/blocklayoutxdr.c b/fs/nfsd/blocklayoutxdr.c index bcf21fde9120..e50afe340737 100644 --- a/fs/nfsd/blocklayoutxdr.c +++ b/fs/nfsd/blocklayoutxdr.c @@ -29,8 +29,7 @@ nfsd4_block_encode_layoutget(struct xdr_stream *xdr, *p++ = cpu_to_be32(len); *p++ = cpu_to_be32(1); /* we always return a single extent */ - p = xdr_encode_opaque_fixed(p, &b->vol_id, - sizeof(struct nfsd4_deviceid)); + p = svcxdr_encode_deviceid4(p, &b->vol_id); p = xdr_encode_hyper(p, b->foff); p = xdr_encode_hyper(p, b->len); p = xdr_encode_hyper(p, b->soff); @@ -114,8 +113,7 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, /** * nfsd4_block_decode_layoutupdate - decode the block layout extent array - * @p: pointer to the xdr data - * @len: number of bytes to decode + * @xdr: subbuf set to the encoded array * @iomapp: pointer to store the decoded extent array * @nr_iomapsp: pointer to store the number of extents * @block_size: alignment of extent offset and length @@ -128,25 +126,24 @@ nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, * * Return values: * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid - * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid * %nfserr_inval: An unaligned extent found * %nfserr_delay: Failed to allocate memory for @iomapp */ __be32 -nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, +nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) - return nfserr_bad_xdr; - len -= sizeof(u32); - if (len % PNFS_BLOCK_EXTENT_SIZE) + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); - if (nr_iomaps != len / PNFS_BLOCK_EXTENT_SIZE) + len = sizeof(__be32) + xdr_stream_remaining(xdr); + expected = sizeof(__be32) + nr_iomaps * PNFS_BLOCK_EXTENT_SIZE; + if (len != expected) return nfserr_bad_xdr; iomaps = kcalloc(nr_iomaps, sizeof(*iomaps), GFP_KERNEL); @@ -156,23 +153,44 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { struct pnfs_block_extent bex; - memcpy(&bex.vol_id, p, sizeof(struct nfsd4_deviceid)); - p += XDR_QUADLEN(sizeof(struct nfsd4_deviceid)); + if (nfsd4_decode_deviceid4(xdr, &bex.vol_id)) { + nfserr = nfserr_bad_xdr; + goto fail; + } - p = xdr_decode_hyper(p, &bex.foff); + if (xdr_stream_decode_u64(xdr, &bex.foff)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (bex.foff & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.len)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.len); if (bex.len & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u64(xdr, &bex.soff)) { + nfserr = nfserr_bad_xdr; goto fail; } - p = xdr_decode_hyper(p, &bex.soff); if (bex.soff & (block_size - 1)) { + nfserr = nfserr_inval; + goto fail; + } + + if (xdr_stream_decode_u32(xdr, &bex.es)) { + nfserr = nfserr_bad_xdr; goto fail; } - bex.es = be32_to_cpup(p++); if (bex.es != PNFS_BLOCK_READWRITE_DATA) { + nfserr = nfserr_inval; goto fail; } @@ -185,13 +203,12 @@ nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, return nfs_ok; fail: kfree(iomaps); - return nfserr_inval; + return nfserr; } /** * nfsd4_scsi_decode_layoutupdate - decode the scsi layout extent array - * @p: pointer to the xdr data - * @len: number of bytes to decode + * @xdr: subbuf set to the encoded array * @iomapp: pointer to store the decoded extent array * @nr_iomapsp: pointer to store the number of extents * @block_size: alignment of extent offset and length @@ -203,21 +220,22 @@ fail: * * Return values: * %nfs_ok: Successful decoding, @iomapp and @nr_iomapsp are valid - * %nfserr_bad_xdr: The encoded array in @p is invalid + * %nfserr_bad_xdr: The encoded array in @xdr is invalid * %nfserr_inval: An unaligned extent found * %nfserr_delay: Failed to allocate memory for @iomapp */ __be32 -nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, +nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size) { struct iomap *iomaps; - u32 nr_iomaps, expected, i; + u32 nr_iomaps, expected, len, i; + __be32 nfserr; - if (len < sizeof(u32)) + if (xdr_stream_decode_u32(xdr, &nr_iomaps)) return nfserr_bad_xdr; - nr_iomaps = be32_to_cpup(p++); + len = sizeof(__be32) + xdr_stream_remaining(xdr); expected = sizeof(__be32) + nr_iomaps * PNFS_SCSI_RANGE_SIZE; if (len != expected) return nfserr_bad_xdr; @@ -229,14 +247,22 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, for (i = 0; i < nr_iomaps; i++) { u64 val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { + nfserr = nfserr_inval; goto fail; } iomaps[i].offset = val; - p = xdr_decode_hyper(p, &val); + if (xdr_stream_decode_u64(xdr, &val)) { + nfserr = nfserr_bad_xdr; + goto fail; + } if (val & (block_size - 1)) { + nfserr = nfserr_inval; goto fail; } iomaps[i].length = val; @@ -247,5 +273,5 @@ nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, struct iomap **iomapp, return nfs_ok; fail: kfree(iomaps); - return nfserr_inval; + return nfserr; } diff --git a/fs/nfsd/blocklayoutxdr.h b/fs/nfsd/blocklayoutxdr.h index 15b3569f3d9a..7d25ef689671 100644 --- a/fs/nfsd/blocklayoutxdr.h +++ b/fs/nfsd/blocklayoutxdr.h @@ -54,9 +54,9 @@ __be32 nfsd4_block_encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfsd4_getdeviceinfo *gdp); __be32 nfsd4_block_encode_layoutget(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp); -__be32 nfsd4_block_decode_layoutupdate(__be32 *p, u32 len, +__be32 nfsd4_block_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size); -__be32 nfsd4_scsi_decode_layoutupdate(__be32 *p, u32 len, +__be32 nfsd4_scsi_decode_layoutupdate(struct xdr_stream *xdr, struct iomap **iomapp, int *nr_iomapsp, u32 block_size); #endif /* _NFSD_BLOCKLAYOUTXDR_H */ diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c index 84b0c8b559dc..ed2b9e066206 100644 --- a/fs/nfsd/debugfs.c +++ b/fs/nfsd/debugfs.c @@ -26,12 +26,99 @@ static int nfsd_dsr_get(void *data, u64 *val) static int nfsd_dsr_set(void *data, u64 val) { - nfsd_disable_splice_read = (val > 0) ? true : false; + nfsd_disable_splice_read = (val > 0); + if (!nfsd_disable_splice_read) { + /* + * Must use buffered I/O if splice_read is enabled. + */ + nfsd_io_cache_read = NFSD_IO_BUFFERED; + } return 0; } DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); +/* + * /sys/kernel/debug/nfsd/io_cache_read + * + * Contents: + * %0: NFS READ will use buffered IO + * %1: NFS READ will use dontcache (buffered IO w/ dropbehind) + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_read_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_read; + return 0; +} + +static int nfsd_io_cache_read_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + nfsd_io_cache_read = NFSD_IO_BUFFERED; + break; + case NFSD_IO_DONTCACHE: + /* + * Must disable splice_read when enabling + * NFSD_IO_DONTCACHE. + */ + nfsd_disable_splice_read = true; + nfsd_io_cache_read = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_read_fops, nfsd_io_cache_read_get, + nfsd_io_cache_read_set, "%llu\n"); + +/* + * /sys/kernel/debug/nfsd/io_cache_write + * + * Contents: + * %0: NFS WRITE will use buffered IO + * %1: NFS WRITE will use dontcache (buffered IO w/ dropbehind) + * + * This setting takes immediate effect for all NFS versions, + * all exports, and in all NFSD net namespaces. + */ + +static int nfsd_io_cache_write_get(void *data, u64 *val) +{ + *val = nfsd_io_cache_write; + return 0; +} + +static int nfsd_io_cache_write_set(void *data, u64 val) +{ + int ret = 0; + + switch (val) { + case NFSD_IO_BUFFERED: + case NFSD_IO_DONTCACHE: + nfsd_io_cache_write = val; + break; + default: + ret = -EINVAL; + break; + } + + return ret; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_io_cache_write_fops, nfsd_io_cache_write_get, + nfsd_io_cache_write_set, "%llu\n"); + void nfsd_debugfs_exit(void) { debugfs_remove_recursive(nfsd_top_dir); @@ -44,4 +131,10 @@ void nfsd_debugfs_init(void) debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, nfsd_top_dir, NULL, &nfsd_dsr_fops); + + debugfs_create_file("io_cache_read", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_read_fops); + + debugfs_create_file("io_cache_write", 0644, nfsd_top_dir, NULL, + &nfsd_io_cache_write_fops); } diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index cadfc2bae60e..9d55512d0cc9 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -402,7 +402,7 @@ static struct svc_export *svc_export_update(struct svc_export *new, struct svc_export *old); static struct svc_export *svc_export_lookup(struct svc_export *); -static int check_export(struct path *path, int *flags, unsigned char *uuid) +static int check_export(const struct path *path, int *flags, unsigned char *uuid) { struct inode *inode = d_inode(path->dentry); @@ -1082,50 +1082,62 @@ static struct svc_export *exp_find(struct cache_detail *cd, } /** - * check_nfsd_access - check if access to export is allowed. + * check_xprtsec_policy - check if access to export is allowed by the + * xprtsec policy * @exp: svc_export that is being accessed. - * @rqstp: svc_rqst attempting to access @exp (will be NULL for LOCALIO). - * @may_bypass_gss: reduce strictness of authorization check + * @rqstp: svc_rqst attempting to access @exp. + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. * * Return values: * %nfs_ok if access is granted, or * %nfserr_wrongsec if access is denied */ -__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, - bool may_bypass_gss) +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp) { - struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; - struct svc_xprt *xprt; - - /* - * If rqstp is NULL, this is a LOCALIO request which will only - * ever use a filehandle/credential pair for which access has - * been affirmed (by ACCESS or OPEN NFS requests) over the - * wire. So there is no need for further checks here. - */ - if (!rqstp) - return nfs_ok; - - xprt = rqstp->rq_xprt; + struct svc_xprt *xprt = rqstp->rq_xprt; if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_NONE) { if (!test_bit(XPT_TLS_SESSION, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_TLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && !test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } if (exp->ex_xprtsec_modes & NFSEXP_XPRTSEC_MTLS) { if (test_bit(XPT_TLS_SESSION, &xprt->xpt_flags) && test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) - goto ok; + return nfs_ok; } - if (!may_bypass_gss) - goto denied; + return nfserr_wrongsec; +} + +/** + * check_security_flavor - check if access to export is allowed by the + * security flavor + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Helper function for check_nfsd_access(). Note that callers should be + * using check_nfsd_access() instead of calling this function directly. The + * one exception is __fh_verify() since it has logic that may result in one + * or both of the helpers being skipped. + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + struct exp_flavor_info *f, *end = exp->ex_flavors + exp->ex_nflavors; -ok: /* legacy gss-only clients are always OK: */ if (exp->ex_client == rqstp->rq_gssclient) return nfs_ok; @@ -1167,10 +1179,30 @@ ok: } } -denied: return nfserr_wrongsec; } +/** + * check_nfsd_access - check if access to export is allowed. + * @exp: svc_export that is being accessed. + * @rqstp: svc_rqst attempting to access @exp. + * @may_bypass_gss: reduce strictness of authorization check + * + * Return values: + * %nfs_ok if access is granted, or + * %nfserr_wrongsec if access is denied + */ +__be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss) +{ + __be32 status; + + status = check_xprtsec_policy(exp, rqstp); + if (status != nfs_ok) + return status; + return check_security_flavor(exp, rqstp, may_bypass_gss); +} + /* * Uses rq_client and rq_gssclient to find an export; uses rq_client (an * auth_unix client) if it's available and has secinfo information; @@ -1181,7 +1213,7 @@ denied: * use exp_get_by_name() or exp_find(). */ struct svc_export * -rqst_exp_get_by_name(struct svc_rqst *rqstp, struct path *path) +rqst_exp_get_by_name(struct svc_rqst *rqstp, const struct path *path) { struct svc_export *gssexp, *exp = ERR_PTR(-ENOENT); struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); diff --git a/fs/nfsd/export.h b/fs/nfsd/export.h index b9c0adb3ce09..d2b09cd76145 100644 --- a/fs/nfsd/export.h +++ b/fs/nfsd/export.h @@ -101,6 +101,9 @@ struct svc_expkey { struct svc_cred; int nfsexp_flags(struct svc_cred *cred, struct svc_export *exp); +__be32 check_xprtsec_policy(struct svc_export *exp, struct svc_rqst *rqstp); +__be32 check_security_flavor(struct svc_export *exp, struct svc_rqst *rqstp, + bool may_bypass_gss); __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, bool may_bypass_gss); @@ -111,7 +114,7 @@ int nfsd_export_init(struct net *); void nfsd_export_shutdown(struct net *); void nfsd_export_flush(struct net *); struct svc_export * rqst_exp_get_by_name(struct svc_rqst *, - struct path *); + const struct path *); struct svc_export * rqst_exp_parent(struct svc_rqst *, struct path *); struct svc_export * rqst_find_fsidzero_export(struct svc_rqst *); diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index 85ca663c052c..a238b6725008 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -231,6 +231,9 @@ nfsd_file_alloc(struct net *net, struct inode *inode, unsigned char need, refcount_set(&nf->nf_ref, 1); nf->nf_may = need; nf->nf_mark = NULL; + nf->nf_dio_mem_align = 0; + nf->nf_dio_offset_align = 0; + nf->nf_dio_read_offset_align = 0; return nf; } @@ -392,27 +395,6 @@ nfsd_file_put_local(struct nfsd_file __rcu **pnf) } /** - * nfsd_file_get_local - get nfsd_file reference and reference to net - * @nf: nfsd_file of which to put the reference - * - * Get reference to both the nfsd_file and nf->nf_net. - */ -struct nfsd_file * -nfsd_file_get_local(struct nfsd_file *nf) -{ - struct net *net = nf->nf_net; - - if (nfsd_net_try_get(net)) { - nf = nfsd_file_get(nf); - if (!nf) - nfsd_net_put(net); - } else { - nf = NULL; - } - return nf; -} - -/** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. * @@ -1070,6 +1052,35 @@ nfsd_file_is_cached(struct inode *inode) } static __be32 +nfsd_file_get_dio_attrs(const struct svc_fh *fhp, struct nfsd_file *nf) +{ + struct inode *inode = file_inode(nf->nf_file); + struct kstat stat; + __be32 status; + + /* Currently only need to get DIO alignment info for regular files */ + if (!S_ISREG(inode->i_mode)) + return nfs_ok; + + status = fh_getattr(fhp, &stat); + if (status != nfs_ok) + return status; + + trace_nfsd_file_get_dio_attrs(inode, &stat); + + if (stat.result_mask & STATX_DIOALIGN) { + nf->nf_dio_mem_align = stat.dio_mem_align; + nf->nf_dio_offset_align = stat.dio_offset_align; + } + if (stat.result_mask & STATX_DIO_READ_ALIGN) + nf->nf_dio_read_offset_align = stat.dio_read_offset_align; + else + nf->nf_dio_read_offset_align = nf->nf_dio_offset_align; + + return nfs_ok; +} + +static __be32 nfsd_file_do_acquire(struct svc_rqst *rqstp, struct net *net, struct svc_cred *cred, struct auth_domain *client, @@ -1187,6 +1198,8 @@ open_file: } status = nfserrno(ret); trace_nfsd_file_open(nf, status); + if (status == nfs_ok) + status = nfsd_file_get_dio_attrs(fhp, nf); } } else status = nfserr_jukebox; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 722b26c71e45..e3d6ca2b6030 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -54,6 +54,10 @@ struct nfsd_file { struct list_head nf_gc; struct rcu_head nf_rcu; ktime_t nf_birthtime; + + u32 nf_dio_mem_align; + u32 nf_dio_offset_align; + u32 nf_dio_read_offset_align; }; int nfsd_file_cache_init(void); @@ -63,7 +67,6 @@ int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); -struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c index 3ca5304440ff..0f1a35400cd5 100644 --- a/fs/nfsd/flexfilelayout.c +++ b/fs/nfsd/flexfilelayout.c @@ -20,8 +20,8 @@ #define NFSDDBG_FACILITY NFSDDBG_PNFS static __be32 -nfsd4_ff_proc_layoutget(struct inode *inode, const struct svc_fh *fhp, - struct nfsd4_layoutget *args) +nfsd4_ff_proc_layoutget(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *args) { struct nfsd4_layout_seg *seg = &args->lg_seg; u32 device_generation = 0; @@ -125,6 +125,13 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp, return 0; } +static __be32 +nfsd4_ff_proc_layoutcommit(struct inode *inode, struct svc_rqst *rqstp, + struct nfsd4_layoutcommit *lcp) +{ + return nfs_ok; +} + const struct nfsd4_layout_ops ff_layout_ops = { .notify_types = NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE, @@ -133,4 +140,5 @@ const struct nfsd4_layout_ops ff_layout_ops = { .encode_getdeviceinfo = nfsd4_ff_encode_getdeviceinfo, .proc_layoutget = nfsd4_ff_proc_layoutget, .encode_layoutget = nfsd4_ff_encode_layoutget, + .proc_layoutcommit = nfsd4_ff_proc_layoutcommit, }; diff --git a/fs/nfsd/flexfilelayoutxdr.c b/fs/nfsd/flexfilelayoutxdr.c index aeb71c10ff1b..f9f7e38cba13 100644 --- a/fs/nfsd/flexfilelayoutxdr.c +++ b/fs/nfsd/flexfilelayoutxdr.c @@ -54,8 +54,7 @@ nfsd4_ff_encode_layoutget(struct xdr_stream *xdr, *p++ = cpu_to_be32(1); /* single mirror */ *p++ = cpu_to_be32(1); /* single data server */ - p = xdr_encode_opaque_fixed(p, &fl->deviceid, - sizeof(struct nfsd4_deviceid)); + p = svcxdr_encode_deviceid4(p, &fl->deviceid); *p++ = cpu_to_be32(1); /* efficiency */ diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index cb237f1b902a..be710d809a3b 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -117,13 +117,23 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, return localio; } +static void nfsd_file_dio_alignment(struct nfsd_file *nf, + u32 *nf_dio_mem_align, + u32 *nf_dio_offset_align, + u32 *nf_dio_read_offset_align) +{ + *nf_dio_mem_align = nf->nf_dio_mem_align; + *nf_dio_offset_align = nf->nf_dio_offset_align; + *nf_dio_read_offset_align = nf->nf_dio_read_offset_align; +} + static const struct nfsd_localio_operations nfsd_localio_ops = { .nfsd_net_try_get = nfsd_net_try_get, .nfsd_net_put = nfsd_net_put, .nfsd_open_local_fh = nfsd_open_local_fh, .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get_local = nfsd_file_get_local, .nfsd_file_file = nfsd_file_file, + .nfsd_file_dio_alignment = nfsd_file_dio_alignment, }; void nfsd_localio_ops_init(void) diff --git a/fs/nfsd/lockd.c b/fs/nfsd/lockd.c index edc9f75dc75c..c774ce9aa296 100644 --- a/fs/nfsd/lockd.c +++ b/fs/nfsd/lockd.c @@ -57,7 +57,20 @@ nlm_fopen(struct svc_rqst *rqstp, struct nfs_fh *f, struct file **filp, switch (nfserr) { case nfs_ok: return 0; - case nfserr_dropit: + case nfserr_jukebox: + /* this error can indicate a presence of a conflicting + * delegation to an NLM lock request. Options are: + * (1) For now, drop this request and make the client + * retry. When delegation is returned, client's lock retry + * will complete. + * (2) NLM4_DENIED as per "spec" signals to the client + * that the lock is unavailable now but client can retry. + * Linux client implementation does not. It treats + * NLM4_DENIED same as NLM4_FAILED and errors the request. + * (3) For the future, treat this as blocked lock and try + * to callback when the delegation is returned but might + * not have a proper lock request to block on. + */ return nlm_drop_reply; case nfserr_stale: return nlm_stale_fh; diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index aea905fcaf87..683bd1130afe 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -120,7 +120,6 @@ nfsd4_set_deviceid(struct nfsd4_deviceid *id, const struct svc_fh *fhp, id->fsid_idx = fhp->fh_export->ex_devid_map->idx; id->generation = device_generation; - id->pad = 0; return 0; } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 71b428efcbb5..e466cf52d7d7 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1133,6 +1133,35 @@ nfsd4_secinfo_no_name_release(union nfsd4_op_u *u) exp_put(u->secinfo_no_name.sin_exp); } +/* + * Validate that the requested timestamps are within the acceptable range. If + * timestamp appears to be in the future, then it will be clamped to + * current_time(). + */ +static void +vet_deleg_attrs(struct nfsd4_setattr *setattr, struct nfs4_delegation *dp) +{ + struct timespec64 now = current_time(dp->dl_stid.sc_file->fi_inode); + struct iattr *iattr = &setattr->sa_iattr; + + if ((setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) && + !nfsd4_vet_deleg_time(&iattr->ia_atime, &dp->dl_atime, &now)) + iattr->ia_valid &= ~(ATTR_ATIME | ATTR_ATIME_SET); + + if (setattr->sa_bmval[2] & FATTR4_WORD2_TIME_DELEG_MODIFY) { + if (nfsd4_vet_deleg_time(&iattr->ia_mtime, &dp->dl_mtime, &now)) { + iattr->ia_ctime = iattr->ia_mtime; + if (nfsd4_vet_deleg_time(&iattr->ia_ctime, &dp->dl_ctime, &now)) + dp->dl_setattr = true; + else + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET); + } else { + iattr->ia_valid &= ~(ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET); + } + } +} + static __be32 nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) @@ -1170,8 +1199,10 @@ nfsd4_setattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfs4_delegation *dp = delegstateid(st); /* Only for *_ATTRS_DELEG flavors */ - if (deleg_attrs_deleg(dp->dl_type)) + if (deleg_attrs_deleg(dp->dl_type)) { + vet_deleg_attrs(setattr, dp); status = nfs_ok; + } } } if (st) @@ -1209,12 +1240,26 @@ out: return status; } +static void nfsd4_file_mark_deleg_written(struct nfs4_file *fi) +{ + spin_lock(&fi->fi_lock); + if (!list_empty(&fi->fi_delegations)) { + struct nfs4_delegation *dp = list_first_entry(&fi->fi_delegations, + struct nfs4_delegation, dl_perfile); + + if (dp->dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG) + dp->dl_written = true; + } + spin_unlock(&fi->fi_lock); +} + static __be32 nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { struct nfsd4_write *write = &u->write; stateid_t *stateid = &write->wr_stateid; + struct nfs4_stid *stid = NULL; struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; @@ -1227,10 +1272,15 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, trace_nfsd_write_start(rqstp, &cstate->current_fh, write->wr_offset, cnt); status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh, - stateid, WR_STATE, &nf, NULL); + stateid, WR_STATE, &nf, &stid); if (status) return status; + if (stid) { + nfsd4_file_mark_deleg_written(stid->sc_file); + nfs4_put_stid(stid); + } + write->wr_how_written = write->wr_stable_how; status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, write->wr_offset, &write->wr_payload, @@ -1469,7 +1519,7 @@ try_again: return 0; } if (work) { - strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr) - 1); + strscpy(work->nsui_ipaddr, ipaddr, sizeof(work->nsui_ipaddr)); refcount_set(&work->nsui_refcnt, 2); work->nsui_busy = true; list_add_tail(&work->nsui_list, &nn->nfsd_ssc_mount_list); @@ -2447,7 +2497,7 @@ nfsd4_layoutget(struct svc_rqst *rqstp, if (atomic_read(&ls->ls_stid.sc_file->fi_lo_recalls)) goto out_put_stid; - nfserr = ops->proc_layoutget(d_inode(current_fh->fh_dentry), + nfserr = ops->proc_layoutget(rqstp, d_inode(current_fh->fh_dentry), current_fh, lgp); if (nfserr) goto out_put_stid; @@ -2471,11 +2521,11 @@ static __be32 nfsd4_layoutcommit(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, union nfsd4_op_u *u) { + struct net *net = SVC_NET(rqstp); struct nfsd4_layoutcommit *lcp = &u->layoutcommit; const struct nfsd4_layout_seg *seg = &lcp->lc_seg; struct svc_fh *current_fh = &cstate->current_fh; const struct nfsd4_layout_ops *ops; - loff_t new_size = lcp->lc_last_wr + 1; struct inode *inode; struct nfs4_layout_stateid *ls; __be32 nfserr; @@ -2491,43 +2541,50 @@ nfsd4_layoutcommit(struct svc_rqst *rqstp, goto out; inode = d_inode(current_fh->fh_dentry); - nfserr = nfserr_inval; - if (new_size <= seg->offset) { - dprintk("pnfsd: last write before layout segment\n"); - goto out; + lcp->lc_size_chg = false; + if (lcp->lc_newoffset) { + loff_t new_size = lcp->lc_last_wr + 1; + + nfserr = nfserr_inval; + if (new_size <= seg->offset) + goto out; + if (new_size > seg->offset + seg->length) + goto out; + + if (new_size > i_size_read(inode)) { + lcp->lc_size_chg = true; + lcp->lc_newsize = new_size; + } } - if (new_size > seg->offset + seg->length) { - dprintk("pnfsd: last write beyond layout segment\n"); + + nfserr = nfserr_grace; + if (locks_in_grace(net) && !lcp->lc_reclaim) goto out; - } - if (!lcp->lc_newoffset && new_size > i_size_read(inode)) { - dprintk("pnfsd: layoutcommit beyond EOF\n"); + nfserr = nfserr_no_grace; + if (!locks_in_grace(net) && lcp->lc_reclaim) goto out; - } - nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, &lcp->lc_sid, - false, lcp->lc_layout_type, - &ls); - if (nfserr) { - trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); - /* fixup error code as per RFC5661 */ - if (nfserr == nfserr_bad_stateid) - nfserr = nfserr_badlayout; - goto out; + if (!lcp->lc_reclaim) { + nfserr = nfsd4_preprocess_layout_stateid(rqstp, cstate, + &lcp->lc_sid, false, lcp->lc_layout_type, &ls); + if (nfserr) { + trace_nfsd_layout_commit_lookup_fail(&lcp->lc_sid); + /* fixup error code as per RFC5661 */ + if (nfserr == nfserr_bad_stateid) + nfserr = nfserr_badlayout; + goto out; + } + + /* LAYOUTCOMMIT does not require any serialization */ + mutex_unlock(&ls->ls_mutex); } - /* LAYOUTCOMMIT does not require any serialization */ - mutex_unlock(&ls->ls_mutex); + nfserr = ops->proc_layoutcommit(inode, rqstp, lcp); - if (new_size > i_size_read(inode)) { - lcp->lc_size_chg = true; - lcp->lc_newsize = new_size; - } else { - lcp->lc_size_chg = false; + if (!lcp->lc_reclaim) { + nfsd4_file_mark_deleg_written(ls->ls_stid.sc_file); + nfs4_put_stid(&ls->ls_stid); } - - nfserr = ops->proc_layoutcommit(inode, lcp); - nfs4_put_stid(&ls->ls_stid); out: return nfserr; } diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 2231192ec33f..e2b9472e5c78 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -92,24 +92,10 @@ nfs4_reset_creds(const struct cred *original) put_cred(revert_creds(original)); } -static void -md5_to_hex(char *out, char *md5) -{ - int i; - - for (i=0; i<16; i++) { - unsigned char c = md5[i]; - - *out++ = '0' + ((c&0xf0)>>4) + (c>=0xa0)*('a'-'9'-1); - *out++ = '0' + (c&0x0f) + ((c&0x0f)>=0x0a)*('a'-'9'-1); - } - *out = '\0'; -} - static int -nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) +nfs4_make_rec_clidname(char dname[HEXDIR_LEN], const struct xdr_netobj *clname) { - struct xdr_netobj cksum; + u8 digest[MD5_DIGEST_SIZE]; struct crypto_shash *tfm; int status; @@ -121,23 +107,16 @@ nfs4_make_rec_clidname(char *dname, const struct xdr_netobj *clname) goto out_no_tfm; } - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - status = -ENOMEM; - goto out; - } - status = crypto_shash_tfm_digest(tfm, clname->data, clname->len, - cksum.data); + digest); if (status) goto out; - md5_to_hex(dname, cksum.data); + static_assert(HEXDIR_LEN == 2 * MD5_DIGEST_SIZE + 1); + sprintf(dname, "%*phN", MD5_DIGEST_SIZE, digest); status = 0; out: - kfree(cksum.data); crypto_free_shash(tfm); out_no_tfm: return status; diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 88c347957da5..81fa7cc6c77b 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1222,6 +1222,42 @@ static void put_deleg_file(struct nfs4_file *fp) nfs4_file_put_access(fp, NFS4_SHARE_ACCESS_READ); } +static void nfsd4_finalize_deleg_timestamps(struct nfs4_delegation *dp, struct file *f) +{ + struct iattr ia = { .ia_valid = ATTR_ATIME | ATTR_CTIME | ATTR_MTIME }; + struct inode *inode = file_inode(f); + int ret; + + /* don't do anything if FMODE_NOCMTIME isn't set */ + if ((READ_ONCE(f->f_mode) & FMODE_NOCMTIME) == 0) + return; + + spin_lock(&f->f_lock); + f->f_mode &= ~FMODE_NOCMTIME; + spin_unlock(&f->f_lock); + + /* was it never written? */ + if (!dp->dl_written) + return; + + /* did it get a setattr for the timestamps at some point? */ + if (dp->dl_setattr) + return; + + /* Stamp everything to "now" */ + inode_lock(inode); + ret = notify_change(&nop_mnt_idmap, f->f_path.dentry, &ia, NULL); + inode_unlock(inode); + if (ret) { + struct inode *inode = file_inode(f); + + pr_notice_ratelimited("Unable to update timestamps on inode %02x:%02x:%lu: %d\n", + MAJOR(inode->i_sb->s_dev), + MINOR(inode->i_sb->s_dev), + inode->i_ino, ret); + } +} + static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; @@ -1229,6 +1265,7 @@ static void nfs4_unlock_deleg_lease(struct nfs4_delegation *dp) WARN_ON_ONCE(!fp->fi_delegees); + nfsd4_finalize_deleg_timestamps(dp, nf->nf_file); kernel_setlease(nf->nf_file, F_UNLCK, NULL, (void **)&dp); put_deleg_file(fp); } @@ -6157,7 +6194,8 @@ nfs4_delegation_stat(struct nfs4_delegation *dp, struct svc_fh *currentfh, path.dentry = file_dentry(nf->nf_file); rc = vfs_getattr(&path, stat, - (STATX_MODE | STATX_SIZE | STATX_CTIME | STATX_CHANGE_COOKIE), + STATX_MODE | STATX_SIZE | STATX_ATIME | + STATX_MTIME | STATX_CTIME | STATX_CHANGE_COOKIE, AT_STATX_SYNC_AS_STAT); nfsd_file_put(nf); @@ -6264,6 +6302,8 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, memcpy(&open->op_delegate_stateid, &dp->dl_stid.sc_stateid, sizeof(dp->dl_stid.sc_stateid)); if (open->op_share_access & NFS4_SHARE_ACCESS_WRITE) { + struct file *f = dp->dl_stid.sc_file->fi_deleg_file->nf_file; + if (!nfsd4_add_rdaccess_to_wrdeleg(rqstp, open, fh, stp) || !nfs4_delegation_stat(dp, currentfh, &stat)) { nfs4_put_stid(&dp->dl_stid); @@ -6274,10 +6314,17 @@ nfs4_open_delegation(struct svc_rqst *rqstp, struct nfsd4_open *open, OPEN_DELEGATE_WRITE; dp->dl_cb_fattr.ncf_cur_fsize = stat.size; dp->dl_cb_fattr.ncf_initial_cinfo = nfsd4_change_attribute(&stat); + dp->dl_atime = stat.atime; + dp->dl_ctime = stat.ctime; + dp->dl_mtime = stat.mtime; + spin_lock(&f->f_lock); + f->f_mode |= FMODE_NOCMTIME; + spin_unlock(&f->f_lock); trace_nfsd_deleg_write(&dp->dl_stid.sc_stateid); } else { - open->op_delegate_type = deleg_ts ? OPEN_DELEGATE_READ_ATTRS_DELEG : - OPEN_DELEGATE_READ; + open->op_delegate_type = deleg_ts && nfs4_delegation_stat(dp, currentfh, &stat) ? + OPEN_DELEGATE_READ_ATTRS_DELEG : OPEN_DELEGATE_READ; + dp->dl_atime = stat.atime; trace_nfsd_deleg_read(&dp->dl_stid.sc_stateid); } nfs4_put_stid(&dp->dl_stid); @@ -9130,25 +9177,25 @@ nfsd4_get_writestateid(struct nfsd4_compound_state *cstate, } /** - * set_cb_time - vet and set the timespec for a cb_getattr update - * @cb: timestamp from the CB_GETATTR response + * nfsd4_vet_deleg_time - vet and set the timespec for a delegated timestamp update + * @req: timestamp from the client * @orig: original timestamp in the inode * @now: current time * - * Given a timestamp in a CB_GETATTR response, check it against the + * Given a timestamp from the client response, check it against the * current timestamp in the inode and the current time. Returns true * if the inode's timestamp needs to be updated, and false otherwise. - * @cb may also be changed if the timestamp needs to be clamped. + * @req may also be changed if the timestamp needs to be clamped. */ -static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, - const struct timespec64 *now) +bool nfsd4_vet_deleg_time(struct timespec64 *req, const struct timespec64 *orig, + const struct timespec64 *now) { /* * "When the time presented is before the original time, then the * update is ignored." Also no need to update if there is no change. */ - if (timespec64_compare(cb, orig) <= 0) + if (timespec64_compare(req, orig) <= 0) return false; /* @@ -9156,10 +9203,8 @@ static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, * clamp the new time to the current time, or it may * return NFS4ERR_DELAY to the client, allowing it to retry." */ - if (timespec64_compare(cb, now) > 0) { - /* clamp it */ - *cb = *now; - } + if (timespec64_compare(req, now) > 0) + *req = *now; return true; } @@ -9167,28 +9212,27 @@ static bool set_cb_time(struct timespec64 *cb, const struct timespec64 *orig, static int cb_getattr_update_times(struct dentry *dentry, struct nfs4_delegation *dp) { struct inode *inode = d_inode(dentry); - struct timespec64 now = current_time(inode); struct nfs4_cb_fattr *ncf = &dp->dl_cb_fattr; struct iattr attrs = { }; int ret; if (deleg_attrs_deleg(dp->dl_type)) { - struct timespec64 atime = inode_get_atime(inode); - struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now = current_time(inode); attrs.ia_atime = ncf->ncf_cb_atime; attrs.ia_mtime = ncf->ncf_cb_mtime; - if (set_cb_time(&attrs.ia_atime, &atime, &now)) + if (nfsd4_vet_deleg_time(&attrs.ia_atime, &dp->dl_atime, &now)) attrs.ia_valid |= ATTR_ATIME | ATTR_ATIME_SET; - if (set_cb_time(&attrs.ia_mtime, &mtime, &now)) { - attrs.ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET; + if (nfsd4_vet_deleg_time(&attrs.ia_mtime, &dp->dl_mtime, &now)) { + attrs.ia_valid |= ATTR_MTIME | ATTR_MTIME_SET; attrs.ia_ctime = attrs.ia_mtime; + if (nfsd4_vet_deleg_time(&attrs.ia_ctime, &dp->dl_ctime, &now)) + attrs.ia_valid |= ATTR_CTIME | ATTR_CTIME_SET; } } else { attrs.ia_valid |= ATTR_MTIME | ATTR_CTIME; - attrs.ia_mtime = attrs.ia_ctime = now; } if (!attrs.ia_valid) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index ea91bad4eee2..c0a3c6a7c8bb 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -538,8 +538,9 @@ nfsd4_decode_fattr4(struct nfsd4_compoundargs *argp, u32 *bmval, u32 bmlen, iattr->ia_mtime.tv_sec = modify.seconds; iattr->ia_mtime.tv_nsec = modify.nseconds; iattr->ia_ctime.tv_sec = modify.seconds; - iattr->ia_ctime.tv_nsec = modify.seconds; - iattr->ia_valid |= ATTR_CTIME | ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; + iattr->ia_ctime.tv_nsec = modify.nseconds; + iattr->ia_valid |= ATTR_CTIME | ATTR_CTIME_SET | + ATTR_MTIME | ATTR_MTIME_SET | ATTR_DELEG; } /* request sanity: did attrlist4 contain the expected number of words? */ @@ -587,23 +588,13 @@ nfsd4_decode_state_owner4(struct nfsd4_compoundargs *argp, } #ifdef CONFIG_NFSD_PNFS -static __be32 -nfsd4_decode_deviceid4(struct nfsd4_compoundargs *argp, - struct nfsd4_deviceid *devid) -{ - __be32 *p; - - p = xdr_inline_decode(argp->xdr, NFS4_DEVICEID4_SIZE); - if (!p) - return nfserr_bad_xdr; - memcpy(devid, p, sizeof(*devid)); - return nfs_ok; -} static __be32 nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, struct nfsd4_layoutcommit *lcp) { + u32 len; + if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_layout_type) < 0) return nfserr_bad_xdr; if (lcp->lc_layout_type < LAYOUT_NFSV4_1_FILES) @@ -611,13 +602,10 @@ nfsd4_decode_layoutupdate4(struct nfsd4_compoundargs *argp, if (lcp->lc_layout_type >= LAYOUT_TYPE_MAX) return nfserr_bad_xdr; - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_up_len) < 0) + if (xdr_stream_decode_u32(argp->xdr, &len) < 0) + return nfserr_bad_xdr; + if (!xdr_stream_subsegment(argp->xdr, &lcp->lc_up_layout, len)) return nfserr_bad_xdr; - if (lcp->lc_up_len > 0) { - lcp->lc_up_layout = xdr_inline_decode(argp->xdr, lcp->lc_up_len); - if (!lcp->lc_up_layout) - return nfserr_bad_xdr; - } return nfs_ok; } @@ -1783,7 +1771,7 @@ nfsd4_decode_getdeviceinfo(struct nfsd4_compoundargs *argp, __be32 status; memset(gdev, 0, sizeof(*gdev)); - status = nfsd4_decode_deviceid4(argp, &gdev->gd_devid); + status = nfsd4_decode_deviceid4(argp->xdr, &gdev->gd_devid); if (status) return status; if (xdr_stream_decode_u32(argp->xdr, &gdev->gd_layout_type) < 0) @@ -1814,7 +1802,7 @@ nfsd4_decode_layoutcommit(struct nfsd4_compoundargs *argp, status = nfsd4_decode_stateid4(argp, &lcp->lc_sid); if (status) return status; - if (xdr_stream_decode_u32(argp->xdr, &lcp->lc_newoffset) < 0) + if (xdr_stream_decode_bool(argp->xdr, &lcp->lc_newoffset) < 0) return nfserr_bad_xdr; if (lcp->lc_newoffset) { if (xdr_stream_decode_u64(argp->xdr, &lcp->lc_last_wr) < 0) diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ba9d326b3de6..ab13ee9c7fd8 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -27,7 +27,7 @@ * cache size, the idea being that when the cache is at its maximum number * of entries, then this should be the average number of entries per bucket. */ -#define TARGET_BUCKET_SIZE 64 +#define TARGET_BUCKET_SIZE 8 struct nfsd_drc_bucket { struct rb_root rb_head; @@ -237,10 +237,6 @@ void nfsd_reply_cache_shutdown(struct nfsd_net *nn) } -/* - * Move cache entry to end of LRU list, and queue the cleaner to run if it's - * not already scheduled. - */ static void lru_put_end(struct nfsd_drc_bucket *b, struct nfsd_cacherep *rp) { @@ -272,13 +268,6 @@ nfsd_prune_bucket_locked(struct nfsd_net *nn, struct nfsd_drc_bucket *b, /* The bucket LRU is ordered oldest-first. */ list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { - /* - * Don't free entries attached to calls that are still - * in-progress, but do keep scanning the list. - */ - if (rp->c_state == RC_INPROG) - continue; - if (atomic_read(&nn->num_drc_entries) <= nn->max_drc_entries && time_before(expiry, rp->c_timestamp)) break; @@ -453,8 +442,6 @@ out: nn->longest_chain_cachesize, atomic_read(&nn->num_drc_entries)); } - - lru_put_end(b, ret); return ret; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index bc6b776fc657..2b79129703d5 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1103,89 +1103,48 @@ static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) * populating the filesystem. */ -/* Basically copying rpc_get_inode. */ static struct inode *nfsd_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); - if (!inode) - return NULL; - /* Following advice from simple_fill_super documentation: */ - inode->i_ino = iunique(sb, NFSD_MaxReserved); - inode->i_mode = mode; - simple_inode_init_ts(inode); - switch (mode & S_IFMT) { - case S_IFDIR: - inode->i_fop = &simple_dir_operations; - inode->i_op = &simple_dir_inode_operations; - inc_nlink(inode); - break; - case S_IFLNK: - inode->i_op = &simple_symlink_inode_operations; - break; - default: - break; + if (inode) { + /* Following advice from simple_fill_super documentation: */ + inode->i_ino = iunique(sb, NFSD_MaxReserved); + inode->i_mode = mode; + simple_inode_init_ts(inode); } return inode; } -static int __nfsd_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode, struct nfsdfs_client *ncl) +static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) { + struct inode *dir = parent->d_inode; + struct dentry *dentry; struct inode *inode; - inode = nfsd_get_inode(dir->i_sb, mode); + inode = nfsd_get_inode(parent->d_sb, S_IFDIR | 0600); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return dentry; + } + inode->i_fop = &simple_dir_operations; + inode->i_op = &simple_dir_inode_operations; + inc_nlink(inode); if (ncl) { inode->i_private = ncl; kref_get(&ncl->cl_ref); } - d_add(dentry, inode); + d_instantiate(dentry, inode); inc_nlink(dir); fsnotify_mkdir(dir, dentry); - return 0; -} - -static struct dentry *nfsd_mkdir(struct dentry *parent, struct nfsdfs_client *ncl, char *name) -{ - struct inode *dir = parent->d_inode; - struct dentry *dentry; - int ret = -ENOMEM; - - inode_lock(dir); - dentry = d_alloc_name(parent, name); - if (!dentry) - goto out_err; - ret = __nfsd_mkdir(d_inode(parent), dentry, S_IFDIR | 0600, ncl); - if (ret) - goto out_err; -out: inode_unlock(dir); return dentry; -out_err: - dput(dentry); - dentry = ERR_PTR(ret); - goto out; } #if IS_ENABLED(CONFIG_SUNRPC_GSS) -static int __nfsd_symlink(struct inode *dir, struct dentry *dentry, - umode_t mode, const char *content) -{ - struct inode *inode; - - inode = nfsd_get_inode(dir->i_sb, mode); - if (!inode) - return -ENOMEM; - - inode->i_link = (char *)content; - inode->i_size = strlen(content); - - d_add(dentry, inode); - inc_nlink(dir); - fsnotify_create(dir, dentry); - return 0; -} - /* * @content is assumed to be a NUL-terminated string that lives * longer than the symlink itself. @@ -1194,17 +1153,25 @@ static void _nfsd_symlink(struct dentry *parent, const char *name, const char *content) { struct inode *dir = parent->d_inode; + struct inode *inode; struct dentry *dentry; - int ret; - inode_lock(dir); - dentry = d_alloc_name(parent, name); - if (!dentry) - goto out; - ret = __nfsd_symlink(d_inode(parent), dentry, S_IFLNK | 0777, content); - if (ret) - dput(dentry); -out: + inode = nfsd_get_inode(dir->i_sb, S_IFLNK | 0777); + if (!inode) + return; + + dentry = simple_start_creating(parent, name); + if (IS_ERR(dentry)) { + iput(inode); + return; + } + + inode->i_op = &simple_symlink_inode_operations; + inode->i_link = (char *)content; + inode->i_size = strlen(content); + + d_instantiate(dentry, inode); + fsnotify_create(dir, dentry); inode_unlock(dir); } #else @@ -1240,40 +1207,34 @@ struct nfsdfs_client *get_nfsdfs_client(struct inode *inode) /* XXX: cut'n'paste from simple_fill_super; figure out if we could share * code instead. */ -static int nfsdfs_create_files(struct dentry *root, +static int nfsdfs_create_files(struct dentry *root, const struct tree_descr *files, struct nfsdfs_client *ncl, struct dentry **fdentries) { struct inode *dir = d_inode(root); - struct inode *inode; struct dentry *dentry; - int i; - inode_lock(dir); - for (i = 0; files->name && files->name[0]; i++, files++) { - dentry = d_alloc_name(root, files->name); - if (!dentry) - goto out; - inode = nfsd_get_inode(d_inode(root)->i_sb, - S_IFREG | files->mode); - if (!inode) { - dput(dentry); - goto out; + for (int i = 0; files->name && files->name[0]; i++, files++) { + struct inode *inode = nfsd_get_inode(root->d_sb, + S_IFREG | files->mode); + if (!inode) + return -ENOMEM; + dentry = simple_start_creating(root, files->name); + if (IS_ERR(dentry)) { + iput(inode); + return PTR_ERR(dentry); } kref_get(&ncl->cl_ref); inode->i_fop = files->ops; inode->i_private = ncl; - d_add(dentry, inode); + d_instantiate(dentry, inode); fsnotify_create(dir, dentry); if (fdentries) fdentries[i] = dentry; + inode_unlock(dir); } - inode_unlock(dir); return 0; -out: - inode_unlock(dir); - return -ENOMEM; } /* on success, returns positive number unique to that client. */ @@ -1993,7 +1954,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) * remaining listeners and recreate the list. */ if (delete) - svc_xprt_destroy_all(serv, net); + svc_xprt_destroy_all(serv, net, false); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr_type(attr, NFSD_A_SERVER_SOCK_ADDR, info->nlhdr, diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 1cd0bed57bc2..ea87b42894dd 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -153,6 +153,15 @@ static inline void nfsd_debugfs_exit(void) {} extern bool nfsd_disable_splice_read __read_mostly; +enum { + /* Any new NFSD_IO enum value must be added at the end */ + NFSD_IO_BUFFERED, + NFSD_IO_DONTCACHE, +}; + +extern u64 nfsd_io_cache_read __read_mostly; +extern u64 nfsd_io_cache_write __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) @@ -335,14 +344,8 @@ void nfsd_lockd_shutdown(void); * cannot conflict with any existing be32 nfserr value. */ enum { - NFSERR_DROPIT = NFS4ERR_FIRST_FREE, -/* if a request fails due to kmalloc failure, it gets dropped. - * Client should resend eventually - */ -#define nfserr_dropit cpu_to_be32(NFSERR_DROPIT) - /* end-of-file indicator in readdir */ - NFSERR_EOF, + NFSERR_EOF = NFS4ERR_FIRST_FREE, #define nfserr_eof cpu_to_be32(NFSERR_EOF) /* replay detected */ diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index 74cf1f4de174..3eb724ec9566 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -364,10 +364,30 @@ __fh_verify(struct svc_rqst *rqstp, if (error) goto out; + /* + * If rqstp is NULL, this is a LOCALIO request which will only + * ever use a filehandle/credential pair for which access has + * been affirmed (by ACCESS or OPEN NFS requests) over the + * wire. Skip both the xprtsec policy and the security flavor + * checks. + */ + if (!rqstp) + goto check_permissions; + if ((access & NFSD_MAY_NLM) && (exp->ex_flags & NFSEXP_NOAUTHNLM)) /* NLM is allowed to fully bypass authentication */ goto out; + /* + * NLM is allowed to bypass the xprtsec policy check because lockd + * doesn't support xprtsec. + */ + if (!(access & NFSD_MAY_NLM)) { + error = check_xprtsec_policy(exp, rqstp); + if (error) + goto out; + } + if (access & NFSD_MAY_BYPASS_GSS) may_bypass_gss = true; /* @@ -379,13 +399,13 @@ __fh_verify(struct svc_rqst *rqstp, && exp->ex_path.dentry == dentry) may_bypass_gss = true; - error = check_nfsd_access(exp, rqstp, may_bypass_gss); + error = check_security_flavor(exp, rqstp, may_bypass_gss); if (error) goto out; - /* During LOCALIO call to fh_verify will be called with a NULL rqstp */ - if (rqstp) - svc_xprt_set_valid(rqstp->rq_xprt); + svc_xprt_set_valid(rqstp->rq_xprt); + +check_permissions: /* Finally, check access permissions. */ error = nfsd_permission(cred, exp, dentry, access); out: @@ -663,6 +683,33 @@ out_negative: } /** + * fh_getattr - Retrieve attributes on a local file + * @fhp: File handle of target file + * @stat: Caller-supplied kstat buffer to be filled in + * + * Returns nfs_ok on success, otherwise an NFS status code is + * returned. + */ +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat) +{ + struct path p = { + .mnt = fhp->fh_export->ex_path.mnt, + .dentry = fhp->fh_dentry, + }; + struct inode *inode = d_inode(p.dentry); + u32 request_mask = STATX_BASIC_STATS; + + if (S_ISREG(inode->i_mode)) + request_mask |= (STATX_DIOALIGN | STATX_DIO_READ_ALIGN); + + if (fhp->fh_maxsize == NFS4_FHSIZE) + request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); + + return nfserrno(vfs_getattr(&p, stat, request_mask, + AT_STATX_SYNC_AS_STAT)); +} + +/** * fh_fill_pre_attrs - Fill in pre-op attributes * @fhp: file handle to be updated * diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 1cf979722521..5ef7191f8ad8 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -14,6 +14,8 @@ #include <linux/exportfs.h> #include <linux/nfs4.h> +#include "export.h" + /* * The file handle starts with a sequence of four-byte words. * The first word contains a version number (1) and three descriptor bytes @@ -220,6 +222,7 @@ extern char * SVCFH_fmt(struct svc_fh *fhp); __be32 fh_verify(struct svc_rqst *, struct svc_fh *, umode_t, int); __be32 fh_verify_local(struct net *, struct svc_cred *, struct auth_domain *, struct svc_fh *, umode_t, int); +__be32 fh_getattr(const struct svc_fh *fhp, struct kstat *stat); __be32 fh_compose(struct svc_fh *, struct svc_export *, struct dentry *, struct svc_fh *); __be32 fh_update(struct svc_fh *); void fh_put(struct svc_fh *); @@ -272,6 +275,41 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, } /** + * fh_want_write - Get write access to an export + * @fhp: File handle of file to be written + * + * Caller must invoke fh_drop_write() when its write operation + * is complete. + * + * Returns 0 if the file handle's export can be written to. Otherwise + * the export is not prepared for updates, and the returned negative + * errno value reflects the reason for the failure. + */ +static inline int fh_want_write(struct svc_fh *fhp) +{ + int ret; + + if (fhp->fh_want_write) + return 0; + ret = mnt_want_write(fhp->fh_export->ex_path.mnt); + if (!ret) + fhp->fh_want_write = true; + return ret; +} + +/** + * fh_drop_write - Release write access on an export + * @fhp: File handle of file on which fh_want_write() was previously called + */ +static inline void fh_drop_write(struct svc_fh *fhp) +{ + if (fhp->fh_want_write) { + fhp->fh_want_write = false; + mnt_drop_write(fhp->fh_export->ex_path.mnt); + } +} + +/** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle * diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 82b0111ac469..7057ddd7a0a8 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -535,16 +535,13 @@ void nfsd_destroy_serv(struct net *net) #endif } - svc_xprt_destroy_all(serv, net); - /* * write_ports can create the server without actually starting - * any threads--if we get shut down before any threads are + * any threads. If we get shut down before any threads are * started, then nfsd_destroy_serv will be run before any of this * other initialization has been done except the rpcb information. */ - svc_rpcb_cleanup(serv, net); - + svc_xprt_destroy_all(serv, net, true); nfsd_shutdown_net(net); svc_destroy(&serv); } diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h index 925817f66917..db9af780438b 100644 --- a/fs/nfsd/pnfs.h +++ b/fs/nfsd/pnfs.h @@ -29,12 +29,13 @@ struct nfsd4_layout_ops { __be32 (*encode_getdeviceinfo)(struct xdr_stream *xdr, const struct nfsd4_getdeviceinfo *gdevp); - __be32 (*proc_layoutget)(struct inode *, const struct svc_fh *fhp, - struct nfsd4_layoutget *lgp); + __be32 (*proc_layoutget)(struct svc_rqst *rqstp, struct inode *inode, + const struct svc_fh *fhp, struct nfsd4_layoutget *lgp); __be32 (*encode_layoutget)(struct xdr_stream *xdr, const struct nfsd4_layoutget *lgp); __be32 (*proc_layoutcommit)(struct inode *inode, + struct svc_rqst *rqstp, struct nfsd4_layoutcommit *lcp); void (*fence_client)(struct nfs4_layout_stateid *ls, diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 8adc2550129e..1e736f402426 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -35,6 +35,7 @@ #ifndef _NFSD4_STATE_H #define _NFSD4_STATE_H +#include <crypto/md5.h> #include <linux/idr.h> #include <linux/refcount.h> #include <linux/sunrpc/svc_xprt.h> @@ -217,13 +218,20 @@ struct nfs4_delegation { struct nfs4_clnt_odstate *dl_clnt_odstate; time64_t dl_time; u32 dl_type; -/* For recall: */ + /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; bool dl_recalled; + bool dl_written; + bool dl_setattr; /* for CB_GETATTR */ struct nfs4_cb_fattr dl_cb_fattr; + + /* For delegated timestamps */ + struct timespec64 dl_atime; + struct timespec64 dl_mtime; + struct timespec64 dl_ctime; }; static inline bool deleg_is_read(u32 dl_type) @@ -242,6 +250,9 @@ static inline bool deleg_attrs_deleg(u32 dl_type) dl_type == OPEN_DELEGATE_WRITE_ATTRS_DELEG; } +bool nfsd4_vet_deleg_time(struct timespec64 *cb, const struct timespec64 *orig, + const struct timespec64 *now); + #define cb_to_delegation(cb) \ container_of(cb, struct nfs4_delegation, dl_recall) @@ -381,7 +392,8 @@ struct nfsd4_sessionid { u32 reserved; }; -#define HEXDIR_LEN 33 /* hex version of 16 byte md5 of cl_name plus '\0' */ +/* Length of MD5 digest as hex, plus terminating '\0' */ +#define HEXDIR_LEN (2 * MD5_DIGEST_SIZE + 1) /* * State Meaning Where set diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a664fdf1161e..6e2c8e2aab10 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -1133,6 +1133,33 @@ TRACE_EVENT(nfsd_file_alloc, ) ); +TRACE_EVENT(nfsd_file_get_dio_attrs, + TP_PROTO( + const struct inode *inode, + const struct kstat *stat + ), + TP_ARGS(inode, stat), + TP_STRUCT__entry( + __field(const void *, inode) + __field(unsigned long, mask) + __field(u32, mem_align) + __field(u32, offset_align) + __field(u32, read_offset_align) + ), + TP_fast_assign( + __entry->inode = inode; + __entry->mask = stat->result_mask; + __entry->mem_align = stat->dio_mem_align; + __entry->offset_align = stat->dio_offset_align; + __entry->read_offset_align = stat->dio_read_offset_align; + ), + TP_printk("inode=%p flags=%s mem_align=%u offset_align=%u read_offset_align=%u", + __entry->inode, show_statx_mask(__entry->mask), + __entry->mem_align, __entry->offset_align, + __entry->read_offset_align + ) +); + TRACE_EVENT(nfsd_file_acquire, TP_PROTO( const struct svc_rqst *rqstp, diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index aa4a95713a48..9cb20d4aeab1 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -49,6 +49,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP bool nfsd_disable_splice_read __read_mostly; +u64 nfsd_io_cache_read __read_mostly = NFSD_IO_BUFFERED; +u64 nfsd_io_cache_write __read_mostly = NFSD_IO_BUFFERED; /** * nfserrno - Map Linux errnos to NFS errnos @@ -467,7 +469,7 @@ static int __nfsd_setattr(struct dentry *dentry, struct iattr *iap) return 0; } - if (!iap->ia_valid) + if ((iap->ia_valid & ~ATTR_DELEG) == 0) return 0; /* @@ -1099,6 +1101,16 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, size_t len; init_sync_kiocb(&kiocb, file); + + switch (nfsd_io_cache_read) { + case NFSD_IO_BUFFERED: + break; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags = IOCB_DONTCACHE; + break; + } + kiocb.ki_pos = offset; v = 0; @@ -1224,6 +1236,15 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); + + switch (nfsd_io_cache_write) { + case NFSD_IO_BUFFERED: + break; + case NFSD_IO_DONTCACHE: + if (file->f_op->fop_flags & FOP_DONTCACHE) + kiocb.ki_flags |= IOCB_DONTCACHE; + break; + } host_err = vfs_iocb_iter_write(file, &kiocb, &iter); if (host_err < 0) { commit_reset_write_verifier(nn, rqstp, host_err); diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index eff04959606f..0c0292611c6d 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -160,37 +160,4 @@ __be32 nfsd_permission(struct svc_cred *cred, struct svc_export *exp, void nfsd_filp_close(struct file *fp); -static inline int fh_want_write(struct svc_fh *fh) -{ - int ret; - - if (fh->fh_want_write) - return 0; - ret = mnt_want_write(fh->fh_export->ex_path.mnt); - if (!ret) - fh->fh_want_write = true; - return ret; -} - -static inline void fh_drop_write(struct svc_fh *fh) -{ - if (fh->fh_want_write) { - fh->fh_want_write = false; - mnt_drop_write(fh->fh_export->ex_path.mnt); - } -} - -static inline __be32 fh_getattr(const struct svc_fh *fh, struct kstat *stat) -{ - u32 request_mask = STATX_BASIC_STATS; - struct path p = {.mnt = fh->fh_export->ex_path.mnt, - .dentry = fh->fh_dentry}; - - if (fh->fh_maxsize == NFS4_FHSIZE) - request_mask |= (STATX_BTIME | STATX_CHANGE_COOKIE); - - return nfserrno(vfs_getattr(&p, stat, request_mask, - AT_STATX_SYNC_AS_STAT)); -} - #endif /* LINUX_NFSD_VFS_H */ diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index a23bc56051ca..d4b48602b2b0 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -595,9 +595,43 @@ struct nfsd4_reclaim_complete { struct nfsd4_deviceid { u64 fsid_idx; u32 generation; - u32 pad; }; +static inline __be32 * +svcxdr_encode_deviceid4(__be32 *p, const struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + *q = (__force __be64)devid->fsid_idx; + p += 2; + *p++ = (__force __be32)devid->generation; + *p++ = xdr_zero; + return p; +} + +static inline __be32 * +svcxdr_decode_deviceid4(__be32 *p, struct nfsd4_deviceid *devid) +{ + __be64 *q = (__be64 *)p; + + devid->fsid_idx = (__force u64)(*q); + p += 2; + devid->generation = (__force u32)(*p++); + p++; /* NFSD does not use the remaining octets */ + return p; +} + +static inline __be32 +nfsd4_decode_deviceid4(struct xdr_stream *xdr, struct nfsd4_deviceid *devid) +{ + __be32 *p = xdr_inline_decode(xdr, NFS4_DEVICEID4_SIZE); + + if (unlikely(!p)) + return nfserr_bad_xdr; + svcxdr_decode_deviceid4(p, devid); + return nfs_ok; +} + struct nfsd4_layout_seg { u32 iomode; u64 offset; @@ -630,8 +664,7 @@ struct nfsd4_layoutcommit { u64 lc_last_wr; /* request */ struct timespec64 lc_mtime; /* request */ u32 lc_layout_type; /* request */ - u32 lc_up_len; /* layout length */ - void *lc_up_layout; /* decoded by callback */ + struct xdr_buf lc_up_layout; /* decoded by callback */ bool lc_size_chg; /* response */ u64 lc_newsize; /* response */ }; diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index b78308975082..39e60218df7c 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -441,7 +441,9 @@ struct fanotify_perm_event { size_t count; u32 response; /* userspace answer to the event */ unsigned short state; /* state of the event */ + unsigned short watchdog_cnt; /* already scanned by watchdog? */ int fd; /* fd we passed to userspace for this event */ + pid_t recv_pid; /* pid of task receiving the event */ union { struct fanotify_response_info_header hdr; struct fanotify_response_info_audit_rule audit_rule; diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b192ee068a7a..1dadda82cae5 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -50,6 +50,7 @@ /* configurable via /proc/sys/fs/fanotify/ */ static int fanotify_max_queued_events __read_mostly; +static int perm_group_timeout __read_mostly; #ifdef CONFIG_SYSCTL @@ -85,6 +86,14 @@ static const struct ctl_table fanotify_table[] = { .proc_handler = proc_dointvec_minmax, .extra1 = SYSCTL_ZERO }, + { + .procname = "watchdog_timeout", + .data = &perm_group_timeout, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, }; static void __init fanotify_sysctls_init(void) @@ -95,6 +104,91 @@ static void __init fanotify_sysctls_init(void) #define fanotify_sysctls_init() do { } while (0) #endif /* CONFIG_SYSCTL */ +static LIST_HEAD(perm_group_list); +static DEFINE_SPINLOCK(perm_group_lock); +static void perm_group_watchdog(struct work_struct *work); +static DECLARE_DELAYED_WORK(perm_group_work, perm_group_watchdog); + +static void perm_group_watchdog_schedule(void) +{ + schedule_delayed_work(&perm_group_work, secs_to_jiffies(perm_group_timeout)); +} + +static void perm_group_watchdog(struct work_struct *work) +{ + struct fsnotify_group *group; + struct fanotify_perm_event *event; + struct task_struct *task; + pid_t failed_pid = 0; + + guard(spinlock)(&perm_group_lock); + if (list_empty(&perm_group_list)) + return; + + list_for_each_entry(group, &perm_group_list, + fanotify_data.perm_grp_list) { + /* + * Ok to test without lock, racing with an addition is + * fine, will deal with it next round + */ + if (list_empty(&group->fanotify_data.access_list)) + continue; + + spin_lock(&group->notification_lock); + list_for_each_entry(event, &group->fanotify_data.access_list, + fae.fse.list) { + if (likely(event->watchdog_cnt == 0)) { + event->watchdog_cnt = 1; + } else if (event->watchdog_cnt == 1) { + /* Report on event only once */ + event->watchdog_cnt = 2; + + /* Do not report same pid repeatedly */ + if (event->recv_pid == failed_pid) + continue; + + failed_pid = event->recv_pid; + rcu_read_lock(); + task = find_task_by_pid_ns(event->recv_pid, + &init_pid_ns); + pr_warn_ratelimited( + "PID %u (%s) failed to respond to fanotify queue for more than %d seconds\n", + event->recv_pid, + task ? task->comm : NULL, + perm_group_timeout); + rcu_read_unlock(); + } + } + spin_unlock(&group->notification_lock); + } + perm_group_watchdog_schedule(); +} + +static void fanotify_perm_watchdog_group_remove(struct fsnotify_group *group) +{ + if (!list_empty(&group->fanotify_data.perm_grp_list)) { + /* Perm event watchdog can no longer scan this group. */ + spin_lock(&perm_group_lock); + list_del_init(&group->fanotify_data.perm_grp_list); + spin_unlock(&perm_group_lock); + } +} + +static void fanotify_perm_watchdog_group_add(struct fsnotify_group *group) +{ + if (!perm_group_timeout) + return; + + spin_lock(&perm_group_lock); + if (list_empty(&group->fanotify_data.perm_grp_list)) { + /* Add to perm_group_list for monitoring by watchdog. */ + if (list_empty(&perm_group_list)) + perm_group_watchdog_schedule(); + list_add_tail(&group->fanotify_data.perm_grp_list, &perm_group_list); + } + spin_unlock(&perm_group_lock); +} + /* * All flags that may be specified in parameter event_f_flags of fanotify_init. * @@ -953,6 +1047,7 @@ static ssize_t fanotify_read(struct file *file, char __user *buf, spin_lock(&group->notification_lock); list_add_tail(&event->fse.list, &group->fanotify_data.access_list); + FANOTIFY_PERM(event)->recv_pid = current->pid; spin_unlock(&group->notification_lock); } } @@ -1012,6 +1107,8 @@ static int fanotify_release(struct inode *ignored, struct file *file) */ fsnotify_group_stop_queueing(group); + fanotify_perm_watchdog_group_remove(group); + /* * Process all permission events on access_list and notification queue * and simulate reply from userspace. @@ -1465,6 +1562,10 @@ out: fsnotify_group_unlock(group); fsnotify_put_mark(fsn_mark); + + if (!ret && (mask & FANOTIFY_PERM_EVENTS)) + fanotify_perm_watchdog_group_add(group); + return ret; } @@ -1625,6 +1726,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.f_flags = event_f_flags; init_waitqueue_head(&group->fanotify_data.access_waitq); INIT_LIST_HEAD(&group->fanotify_data.access_list); + INIT_LIST_HEAD(&group->fanotify_data.perm_grp_list); switch (class) { case FAN_CLASS_NOTIF: group->priority = FSNOTIFY_PRIO_NORMAL; @@ -1999,7 +2101,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt->mnt_sb; } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + ret = -EINVAL; mntns = mnt_ns_from_dentry(path.dentry); + if (!mntns) + goto path_put_and_out; user_ns = mntns->user_ns; obj = mntns; } diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 1161eabf11ee..9cc7eb863643 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -17,6 +17,7 @@ #include "fanotify/fanotify.h" #include "fdinfo.h" #include "fsnotify.h" +#include "../internal.h" #if defined(CONFIG_PROC_FS) @@ -46,7 +47,12 @@ static void show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f->handle_bytes >> 2; + if (!super_trylock_shared(inode->i_sb)) + return; + ret = exportfs_encode_fid(inode, (struct fid *)f->f_handle, &size); + up_read(&inode->i_sb->s_umount); + if ((ret == FILEID_INVALID) || (ret < 0)) return; diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index cd7d11b0eb08..7c326ec2e8a8 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -10,7 +10,7 @@ * Copyright 2006 Hewlett-Packard Development Company, L.P. * * Copyright (C) 2009 Eric Paris <Red Hat Inc> - * inotify was largely rewriten to make use of the fsnotify infrastructure + * inotify was largely rewritten to make use of the fsnotify infrastructure */ #include <linux/dcache.h> /* d_unlinked */ diff --git a/fs/nsfs.c b/fs/nsfs.c index e7fd8a790aaa..79b026a36fb6 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -490,7 +490,9 @@ static struct dentry *nsfs_fh_to_dentry(struct super_block *sb, struct fid *fh, VFS_WARN_ON_ONCE(ns->ns_id != fid->ns_id); VFS_WARN_ON_ONCE(ns->ns_type != fid->ns_type); - VFS_WARN_ON_ONCE(ns->inum != fid->ns_inum); + + if (ns->inum != fid->ns_inum) + return NULL; if (!__ns_ref_get(ns)) return NULL; @@ -571,7 +573,7 @@ static int nsfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *nsfs_export_open(struct path *path, unsigned int oflags) +static struct file *nsfs_export_open(const struct path *path, unsigned int oflags) { return file_open_root(path, "", oflags, 0); } diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c index 04107b950717..65d05e6a0566 100644 --- a/fs/ntfs3/bitmap.c +++ b/fs/ntfs3/bitmap.c @@ -1371,6 +1371,7 @@ int wnd_extend(struct wnd_bitmap *wnd, size_t new_bits) mark_buffer_dirty(bh); unlock_buffer(bh); /* err = sync_dirty_buffer(bh); */ + put_bh(bh); b0 = 0; bits -= op; diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index c1ece707b195..4c90ec2fa2ea 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -49,6 +49,30 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) return 0; } +static int ntfs_ioctl_get_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf) +{ + if (copy_to_user(buf, sbi->volume.label, FSLABEL_MAX)) + return -EFAULT; + + return 0; +} + +static int ntfs_ioctl_set_volume_label(struct ntfs_sb_info *sbi, u8 __user *buf) +{ + u8 user[FSLABEL_MAX] = {0}; + int len; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(user, buf, FSLABEL_MAX)) + return -EFAULT; + + len = strnlen(user, FSLABEL_MAX); + + return ntfs_set_label(sbi, user, len); +} + /* * ntfs_ioctl - file_operations::unlocked_ioctl */ @@ -64,6 +88,10 @@ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) switch (cmd) { case FITRIM: return ntfs_ioctl_fitrim(sbi, arg); + case FS_IOC_GETFSLABEL: + return ntfs_ioctl_get_volume_label(sbi, (u8 __user *)arg); + case FS_IOC_SETFSLABEL: + return ntfs_ioctl_set_volume_label(sbi, (u8 __user *)arg); } return -ENOTTY; /* Inappropriate ioctl for device. */ } diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 1bf2a6593dec..6d1bf890929d 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -1508,6 +1508,16 @@ static int indx_add_allocate(struct ntfs_index *indx, struct ntfs_inode *ni, bmp_size = bmp_size_v = le32_to_cpu(bmp->res.data_size); } + /* + * Index blocks exist, but $BITMAP has zero valid bits. + * This implies an on-disk corruption and must be rejected. + */ + if (in->name == I30_NAME && + unlikely(bmp_size_v == 0 && indx->alloc_run.count)) { + err = -EINVAL; + goto out1; + } + bit = bmp_size << 3; } diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 48b4f73a93ee..3959f23c487a 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -471,6 +471,7 @@ end_enum: fname->home.seq == cpu_to_le16(MFT_REC_EXTEND)) { /* Records in $Extend are not a files or general directories. */ inode->i_op = &ntfs_file_inode_operations; + mode = S_IFREG; } else { err = -EINVAL; goto out; diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 1296e6fcc779..630128716ea7 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -280,7 +280,7 @@ struct ntfs_sb_info { __le16 flags; // Cached current VOLUME_INFO::flags, VOLUME_FLAG_DIRTY. u8 major_ver; u8 minor_ver; - char label[256]; + char label[FSLABEL_MAX]; bool real_dirty; // Real fs state. } volume; diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c index 6e86d66197ef..88550085f745 100644 --- a/fs/ntfs3/run.c +++ b/fs/ntfs3/run.c @@ -9,6 +9,7 @@ #include <linux/blkdev.h> #include <linux/fs.h> #include <linux/log2.h> +#include <linux/overflow.h> #include "debug.h" #include "ntfs.h" @@ -982,14 +983,18 @@ int run_unpack(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, if (!dlcn) return -EINVAL; - lcn = prev_lcn + dlcn; + + if (check_add_overflow(prev_lcn, dlcn, &lcn)) + return -EINVAL; prev_lcn = lcn; } else { /* The size of 'dlcn' can't be > 8. */ return -EINVAL; } - next_vcn = vcn64 + len; + if (check_add_overflow(vcn64, len, &next_vcn)) + return -EINVAL; + /* Check boundary. */ if (next_vcn > evcn + 1) return -EINVAL; @@ -1153,7 +1158,8 @@ int run_get_highest_vcn(CLST vcn, const u8 *run_buf, u64 *highest_vcn) return -EINVAL; run_buf += size_size + offset_size; - vcn64 += len; + if (check_add_overflow(vcn64, len, &vcn64)) + return -EINVAL; #ifndef CONFIG_NTFS3_64BIT_CLUSTER if (vcn64 > 0x100000000ull) diff --git a/fs/open.c b/fs/open.c index 9655158c3885..3d64372ecc67 100644 --- a/fs/open.c +++ b/fs/open.c @@ -1022,8 +1022,8 @@ cleanup_all: put_file_access(f); cleanup_file: path_put(&f->f_path); - f->f_path.mnt = NULL; - f->f_path.dentry = NULL; + f->__f_path.mnt = NULL; + f->__f_path.dentry = NULL; f->f_inode = NULL; return error; } @@ -1050,7 +1050,7 @@ int finish_open(struct file *file, struct dentry *dentry, { BUG_ON(file->f_mode & FMODE_OPENED); /* once it's opened, it's opened */ - file->f_path.dentry = dentry; + file->__f_path.dentry = dentry; return do_dentry_open(file, open); } EXPORT_SYMBOL(finish_open); @@ -1059,19 +1059,21 @@ EXPORT_SYMBOL(finish_open); * finish_no_open - finish ->atomic_open() without opening the file * * @file: file pointer - * @dentry: dentry or NULL (as returned from ->lookup()) + * @dentry: dentry, ERR_PTR(-E...) or NULL (as returned from ->lookup()) * - * This can be used to set the result of a successful lookup in ->atomic_open(). + * This can be used to set the result of a lookup in ->atomic_open(). * * NB: unlike finish_open() this function does consume the dentry reference and * the caller need not dput() it. * - * Returns "0" which must be the return value of ->atomic_open() after having - * called this function. + * Returns 0 or -E..., which must be the return value of ->atomic_open() after + * having called this function. */ int finish_no_open(struct file *file, struct dentry *dentry) { - file->f_path.dentry = dentry; + if (IS_ERR(dentry)) + return PTR_ERR(dentry); + file->__f_path.dentry = dentry; return 0; } EXPORT_SYMBOL(finish_no_open); @@ -1091,7 +1093,7 @@ int vfs_open(const struct path *path, struct file *file) { int ret; - file->f_path = *path; + file->__f_path = *path; ret = do_dentry_open(file, NULL); if (!ret) { /* diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 82395fe2b956..bec5475de094 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -38,8 +38,7 @@ static int orangefs_create(struct mnt_idmap *idmap, new_op->upcall.req.create.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.create.attributes, - ORANGEFS_TYPE_METAFILE, mode); + fill_default_sys_attrs(new_op->upcall.req.create.attributes, mode); strscpy(new_op->upcall.req.create.d_name, dentry->d_name.name); @@ -240,9 +239,7 @@ static int orangefs_symlink(struct mnt_idmap *idmap, new_op->upcall.req.sym.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.sym.attributes, - ORANGEFS_TYPE_SYMLINK, - mode); + fill_default_sys_attrs(new_op->upcall.req.sym.attributes, mode); strscpy(new_op->upcall.req.sym.entry_name, dentry->d_name.name); strscpy(new_op->upcall.req.sym.target, symname); @@ -316,8 +313,7 @@ static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, new_op->upcall.req.mkdir.parent_refn = parent->refn; - fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, - ORANGEFS_TYPE_DIRECTORY, mode); + fill_default_sys_attrs(new_op->upcall.req.mkdir.attributes, mode); strscpy(new_op->upcall.req.mkdir.d_name, dentry->d_name.name); diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index 1c375fb65018..79267b3419f2 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -440,14 +440,13 @@ static ssize_t orangefs_debug_write(struct file *file, count = ORANGEFS_MAX_DEBUG_STRING_LEN; } - buf = kzalloc(ORANGEFS_MAX_DEBUG_STRING_LEN, GFP_KERNEL); - if (!buf) - goto out; - - if (copy_from_user(buf, ubuf, count - 1)) { + buf = memdup_user_nul(ubuf, count - 1); + if (IS_ERR(buf)) { gossip_debug(GOSSIP_DEBUGFS_DEBUG, - "%s: copy_from_user failed!\n", + "%s: memdup_user_nul failed!\n", __func__); + rc = PTR_ERR(buf); + buf = NULL; goto out; } diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3e153c2f6b82..29c6da43e396 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -462,7 +462,7 @@ int service_operation(struct orangefs_kernel_op_s *op, ((ORANGEFS_SB(inode->i_sb)->flags & ORANGEFS_OPT_INTR) ? \ ORANGEFS_OP_INTERRUPTIBLE : 0) -#define fill_default_sys_attrs(sys_attr, type, mode) \ +#define fill_default_sys_attrs(sys_attr, mode) \ do { \ sys_attr.owner = from_kuid(&init_user_ns, current_fsuid()); \ sys_attr.group = from_kgid(&init_user_ns, current_fsgid()); \ diff --git a/fs/orangefs/xattr.c b/fs/orangefs/xattr.c index 74ef75586f38..eee3c5ed1bbb 100644 --- a/fs/orangefs/xattr.c +++ b/fs/orangefs/xattr.c @@ -54,7 +54,9 @@ static inline int convert_to_internal_xattr_flags(int setxattr_flags) static unsigned int xattr_key(const char *key) { unsigned int i = 0; - while (key) + if (!key) + return 0; + while (*key) i += *key++; return i % 16; } @@ -175,8 +177,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, cx->length = -1; cx->timeout = jiffies + orangefs_getattr_timeout_msecs*HZ/1000; - hash_add(orangefs_inode->xattr_cache, &cx->node, - xattr_key(cx->key)); + hlist_add_head( &cx->node, + &orangefs_inode->xattr_cache[xattr_key(cx->key)]); } } goto out_release_op; @@ -229,8 +231,8 @@ ssize_t orangefs_inode_getxattr(struct inode *inode, const char *name, memcpy(cx->val, buffer, length); cx->length = length; cx->timeout = jiffies + HZ; - hash_add(orangefs_inode->xattr_cache, &cx->node, - xattr_key(cx->key)); + hlist_add_head(&cx->node, + &orangefs_inode->xattr_cache[xattr_key(cx->key)]); } } diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 27396fe63f6d..604a82acd164 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -178,7 +178,7 @@ static int ovl_copy_fileattr(struct inode *inode, const struct path *old, err = ovl_real_fileattr_get(old, &oldfa); if (err) { /* Ntfs-3g returns -EINVAL for "no fileattr support" */ - if (err == -EOPNOTSUPP || err == -EINVAL) + if (err == -ENOTTY || err == -EINVAL) return 0; pr_warn("failed to retrieve lower fileattr (%pd2, err=%i)\n", old->dentry, err); @@ -242,7 +242,7 @@ static int ovl_verify_area(loff_t pos, loff_t pos2, loff_t len, loff_t totlen) return 0; } -static int ovl_sync_file(struct path *path) +static int ovl_sync_file(const struct path *path) { struct file *new_file; int err; @@ -670,7 +670,7 @@ static int ovl_copy_up_metadata(struct ovl_copy_up_ctx *c, struct dentry *temp) if (err) return err; - if (inode->i_flags & OVL_COPY_I_FLAGS_MASK && + if (inode->i_flags & OVL_FATTR_I_FLAGS_MASK && (S_ISREG(c->stat.mode) || S_ISDIR(c->stat.mode))) { /* * Copy the fileattr inode flags that are the source of already diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index dbd63a74df4b..a5e9ddf3023b 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -187,6 +187,13 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, /* mkdir is special... */ newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); err = PTR_ERR_OR_ZERO(newdentry); + /* expect to inherit casefolding from workdir/upperdir */ + if (!err && ofs->casefold != ovl_dentry_casefolded(newdentry)) { + pr_warn_ratelimited("wrong inherited casefold (%pd2)\n", + newdentry); + dput(newdentry); + err = -EINVAL; + } break; case S_IFCHR: @@ -205,12 +212,32 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct dentry *parent, err = -EPERM; } } - if (!err && WARN_ON(!newdentry->d_inode)) { + if (err) + goto out; + + if (WARN_ON(!newdentry->d_inode)) { /* * Not quite sure if non-instantiated dentry is legal or not. * VFS doesn't seem to care so check and warn here. */ err = -EIO; + } else if (d_unhashed(newdentry)) { + struct dentry *d; + /* + * Some filesystems (i.e. casefolded) may return an unhashed + * negative dentry from the ovl_lookup_upper() call before + * ovl_create_real(). + * In that case, lookup again after making the newdentry + * positive, so ovl_create_upper() always returns a hashed + * positive dentry. + */ + d = ovl_lookup_upper(ofs, newdentry->d_name.name, parent, + newdentry->d_name.len); + dput(newdentry); + if (IS_ERR_OR_NULL(d)) + err = d ? PTR_ERR(d) : -ENOENT; + else + return d; } out: if (err) { diff --git a/fs/overlayfs/file.c b/fs/overlayfs/file.c index f5b8877d5fe2..7ab2c9daffd0 100644 --- a/fs/overlayfs/file.c +++ b/fs/overlayfs/file.c @@ -120,7 +120,7 @@ static bool ovl_is_real_file(const struct file *realfile, } static struct file *ovl_real_file_path(const struct file *file, - struct path *realpath) + const struct path *realpath) { struct ovl_file *of = file->private_data; struct file *realfile = of->realfile; @@ -369,11 +369,6 @@ static ssize_t ovl_write_iter(struct kiocb *iocb, struct iov_iter *iter) if (!ovl_should_sync(OVL_FS(inode->i_sb))) ifl &= ~(IOCB_DSYNC | IOCB_SYNC); - /* - * Overlayfs doesn't support deferred completions, don't copy - * this property in case it is set by the issuer. - */ - ifl &= ~IOCB_DIO_CALLER_COMP; ret = backing_file_write_iter(realfile, iter, iocb, ifl, &ctx); out_unlock: diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index ecb9f2019395..e11f310ce092 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -720,7 +720,10 @@ int ovl_real_fileattr_get(const struct path *realpath, struct file_kattr *fa) if (err) return err; - return vfs_fileattr_get(realpath->dentry, fa); + err = vfs_fileattr_get(realpath->dentry, fa); + if (err == -ENOIOCTLCMD) + err = -ENOTTY; + return err; } int ovl_fileattr_get(struct dentry *dentry, struct file_kattr *fa) @@ -1277,6 +1280,7 @@ struct inode *ovl_get_inode(struct super_block *sb, } ovl_fill_inode(inode, realinode->i_mode, realinode->i_rdev); ovl_inode_init(inode, oip, ino, fsid); + WARN_ON_ONCE(!!IS_CASEFOLDED(inode) != ofs->casefold); if (upperdentry && ovl_is_impuredir(sb, upperdentry)) ovl_set_flag(OVL_IMPURE, inode); diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index 76d6248b625e..e93bcc5727bc 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -239,13 +239,14 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, char val; /* - * We allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. If someone has enabled case - * folding on a directory on underlying layer, the warranty of the ovl - * stack is voided. + * We allow filesystems that are case-folding capable as long as the + * layers are consistently enabled in the stack, enabled for every dir + * or disabled in all dirs. If someone has modified case folding on a + * directory on underlying layer, the warranty of the ovl stack is + * voided. */ - if (ovl_dentry_casefolded(base)) { - warn = "case folded parent"; + if (ofs->casefold != ovl_dentry_casefolded(base)) { + warn = "parent wrong casefold"; err = -ESTALE; goto out_warn; } @@ -259,8 +260,8 @@ static int ovl_lookup_single(struct dentry *base, struct ovl_lookup_data *d, goto out_err; } - if (ovl_dentry_casefolded(this)) { - warn = "case folded child"; + if (ofs->casefold != ovl_dentry_casefolded(this)) { + warn = "child wrong casefold"; err = -EREMOTE; goto out_warn; } diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 4f84abaa0d68..c8fd5951fc5e 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -562,11 +562,11 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, struct ovl_metacopy *metacopy); bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); -int ovl_ensure_verity_loaded(struct path *path); +int ovl_ensure_verity_loaded(const struct path *path); int ovl_validate_verity(struct ovl_fs *ofs, - struct path *metapath, - struct path *datapath); -int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, + const struct path *metapath, + const struct path *datapath); +int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src, struct ovl_metacopy *metacopy); int ovl_sync_status(struct ovl_fs *ofs); @@ -820,10 +820,12 @@ struct inode *ovl_get_inode(struct super_block *sb, struct ovl_inode_params *oip); void ovl_copyattr(struct inode *to); +/* vfs fileattr flags read from overlay.protattr xattr to ovl inode */ +#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) +/* vfs fileattr flags copied from real to ovl inode */ +#define OVL_FATTR_I_FLAGS_MASK (OVL_PROT_I_FLAGS_MASK | S_SYNC | S_NOATIME) /* vfs inode flags copied from real to ovl inode */ -#define OVL_COPY_I_FLAGS_MASK (S_SYNC | S_NOATIME | S_APPEND | S_IMMUTABLE) -/* vfs inode flags read from overlay.protattr xattr to ovl inode */ -#define OVL_PROT_I_FLAGS_MASK (S_APPEND | S_IMMUTABLE) +#define OVL_COPY_I_FLAGS_MASK (OVL_FATTR_I_FLAGS_MASK | S_CASEFOLD) /* * fileattr flags copied from lower to upper inode on copy up. diff --git a/fs/overlayfs/ovl_entry.h b/fs/overlayfs/ovl_entry.h index 4c1bae935ced..1d4828dbcf7a 100644 --- a/fs/overlayfs/ovl_entry.h +++ b/fs/overlayfs/ovl_entry.h @@ -91,6 +91,7 @@ struct ovl_fs { struct mutex whiteout_lock; /* r/o snapshot of upperdir sb's only taken on volatile mounts */ errseq_t errseq; + bool casefold; }; /* Number of lower layers, not including data-only layers */ diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index f4e7fff909ac..63b7346c5ee1 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -276,17 +276,26 @@ static int ovl_mount_dir(const char *name, struct path *path) static int ovl_mount_dir_check(struct fs_context *fc, const struct path *path, enum ovl_opt layer, const char *name, bool upper) { + bool is_casefolded = ovl_dentry_casefolded(path->dentry); struct ovl_fs_context *ctx = fc->fs_private; + struct ovl_fs *ofs = fc->s_fs_info; if (!d_is_dir(path->dentry)) return invalfc(fc, "%s is not a directory", name); /* * Allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. + * ovl stack from inconsistent case-folded directories. */ - if (ovl_dentry_casefolded(path->dentry)) - return invalfc(fc, "case-insensitive directory on %s not supported", name); + if (!ctx->casefold_set) { + ofs->casefold = is_casefolded; + ctx->casefold_set = true; + } + + if (ofs->casefold != is_casefolded) { + return invalfc(fc, "case-%ssensitive directory on %s is inconsistent", + is_casefolded ? "in" : "", name); + } if (ovl_dentry_weird(path->dentry)) return invalfc(fc, "filesystem on %s not supported", name); diff --git a/fs/overlayfs/params.h b/fs/overlayfs/params.h index c96d93982021..ffd53cdd8482 100644 --- a/fs/overlayfs/params.h +++ b/fs/overlayfs/params.h @@ -33,6 +33,7 @@ struct ovl_fs_context { struct ovl_opt_set set; struct ovl_fs_context_layer *lower; char *lowerdir_all; /* user provided lowerdir string */ + bool casefold_set; }; int ovl_init_fs_context(struct fs_context *fc); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 15cb06fa0c9a..1e9792cc557b 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -27,6 +27,8 @@ struct ovl_cache_entry { bool is_upper; bool is_whiteout; bool check_xwhiteout; + const char *c_name; + int c_len; char name[]; }; @@ -45,6 +47,7 @@ struct ovl_readdir_data { struct list_head *list; struct list_head middle; struct ovl_cache_entry *first_maybe_whiteout; + struct unicode_map *map; int count; int err; bool is_upper; @@ -66,6 +69,31 @@ static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) return rb_entry(n, struct ovl_cache_entry, node); } +static int ovl_casefold(struct ovl_readdir_data *rdd, const char *str, int len, + char **dst) +{ + const struct qstr qstr = { .name = str, .len = len }; + char *cf_name; + int cf_len; + + if (!IS_ENABLED(CONFIG_UNICODE) || !rdd->map || is_dot_dotdot(str, len)) + return 0; + + cf_name = kmalloc(NAME_MAX, GFP_KERNEL); + if (!cf_name) { + rdd->err = -ENOMEM; + return -ENOMEM; + } + + cf_len = utf8_casefold(rdd->map, &qstr, cf_name, NAME_MAX); + if (cf_len > 0) + *dst = cf_name; + else + kfree(cf_name); + + return cf_len; +} + static bool ovl_cache_entry_find_link(const char *name, int len, struct rb_node ***link, struct rb_node **parent) @@ -79,10 +107,10 @@ static bool ovl_cache_entry_find_link(const char *name, int len, *parent = *newp; tmp = ovl_cache_entry_from_node(*newp); - cmp = strncmp(name, tmp->name, len); + cmp = strncmp(name, tmp->c_name, len); if (cmp > 0) newp = &tmp->node.rb_right; - else if (cmp < 0 || len < tmp->len) + else if (cmp < 0 || len < tmp->c_len) newp = &tmp->node.rb_left; else found = true; @@ -101,10 +129,10 @@ static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, while (node) { struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); - cmp = strncmp(name, p->name, len); + cmp = strncmp(name, p->c_name, len); if (cmp > 0) node = p->node.rb_right; - else if (cmp < 0 || len < p->len) + else if (cmp < 0 || len < p->c_len) node = p->node.rb_left; else return p; @@ -145,6 +173,7 @@ static bool ovl_calc_d_ino(struct ovl_readdir_data *rdd, static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, const char *name, int len, + const char *c_name, int c_len, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; @@ -167,6 +196,14 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, /* Defer check for overlay.whiteout to ovl_iterate() */ p->check_xwhiteout = rdd->in_xwhiteouts_dir && d_type == DT_REG; + if (c_name && c_name != name) { + p->c_name = c_name; + p->c_len = c_len; + } else { + p->c_name = p->name; + p->c_len = len; + } + if (d_type == DT_CHR) { p->next_maybe_whiteout = rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p; @@ -174,48 +211,62 @@ static struct ovl_cache_entry *ovl_cache_entry_new(struct ovl_readdir_data *rdd, return p; } -static bool ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, - const char *name, int len, u64 ino, +/* Return 0 for found, 1 for added, <0 for error */ +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, + const char *c_name, int c_len, + u64 ino, unsigned int d_type) { struct rb_node **newp = &rdd->root->rb_node; struct rb_node *parent = NULL; struct ovl_cache_entry *p; - if (ovl_cache_entry_find_link(name, len, &newp, &parent)) - return true; + if (ovl_cache_entry_find_link(c_name, c_len, &newp, &parent)) + return 0; - p = ovl_cache_entry_new(rdd, name, len, ino, d_type); + p = ovl_cache_entry_new(rdd, name, len, c_name, c_len, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; - return false; + return -ENOMEM; } list_add_tail(&p->l_node, rdd->list); rb_link_node(&p->node, parent, newp); rb_insert_color(&p->node, rdd->root); - return true; + return 1; } -static bool ovl_fill_lowest(struct ovl_readdir_data *rdd, +/* Return 0 for found, 1 for added, <0 for error */ +static int ovl_fill_lowest(struct ovl_readdir_data *rdd, const char *name, int namelen, + const char *c_name, int c_len, loff_t offset, u64 ino, unsigned int d_type) { struct ovl_cache_entry *p; - p = ovl_cache_entry_find(rdd->root, name, namelen); + p = ovl_cache_entry_find(rdd->root, c_name, c_len); if (p) { list_move_tail(&p->l_node, &rdd->middle); + return 0; } else { - p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, c_name, c_len, + ino, d_type); if (p == NULL) rdd->err = -ENOMEM; else list_add_tail(&p->l_node, &rdd->middle); } - return rdd->err == 0; + return rdd->err ?: 1; +} + +static void ovl_cache_entry_free(struct ovl_cache_entry *p) +{ + if (p->c_name != p->name) + kfree(p->c_name); + kfree(p); } void ovl_cache_free(struct list_head *list) @@ -224,7 +275,7 @@ void ovl_cache_free(struct list_head *list) struct ovl_cache_entry *n; list_for_each_entry_safe(p, n, list, l_node) - kfree(p); + ovl_cache_entry_free(p); INIT_LIST_HEAD(list); } @@ -260,12 +311,39 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, { struct ovl_readdir_data *rdd = container_of(ctx, struct ovl_readdir_data, ctx); + struct ovl_fs *ofs = OVL_FS(rdd->dentry->d_sb); + const char *c_name = NULL; + char *cf_name = NULL; + int c_len = 0, ret; + + if (ofs->casefold) + c_len = ovl_casefold(rdd, name, namelen, &cf_name); + + if (rdd->err) + return false; + + if (c_len <= 0) { + c_name = name; + c_len = namelen; + } else { + c_name = cf_name; + } rdd->count++; if (!rdd->is_lowest) - return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + ret = ovl_cache_entry_add_rb(rdd, name, namelen, c_name, c_len, ino, d_type); else - return ovl_fill_lowest(rdd, name, namelen, offset, ino, d_type); + ret = ovl_fill_lowest(rdd, name, namelen, c_name, c_len, offset, ino, d_type); + + /* + * If ret == 1, that means that c_name is being used as part of struct + * ovl_cache_entry and will be freed at ovl_cache_free(). Otherwise, + * c_name was found in the rb-tree so we can free it here. + */ + if (ret != 1 && c_name != name) + kfree(c_name); + + return ret >= 0; } static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) @@ -357,12 +435,18 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, .list = list, .root = root, .is_lowest = false, + .map = NULL, }; int idx, next; const struct ovl_layer *layer; + struct ovl_fs *ofs = OVL_FS(dentry->d_sb); for (idx = 0; idx != -1; idx = next) { next = ovl_path_next(idx, dentry, &realpath, &layer); + + if (ofs->casefold) + rdd.map = sb_encoding(realpath.dentry->d_sb); + rdd.is_upper = ovl_dentry_upper(dentry) == realpath.dentry; rdd.in_xwhiteouts_dir = layer->has_xwhiteouts && ovl_dentry_has_xwhiteouts(dentry); @@ -555,7 +639,7 @@ static bool ovl_fill_plain(struct dir_context *ctx, const char *name, container_of(ctx, struct ovl_readdir_data, ctx); rdd->count++; - p = ovl_cache_entry_new(rdd, name, namelen, ino, d_type); + p = ovl_cache_entry_new(rdd, name, namelen, NULL, 0, ino, d_type); if (p == NULL) { rdd->err = -ENOMEM; return false; @@ -595,7 +679,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, } if (p->ino == p->real_ino) { list_del(&p->l_node); - kfree(p); + ovl_cache_entry_free(p); } else { struct rb_node **newp = &root->rb_node; struct rb_node *parent = NULL; @@ -1023,7 +1107,7 @@ int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) del_entry: list_del(&p->l_node); - kfree(p); + ovl_cache_entry_free(p); } return err; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index bd3d7ba8fb95..43ee4c7296a7 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -161,6 +161,16 @@ static const struct dentry_operations ovl_dentry_operations = { .d_weak_revalidate = ovl_dentry_weak_revalidate, }; +#if IS_ENABLED(CONFIG_UNICODE) +static const struct dentry_operations ovl_dentry_ci_operations = { + .d_real = ovl_d_real, + .d_revalidate = ovl_dentry_revalidate, + .d_weak_revalidate = ovl_dentry_weak_revalidate, + .d_hash = generic_ci_d_hash, + .d_compare = generic_ci_d_compare, +}; +#endif + static struct kmem_cache *ovl_inode_cachep; static struct inode *ovl_alloc_inode(struct super_block *sb) @@ -394,7 +404,7 @@ static int ovl_check_namelen(const struct path *path, struct ovl_fs *ofs, return err; } -static int ovl_lower_dir(const char *name, struct path *path, +static int ovl_lower_dir(const char *name, const struct path *path, struct ovl_fs *ofs, int *stack_depth) { int fh_type; @@ -991,6 +1001,25 @@ static int ovl_get_data_fsid(struct ovl_fs *ofs) return ofs->numfs; } +/* + * Set the ovl sb encoding as the same one used by the first layer + */ +static int ovl_set_encoding(struct super_block *sb, struct super_block *fs_sb) +{ + if (!sb_has_encoding(fs_sb)) + return 0; + +#if IS_ENABLED(CONFIG_UNICODE) + if (sb_has_strict_encoding(fs_sb)) { + pr_err("strict encoding not supported\n"); + return -EINVAL; + } + + sb->s_encoding = fs_sb->s_encoding; + sb->s_encoding_flags = fs_sb->s_encoding_flags; +#endif + return 0; +} static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, struct ovl_fs_context *ctx, struct ovl_layer *layers) @@ -1024,6 +1053,12 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, if (ovl_upper_mnt(ofs)) { ofs->fs[0].sb = ovl_upper_mnt(ofs)->mnt_sb; ofs->fs[0].is_lower = false; + + if (ofs->casefold) { + err = ovl_set_encoding(sb, ofs->fs[0].sb); + if (err) + return err; + } } nr_merged_lower = ctx->nr - ctx->nr_data; @@ -1083,6 +1118,19 @@ static int ovl_get_layers(struct super_block *sb, struct ovl_fs *ofs, l->name = NULL; ofs->numlayer++; ofs->fs[fsid].is_lower = true; + + if (ofs->casefold) { + if (!ovl_upper_mnt(ofs) && !sb_has_encoding(sb)) { + err = ovl_set_encoding(sb, ofs->fs[fsid].sb); + if (err) + return err; + } + + if (!sb_same_encoding(sb, mnt->mnt_sb)) { + pr_err("all layers must have the same encoding\n"); + return -EINVAL; + } + } } /* @@ -1300,6 +1348,7 @@ static struct dentry *ovl_get_root(struct super_block *sb, ovl_dentry_set_flag(OVL_E_CONNECTED, root); ovl_set_upperdata(d_inode(root)); ovl_inode_init(d_inode(root), &oip, ino, fsid); + WARN_ON(!!IS_CASEFOLDED(d_inode(root)) != ofs->casefold); ovl_dentry_init_flags(root, upperdentry, oe, DCACHE_OP_WEAK_REVALIDATE); /* root keeps a reference of upperdentry */ dget(upperdentry); @@ -1307,6 +1356,19 @@ static struct dentry *ovl_get_root(struct super_block *sb, return root; } +static void ovl_set_d_op(struct super_block *sb) +{ +#if IS_ENABLED(CONFIG_UNICODE) + struct ovl_fs *ofs = sb->s_fs_info; + + if (ofs->casefold) { + set_default_d_op(sb, &ovl_dentry_ci_operations); + return; + } +#endif + set_default_d_op(sb, &ovl_dentry_operations); +} + int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; @@ -1322,7 +1384,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) if (WARN_ON(fc->user_ns != current_user_ns())) goto out_err; - set_default_d_op(sb, &ovl_dentry_operations); + ovl_set_d_op(sb); err = -ENOMEM; if (!ofs->creator_cred) diff --git a/fs/overlayfs/util.c b/fs/overlayfs/util.c index 41033bac96cb..f76672f2e686 100644 --- a/fs/overlayfs/util.c +++ b/fs/overlayfs/util.c @@ -210,11 +210,11 @@ bool ovl_dentry_weird(struct dentry *dentry) return true; /* - * Allow filesystems that are case-folding capable but deny composing - * ovl stack from case-folded directories. + * Exceptionally for layers with casefold, we accept that they have + * their own hash and compare operations */ if (sb_has_encoding(dentry->d_sb)) - return IS_CASEFOLDED(d_inode(dentry)); + return false; return dentry->d_flags & (DCACHE_OP_HASH | DCACHE_OP_COMPARE); } @@ -1381,7 +1381,7 @@ err_free: } /* Call with mounter creds as it may open the file */ -int ovl_ensure_verity_loaded(struct path *datapath) +int ovl_ensure_verity_loaded(const struct path *datapath) { struct inode *inode = d_inode(datapath->dentry); struct file *filp; @@ -1401,8 +1401,8 @@ int ovl_ensure_verity_loaded(struct path *datapath) } int ovl_validate_verity(struct ovl_fs *ofs, - struct path *metapath, - struct path *datapath) + const struct path *metapath, + const struct path *datapath) { struct ovl_metacopy metacopy_data; u8 actual_digest[FS_VERITY_MAX_DIGEST_SIZE]; @@ -1455,7 +1455,7 @@ int ovl_validate_verity(struct ovl_fs *ofs, return 0; } -int ovl_get_verity_digest(struct ovl_fs *ofs, struct path *src, +int ovl_get_verity_digest(struct ovl_fs *ofs, const struct path *src, struct ovl_metacopy *metacopy) { int err, digest_size; diff --git a/fs/pidfs.c b/fs/pidfs.c index 44a95cd27377..0ef5b47d796a 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -850,7 +850,7 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } -static struct file *pidfs_export_open(struct path *path, unsigned int oflags) +static struct file *pidfs_export_open(const struct path *path, unsigned int oflags) { /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise diff --git a/fs/pnode.c b/fs/pnode.c index 6f7d02f3fa98..5d91c3e58d2a 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -29,6 +29,7 @@ static inline struct mount *next_slave(struct mount *p) return hlist_entry(p->mnt_slave.next, struct mount, mnt_slave); } +/* locks: namespace_shared && is_mounted(mnt) */ static struct mount *get_peer_under_root(struct mount *mnt, struct mnt_namespace *ns, const struct path *root) @@ -50,7 +51,7 @@ static struct mount *get_peer_under_root(struct mount *mnt, * Get ID of closest dominating peer group having a representative * under the given root. * - * Caller must hold namespace_sem + * locks: namespace_shared */ int get_dominating_id(struct mount *mnt, const struct path *root) { @@ -70,19 +71,6 @@ static inline bool will_be_unmounted(struct mount *m) return m->mnt.mnt_flags & MNT_UMOUNT; } -static struct mount *propagation_source(struct mount *mnt) -{ - do { - struct mount *m; - for (m = next_peer(mnt); m != mnt; m = next_peer(m)) { - if (!will_be_unmounted(m)) - return m; - } - mnt = mnt->mnt_master; - } while (mnt && will_be_unmounted(mnt)); - return mnt; -} - static void transfer_propagation(struct mount *mnt, struct mount *to) { struct hlist_node *p = NULL, *n; @@ -111,11 +99,10 @@ void change_mnt_propagation(struct mount *mnt, int type) return; } if (IS_MNT_SHARED(mnt)) { - if (type == MS_SLAVE || !hlist_empty(&mnt->mnt_slave_list)) - m = propagation_source(mnt); if (list_empty(&mnt->mnt_share)) { mnt_release_group_id(mnt); } else { + m = next_peer(mnt); list_del_init(&mnt->mnt_share); mnt->mnt_group_id = 0; } @@ -136,6 +123,57 @@ void change_mnt_propagation(struct mount *mnt, int type) } } +static struct mount *trace_transfers(struct mount *m) +{ + while (1) { + struct mount *next = next_peer(m); + + if (next != m) { + list_del_init(&m->mnt_share); + m->mnt_group_id = 0; + m->mnt_master = next; + } else { + if (IS_MNT_SHARED(m)) + mnt_release_group_id(m); + next = m->mnt_master; + } + hlist_del_init(&m->mnt_slave); + CLEAR_MNT_SHARED(m); + SET_MNT_MARK(m); + + if (!next || !will_be_unmounted(next)) + return next; + if (IS_MNT_MARKED(next)) + return next->mnt_master; + m = next; + } +} + +static void set_destinations(struct mount *m, struct mount *master) +{ + struct mount *next; + + while ((next = m->mnt_master) != master) { + m->mnt_master = master; + m = next; + } +} + +void bulk_make_private(struct list_head *set) +{ + struct mount *m; + + list_for_each_entry(m, set, mnt_list) + if (!IS_MNT_MARKED(m)) + set_destinations(m, trace_transfers(m)); + + list_for_each_entry(m, set, mnt_list) { + transfer_propagation(m, m->mnt_master); + m->mnt_master = NULL; + CLEAR_MNT_MARK(m); + } +} + static struct mount *__propagation_next(struct mount *m, struct mount *origin) { @@ -304,9 +342,8 @@ int propagate_mnt(struct mount *dest_mnt, struct mountpoint *dest_mp, err = PTR_ERR(this); break; } - read_seqlock_excl(&mount_lock); - mnt_set_mountpoint(n, dest_mp, this); - read_sequnlock_excl(&mount_lock); + scoped_guard(mount_locked_reader) + mnt_set_mountpoint(n, dest_mp, this); if (n->mnt_master) SET_MNT_MARK(n->mnt_master); copy = this; diff --git a/fs/pnode.h b/fs/pnode.h index 00ab153e3e9d..b029db225f33 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -42,6 +42,7 @@ static inline bool peers(const struct mount *m1, const struct mount *m2) } void change_mnt_propagation(struct mount *, int); +void bulk_make_private(struct list_head *); int propagate_mnt(struct mount *, struct mountpoint *, struct mount *, struct hlist_head *); void propagate_umount(struct list_head *); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index afa15a214538..6c4a6ee1fa2b 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -162,6 +162,9 @@ static struct quota_module_name module_names[] = INIT_QUOTA_MODULE_NAMES; /* SLAB cache for dquot structures */ static struct kmem_cache *dquot_cachep; +/* workqueue for work quota_release_work*/ +static struct workqueue_struct *quota_unbound_wq; + void register_quota_format(struct quota_format_type *fmt) { spin_lock(&dq_list_lock); @@ -881,7 +884,7 @@ void dqput(struct dquot *dquot) put_releasing_dquots(dquot); atomic_dec(&dquot->dq_count); spin_unlock(&dq_list_lock); - queue_delayed_work(system_dfl_wq, "a_release_work, 1); + queue_delayed_work(quota_unbound_wq, "a_release_work, 1); } EXPORT_SYMBOL(dqput); @@ -3041,6 +3044,11 @@ static int __init dquot_init(void) shrinker_register(dqcache_shrinker); + quota_unbound_wq = alloc_workqueue("quota_events_unbound", + WQ_UNBOUND | WQ_MEM_RECLAIM, WQ_MAX_ACTIVE); + if (!quota_unbound_wq) + panic("Cannot create quota_unbound_wq\n"); + return 0; } fs_initcall(dquot_init); diff --git a/fs/smb/client/Kconfig b/fs/smb/client/Kconfig index 9f05f94e265a..17bd368574e9 100644 --- a/fs/smb/client/Kconfig +++ b/fs/smb/client/Kconfig @@ -5,16 +5,16 @@ config CIFS select NLS select NLS_UCS2_UTILS select CRYPTO - select CRYPTO_MD5 - select CRYPTO_SHA256 - select CRYPTO_SHA512 select CRYPTO_CMAC - select CRYPTO_HMAC select CRYPTO_AEAD2 select CRYPTO_CCM select CRYPTO_GCM select CRYPTO_ECB select CRYPTO_AES + select CRYPTO_LIB_ARC4 + select CRYPTO_LIB_MD5 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_SHA512 select KEYS select DNS_RESOLVER select ASN1 diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index b69daeb1301b..b8ac7b7faf61 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -36,9 +36,8 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * fully cached or it may be in the process of * being deleted due to a lease break. */ - if (!cfid->time || !cfid->has_lease) { + if (!is_valid_cached_dir(cfid)) return NULL; - } kref_get(&cfid->refcount); return cfid; } @@ -194,7 +193,7 @@ replay_again: * Otherwise, it is either a new entry or laundromat worker removed it * from @cfids->entries. Caller will put last reference if the latter. */ - if (cfid->has_lease && cfid->time) { + if (is_valid_cached_dir(cfid)) { cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; @@ -233,7 +232,7 @@ replay_again: list_for_each_entry(parent_cfid, &cfids->entries, entry) { if (parent_cfid->dentry == dentry->d_parent) { cifs_dbg(FYI, "found a parent cached file handle\n"); - if (parent_cfid->has_lease && parent_cfid->time) { + if (is_valid_cached_dir(parent_cfid)) { lease_flags |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; memcpy(pfid->parent_lease_key, @@ -417,12 +416,18 @@ int open_cached_dir_by_dentry(struct cifs_tcon *tcon, if (cfids == NULL) return -EOPNOTSUPP; + if (!dentry) + return -ENOENT; + spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { - if (dentry && cfid->dentry == dentry) { + if (cfid->dentry == dentry) { + if (!is_valid_cached_dir(cfid)) + break; cifs_dbg(FYI, "found a cached file handle by dentry\n"); kref_get(&cfid->refcount); *ret_cfid = cfid; + cfid->last_access_time = jiffies; spin_unlock(&cfids->cfid_list_lock); return 0; } @@ -522,10 +527,9 @@ void close_all_cached_dirs(struct cifs_sb_info *cifs_sb) spin_unlock(&cifs_sb->tlink_tree_lock); goto done; } - spin_lock(&cfid->fid_lock); + tmp_list->dentry = cfid->dentry; cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); list_add_tail(&tmp_list->entry, &entry); } @@ -558,8 +562,8 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) /* * Mark all the cfids as closed, and move them to the cfids->dying list. - * They'll be cleaned up later by cfids_invalidation_worker. Take - * a reference to each cfid during this process. + * They'll be cleaned up by laundromat. Take a reference to each cfid + * during this process. */ spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { @@ -576,12 +580,11 @@ void invalidate_all_cached_dirs(struct cifs_tcon *tcon) } else kref_get(&cfid->refcount); } - /* - * Queue dropping of the dentries once locks have been dropped - */ - if (!list_empty(&cfids->dying)) - queue_work(cfid_put_wq, &cfids->invalidation_work); spin_unlock(&cfids->cfid_list_lock); + + /* run laundromat unconditionally now as there might have been previously queued work */ + mod_delayed_work(cfid_put_wq, &cfids->laundromat_work, 0); + flush_delayed_work(&cfids->laundromat_work); } static void @@ -608,14 +611,9 @@ static void cached_dir_put_work(struct work_struct *work) { struct cached_fid *cfid = container_of(work, struct cached_fid, put_work); - struct dentry *dentry; - - spin_lock(&cfid->fid_lock); - dentry = cfid->dentry; + dput(cfid->dentry); cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); - dput(dentry); queue_work(serverclose_wq, &cfid->close_work); } @@ -673,7 +671,6 @@ static struct cached_fid *init_cached_dir(const char *path) INIT_LIST_HEAD(&cfid->entry); INIT_LIST_HEAD(&cfid->dirents.entries); mutex_init(&cfid->dirents.de_mutex); - spin_lock_init(&cfid->fid_lock); kref_init(&cfid->refcount); return cfid; } @@ -697,40 +694,38 @@ static void free_cached_dir(struct cached_fid *cfid) kfree(dirent); } + /* adjust tcon-level counters and reset per-dir accounting */ + if (cfid->cfids) { + if (cfid->dirents.entries_count) + atomic_long_sub((long)cfid->dirents.entries_count, + &cfid->cfids->total_dirents_entries); + if (cfid->dirents.bytes_used) { + atomic64_sub((long long)cfid->dirents.bytes_used, + &cfid->cfids->total_dirents_bytes); + atomic64_sub((long long)cfid->dirents.bytes_used, + &cifs_dircache_bytes_used); + } + } + cfid->dirents.entries_count = 0; + cfid->dirents.bytes_used = 0; + kfree(cfid->path); cfid->path = NULL; kfree(cfid); } -static void cfids_invalidation_worker(struct work_struct *work) -{ - struct cached_fids *cfids = container_of(work, struct cached_fids, - invalidation_work); - struct cached_fid *cfid, *q; - LIST_HEAD(entry); - - spin_lock(&cfids->cfid_list_lock); - /* move cfids->dying to the local list */ - list_cut_before(&entry, &cfids->dying, &cfids->dying); - spin_unlock(&cfids->cfid_list_lock); - - list_for_each_entry_safe(cfid, q, &entry, entry) { - list_del(&cfid->entry); - /* Drop the ref-count acquired in invalidate_all_cached_dirs */ - kref_put(&cfid->refcount, smb2_close_cached_fid); - } -} - static void cfids_laundromat_worker(struct work_struct *work) { struct cached_fids *cfids; struct cached_fid *cfid, *q; - struct dentry *dentry; LIST_HEAD(entry); cfids = container_of(work, struct cached_fids, laundromat_work.work); spin_lock(&cfids->cfid_list_lock); + /* move cfids->dying to the local list */ + list_cut_before(&entry, &cfids->dying, &cfids->dying); + list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { if (cfid->last_access_time && time_after(jiffies, cfid->last_access_time + HZ * dir_cache_timeout)) { @@ -752,12 +747,9 @@ static void cfids_laundromat_worker(struct work_struct *work) list_for_each_entry_safe(cfid, q, &entry, entry) { list_del(&cfid->entry); - spin_lock(&cfid->fid_lock); - dentry = cfid->dentry; + dput(cfid->dentry); cfid->dentry = NULL; - spin_unlock(&cfid->fid_lock); - dput(dentry); if (cfid->is_open) { spin_lock(&cifs_tcp_ses_lock); ++cfid->tcon->tc_count; @@ -787,11 +779,13 @@ struct cached_fids *init_cached_dirs(void) INIT_LIST_HEAD(&cfids->entries); INIT_LIST_HEAD(&cfids->dying); - INIT_WORK(&cfids->invalidation_work, cfids_invalidation_worker); INIT_DELAYED_WORK(&cfids->laundromat_work, cfids_laundromat_worker); queue_delayed_work(cfid_put_wq, &cfids->laundromat_work, dir_cache_timeout * HZ); + atomic_long_set(&cfids->total_dirents_entries, 0); + atomic64_set(&cfids->total_dirents_bytes, 0); + return cfids; } @@ -808,7 +802,6 @@ void free_cached_dirs(struct cached_fids *cfids) return; cancel_delayed_work_sync(&cfids->laundromat_work); - cancel_work_sync(&cfids->invalidation_work); spin_lock(&cfids->cfid_list_lock); list_for_each_entry_safe(cfid, q, &cfids->entries, entry) { diff --git a/fs/smb/client/cached_dir.h b/fs/smb/client/cached_dir.h index 46b5a2fdf15b..1e383db7c337 100644 --- a/fs/smb/client/cached_dir.h +++ b/fs/smb/client/cached_dir.h @@ -27,6 +27,9 @@ struct cached_dirents { struct mutex de_mutex; loff_t pos; /* Expected ctx->pos */ struct list_head entries; + /* accounting for cached entries in this directory */ + unsigned long entries_count; + unsigned long bytes_used; }; struct cached_fid { @@ -41,7 +44,6 @@ struct cached_fid { unsigned long last_access_time; /* jiffies of when last accessed */ struct kref refcount; struct cifs_fid fid; - spinlock_t fid_lock; struct cifs_tcon *tcon; struct dentry *dentry; struct work_struct put_work; @@ -60,10 +62,21 @@ struct cached_fids { int num_entries; struct list_head entries; struct list_head dying; - struct work_struct invalidation_work; struct delayed_work laundromat_work; + /* aggregate accounting for all cached dirents under this tcon */ + atomic_long_t total_dirents_entries; + atomic64_t total_dirents_bytes; }; +/* Module-wide directory cache accounting (defined in cifsfs.c) */ +extern atomic64_t cifs_dircache_bytes_used; /* bytes across all mounts */ + +static inline bool +is_valid_cached_dir(struct cached_fid *cfid) +{ + return cfid->time && cfid->has_lease; +} + extern struct cached_fids *init_cached_dirs(void); extern void free_cached_dirs(struct cached_fids *cfids); extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/smb/client/cifs_debug.c b/fs/smb/client/cifs_debug.c index 35c4d27d2cc0..1fb71d2d31b5 100644 --- a/fs/smb/client/cifs_debug.c +++ b/fs/smb/client/cifs_debug.c @@ -240,14 +240,18 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) struct cifs_ses *ses; struct cifs_tcon *tcon; struct cifsFileInfo *cfile; + struct inode *inode; + struct cifsInodeInfo *cinode; + char lease[4]; + int n; seq_puts(m, "# Version:1\n"); seq_puts(m, "# Format:\n"); seq_puts(m, "# <tree id> <ses id> <persistent fid> <flags> <count> <pid> <uid>"); #ifdef CONFIG_CIFS_DEBUG2 - seq_printf(m, " <filename> <mid>\n"); + seq_puts(m, " <filename> <lease> <mid>\n"); #else - seq_printf(m, " <filename>\n"); + seq_puts(m, " <filename> <lease>\n"); #endif /* CIFS_DEBUG2 */ spin_lock(&cifs_tcp_ses_lock); list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { @@ -267,11 +271,30 @@ static int cifs_debug_files_proc_show(struct seq_file *m, void *v) cfile->pid, from_kuid(&init_user_ns, cfile->uid), cfile->dentry); + + /* Append lease/oplock caching state as RHW letters */ + inode = d_inode(cfile->dentry); + n = 0; + if (inode) { + cinode = CIFS_I(inode); + if (CIFS_CACHE_READ(cinode)) + lease[n++] = 'R'; + if (CIFS_CACHE_HANDLE(cinode)) + lease[n++] = 'H'; + if (CIFS_CACHE_WRITE(cinode)) + lease[n++] = 'W'; + } + lease[n] = '\0'; + seq_puts(m, " "); + if (n) + seq_printf(m, "%s", lease); + else + seq_puts(m, "NONE"); + #ifdef CONFIG_CIFS_DEBUG2 - seq_printf(m, " %llu\n", cfile->fid.mid); -#else + seq_printf(m, " %llu", cfile->fid.mid); +#endif /* CONFIG_CIFS_DEBUG2 */ seq_printf(m, "\n"); -#endif /* CIFS_DEBUG2 */ } spin_unlock(&tcon->open_file_lock); } @@ -308,7 +331,10 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) if (!cfids) continue; spin_lock(&cfids->cfid_list_lock); /* check lock ordering */ - seq_printf(m, "Num entries: %d\n", cfids->num_entries); + seq_printf(m, "Num entries: %d, cached_dirents: %lu entries, %llu bytes\n", + cfids->num_entries, + (unsigned long)atomic_long_read(&cfids->total_dirents_entries), + (unsigned long long)atomic64_read(&cfids->total_dirents_bytes)); list_for_each_entry(cfid, &cfids->entries, entry) { seq_printf(m, "0x%x 0x%llx 0x%llx %s", tcon->tid, @@ -319,6 +345,9 @@ static int cifs_debug_dirs_proc_show(struct seq_file *m, void *v) seq_printf(m, "\tvalid file info"); if (cfid->dirents.is_valid) seq_printf(m, ", valid dirents"); + if (!list_empty(&cfid->dirents.entries)) + seq_printf(m, ", dirents: %lu entries, %lu bytes", + cfid->dirents.entries_count, cfid->dirents.bytes_used); seq_printf(m, "\n"); } spin_unlock(&cfids->cfid_list_lock); diff --git a/fs/smb/client/cifs_spnego.c b/fs/smb/client/cifs_spnego.c index 43b86fa4d695..9891f55bac1e 100644 --- a/fs/smb/client/cifs_spnego.c +++ b/fs/smb/client/cifs_spnego.c @@ -24,20 +24,14 @@ static const struct cred *spnego_cred; static int cifs_spnego_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { - char *payload; - int ret; + char *payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); - ret = -ENOMEM; - payload = kmemdup(prep->data, prep->datalen, GFP_KERNEL); if (!payload) - goto error; + return -ENOMEM; /* attach the data */ key->payload.data[0] = payload; - ret = 0; - -error: - return ret; + return 0; } static void diff --git a/fs/smb/client/cifs_swn.c b/fs/smb/client/cifs_swn.c index 7233c6a7e6d7..68a1f87c446d 100644 --- a/fs/smb/client/cifs_swn.c +++ b/fs/smb/client/cifs_swn.c @@ -82,10 +82,8 @@ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg) int ret; skb = genlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); - if (skb == NULL) { - ret = -ENOMEM; - goto fail; - } + if (!skb) + return -ENOMEM; hdr = genlmsg_put(skb, 0, 0, &cifs_genl_family, 0, CIFS_GENL_CMD_SWN_REGISTER); if (hdr == NULL) { @@ -172,7 +170,6 @@ static int cifs_swn_send_register_message(struct cifs_swn_reg *swnreg) nlmsg_fail: genlmsg_cancel(skb, hdr); nlmsg_free(skb); -fail: return ret; } @@ -313,17 +310,15 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) reg = cifs_find_swn_reg(tcon); if (!IS_ERR(reg)) { kref_get(®->ref_count); - mutex_unlock(&cifs_swnreg_idr_mutex); - return reg; + goto unlock; } else if (PTR_ERR(reg) != -EEXIST) { - mutex_unlock(&cifs_swnreg_idr_mutex); - return reg; + goto unlock; } reg = kmalloc(sizeof(struct cifs_swn_reg), GFP_ATOMIC); if (reg == NULL) { - mutex_unlock(&cifs_swnreg_idr_mutex); - return ERR_PTR(-ENOMEM); + ret = -ENOMEM; + goto fail_unlock; } kref_init(®->ref_count); @@ -354,7 +349,7 @@ static struct cifs_swn_reg *cifs_get_swn_reg(struct cifs_tcon *tcon) reg->ip_notify = (tcon->capabilities & SMB2_SHARE_CAP_SCALEOUT); reg->tcon = tcon; - +unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return reg; @@ -365,6 +360,7 @@ fail_idr: idr_remove(&cifs_swnreg_idr, reg->id); fail: kfree(reg); +fail_unlock: mutex_unlock(&cifs_swnreg_idr_mutex); return ERR_PTR(ret); } diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 63b3b1290bed..ce2ebc213a1d 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -339,7 +339,6 @@ int sid_to_id(struct cifs_sb_info *cifs_sb, struct smb_sid *psid, struct cifs_fattr *fattr, uint sidtype) { - int rc = 0; struct key *sidkey; char *sidstr; const struct cred *saved_cred; @@ -446,12 +445,12 @@ out_revert_creds: * fails then we just fall back to using the ctx->linux_uid/linux_gid. */ got_valid_id: - rc = 0; if (sidtype == SIDOWNER) fattr->cf_uid = fuid; else fattr->cf_gid = fgid; - return rc; + + return 0; } int diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index 3cc686246908..801824825ecf 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -22,16 +22,45 @@ #include <linux/highmem.h> #include <linux/fips.h> #include <linux/iov_iter.h> -#include "../common/arc4.h" #include <crypto/aead.h> +#include <crypto/arc4.h> +#include <crypto/md5.h> +#include <crypto/sha2.h> -static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, - void *priv, void *priv2) +static int cifs_sig_update(struct cifs_calc_sig_ctx *ctx, + const u8 *data, size_t len) { - struct shash_desc *shash = priv; + if (ctx->md5) { + md5_update(ctx->md5, data, len); + return 0; + } + if (ctx->hmac) { + hmac_sha256_update(ctx->hmac, data, len); + return 0; + } + return crypto_shash_update(ctx->shash, data, len); +} + +static int cifs_sig_final(struct cifs_calc_sig_ctx *ctx, u8 *out) +{ + if (ctx->md5) { + md5_final(ctx->md5, out); + return 0; + } + if (ctx->hmac) { + hmac_sha256_final(ctx->hmac, out); + return 0; + } + return crypto_shash_final(ctx->shash, out); +} + +static size_t cifs_sig_step(void *iter_base, size_t progress, size_t len, + void *priv, void *priv2) +{ + struct cifs_calc_sig_ctx *ctx = priv; int ret, *pret = priv2; - ret = crypto_shash_update(shash, iter_base, len); + ret = cifs_sig_update(ctx, iter_base, len); if (ret < 0) { *pret = ret; return len; @@ -42,21 +71,20 @@ static size_t cifs_shash_step(void *iter_base, size_t progress, size_t len, /* * Pass the data from an iterator into a hash. */ -static int cifs_shash_iter(const struct iov_iter *iter, size_t maxsize, - struct shash_desc *shash) +static int cifs_sig_iter(const struct iov_iter *iter, size_t maxsize, + struct cifs_calc_sig_ctx *ctx) { struct iov_iter tmp_iter = *iter; int err = -EIO; - if (iterate_and_advance_kernel(&tmp_iter, maxsize, shash, &err, - cifs_shash_step) != maxsize) + if (iterate_and_advance_kernel(&tmp_iter, maxsize, ctx, &err, + cifs_sig_step) != maxsize) return err; return 0; } -int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash) +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, + char *signature, struct cifs_calc_sig_ctx *ctx) { int i; ssize_t rc; @@ -82,8 +110,7 @@ int __cifs_calc_signature(struct smb_rqst *rqst, return -EIO; } - rc = crypto_shash_update(shash, - iov[i].iov_base, iov[i].iov_len); + rc = cifs_sig_update(ctx, iov[i].iov_base, iov[i].iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with payload\n", __func__); @@ -91,11 +118,11 @@ int __cifs_calc_signature(struct smb_rqst *rqst, } } - rc = cifs_shash_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), shash); + rc = cifs_sig_iter(&rqst->rq_iter, iov_iter_count(&rqst->rq_iter), ctx); if (rc < 0) return rc; - rc = crypto_shash_final(shash, signature); + rc = cifs_sig_final(ctx, signature); if (rc) cifs_dbg(VFS, "%s: Could not generate hash\n", __func__); @@ -112,29 +139,22 @@ int __cifs_calc_signature(struct smb_rqst *rqst, static int cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, char *signature) { - int rc; + struct md5_ctx ctx; if (!rqst->rq_iov || !signature || !server) return -EINVAL; - - rc = cifs_alloc_hash("md5", &server->secmech.md5); - if (rc) - return -1; - - rc = crypto_shash_init(server->secmech.md5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init md5\n", __func__); - return rc; + if (fips_enabled) { + cifs_dbg(VFS, + "MD5 signature support is disabled due to FIPS\n"); + return -EOPNOTSUPP; } - rc = crypto_shash_update(server->secmech.md5, - server->session_key.response, server->session_key.len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response\n", __func__); - return rc; - } + md5_init(&ctx); + md5_update(&ctx, server->session_key.response, server->session_key.len); - return __cifs_calc_signature(rqst, server, signature, server->secmech.md5); + return __cifs_calc_signature( + rqst, server, signature, + &(struct cifs_calc_sig_ctx){ .md5 = &ctx }); } /* must be called with server->srv_mutex held */ @@ -405,11 +425,11 @@ static __le64 find_timestamp(struct cifs_ses *ses) } static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, - const struct nls_table *nls_cp, struct shash_desc *hmacmd5) + const struct nls_table *nls_cp) { - int rc = 0; int len; char nt_hash[CIFS_NTHASH_SIZE]; + struct hmac_md5_ctx hmac_ctx; __le16 *user; wchar_t *domain; wchar_t *server; @@ -417,17 +437,7 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, /* calculate md4 hash of password */ E_md4hash(ses->password, nt_hash, nls_cp); - rc = crypto_shash_setkey(hmacmd5->tfm, nt_hash, CIFS_NTHASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NT hash as a key, rc=%d\n", __func__, rc); - return rc; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - return rc; - } + hmac_md5_init_usingrawkey(&hmac_ctx, nt_hash, CIFS_NTHASH_SIZE); /* convert ses->user_name to unicode */ len = ses->user_name ? strlen(ses->user_name) : 0; @@ -442,12 +452,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, *(u16 *)user = 0; } - rc = crypto_shash_update(hmacmd5, (char *)user, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)user, 2 * len); kfree(user); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with user, rc=%d\n", __func__, rc); - return rc; - } /* convert ses->domainName to unicode and uppercase */ if (ses->domainName) { @@ -459,12 +465,8 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, len = cifs_strtoUTF16((__le16 *)domain, ses->domainName, len, nls_cp); - rc = crypto_shash_update(hmacmd5, (char *)domain, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)domain, 2 * len); kfree(domain); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with domain, rc=%d\n", __func__, rc); - return rc; - } } else { /* We use ses->ip_addr if no domain name available */ len = strlen(ses->ip_addr); @@ -474,25 +476,16 @@ static int calc_ntlmv2_hash(struct cifs_ses *ses, char *ntlmv2_hash, return -ENOMEM; len = cifs_strtoUTF16((__le16 *)server, ses->ip_addr, len, nls_cp); - rc = crypto_shash_update(hmacmd5, (char *)server, 2 * len); + hmac_md5_update(&hmac_ctx, (const u8 *)server, 2 * len); kfree(server); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with server, rc=%d\n", __func__, rc); - return rc; - } } - rc = crypto_shash_final(hmacmd5, ntlmv2_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - - return rc; + hmac_md5_final(&hmac_ctx, ntlmv2_hash); + return 0; } -static int -CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_desc *hmacmd5) +static void CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash) { - int rc; struct ntlmv2_resp *ntlmv2 = (struct ntlmv2_resp *) (ses->auth_key.response + CIFS_SESS_KEY_SIZE); unsigned int hash_len; @@ -501,35 +494,15 @@ CalcNTLMv2_response(const struct cifs_ses *ses, char *ntlmv2_hash, struct shash_ hash_len = ses->auth_key.len - (CIFS_SESS_KEY_SIZE + offsetof(struct ntlmv2_resp, challenge.key[0])); - rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); - return rc; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - return rc; - } - if (ses->server->negflavor == CIFS_NEGFLAVOR_EXTENDED) memcpy(ntlmv2->challenge.key, ses->ntlmssp->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); else memcpy(ntlmv2->challenge.key, ses->server->cryptkey, CIFS_SERVER_CHALLENGE_SIZE); - rc = crypto_shash_update(hmacmd5, ntlmv2->challenge.key, hash_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); - return rc; - } - - /* Note that the MD5 digest over writes anon.challenge_key.key */ - rc = crypto_shash_final(hmacmd5, ntlmv2->ntlmv2_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); - - return rc; + /* Note that the HMAC-MD5 value overwrites ntlmv2->challenge.key */ + hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ntlmv2->challenge.key, hash_len, + ntlmv2->ntlmv2_hash); } /* @@ -586,7 +559,6 @@ out: int setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) { - struct shash_desc *hmacmd5 = NULL; unsigned char *tiblob = NULL; /* target info blob */ struct ntlmv2_resp *ntlmv2; char ntlmv2_hash[16]; @@ -657,51 +629,29 @@ setup_ntlmv2_rsp(struct cifs_ses *ses, const struct nls_table *nls_cp) ntlmv2->client_chal = cc; ntlmv2->reserved2 = 0; - rc = cifs_alloc_hash("hmac(md5)", &hmacmd5); - if (rc) { - cifs_dbg(VFS, "Could not allocate HMAC-MD5, rc=%d\n", rc); + if (fips_enabled) { + cifs_dbg(VFS, "NTLMv2 support is disabled due to FIPS\n"); + rc = -EOPNOTSUPP; goto unlock; } /* calculate ntlmv2_hash */ - rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp, hmacmd5); + rc = calc_ntlmv2_hash(ses, ntlmv2_hash, nls_cp); if (rc) { cifs_dbg(VFS, "Could not get NTLMv2 hash, rc=%d\n", rc); goto unlock; } /* calculate first part of the client response (CR1) */ - rc = CalcNTLMv2_response(ses, ntlmv2_hash, hmacmd5); - if (rc) { - cifs_dbg(VFS, "Could not calculate CR1, rc=%d\n", rc); - goto unlock; - } + CalcNTLMv2_response(ses, ntlmv2_hash); /* now calculate the session key for NTLMv2 */ - rc = crypto_shash_setkey(hmacmd5->tfm, ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not set NTLMv2 hash as a key, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_init(hmacmd5); - if (rc) { - cifs_dbg(VFS, "%s: Could not init HMAC-MD5, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_update(hmacmd5, ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not update with response, rc=%d\n", __func__, rc); - goto unlock; - } - - rc = crypto_shash_final(hmacmd5, ses->auth_key.response); - if (rc) - cifs_dbg(VFS, "%s: Could not generate MD5 hash, rc=%d\n", __func__, rc); + hmac_md5_usingrawkey(ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ntlmv2->ntlmv2_hash, CIFS_HMAC_MD5_HASH_SIZE, + ses->auth_key.response); + rc = 0; unlock: cifs_server_unlock(ses->server); - cifs_free_hash(&hmacmd5); setup_ntlmv2_rsp_ret: kfree_sensitive(tiblob); @@ -725,9 +675,9 @@ calc_seckey(struct cifs_ses *ses) return -ENOMEM; } - cifs_arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); - cifs_arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, - CIFS_CPHTXT_SIZE); + arc4_setkey(ctx_arc4, ses->auth_key.response, CIFS_SESS_KEY_SIZE); + arc4_crypt(ctx_arc4, ses->ntlmssp->ciphertext, sec_key, + CIFS_CPHTXT_SIZE); /* make secondary_key/nonce as session key */ memcpy(ses->auth_key.response, sec_key, CIFS_SESS_KEY_SIZE); @@ -743,9 +693,6 @@ void cifs_crypto_secmech_release(struct TCP_Server_Info *server) { cifs_free_hash(&server->secmech.aes_cmac); - cifs_free_hash(&server->secmech.hmacsha256); - cifs_free_hash(&server->secmech.md5); - cifs_free_hash(&server->secmech.sha512); if (server->secmech.enc) { crypto_free_aead(server->secmech.enc); diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index dcb39d1b5958..4f959f1e08d2 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -121,6 +121,46 @@ unsigned int dir_cache_timeout = 30; module_param(dir_cache_timeout, uint, 0644); MODULE_PARM_DESC(dir_cache_timeout, "Number of seconds to cache directory contents for which we have a lease. Default: 30 " "Range: 1 to 65000 seconds, 0 to disable caching dir contents"); +/* Module-wide total cached dirents (in bytes) across all tcons */ +atomic64_t cifs_dircache_bytes_used = ATOMIC64_INIT(0); + +/* + * Write-only module parameter to drop all cached directory entries across + * all CIFS mounts. Echo a non-zero value to trigger. + */ +static void cifs_drop_all_dir_caches(void) +{ + struct TCP_Server_Info *server; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses, &server->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) + invalidate_all_cached_dirs(tcon); + } + } + spin_unlock(&cifs_tcp_ses_lock); +} + +static int cifs_param_set_drop_dir_cache(const char *val, const struct kernel_param *kp) +{ + bool bv; + int rc = kstrtobool(val, &bv); + + if (rc) + return rc; + if (bv) + cifs_drop_all_dir_caches(); + return 0; +} + +module_param_call(drop_dir_cache, cifs_param_set_drop_dir_cache, NULL, NULL, 0200); +MODULE_PARM_DESC(drop_dir_cache, "Write 1 to drop all cached directory entries across all CIFS mounts"); + #ifdef CONFIG_CIFS_STATS2 unsigned int slow_rsp_threshold = 1; module_param(slow_rsp_threshold, uint, 0644); @@ -352,11 +392,27 @@ static long cifs_fallocate(struct file *file, int mode, loff_t off, loff_t len) struct cifs_sb_info *cifs_sb = CIFS_FILE_SB(file); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct TCP_Server_Info *server = tcon->ses->server; + struct inode *inode = file_inode(file); + int rc; + + if (!server->ops->fallocate) + return -EOPNOTSUPP; - if (server->ops->fallocate) - return server->ops->fallocate(file, tcon, mode, off, len); + rc = inode_lock_killable(inode); + if (rc) + return rc; + + netfs_wait_for_outstanding_io(inode); - return -EOPNOTSUPP; + rc = file_modified(file); + if (rc) + goto out_unlock; + + rc = server->ops->fallocate(file, tcon, mode, off, len); + +out_unlock: + inode_unlock(inode); + return rc; } static int cifs_permission(struct mnt_idmap *idmap, @@ -2083,13 +2139,9 @@ MODULE_DESCRIPTION "also older servers complying with the SNIA CIFS Specification)"); MODULE_VERSION(CIFS_VERSION); MODULE_SOFTDEP("ecb"); -MODULE_SOFTDEP("hmac"); -MODULE_SOFTDEP("md5"); MODULE_SOFTDEP("nls"); MODULE_SOFTDEP("aes"); MODULE_SOFTDEP("cmac"); -MODULE_SOFTDEP("sha256"); -MODULE_SOFTDEP("sha512"); MODULE_SOFTDEP("aead2"); MODULE_SOFTDEP("ccm"); MODULE_SOFTDEP("gcm"); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 3ce7c614ccc0..e9534258d1ef 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -145,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 56 -#define CIFS_VERSION "2.56" +#define SMB3_PRODUCT_BUILD 57 +#define CIFS_VERSION "2.57" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 3ac254e123dc..16a00a61fd2c 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -24,6 +24,7 @@ #include "cifsacl.h" #include <crypto/internal/hash.h> #include <uapi/linux/cifs/cifs_mount.h> +#include "../common/cifsglob.h" #include "../common/smb2pdu.h" #include "smb2pdu.h" #include <linux/filelock.h> @@ -221,9 +222,6 @@ struct session_key { /* crypto hashing related structure/fields, not specific to a sec mech */ struct cifs_secmech { - struct shash_desc *md5; /* md5 hash function, for CIFS/SMB1 signatures */ - struct shash_desc *hmacsha256; /* hmac-sha256 hash function, for SMB2 signatures */ - struct shash_desc *sha512; /* sha512 hash function, for SMB3.1.1 preauth hash */ struct shash_desc *aes_cmac; /* block-cipher based MAC function, for SMB3 signatures */ struct crypto_aead *enc; /* smb3 encryption AEAD TFM (AES-CCM and AES-GCM) */ @@ -702,12 +700,6 @@ get_rfc1002_length(void *buf) return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; } -static inline void -inc_rfc1001_len(void *buf, int count) -{ - be32_add_cpu((__be32 *)buf, count); -} - struct TCP_Server_Info { struct list_head tcp_ses_list; struct list_head smb_ses_list; @@ -1021,8 +1013,6 @@ compare_mid(__u16 mid, const struct smb_hdr *smb) #define CIFS_MAX_RFC1002_WSIZE ((1<<17) - 1 - sizeof(WRITE_REQ) + 4) #define CIFS_MAX_RFC1002_RSIZE ((1<<17) - 1 - sizeof(READ_RSP) + 4) -#define CIFS_DEFAULT_IOSIZE (1024 * 1024) - /* * Windows only supports a max of 60kb reads and 65535 byte writes. Default to * those values when posix extensions aren't in force. In actuality here, we @@ -1566,6 +1556,11 @@ struct cifsFileInfo *cifsFileInfo_get(struct cifsFileInfo *cifs_file); void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, bool wait_oplock_hdlr, bool offload); void cifsFileInfo_put(struct cifsFileInfo *cifs_file); +int cifs_file_flush(const unsigned int xid, struct inode *inode, + struct cifsFileInfo *cfile); +int cifs_file_set_size(const unsigned int xid, struct dentry *dentry, + const char *full_path, struct cifsFileInfo *open_file, + loff_t size); #define CIFS_CACHE_READ_FLG 1 #define CIFS_CACHE_HANDLE_FLG 2 @@ -2143,30 +2138,20 @@ extern mempool_t cifs_io_request_pool; extern mempool_t cifs_io_subrequest_pool; /* Operations for different SMB versions */ -#define SMB1_VERSION_STRING "1.0" -#define SMB20_VERSION_STRING "2.0" #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY extern struct smb_version_operations smb1_operations; extern struct smb_version_values smb1_values; extern struct smb_version_operations smb20_operations; extern struct smb_version_values smb20_values; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ -#define SMB21_VERSION_STRING "2.1" extern struct smb_version_operations smb21_operations; extern struct smb_version_values smb21_values; -#define SMBDEFAULT_VERSION_STRING "default" extern struct smb_version_values smbdefault_values; -#define SMB3ANY_VERSION_STRING "3" extern struct smb_version_values smb3any_values; -#define SMB30_VERSION_STRING "3.0" extern struct smb_version_operations smb30_operations; extern struct smb_version_values smb30_values; -#define SMB302_VERSION_STRING "3.02" -#define ALT_SMB302_VERSION_STRING "3.0.2" /*extern struct smb_version_operations smb302_operations;*/ /* not needed yet */ extern struct smb_version_values smb302_values; -#define SMB311_VERSION_STRING "3.1.1" -#define ALT_SMB311_VERSION_STRING "3.11" extern struct smb_version_operations smb311_operations; extern struct smb_version_values smb311_values; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index e8fba98690ce..4976be2c47c1 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -632,9 +632,13 @@ int cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, unsigned int *pbytes_written); -int __cifs_calc_signature(struct smb_rqst *rqst, - struct TCP_Server_Info *server, char *signature, - struct shash_desc *shash); +struct cifs_calc_sig_ctx { + struct md5_ctx *md5; + struct hmac_sha256_ctx *hmac; + struct shash_desc *shash; +}; +int __cifs_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, + char *signature, struct cifs_calc_sig_ctx *ctx); enum securityEnum cifs_select_sectype(struct TCP_Server_Info *, enum securityEnum); diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index d20766f664c4..2881efcbe09a 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -1163,7 +1163,7 @@ OldOpenRetry: cpu_to_le64(le32_to_cpu(pSMBr->EndOfFile)); pfile_info->EndOfFile = pfile_info->AllocationSize; pfile_info->NumberOfLinks = cpu_to_le32(1); - pfile_info->DeletePending = 0; + pfile_info->DeletePending = 0; /* successful open = not delete pending */ } } @@ -1288,7 +1288,7 @@ openRetry: buf->AllocationSize = rsp->AllocationSize; buf->EndOfFile = rsp->EndOfFile; buf->NumberOfLinks = cpu_to_le32(1); - buf->DeletePending = 0; + buf->DeletePending = 0; /* successful open = not delete pending */ } cifs_buf_release(req); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index 5223edf6d11a..da5597dbf5b9 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -200,8 +200,8 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { - free_dentry_path(page); - return PTR_ERR(full_path); + rc = PTR_ERR(full_path); + goto out; } /* If we're caching, we need to be able to fill in around partial writes. */ @@ -322,13 +322,14 @@ retry_open: list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) { if (parent_cfid->dentry == direntry->d_parent) { cifs_dbg(FYI, "found a parent cached file handle\n"); - if (parent_cfid->has_lease && parent_cfid->time) { + if (is_valid_cached_dir(parent_cfid)) { lease_flags |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; memcpy(fid->parent_lease_key, parent_cfid->fid.lease_key, SMB2_LEASE_KEY_SIZE); parent_cfid->dirents.is_valid = false; + parent_cfid->dirents.is_failed = true; } break; } @@ -484,8 +485,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, * in network traffic in the other paths. */ if (!(oflags & O_CREAT)) { - struct dentry *res; - /* * Check for hashed negative dentry. We have already revalidated * the dentry and it is fine. No need to perform another lookup. @@ -493,11 +492,7 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, if (!d_in_lookup(direntry)) return -ENOENT; - res = cifs_lookup(inode, direntry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - return finish_no_open(file, res); + return finish_no_open(file, cifs_lookup(inode, direntry, 0)); } xid = get_xid(); @@ -683,6 +678,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, const char *full_path; void *page; int retry_count = 0; + struct dentry *de; xid = get_xid(); @@ -694,16 +690,15 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, cifs_sb = CIFS_SB(parent_dir_inode->i_sb); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { - free_xid(xid); - return ERR_CAST(tlink); + de = ERR_CAST(tlink); + goto free_xid; } pTcon = tlink_tcon(tlink); rc = check_name(direntry, pTcon); if (unlikely(rc)) { - cifs_put_tlink(tlink); - free_xid(xid); - return ERR_PTR(rc); + de = ERR_PTR(rc); + goto put_tlink; } /* can not grab the rename sem here since it would @@ -712,16 +707,38 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, page = alloc_dentry_path(); full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { - cifs_put_tlink(tlink); - free_xid(xid); - free_dentry_path(page); - return ERR_CAST(full_path); + de = ERR_CAST(full_path); + goto free_dentry_path; } if (d_really_is_positive(direntry)) { cifs_dbg(FYI, "non-NULL inode in lookup\n"); } else { + struct cached_fid *cfid = NULL; + cifs_dbg(FYI, "NULL inode in lookup\n"); + + /* + * We can only rely on negative dentries having the same + * spelling as the cached dirent if case insensitivity is + * forced on mount. + * + * XXX: if servers correctly announce Case Sensitivity Search + * on GetInfo of FileFSAttributeInformation, then we can take + * correct action even if case insensitive is not forced on + * mount. + */ + if (pTcon->nocase && !open_cached_dir_by_dentry(pTcon, direntry->d_parent, &cfid)) { + /* + * dentry is negative and parent is fully cached: + * we can assume file does not exist + */ + if (cfid->dirents.is_valid) { + close_cached_dir(cfid); + goto out; + } + close_cached_dir(cfid); + } } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, d_inode(direntry)); @@ -755,24 +772,29 @@ again: } newInode = ERR_PTR(rc); } + +out: + de = d_splice_alias(newInode, direntry); +free_dentry_path: free_dentry_path(page); +put_tlink: cifs_put_tlink(tlink); +free_xid: free_xid(xid); - return d_splice_alias(newInode, direntry); + return de; } static int cifs_d_revalidate(struct inode *dir, const struct qstr *name, struct dentry *direntry, unsigned int flags) { - struct inode *inode; - int rc; - if (flags & LOOKUP_RCU) return -ECHILD; if (d_really_is_positive(direntry)) { - inode = d_inode(direntry); + int rc; + struct inode *inode = d_inode(direntry); + if ((flags & LOOKUP_REVAL) && !CIFS_CACHE_READ(CIFS_I(inode))) CIFS_I(inode)->time = 0; /* force reval */ @@ -812,6 +834,22 @@ cifs_d_revalidate(struct inode *dir, const struct qstr *name, return 1; } + } else { + struct cifs_sb_info *cifs_sb = CIFS_SB(dir->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct cached_fid *cfid; + + if (!open_cached_dir_by_dentry(tcon, direntry->d_parent, &cfid)) { + /* + * dentry is negative and parent is fully cached: + * we can assume file does not exist + */ + if (cfid->dirents.is_valid) { + close_cached_dir(cfid); + return 1; + } + close_cached_dir(cfid); + } } /* diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index a5ed742afa00..474dadeb1593 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -952,6 +952,66 @@ void _cifsFileInfo_put(struct cifsFileInfo *cifs_file, } } +int cifs_file_flush(const unsigned int xid, struct inode *inode, + struct cifsFileInfo *cfile) +{ + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon; + int rc; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC) + return 0; + + if (cfile && (OPEN_FMODE(cfile->f_flags) & FMODE_WRITE)) { + tcon = tlink_tcon(cfile->tlink); + return tcon->ses->server->ops->flush(xid, tcon, + &cfile->fid); + } + rc = cifs_get_writable_file(CIFS_I(inode), FIND_WR_ANY, &cfile); + if (!rc) { + tcon = tlink_tcon(cfile->tlink); + rc = tcon->ses->server->ops->flush(xid, tcon, &cfile->fid); + cifsFileInfo_put(cfile); + } else if (rc == -EBADF) { + rc = 0; + } + return rc; +} + +static int cifs_do_truncate(const unsigned int xid, struct dentry *dentry) +{ + struct cifsInodeInfo *cinode = CIFS_I(d_inode(dentry)); + struct inode *inode = d_inode(dentry); + struct cifsFileInfo *cfile = NULL; + struct TCP_Server_Info *server; + struct cifs_tcon *tcon; + int rc; + + rc = filemap_write_and_wait(inode->i_mapping); + if (is_interrupt_error(rc)) + return -ERESTARTSYS; + mapping_set_error(inode->i_mapping, rc); + + cfile = find_writable_file(cinode, FIND_WR_FSUID_ONLY); + rc = cifs_file_flush(xid, inode, cfile); + if (!rc) { + if (cfile) { + tcon = tlink_tcon(cfile->tlink); + server = tcon->ses->server; + rc = server->ops->set_file_size(xid, tcon, + cfile, 0, false); + } + if (!rc) { + netfs_resize_file(&cinode->netfs, 0, true); + cifs_setsize(inode, 0); + inode->i_blocks = 0; + } + } + if (cfile) + cifsFileInfo_put(cfile); + return rc; +} + int cifs_open(struct inode *inode, struct file *file) { @@ -1004,6 +1064,12 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } + if (file->f_flags & O_TRUNC) { + rc = cifs_do_truncate(xid, file_dentry(file)); + if (rc) + goto out; + } + /* Get the cached handle as SMB2 close is deferred */ if (OPEN_FMODE(file->f_flags) & FMODE_WRITE) { rc = cifs_get_writable_path(tcon, full_path, @@ -2685,13 +2751,10 @@ cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, int datasync) { - unsigned int xid; - int rc = 0; - struct cifs_tcon *tcon; - struct TCP_Server_Info *server; struct cifsFileInfo *smbfile = file->private_data; struct inode *inode = file_inode(file); - struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + unsigned int xid; + int rc; rc = file_write_and_wait_range(file, start, end); if (rc) { @@ -2699,39 +2762,15 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, return rc; } - xid = get_xid(); - - cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n", - file, datasync); + cifs_dbg(FYI, "%s: name=%pD datasync=0x%x\n", __func__, file, datasync); if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_zap_mapping(inode); - if (rc) { - cifs_dbg(FYI, "rc: %d during invalidate phase\n", rc); - rc = 0; /* don't care about it in fsync */ - } + cifs_dbg(FYI, "%s: invalidate mapping: rc = %d\n", __func__, rc); } - tcon = tlink_tcon(smbfile->tlink); - if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { - server = tcon->ses->server; - if (server->ops->flush == NULL) { - rc = -ENOSYS; - goto strict_fsync_exit; - } - - if ((OPEN_FMODE(smbfile->f_flags) & FMODE_WRITE) == 0) { - smbfile = find_writable_file(CIFS_I(inode), FIND_WR_ANY); - if (smbfile) { - rc = server->ops->flush(xid, tcon, &smbfile->fid); - cifsFileInfo_put(smbfile); - } else - cifs_dbg(FYI, "ignore fsync for file not open for write\n"); - } else - rc = server->ops->flush(xid, tcon, &smbfile->fid); - } - -strict_fsync_exit: + xid = get_xid(); + rc = cifs_file_flush(xid, inode, smbfile); free_xid(xid); return rc; } diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index 072383899e81..e60927b2a7c8 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -773,16 +773,14 @@ static int smb3_fs_context_parse_monolithic(struct fs_context *fc, } - len = 0; value = strchr(key, '='); if (value) { if (value == key) continue; *value++ = 0; - len = strlen(value); } - ret = vfs_parse_fs_string(fc, key, value, len); + ret = vfs_parse_fs_string(fc, key, value); if (ret < 0) break; } @@ -1820,6 +1818,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } + /* + * Multichannel is not meaningful if max_channels is 1. + * Force multichannel to false to ensure consistent configuration. + */ + if (ctx->multichannel && ctx->max_channels == 1) + ctx->multichannel = false; + return 0; cifs_parse_mount_err: diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 7e9784080501..098a79b7a959 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -2431,8 +2431,10 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, tcon = tlink_tcon(tlink); server = tcon->ses->server; - if (!server->ops->rename) - return -ENOSYS; + if (!server->ops->rename) { + rc = -ENOSYS; + goto do_rename_exit; + } /* try path-based rename first */ rc = server->ops->rename(xid, tcon, from_dentry, @@ -2704,7 +2706,7 @@ cifs_dentry_needs_reval(struct dentry *dentry) return true; if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { - if (cfid->time && cifs_i->time > cfid->time) { + if (cifs_i->time > cfid->time) { close_cached_dir(cfid); return false; } @@ -3007,28 +3009,25 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, void cifs_setsize(struct inode *inode, loff_t offset) { - struct cifsInodeInfo *cifs_i = CIFS_I(inode); - spin_lock(&inode->i_lock); i_size_write(inode, offset); spin_unlock(&inode->i_lock); - - /* Cached inode must be refreshed on truncate */ - cifs_i->time = 0; + inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); truncate_pagecache(inode, offset); + netfs_wait_for_outstanding_io(inode); } -static int -cifs_set_file_size(struct inode *inode, struct iattr *attrs, - unsigned int xid, const char *full_path, struct dentry *dentry) +int cifs_file_set_size(const unsigned int xid, struct dentry *dentry, + const char *full_path, struct cifsFileInfo *open_file, + loff_t size) { - int rc; - struct cifsFileInfo *open_file; - struct cifsInodeInfo *cifsInode = CIFS_I(inode); + struct inode *inode = d_inode(dentry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct tcon_link *tlink = NULL; struct cifs_tcon *tcon = NULL; struct TCP_Server_Info *server; + int rc = -EINVAL; /* * To avoid spurious oplock breaks from server, in the case of @@ -3039,19 +3038,25 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, * writebehind data than the SMB timeout for the SetPathInfo * request would allow */ - open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); - if (open_file) { + if (open_file && (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE)) { tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; - if (server->ops->set_file_size) - rc = server->ops->set_file_size(xid, tcon, open_file, - attrs->ia_size, false); - else - rc = -ENOSYS; - cifsFileInfo_put(open_file); - cifs_dbg(FYI, "SetFSize for attrs rc = %d\n", rc); - } else - rc = -EINVAL; + rc = server->ops->set_file_size(xid, tcon, + open_file, + size, false); + cifs_dbg(FYI, "%s: set_file_size: rc = %d\n", __func__, rc); + } else { + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); + if (open_file) { + tcon = tlink_tcon(open_file->tlink); + server = tcon->ses->server; + rc = server->ops->set_file_size(xid, tcon, + open_file, + size, false); + cifs_dbg(FYI, "%s: set_file_size: rc = %d\n", __func__, rc); + cifsFileInfo_put(open_file); + } + } if (!rc) goto set_size_out; @@ -3069,20 +3074,15 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, * valid, writeable file handle for it was found or because there was * an error setting it by handle. */ - if (server->ops->set_path_size) - rc = server->ops->set_path_size(xid, tcon, full_path, - attrs->ia_size, cifs_sb, false, dentry); - else - rc = -ENOSYS; - cifs_dbg(FYI, "SetEOF by path (setattrs) rc = %d\n", rc); - - if (tlink) - cifs_put_tlink(tlink); + rc = server->ops->set_path_size(xid, tcon, full_path, size, + cifs_sb, false, dentry); + cifs_dbg(FYI, "%s: SetEOF by path (setattrs) rc = %d\n", __func__, rc); + cifs_put_tlink(tlink); set_size_out: if (rc == 0) { - netfs_resize_file(&cifsInode->netfs, attrs->ia_size, true); - cifs_setsize(inode, attrs->ia_size); + netfs_resize_file(&cifsInode->netfs, size, true); + cifs_setsize(inode, size); /* * i_blocks is not related to (i_size / i_blksize), but instead * 512 byte (2**9) size is required for calculating num blocks. @@ -3090,15 +3090,7 @@ set_size_out: * this is best estimate we have for blocks allocated for a file * Number of blocks must be rounded up so size 1 is not 0 blocks */ - inode->i_blocks = (512 - 1 + attrs->ia_size) >> 9; - - /* - * The man page of truncate says if the size changed, - * then the st_ctime and st_mtime fields for the file - * are updated. - */ - attrs->ia_ctime = attrs->ia_mtime = current_time(inode); - attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME; + inode->i_blocks = (512 - 1 + size) >> 9; } return rc; @@ -3118,7 +3110,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) struct tcon_link *tlink; struct cifs_tcon *pTcon; struct cifs_unix_set_info_args *args = NULL; - struct cifsFileInfo *open_file; + struct cifsFileInfo *open_file = NULL; cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n", direntry, attrs->ia_valid); @@ -3132,6 +3124,9 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (rc < 0) goto out; + if (attrs->ia_valid & ATTR_FILE) + open_file = attrs->ia_file->private_data; + full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { rc = PTR_ERR(full_path); @@ -3159,9 +3154,16 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) rc = 0; if (attrs->ia_valid & ATTR_SIZE) { - rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry); + rc = cifs_file_set_size(xid, direntry, full_path, + open_file, attrs->ia_size); if (rc != 0) goto out; + /* + * Avoid setting timestamps on the server for ftruncate(2) to + * prevent it from disabling automatic timestamp updates as per + * MS-FSA 2.1.4.17. + */ + attrs->ia_valid &= ~(ATTR_CTIME | ATTR_MTIME); } /* skip mode change if it's just for clearing setuid/setgid */ @@ -3206,14 +3208,24 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) args->ctime = NO_CHANGE_64; args->device = 0; - open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); - if (open_file) { - u16 nfid = open_file->fid.netfid; - u32 npid = open_file->pid; + rc = -EINVAL; + if (open_file && (OPEN_FMODE(open_file->f_flags) & FMODE_WRITE)) { pTcon = tlink_tcon(open_file->tlink); - rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, nfid, npid); - cifsFileInfo_put(open_file); + rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, + open_file->fid.netfid, + open_file->pid); } else { + open_file = find_writable_file(cifsInode, FIND_WR_FSUID_ONLY); + if (open_file) { + pTcon = tlink_tcon(open_file->tlink); + rc = CIFSSMBUnixSetFileInfo(xid, pTcon, args, + open_file->fid.netfid, + open_file->pid); + cifsFileInfo_put(open_file); + } + } + + if (rc) { tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); @@ -3221,8 +3233,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) } pTcon = tlink_tcon(tlink); rc = CIFSSMBUnixSetPathInfo(xid, pTcon, full_path, args, - cifs_sb->local_nls, - cifs_remap(cifs_sb)); + cifs_sb->local_nls, + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); } @@ -3264,8 +3276,7 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) struct inode *inode = d_inode(direntry); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifsInodeInfo *cifsInode = CIFS_I(inode); - struct cifsFileInfo *wfile; - struct cifs_tcon *tcon; + struct cifsFileInfo *cfile = NULL; const char *full_path; void *page = alloc_dentry_path(); int rc = -EACCES; @@ -3285,6 +3296,9 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) if (rc < 0) goto cifs_setattr_exit; + if (attrs->ia_valid & ATTR_FILE) + cfile = attrs->ia_file->private_data; + full_path = build_path_from_dentry(direntry, page); if (IS_ERR(full_path)) { rc = PTR_ERR(full_path); @@ -3311,25 +3325,23 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) rc = 0; - if ((attrs->ia_valid & ATTR_MTIME) && - !(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { - rc = cifs_get_writable_file(cifsInode, FIND_WR_ANY, &wfile); - if (!rc) { - tcon = tlink_tcon(wfile->tlink); - rc = tcon->ses->server->ops->flush(xid, tcon, &wfile->fid); - cifsFileInfo_put(wfile); - if (rc) - goto cifs_setattr_exit; - } else if (rc != -EBADF) + if (attrs->ia_valid & ATTR_MTIME) { + rc = cifs_file_flush(xid, inode, cfile); + if (rc) goto cifs_setattr_exit; - else - rc = 0; } if (attrs->ia_valid & ATTR_SIZE) { - rc = cifs_set_file_size(inode, attrs, xid, full_path, direntry); + rc = cifs_file_set_size(xid, direntry, full_path, + cfile, attrs->ia_size); if (rc != 0) goto cifs_setattr_exit; + /* + * Avoid setting timestamps on the server for ftruncate(2) to + * prevent it from disabling automatic timestamp updates as per + * MS-FSA 2.1.4.17. + */ + attrs->ia_valid &= ~(ATTR_CTIME | ATTR_MTIME); } if (attrs->ia_valid & ATTR_UID) @@ -3459,6 +3471,13 @@ cifs_setattr(struct mnt_idmap *idmap, struct dentry *direntry, if (unlikely(cifs_forced_shutdown(cifs_sb))) return -EIO; + /* + * Avoid setting [cm]time with O_TRUNC to prevent the server from + * disabling automatic timestamp updates as specified in + * MS-FSA 2.1.4.17. + */ + if (attrs->ia_valid & ATTR_OPEN) + return 0; do { #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index fe80e711cd75..70f3c0c67eeb 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -5,6 +5,7 @@ * Author(s): Steve French (sfrench@us.ibm.com) * */ +#include <crypto/md5.h> #include <linux/fs.h> #include <linux/stat.h> #include <linux/slab.h> @@ -37,23 +38,6 @@ #define CIFS_MF_SYMLINK_MD5_ARGS(md5_hash) md5_hash static int -symlink_hash(unsigned int link_len, const char *link_str, u8 *md5_hash) -{ - int rc; - struct shash_desc *md5 = NULL; - - rc = cifs_alloc_hash("md5", &md5); - if (rc) - return rc; - - rc = crypto_shash_digest(md5, link_str, link_len, md5_hash); - if (rc) - cifs_dbg(VFS, "%s: Could not generate md5 hash\n", __func__); - cifs_free_hash(&md5); - return rc; -} - -static int parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, char **_link_str) { @@ -77,11 +61,7 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) return -EINVAL; - rc = symlink_hash(link_len, link_str, md5_hash); - if (rc) { - cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); - return rc; - } + md5(link_str, link_len, md5_hash); scnprintf(md5_str2, sizeof(md5_str2), CIFS_MF_SYMLINK_MD5_FORMAT, @@ -103,7 +83,6 @@ parse_mf_symlink(const u8 *buf, unsigned int buf_len, unsigned int *_link_len, static int format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) { - int rc; unsigned int link_len; unsigned int ofs; u8 md5_hash[16]; @@ -116,11 +95,7 @@ format_mf_symlink(u8 *buf, unsigned int buf_len, const char *link_str) if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN) return -ENAMETOOLONG; - rc = symlink_hash(link_len, link_str, md5_hash); - if (rc) { - cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc); - return rc; - } + md5(link_str, link_len, md5_hash); scnprintf(buf, buf_len, CIFS_MF_SYMLINK_LEN_FORMAT CIFS_MF_SYMLINK_MD5_FORMAT, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index dda6dece802a..e10123d8cd7d 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -916,6 +916,14 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, char *data_end; struct dfs_referral_level_3 *ref; + if (rsp_size < sizeof(*rsp)) { + cifs_dbg(VFS | ONCE, + "%s: header is malformed (size is %u, must be %zu)\n", + __func__, rsp_size, sizeof(*rsp)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + *num_of_nodes = le16_to_cpu(rsp->NumberOfReferrals); if (*num_of_nodes < 1) { @@ -925,6 +933,15 @@ parse_dfs_referrals(struct get_dfs_referral_rsp *rsp, u32 rsp_size, goto parse_DFS_referrals_exit; } + if (sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3) > rsp_size) { + cifs_dbg(VFS | ONCE, + "%s: malformed buffer (size is %u, must be at least %zu)\n", + __func__, rsp_size, + sizeof(*rsp) + *num_of_nodes * sizeof(REFERRAL3)); + rc = -EINVAL; + goto parse_DFS_referrals_exit; + } + ref = (struct dfs_referral_level_3 *) &(rsp->referrals); if (ref->VersionNumber != cpu_to_le16(3)) { cifs_dbg(VFS, "Referrals of V%d version are not supported, should be V3\n", diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 4e5460206397..f0ce26622a14 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -874,39 +874,42 @@ static void finished_cached_dirents_count(struct cached_dirents *cde, cde->is_valid = 1; } -static void add_cached_dirent(struct cached_dirents *cde, - struct dir_context *ctx, - const char *name, int namelen, - struct cifs_fattr *fattr, - struct file *file) +static bool add_cached_dirent(struct cached_dirents *cde, + struct dir_context *ctx, const char *name, + int namelen, struct cifs_fattr *fattr, + struct file *file) { struct cached_dirent *de; if (cde->file != file) - return; + return false; if (cde->is_valid || cde->is_failed) - return; + return false; if (ctx->pos != cde->pos) { cde->is_failed = 1; - return; + return false; } de = kzalloc(sizeof(*de), GFP_ATOMIC); if (de == NULL) { cde->is_failed = 1; - return; + return false; } de->namelen = namelen; de->name = kstrndup(name, namelen, GFP_ATOMIC); if (de->name == NULL) { kfree(de); cde->is_failed = 1; - return; + return false; } de->pos = ctx->pos; memcpy(&de->fattr, fattr, sizeof(struct cifs_fattr)); list_add_tail(&de->entry, &cde->entries); + /* update accounting */ + cde->entries_count++; + cde->bytes_used += sizeof(*de) + (size_t)namelen + 1; + return true; } static bool cifs_dir_emit(struct dir_context *ctx, @@ -915,7 +918,8 @@ static bool cifs_dir_emit(struct dir_context *ctx, struct cached_fid *cfid, struct file *file) { - bool rc; + size_t delta_bytes = 0; + bool rc, added = false; ino_t ino = cifs_uniqueid_to_ino_t(fattr->cf_uniqueid); rc = dir_emit(ctx, name, namelen, ino, fattr->cf_dtype); @@ -923,10 +927,20 @@ static bool cifs_dir_emit(struct dir_context *ctx, return rc; if (cfid) { + /* Cost of this entry */ + delta_bytes = sizeof(struct cached_dirent) + (size_t)namelen + 1; + mutex_lock(&cfid->dirents.de_mutex); - add_cached_dirent(&cfid->dirents, ctx, name, namelen, - fattr, file); + added = add_cached_dirent(&cfid->dirents, ctx, name, namelen, + fattr, file); mutex_unlock(&cfid->dirents.de_mutex); + + if (added) { + /* per-tcon then global for consistency with free path */ + atomic64_add((long long)delta_bytes, &cfid->cfids->total_dirents_bytes); + atomic_long_inc(&cfid->cfids->total_dirents_entries); + atomic64_add((long long)delta_bytes, &cifs_dircache_bytes_used); + } } return rc; diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index 0a8c2fcc9ded..ef3b498b0a02 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -584,7 +584,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, * to sign packets before we generate the channel signing key * (we sign with the session key) */ - rc = smb311_crypto_shash_allocate(chan->server); + rc = smb3_crypto_shash_allocate(chan->server); if (rc) { cifs_dbg(VFS, "%s: crypto alloc failed\n", __func__); mutex_unlock(&ses->session_mutex); diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index a02d41d1ce4a..ca8f3dd7ff63 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -652,13 +652,71 @@ static int cifs_query_path_info(const unsigned int xid, #ifdef CONFIG_CIFS_XATTR /* + * For non-symlink WSL reparse points it is required to fetch + * EA $LXMOD which contains in its S_DT part the mandatory file type. + */ + if (!rc && data->reparse_point) { + struct smb2_file_full_ea_info *ea; + u32 next = 0; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + do { + ea = (void *)((u8 *)ea + next); + next = le32_to_cpu(ea->next_entry_offset); + } while (next); + if (le16_to_cpu(ea->ea_value_length)) { + ea->next_entry_offset = cpu_to_le32(ALIGN(sizeof(*ea) + + ea->ea_name_length + 1 + + le16_to_cpu(ea->ea_value_length), 4)); + ea = (void *)((u8 *)ea + le32_to_cpu(ea->next_entry_offset)); + } + + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_MODE, + &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], + SMB2_WSL_XATTR_MODE_SIZE, cifs_sb); + if (rc == SMB2_WSL_XATTR_MODE_SIZE) { + ea->next_entry_offset = cpu_to_le32(0); + ea->flags = 0; + ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; + ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_MODE_SIZE); + memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_MODE, SMB2_WSL_XATTR_NAME_LEN + 1); + data->wsl.eas_len += ALIGN(sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_MODE_SIZE, 4); + rc = 0; + } else if (rc >= 0) { + /* It is an error if EA $LXMOD has wrong size. */ + rc = -EINVAL; + } else { + /* + * In all other cases ignore error if fetching + * of EA $LXMOD failed. It is needed only for + * non-symlink WSL reparse points and wsl_to_fattr() + * handle the case when EA is missing. + */ + rc = 0; + } + } + + /* * For WSL CHR and BLK reparse points it is required to fetch * EA $LXDEV which contains major and minor device numbers. */ if (!rc && data->reparse_point) { struct smb2_file_full_ea_info *ea; + u32 next = 0; ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + do { + ea = (void *)((u8 *)ea + next); + next = le32_to_cpu(ea->next_entry_offset); + } while (next); + if (le16_to_cpu(ea->ea_value_length)) { + ea->next_entry_offset = cpu_to_le32(ALIGN(sizeof(*ea) + + ea->ea_name_length + 1 + + le16_to_cpu(ea->ea_value_length), 4)); + ea = (void *)((u8 *)ea + le32_to_cpu(ea->next_entry_offset)); + } + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); @@ -668,8 +726,8 @@ static int cifs_query_path_info(const unsigned int xid, ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); - data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + - SMB2_WSL_XATTR_DEV_SIZE; + data->wsl.eas_len += ALIGN(sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_MODE_SIZE, 4); rc = 0; } else if (rc >= 0) { /* It is an error if EA $LXDEV has wrong size. */ @@ -818,6 +876,11 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, info.Attributes = cpu_to_le32(dosattrs); rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls, cifs_sb); + if (rc == -EOPNOTSUPP || rc == -EINVAL) + rc = SMBSetInformation(xid, tcon, full_path, + info.Attributes, + 0 /* do not change write time */, + cifs_sb->local_nls, cifs_sb); if (rc == 0) cifsInode->cifsAttrs = dosattrs; } @@ -974,7 +1037,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, .tcon = tcon, .cifs_sb = cifs_sb, .desired_access = SYNCHRONIZE | FILE_WRITE_ATTRIBUTES, - .create_options = cifs_create_options(cifs_sb, CREATE_NOT_DIR), + .create_options = cifs_create_options(cifs_sb, 0), .disposition = FILE_OPEN, .path = full_path, .fid = &fid, diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 0985db9f86e5..09e3fc81d7cb 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -676,7 +676,7 @@ finished: idata->fi.EndOfFile = create_rsp->EndofFile; if (le32_to_cpu(idata->fi.NumberOfLinks) == 0) idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */ - idata->fi.DeletePending = 0; + idata->fi.DeletePending = 0; /* successful open = not delete pending */ idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY); /* smb2_parse_contexts() fills idata->fi.IndexNumber */ @@ -1382,31 +1382,33 @@ int smb2_set_file_info(struct inode *inode, const char *full_path, FILE_BASIC_INFO *buf, const unsigned int xid) { - struct cifs_open_parms oparms; + struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), }; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifsFileInfo *cfile = NULL; + struct cifs_open_parms oparms; struct tcon_link *tlink; struct cifs_tcon *tcon; - struct cifsFileInfo *cfile; - struct kvec in_iov = { .iov_base = buf, .iov_len = sizeof(*buf), }; - int rc; - - if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && - (buf->LastWriteTime == 0) && (buf->ChangeTime == 0) && - (buf->Attributes == 0)) - return 0; /* would be a no op, no sense sending this */ + int rc = 0; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); - cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + if ((buf->CreationTime == 0) && (buf->LastAccessTime == 0) && + (buf->LastWriteTime == 0) && (buf->ChangeTime == 0)) { + if (buf->Attributes == 0) + goto out; /* would be a no op, no sense sending this */ + cifs_get_writable_path(tcon, full_path, FIND_WR_ANY, &cfile); + } + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, FILE_WRITE_ATTRIBUTES, FILE_OPEN, 0, ACL_NO_MODE); rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, &oparms, &in_iov, &(int){SMB2_OP_SET_INFO}, 1, cfile, NULL, NULL, NULL); +out: cifs_put_tlink(tlink); return rc; } diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index 89d933b4a8bc..96bfe4c63ccf 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -7,6 +7,7 @@ * Pavel Shilovsky (pshilovsky@samba.org) 2012 * */ +#include <crypto/sha2.h> #include <linux/ctype.h> #include "cifsglob.h" #include "cifsproto.h" @@ -888,13 +889,13 @@ smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *serve * @iov: array containing the SMB request we will send to the server * @nvec: number of array entries for the iov */ -int +void smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, struct kvec *iov, int nvec) { - int i, rc; + int i; struct smb2_hdr *hdr; - struct shash_desc *sha512 = NULL; + struct sha512_ctx sha_ctx; hdr = (struct smb2_hdr *)iov[0].iov_base; /* neg prot are always taken */ @@ -907,52 +908,22 @@ smb311_update_preauth_hash(struct cifs_ses *ses, struct TCP_Server_Info *server, * and we can test it. Preauth requires 3.1.1 for now. */ if (server->dialect != SMB311_PROT_ID) - return 0; + return; if (hdr->Command != SMB2_SESSION_SETUP) - return 0; + return; /* skip last sess setup response */ if ((hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) && (hdr->Status == NT_STATUS_OK || (hdr->Status != cpu_to_le32(NT_STATUS_MORE_PROCESSING_REQUIRED)))) - return 0; + return; ok: - rc = smb311_crypto_shash_allocate(server); - if (rc) - return rc; - - sha512 = server->secmech.sha512; - rc = crypto_shash_init(sha512); - if (rc) { - cifs_dbg(VFS, "%s: Could not init sha512 shash\n", __func__); - return rc; - } - - rc = crypto_shash_update(sha512, ses->preauth_sha_hash, - SMB2_PREAUTH_HASH_SIZE); - if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", __func__); - return rc; - } - - for (i = 0; i < nvec; i++) { - rc = crypto_shash_update(sha512, iov[i].iov_base, iov[i].iov_len); - if (rc) { - cifs_dbg(VFS, "%s: Could not update sha512 shash\n", - __func__); - return rc; - } - } - - rc = crypto_shash_final(sha512, ses->preauth_sha_hash); - if (rc) { - cifs_dbg(VFS, "%s: Could not finalize sha512 shash\n", - __func__); - return rc; - } - - return 0; + sha512_init(&sha_ctx); + sha512_update(&sha_ctx, ses->preauth_sha_hash, SMB2_PREAUTH_HASH_SIZE); + for (i = 0; i < nvec; i++) + sha512_update(&sha_ctx, iov[i].iov_base, iov[i].iov_len); + sha512_final(&sha_ctx, ses->preauth_sha_hash); } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4711a23c5b38..95cd484cfbba 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -954,11 +954,8 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = open_cached_dir(xid, tcon, full_path, cifs_sb, true, &cfid); if (!rc) { - if (cfid->has_lease) { - close_cached_dir(cfid); - return 0; - } close_cached_dir(cfid); + return 0; } utf16_path = cifs_convert_path_to_utf16(full_path, cifs_sb); @@ -1806,140 +1803,226 @@ free_vars: return rc; } +/** + * calc_chunk_count - calculates the number chunks to be filled in the Chunks[] + * array of struct copychunk_ioctl + * + * @tcon: destination file tcon + * @bytes_left: how many bytes are left to copy + * + * Return: maximum number of chunks with which Chunks[] can be filled. + */ +static inline u32 +calc_chunk_count(struct cifs_tcon *tcon, u64 bytes_left) +{ + u32 max_chunks = READ_ONCE(tcon->max_chunks); + u32 max_bytes_copy = READ_ONCE(tcon->max_bytes_copy); + u32 max_bytes_chunk = READ_ONCE(tcon->max_bytes_chunk); + u64 need; + u32 allowed; + + if (!max_bytes_chunk || !max_bytes_copy || !max_chunks) + return 0; + + /* chunks needed for the remaining bytes */ + need = DIV_ROUND_UP_ULL(bytes_left, max_bytes_chunk); + /* chunks allowed per cc request */ + allowed = DIV_ROUND_UP(max_bytes_copy, max_bytes_chunk); + + return (u32)umin(need, umin(max_chunks, allowed)); +} + +/** + * smb2_copychunk_range - server-side copy of data range + * + * @xid: transaction id + * @src_file: source file + * @dst_file: destination file + * @src_off: source file byte offset + * @len: number of bytes to copy + * @dst_off: destination file byte offset + * + * Obtains a resume key for @src_file and issues FSCTL_SRV_COPYCHUNK_WRITE + * IOCTLs, splitting the request into chunks limited by tcon->max_*. + * + * Return: @len on success; negative errno on failure. + */ static ssize_t smb2_copychunk_range(const unsigned int xid, - struct cifsFileInfo *srcfile, - struct cifsFileInfo *trgtfile, u64 src_off, - u64 len, u64 dest_off) + struct cifsFileInfo *src_file, + struct cifsFileInfo *dst_file, + u64 src_off, + u64 len, + u64 dst_off) { - int rc; - unsigned int ret_data_len; - struct copychunk_ioctl *pcchunk; - struct copychunk_ioctl_rsp *retbuf = NULL; + int rc = 0; + unsigned int ret_data_len = 0; + struct copychunk_ioctl *cc_req = NULL; + struct copychunk_ioctl_rsp *cc_rsp = NULL; struct cifs_tcon *tcon; - int chunks_copied = 0; - bool chunk_sizes_updated = false; - ssize_t bytes_written, total_bytes_written = 0; + struct copychunk *chunk; + u32 chunks, chunk_count, chunk_bytes; + u32 copy_bytes, copy_bytes_left; + u32 chunks_written, bytes_written; + u64 total_bytes_left = len; + u64 src_off_prev, dst_off_prev; + u32 retries = 0; + + tcon = tlink_tcon(dst_file->tlink); + + trace_smb3_copychunk_enter(xid, src_file->fid.volatile_fid, + dst_file->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dst_off, len); + +retry: + chunk_count = calc_chunk_count(tcon, total_bytes_left); + if (!chunk_count) { + rc = -EOPNOTSUPP; + goto out; + } - pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL); - if (pcchunk == NULL) - return -ENOMEM; + cc_req = kzalloc(struct_size(cc_req, Chunks, chunk_count), GFP_KERNEL); + if (!cc_req) { + rc = -ENOMEM; + goto out; + } - cifs_dbg(FYI, "%s: about to call request res key\n", __func__); /* Request a key from the server to identify the source of the copy */ - rc = SMB2_request_res_key(xid, tlink_tcon(srcfile->tlink), - srcfile->fid.persistent_fid, - srcfile->fid.volatile_fid, pcchunk); + rc = SMB2_request_res_key(xid, + tlink_tcon(src_file->tlink), + src_file->fid.persistent_fid, + src_file->fid.volatile_fid, + cc_req); - /* Note: request_res_key sets res_key null only if rc !=0 */ + /* Note: request_res_key sets res_key null only if rc != 0 */ if (rc) - goto cchunk_out; + goto out; + + while (total_bytes_left > 0) { - /* For now array only one chunk long, will make more flexible later */ - pcchunk->ChunkCount = cpu_to_le32(1); - pcchunk->Reserved = 0; - pcchunk->Reserved2 = 0; + /* Store previous offsets to allow rewind */ + src_off_prev = src_off; + dst_off_prev = dst_off; - tcon = tlink_tcon(trgtfile->tlink); + chunks = 0; + copy_bytes = 0; + copy_bytes_left = umin(total_bytes_left, tcon->max_bytes_copy); + while (copy_bytes_left > 0 && chunks < chunk_count) { + chunk = &cc_req->Chunks[chunks++]; - trace_smb3_copychunk_enter(xid, srcfile->fid.volatile_fid, - trgtfile->fid.volatile_fid, tcon->tid, - tcon->ses->Suid, src_off, dest_off, len); + chunk->SourceOffset = cpu_to_le64(src_off); + chunk->TargetOffset = cpu_to_le64(dst_off); - while (len > 0) { - pcchunk->SourceOffset = cpu_to_le64(src_off); - pcchunk->TargetOffset = cpu_to_le64(dest_off); - pcchunk->Length = - cpu_to_le32(min_t(u64, len, tcon->max_bytes_chunk)); + chunk_bytes = umin(copy_bytes_left, tcon->max_bytes_chunk); + + chunk->Length = cpu_to_le32(chunk_bytes); + /* Buffer is zeroed, no need to set chunk->Reserved = 0 */ + + src_off += chunk_bytes; + dst_off += chunk_bytes; + + copy_bytes_left -= chunk_bytes; + copy_bytes += chunk_bytes; + } + + cc_req->ChunkCount = cpu_to_le32(chunks); + /* Buffer is zeroed, no need to set cc_req->Reserved = 0 */ /* Request server copy to target from src identified by key */ - kfree(retbuf); - retbuf = NULL; - rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, - trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, - (char *)pcchunk, sizeof(struct copychunk_ioctl), - CIFSMaxBufSize, (char **)&retbuf, &ret_data_len); + kfree(cc_rsp); + cc_rsp = NULL; + rc = SMB2_ioctl(xid, tcon, dst_file->fid.persistent_fid, + dst_file->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, + (char *)cc_req, struct_size(cc_req, Chunks, chunks), + CIFSMaxBufSize, (char **)&cc_rsp, &ret_data_len); + + if (rc && rc != -EINVAL) + goto out; + + if (unlikely(ret_data_len != sizeof(*cc_rsp))) { + cifs_tcon_dbg(VFS, "Copychunk invalid response: size %u/%zu\n", + ret_data_len, sizeof(*cc_rsp)); + rc = -EIO; + goto out; + } + + bytes_written = le32_to_cpu(cc_rsp->TotalBytesWritten); + chunks_written = le32_to_cpu(cc_rsp->ChunksWritten); + chunk_bytes = le32_to_cpu(cc_rsp->ChunkBytesWritten); + if (rc == 0) { - if (ret_data_len != - sizeof(struct copychunk_ioctl_rsp)) { - cifs_tcon_dbg(VFS, "Invalid cchunk response size\n"); - rc = -EIO; - goto cchunk_out; - } - if (retbuf->TotalBytesWritten == 0) { - cifs_dbg(FYI, "no bytes copied\n"); + /* Check if server claimed to write more than we asked */ + if (unlikely(!bytes_written || bytes_written > copy_bytes || + !chunks_written || chunks_written > chunks)) { + cifs_tcon_dbg(VFS, "Copychunk invalid response: bytes written %u/%u, chunks written %u/%u\n", + bytes_written, copy_bytes, chunks_written, chunks); rc = -EIO; - goto cchunk_out; - } - /* - * Check if server claimed to write more than we asked - */ - if (le32_to_cpu(retbuf->TotalBytesWritten) > - le32_to_cpu(pcchunk->Length)) { - cifs_tcon_dbg(VFS, "Invalid copy chunk response\n"); - rc = -EIO; - goto cchunk_out; + goto out; } - if (le32_to_cpu(retbuf->ChunksWritten) != 1) { - cifs_tcon_dbg(VFS, "Invalid num chunks written\n"); - rc = -EIO; - goto cchunk_out; + + /* Partial write: rewind */ + if (bytes_written < copy_bytes) { + u32 delta = copy_bytes - bytes_written; + + src_off -= delta; + dst_off -= delta; } - chunks_copied++; - - bytes_written = le32_to_cpu(retbuf->TotalBytesWritten); - src_off += bytes_written; - dest_off += bytes_written; - len -= bytes_written; - total_bytes_written += bytes_written; - - cifs_dbg(FYI, "Chunks %d PartialChunk %d Total %zu\n", - le32_to_cpu(retbuf->ChunksWritten), - le32_to_cpu(retbuf->ChunkBytesWritten), - bytes_written); - trace_smb3_copychunk_done(xid, srcfile->fid.volatile_fid, - trgtfile->fid.volatile_fid, tcon->tid, - tcon->ses->Suid, src_off, dest_off, len); - } else if (rc == -EINVAL) { - if (ret_data_len != sizeof(struct copychunk_ioctl_rsp)) - goto cchunk_out; - - cifs_dbg(FYI, "MaxChunks %d BytesChunk %d MaxCopy %d\n", - le32_to_cpu(retbuf->ChunksWritten), - le32_to_cpu(retbuf->ChunkBytesWritten), - le32_to_cpu(retbuf->TotalBytesWritten)); - /* - * Check if this is the first request using these sizes, - * (ie check if copy succeed once with original sizes - * and check if the server gave us different sizes after - * we already updated max sizes on previous request). - * if not then why is the server returning an error now - */ - if ((chunks_copied != 0) || chunk_sizes_updated) - goto cchunk_out; - - /* Check that server is not asking us to grow size */ - if (le32_to_cpu(retbuf->ChunkBytesWritten) < - tcon->max_bytes_chunk) - tcon->max_bytes_chunk = - le32_to_cpu(retbuf->ChunkBytesWritten); - else - goto cchunk_out; /* server gave us bogus size */ + total_bytes_left -= bytes_written; + continue; + } - /* No need to change MaxChunks since already set to 1 */ - chunk_sizes_updated = true; - } else - goto cchunk_out; + /* + * Check if server is not asking us to reduce size. + * + * Note: As per MS-SMB2 2.2.32.1, the values returned + * in cc_rsp are not strictly lower than what existed + * before. + */ + if (bytes_written < tcon->max_bytes_copy) { + cifs_tcon_dbg(FYI, "Copychunk MaxBytesCopy updated: %u -> %u\n", + tcon->max_bytes_copy, bytes_written); + tcon->max_bytes_copy = bytes_written; + } + + if (chunks_written < tcon->max_chunks) { + cifs_tcon_dbg(FYI, "Copychunk MaxChunks updated: %u -> %u\n", + tcon->max_chunks, chunks_written); + tcon->max_chunks = chunks_written; + } + + if (chunk_bytes < tcon->max_bytes_chunk) { + cifs_tcon_dbg(FYI, "Copychunk MaxBytesChunk updated: %u -> %u\n", + tcon->max_bytes_chunk, chunk_bytes); + tcon->max_bytes_chunk = chunk_bytes; + } + + /* reset to last offsets */ + if (retries++ < 2) { + src_off = src_off_prev; + dst_off = dst_off_prev; + kfree(cc_req); + cc_req = NULL; + goto retry; + } + + break; } -cchunk_out: - kfree(pcchunk); - kfree(retbuf); - if (rc) +out: + kfree(cc_req); + kfree(cc_rsp); + if (rc) { + trace_smb3_copychunk_err(xid, src_file->fid.volatile_fid, + dst_file->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dst_off, len, rc); return rc; - else - return total_bytes_written; + } else { + trace_smb3_copychunk_done(xid, src_file->fid.volatile_fid, + dst_file->fid.volatile_fid, tcon->tid, + tcon->ses->Suid, src_off, dst_off, len); + return len; + } } static int @@ -3129,8 +3212,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) { rc = -ENOMEM; - free_xid(xid); - return ERR_PTR(rc); + goto put_tlink; } oparms = (struct cifs_open_parms) { @@ -3162,6 +3244,7 @@ get_smb2_acl_by_path(struct cifs_sb_info *cifs_sb, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); } +put_tlink: cifs_put_tlink(tlink); free_xid(xid); @@ -3202,8 +3285,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen, utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (!utf16_path) { rc = -ENOMEM; - free_xid(xid); - return rc; + goto put_tlink; } oparms = (struct cifs_open_parms) { @@ -3224,6 +3306,7 @@ set_smb2_acl(struct smb_ntsd *pnntsd, __u32 acllen, SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); } +put_tlink: cifs_put_tlink(tlink); free_xid(xid); return rc; @@ -3284,7 +3367,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, trace_smb3_zero_enter(xid, cfile->fid.persistent_fid, tcon->tid, ses->Suid, offset, len); - inode_lock(inode); filemap_invalidate_lock(inode->i_mapping); i_size = i_size_read(inode); @@ -3302,6 +3384,7 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, * first, otherwise the data may be inconsistent with the server. */ truncate_pagecache_range(inode, offset, offset + len - 1); + netfs_wait_for_outstanding_io(inode); /* if file not oplocked can't be sure whether asking to extend size */ rc = -EOPNOTSUPP; @@ -3330,7 +3413,6 @@ static long smb3_zero_range(struct file *file, struct cifs_tcon *tcon, zero_range_exit: filemap_invalidate_unlock(inode->i_mapping); - inode_unlock(inode); free_xid(xid); if (rc) trace_smb3_zero_err(xid, cfile->fid.persistent_fid, tcon->tid, @@ -3354,7 +3436,6 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode_lock(inode); /* Need to make file sparse, if not already, before freeing range. */ /* Consider adding equivalent for compressed since it could also work */ if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) { @@ -3368,6 +3449,7 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, * caches first, otherwise the data may be inconsistent with the server. */ truncate_pagecache_range(inode, offset, offset + len - 1); + netfs_wait_for_outstanding_io(inode); cifs_dbg(FYI, "Offset %lld len %lld\n", offset, len); @@ -3402,7 +3484,6 @@ static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon, unlock: filemap_invalidate_unlock(inode->i_mapping); out: - inode_unlock(inode); free_xid(xid); return rc; } @@ -3666,8 +3747,6 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode_lock(inode); - old_eof = i_size_read(inode); if ((off >= old_eof) || off + len >= old_eof) { @@ -3682,6 +3761,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, truncate_pagecache_range(inode, off, old_eof); ictx->zero_point = old_eof; + netfs_wait_for_outstanding_io(inode); rc = smb2_copychunk_range(xid, cfile, cfile, off + len, old_eof - off - len, off); @@ -3702,8 +3782,7 @@ static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, fscache_resize_cookie(cifs_inode_cookie(inode), new_eof); out_2: filemap_invalidate_unlock(inode->i_mapping); - out: - inode_unlock(inode); +out: free_xid(xid); return rc; } @@ -3720,8 +3799,6 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, xid = get_xid(); - inode_lock(inode); - old_eof = i_size_read(inode); if (off >= old_eof) { rc = -EINVAL; @@ -3736,6 +3813,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, if (rc < 0) goto out_2; truncate_pagecache_range(inode, off, old_eof); + netfs_wait_for_outstanding_io(inode); rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, cfile->fid.volatile_fid, cfile->pid, new_eof); @@ -3758,8 +3836,7 @@ static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, rc = 0; out_2: filemap_invalidate_unlock(inode->i_mapping); - out: - inode_unlock(inode); +out: free_xid(xid); return rc; } @@ -4219,7 +4296,7 @@ fill_transform_hdr(struct smb2_transform_hdr *tr_hdr, unsigned int orig_len, static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, struct aead_request **req, struct sg_table *sgt, - unsigned int *num_sgs, size_t *sensitive_size) + unsigned int *num_sgs) { unsigned int req_size = sizeof(**req) + crypto_aead_reqsize(tfm); unsigned int iv_size = crypto_aead_ivsize(tfm); @@ -4236,9 +4313,8 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst len += req_size; len = ALIGN(len, __alignof__(struct scatterlist)); len += array_size(*num_sgs, sizeof(struct scatterlist)); - *sensitive_size = len; - p = kvzalloc(len, GFP_NOFS); + p = kzalloc(len, GFP_NOFS); if (!p) return ERR_PTR(-ENOMEM); @@ -4252,16 +4328,14 @@ static void *smb2_aead_req_alloc(struct crypto_aead *tfm, const struct smb_rqst static void *smb2_get_aead_req(struct crypto_aead *tfm, struct smb_rqst *rqst, int num_rqst, const u8 *sig, u8 **iv, - struct aead_request **req, struct scatterlist **sgl, - size_t *sensitive_size) + struct aead_request **req, struct scatterlist **sgl) { struct sg_table sgtable = {}; unsigned int skip, num_sgs, i, j; ssize_t rc; void *p; - p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, - &num_sgs, sensitive_size); + p = smb2_aead_req_alloc(tfm, rqst, num_rqst, sig, iv, req, &sgtable, &num_sgs); if (IS_ERR(p)) return ERR_CAST(p); @@ -4350,7 +4424,6 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, DECLARE_CRYPTO_WAIT(wait); unsigned int crypt_len = le32_to_cpu(tr_hdr->OriginalMessageSize); void *creq; - size_t sensitive_size; rc = smb2_get_enc_key(server, le64_to_cpu(tr_hdr->SessionId), enc, key); if (rc) { @@ -4376,8 +4449,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, return rc; } - creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg, - &sensitive_size); + creq = smb2_get_aead_req(tfm, rqst, num_rqst, sign, &iv, &req, &sg); if (IS_ERR(creq)) return PTR_ERR(creq); @@ -4407,7 +4479,7 @@ crypt_message(struct TCP_Server_Info *server, int num_rqst, if (!rc && enc) memcpy(&tr_hdr->Signature, sign, SMB2_SIGNATURE_SIZE); - kvfree_sensitive(creq, sensitive_size); + kfree_sensitive(creq); return rc; } @@ -4658,7 +4730,7 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, unsigned int pad_len; struct cifs_io_subrequest *rdata = mid->callback_data; struct smb2_hdr *shdr = (struct smb2_hdr *)buf; - int length; + size_t copied; bool use_rdma_mr = false; if (shdr->Command != SMB2_READ) { @@ -4771,10 +4843,10 @@ handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid, } else if (buf_len >= data_offset + data_len) { /* read response payload is in buf */ WARN_ONCE(buffer, "read data can be either in buf or in buffer"); - length = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); - if (length < 0) - return length; - rdata->got_bytes = data_len; + copied = copy_to_iter(buf + data_offset, data_len, &rdata->subreq.io_iter); + if (copied == 0) + return -EIO; + rdata->got_bytes = copied; } else { /* read response payload cannot be in both buf and pages */ WARN_ONCE(1, "buf can not contain only a part of read data"); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 1c63d2c9cc9c..b0739a2661bf 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -240,8 +240,8 @@ smb2_reconnect(__le16 smb2_command, struct cifs_tcon *tcon, */ if (smb2_command != SMB2_TREE_DISCONNECT) { spin_unlock(&tcon->tc_lock); - cifs_dbg(FYI, "can not send cmd %d while umounting\n", - smb2_command); + cifs_tcon_dbg(FYI, "can not send cmd %d while umounting\n", + smb2_command); return -ENODEV; } } @@ -296,9 +296,9 @@ again: return 0; } spin_unlock(&ses->chan_lock); - cifs_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d", - tcon->ses->chans_need_reconnect, - tcon->need_reconnect); + cifs_tcon_dbg(FYI, "sess reconnect mask: 0x%lx, tcon reconnect: %d\n", + tcon->ses->chans_need_reconnect, + tcon->need_reconnect); mutex_lock(&ses->session_mutex); /* @@ -392,11 +392,11 @@ skip_sess_setup: rc = cifs_tree_connect(0, tcon); - cifs_dbg(FYI, "reconnect tcon rc = %d\n", rc); + cifs_tcon_dbg(FYI, "reconnect tcon rc = %d\n", rc); if (rc) { /* If sess reconnected but tcon didn't, something strange ... */ mutex_unlock(&ses->session_mutex); - cifs_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); + cifs_tcon_dbg(VFS, "reconnect tcon failed rc = %d\n", rc); goto out; } @@ -442,8 +442,8 @@ skip_sess_setup: from_reconnect); goto skip_add_channels; } else if (rc) - cifs_dbg(FYI, "%s: failed to query server interfaces: %d\n", - __func__, rc); + cifs_tcon_dbg(FYI, "%s: failed to query server interfaces: %d\n", + __func__, rc); if (ses->chan_max > ses->chan_count && ses->iface_count && @@ -3277,7 +3277,7 @@ replay_again: buf->EndOfFile = rsp->EndofFile; buf->Attributes = rsp->FileAttributes; buf->NumberOfLinks = cpu_to_le32(1); - buf->DeletePending = 0; + buf->DeletePending = 0; /* successful open = not delete pending */ } diff --git a/fs/smb/client/smb2pdu.h b/fs/smb/client/smb2pdu.h index 3c09a58dfd07..101024f8f725 100644 --- a/fs/smb/client/smb2pdu.h +++ b/fs/smb/client/smb2pdu.h @@ -201,16 +201,20 @@ struct resume_key_req { char Context[]; /* ignored, Windows sets to 4 bytes of zero */ } __packed; + +struct copychunk { + __le64 SourceOffset; + __le64 TargetOffset; + __le32 Length; + __le32 Reserved; +} __packed; + /* this goes in the ioctl buffer when doing a copychunk request */ struct copychunk_ioctl { char SourceKey[COPY_CHUNK_RES_KEY_SIZE]; - __le32 ChunkCount; /* we are only sending 1 */ + __le32 ChunkCount; __le32 Reserved; - /* array will only be one chunk long for us */ - __le64 SourceOffset; - __le64 TargetOffset; - __le32 Length; /* how many bytes to copy */ - __u32 Reserved2; + struct copychunk Chunks[]; } __packed; struct copychunk_ioctl_rsp { diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index b3f1398c9f79..6eb86d134abc 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -295,10 +295,10 @@ extern int smb2_validate_and_copy_iov(unsigned int offset, extern void smb2_copy_fs_info_to_kstatfs( struct smb2_fs_full_size_info *pfs_inf, struct kstatfs *kst); -extern int smb311_crypto_shash_allocate(struct TCP_Server_Info *server); -extern int smb311_update_preauth_hash(struct cifs_ses *ses, - struct TCP_Server_Info *server, - struct kvec *iov, int nvec); +extern int smb3_crypto_shash_allocate(struct TCP_Server_Info *server); +extern void smb311_update_preauth_hash(struct cifs_ses *ses, + struct TCP_Server_Info *server, + struct kvec *iov, int nvec); extern int smb2_query_info_compound(const unsigned int xid, struct cifs_tcon *tcon, const char *path, u32 desired_access, diff --git a/fs/smb/client/smb2transport.c b/fs/smb/client/smb2transport.c index bc0e92eb2b64..ad6068e17a2a 100644 --- a/fs/smb/client/smb2transport.c +++ b/fs/smb/client/smb2transport.c @@ -19,6 +19,7 @@ #include <linux/mempool.h> #include <linux/highmem.h> #include <crypto/aead.h> +#include <crypto/sha2.h> #include "cifsglob.h" #include "cifsproto.h" #include "smb2proto.h" @@ -26,53 +27,14 @@ #include "../common/smb2status.h" #include "smb2glob.h" -static int -smb3_crypto_shash_allocate(struct TCP_Server_Info *server) -{ - struct cifs_secmech *p = &server->secmech; - int rc; - - rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); - if (rc) - goto err; - - rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); - if (rc) - goto err; - - return 0; -err: - cifs_free_hash(&p->hmacsha256); - return rc; -} - int -smb311_crypto_shash_allocate(struct TCP_Server_Info *server) +smb3_crypto_shash_allocate(struct TCP_Server_Info *server) { struct cifs_secmech *p = &server->secmech; - int rc = 0; - - rc = cifs_alloc_hash("hmac(sha256)", &p->hmacsha256); - if (rc) - return rc; - - rc = cifs_alloc_hash("cmac(aes)", &p->aes_cmac); - if (rc) - goto err; - - rc = cifs_alloc_hash("sha512", &p->sha512); - if (rc) - goto err; - - return 0; -err: - cifs_free_hash(&p->aes_cmac); - cifs_free_hash(&p->hmacsha256); - return rc; + return cifs_alloc_hash("cmac(aes)", &p->aes_cmac); } - static int smb3_get_sign_key(__u64 ses_id, struct TCP_Server_Info *server, u8 *key) { @@ -240,11 +202,6 @@ smb2_find_smb_tcon(struct TCP_Server_Info *server, __u64 ses_id, __u32 tid) return NULL; } tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); - if (!tcon) { - spin_unlock(&cifs_tcp_ses_lock); - cifs_put_smb_ses(ses); - return NULL; - } spin_unlock(&cifs_tcp_ses_lock); /* tcon already has a ref to ses, so we don't need ses anymore */ cifs_put_smb_ses(ses); @@ -258,10 +215,9 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, { int rc; unsigned char smb2_signature[SMB2_HMACSHA256_SIZE]; - unsigned char *sigptr = smb2_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; - struct shash_desc *shash = NULL; + struct hmac_sha256_ctx hmac_ctx; struct smb_rqst drqst; __u64 sid = le64_to_cpu(shdr->SessionId); u8 key[SMB2_NTLMV2_SESSKEY_SIZE]; @@ -276,30 +232,7 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, memset(smb2_signature, 0x0, SMB2_HMACSHA256_SIZE); memset(shdr->Signature, 0x0, SMB2_SIGNATURE_SIZE); - if (allocate_crypto) { - rc = cifs_alloc_hash("hmac(sha256)", &shash); - if (rc) { - cifs_server_dbg(VFS, - "%s: sha256 alloc failed\n", __func__); - goto out; - } - } else { - shash = server->secmech.hmacsha256; - } - - rc = crypto_shash_setkey(shash->tfm, key, sizeof(key)); - if (rc) { - cifs_server_dbg(VFS, - "%s: Could not update with response\n", - __func__); - goto out; - } - - rc = crypto_shash_init(shash); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not init sha256", __func__); - goto out; - } + hmac_sha256_init_usingrawkey(&hmac_ctx, key, sizeof(key)); /* * For SMB2+, __cifs_calc_signature() expects to sign only the actual @@ -310,25 +243,17 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, */ drqst = *rqst; if (drqst.rq_nvec >= 2 && iov[0].iov_len == 4) { - rc = crypto_shash_update(shash, iov[0].iov_base, - iov[0].iov_len); - if (rc) { - cifs_server_dbg(VFS, - "%s: Could not update with payload\n", - __func__); - goto out; - } + hmac_sha256_update(&hmac_ctx, iov[0].iov_base, iov[0].iov_len); drqst.rq_iov++; drqst.rq_nvec--; } - rc = __cifs_calc_signature(&drqst, server, sigptr, shash); + rc = __cifs_calc_signature( + &drqst, server, smb2_signature, + &(struct cifs_calc_sig_ctx){ .hmac = &hmac_ctx }); if (!rc) - memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, smb2_signature, SMB2_SIGNATURE_SIZE); -out: - if (allocate_crypto) - cifs_free_hash(&shash); return rc; } @@ -341,8 +266,8 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, __u8 L256[4] = {0, 0, 1, 0}; int rc = 0; unsigned char prfhash[SMB2_HMACSHA256_SIZE]; - unsigned char *hashptr = prfhash; struct TCP_Server_Info *server = ses->server; + struct hmac_sha256_ctx hmac_ctx; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); memset(key, 0x0, key_size); @@ -350,67 +275,26 @@ static int generate_key(struct cifs_ses *ses, struct kvec label, rc = smb3_crypto_shash_allocate(server); if (rc) { cifs_server_dbg(VFS, "%s: crypto alloc failed\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_setkey(server->secmech.hmacsha256->tfm, - ses->auth_key.response, SMB2_NTLMV2_SESSKEY_SIZE); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not set with session key\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_init(server->secmech.hmacsha256); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not init sign hmac\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, i, 4); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with n\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, label.iov_base, label.iov_len); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with label\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_update(server->secmech.hmacsha256, &zero, 1); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with zero\n", __func__); - goto smb3signkey_ret; + return rc; } - rc = crypto_shash_update(server->secmech.hmacsha256, context.iov_base, context.iov_len); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with context\n", __func__); - goto smb3signkey_ret; - } + hmac_sha256_init_usingrawkey(&hmac_ctx, ses->auth_key.response, + SMB2_NTLMV2_SESSKEY_SIZE); + hmac_sha256_update(&hmac_ctx, i, 4); + hmac_sha256_update(&hmac_ctx, label.iov_base, label.iov_len); + hmac_sha256_update(&hmac_ctx, &zero, 1); + hmac_sha256_update(&hmac_ctx, context.iov_base, context.iov_len); if ((server->cipher_type == SMB2_ENCRYPTION_AES256_CCM) || (server->cipher_type == SMB2_ENCRYPTION_AES256_GCM)) { - rc = crypto_shash_update(server->secmech.hmacsha256, L256, 4); + hmac_sha256_update(&hmac_ctx, L256, 4); } else { - rc = crypto_shash_update(server->secmech.hmacsha256, L128, 4); - } - if (rc) { - cifs_server_dbg(VFS, "%s: Could not update with L\n", __func__); - goto smb3signkey_ret; - } - - rc = crypto_shash_final(server->secmech.hmacsha256, hashptr); - if (rc) { - cifs_server_dbg(VFS, "%s: Could not generate sha256 hash\n", __func__); - goto smb3signkey_ret; + hmac_sha256_update(&hmac_ctx, L128, 4); } + hmac_sha256_final(&hmac_ctx, prfhash); - memcpy(key, hashptr, key_size); - -smb3signkey_ret: - return rc; + memcpy(key, prfhash, key_size); + return 0; } struct derivation { @@ -587,7 +471,6 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, { int rc; unsigned char smb3_signature[SMB2_CMACAES_SIZE]; - unsigned char *sigptr = smb3_signature; struct kvec *iov = rqst->rq_iov; struct smb2_hdr *shdr = (struct smb2_hdr *)iov[0].iov_base; struct shash_desc *shash = NULL; @@ -648,9 +531,11 @@ smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server, drqst.rq_nvec--; } - rc = __cifs_calc_signature(&drqst, server, sigptr, shash); + rc = __cifs_calc_signature( + &drqst, server, smb3_signature, + &(struct cifs_calc_sig_ctx){ .shash = shash }); if (!rc) - memcpy(shdr->Signature, sigptr, SMB2_SIGNATURE_SIZE); + memcpy(shdr->Signature, smb3_signature, SMB2_SIGNATURE_SIZE); out: if (allocate_crypto) diff --git a/fs/smb/client/smbdirect.c b/fs/smb/client/smbdirect.c index 316f398c70f4..49e2df3ad1f0 100644 --- a/fs/smb/client/smbdirect.c +++ b/fs/smb/client/smbdirect.c @@ -1575,12 +1575,12 @@ void smbd_destroy(struct TCP_Server_Info *server) disable_work_sync(&sc->disconnect_work); log_rdma_event(INFO, "destroying rdma session\n"); - if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) { + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTING) smbd_disconnect_rdma_work(&sc->disconnect_work); + if (sc->status < SMBDIRECT_SOCKET_DISCONNECTED) { log_rdma_event(INFO, "wait for transport being disconnected\n"); - wait_event_interruptible( - sc->status_wait, - sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + wait_event(sc->status_wait, sc->status == SMBDIRECT_SOCKET_DISCONNECTED); + log_rdma_event(INFO, "waited for transport being disconnected\n"); } /* @@ -1624,19 +1624,7 @@ void smbd_destroy(struct TCP_Server_Info *server) log_rdma_event(INFO, "free receive buffers\n"); destroy_receive_buffers(sc); - /* - * For performance reasons, memory registration and deregistration - * are not locked by srv_mutex. It is possible some processes are - * blocked on transport srv_mutex while holding memory registration. - * Release the transport srv_mutex to allow them to hit the failure - * path when sending data, and then release memory registrations. - */ log_rdma_event(INFO, "freeing mr list\n"); - while (atomic_read(&sc->mr_io.used.count)) { - cifs_server_unlock(server); - msleep(1000); - cifs_server_lock(server); - } destroy_mr_list(sc); ib_free_cq(sc->ib.send_cq); @@ -2352,18 +2340,84 @@ static void smbd_mr_recovery_work(struct work_struct *work) } } +static void smbd_mr_disable_locked(struct smbdirect_mr_io *mr) +{ + struct smbdirect_socket *sc = mr->socket; + + lockdep_assert_held(&mr->mutex); + + if (mr->state == SMBDIRECT_MR_DISABLED) + return; + + if (mr->mr) + ib_dereg_mr(mr->mr); + if (mr->sgt.nents) + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + kfree(mr->sgt.sgl); + + mr->mr = NULL; + mr->sgt.sgl = NULL; + mr->sgt.nents = 0; + + mr->state = SMBDIRECT_MR_DISABLED; +} + +static void smbd_mr_free_locked(struct kref *kref) +{ + struct smbdirect_mr_io *mr = + container_of(kref, struct smbdirect_mr_io, kref); + + lockdep_assert_held(&mr->mutex); + + /* + * smbd_mr_disable_locked() should already be called! + */ + if (WARN_ON_ONCE(mr->state != SMBDIRECT_MR_DISABLED)) + smbd_mr_disable_locked(mr); + + mutex_unlock(&mr->mutex); + mutex_destroy(&mr->mutex); + kfree(mr); +} + static void destroy_mr_list(struct smbdirect_socket *sc) { struct smbdirect_mr_io *mr, *tmp; + LIST_HEAD(all_list); + unsigned long flags; disable_work_sync(&sc->mr_io.recovery_work); - list_for_each_entry_safe(mr, tmp, &sc->mr_io.all.list, list) { - if (mr->state == SMBDIRECT_MR_INVALIDATED) - ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, - mr->sgt.nents, mr->dir); - ib_dereg_mr(mr->mr); - kfree(mr->sgt.sgl); - kfree(mr); + + spin_lock_irqsave(&sc->mr_io.all.lock, flags); + list_splice_tail_init(&sc->mr_io.all.list, &all_list); + spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); + + list_for_each_entry_safe(mr, tmp, &all_list, list) { + mutex_lock(&mr->mutex); + + smbd_mr_disable_locked(mr); + list_del(&mr->list); + mr->socket = NULL; + + /* + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + * + * If the mr is still registered it will + * be dangling (detached from the connection + * waiting for smbd_deregister_mr() to be + * called in order to free the memory. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); } } @@ -2377,10 +2431,9 @@ static void destroy_mr_list(struct smbdirect_socket *sc) static int allocate_mr_list(struct smbdirect_socket *sc) { struct smbdirect_socket_parameters *sp = &sc->parameters; - int i; - struct smbdirect_mr_io *smbdirect_mr, *tmp; - - INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + struct smbdirect_mr_io *mr; + int ret; + u32 i; if (sp->responder_resources == 0) { log_rdma_mr(ERR, "responder_resources negotiated as 0\n"); @@ -2389,42 +2442,52 @@ static int allocate_mr_list(struct smbdirect_socket *sc) /* Allocate more MRs (2x) than hardware responder_resources */ for (i = 0; i < sp->responder_resources * 2; i++) { - smbdirect_mr = kzalloc(sizeof(*smbdirect_mr), GFP_KERNEL); - if (!smbdirect_mr) - goto cleanup_entries; - smbdirect_mr->mr = ib_alloc_mr(sc->ib.pd, sc->mr_io.type, - sp->max_frmr_depth); - if (IS_ERR(smbdirect_mr->mr)) { + mr = kzalloc(sizeof(*mr), GFP_KERNEL); + if (!mr) { + ret = -ENOMEM; + goto kzalloc_mr_failed; + } + + kref_init(&mr->kref); + mutex_init(&mr->mutex); + + mr->mr = ib_alloc_mr(sc->ib.pd, + sc->mr_io.type, + sp->max_frmr_depth); + if (IS_ERR(mr->mr)) { + ret = PTR_ERR(mr->mr); log_rdma_mr(ERR, "ib_alloc_mr failed mr_type=%x max_frmr_depth=%x\n", sc->mr_io.type, sp->max_frmr_depth); - goto out; + goto ib_alloc_mr_failed; } - smbdirect_mr->sgt.sgl = kcalloc(sp->max_frmr_depth, - sizeof(struct scatterlist), - GFP_KERNEL); - if (!smbdirect_mr->sgt.sgl) { + + mr->sgt.sgl = kcalloc(sp->max_frmr_depth, + sizeof(struct scatterlist), + GFP_KERNEL); + if (!mr->sgt.sgl) { + ret = -ENOMEM; log_rdma_mr(ERR, "failed to allocate sgl\n"); - ib_dereg_mr(smbdirect_mr->mr); - goto out; + goto kcalloc_sgl_failed; } - smbdirect_mr->state = SMBDIRECT_MR_READY; - smbdirect_mr->socket = sc; + mr->state = SMBDIRECT_MR_READY; + mr->socket = sc; - list_add_tail(&smbdirect_mr->list, &sc->mr_io.all.list); + list_add_tail(&mr->list, &sc->mr_io.all.list); atomic_inc(&sc->mr_io.ready.count); } + + INIT_WORK(&sc->mr_io.recovery_work, smbd_mr_recovery_work); + return 0; -out: - kfree(smbdirect_mr); -cleanup_entries: - list_for_each_entry_safe(smbdirect_mr, tmp, &sc->mr_io.all.list, list) { - list_del(&smbdirect_mr->list); - ib_dereg_mr(smbdirect_mr->mr); - kfree(smbdirect_mr->sgt.sgl); - kfree(smbdirect_mr); - } - return -ENOMEM; +kcalloc_sgl_failed: + ib_dereg_mr(mr->mr); +ib_alloc_mr_failed: + mutex_destroy(&mr->mutex); + kfree(mr); +kzalloc_mr_failed: + destroy_mr_list(sc); + return ret; } /* @@ -2458,6 +2521,7 @@ again: list_for_each_entry(ret, &sc->mr_io.all.list, list) { if (ret->state == SMBDIRECT_MR_READY) { ret->state = SMBDIRECT_MR_REGISTERED; + kref_get(&ret->kref); spin_unlock_irqrestore(&sc->mr_io.all.lock, flags); atomic_dec(&sc->mr_io.ready.count); atomic_inc(&sc->mr_io.used.count); @@ -2504,9 +2568,8 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, { struct smbdirect_socket *sc = &info->socket; struct smbdirect_socket_parameters *sp = &sc->parameters; - struct smbdirect_mr_io *smbdirect_mr; + struct smbdirect_mr_io *mr; int rc, num_pages; - enum dma_data_direction dir; struct ib_reg_wr *reg_wr; num_pages = iov_iter_npages(iter, sp->max_frmr_depth + 1); @@ -2517,49 +2580,47 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, return NULL; } - smbdirect_mr = get_mr(sc); - if (!smbdirect_mr) { + mr = get_mr(sc); + if (!mr) { log_rdma_mr(ERR, "get_mr returning NULL\n"); return NULL; } - dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; - smbdirect_mr->dir = dir; - smbdirect_mr->need_invalidate = need_invalidate; - smbdirect_mr->sgt.nents = 0; - smbdirect_mr->sgt.orig_nents = 0; + mutex_lock(&mr->mutex); + + mr->dir = writing ? DMA_FROM_DEVICE : DMA_TO_DEVICE; + mr->need_invalidate = need_invalidate; + mr->sgt.nents = 0; + mr->sgt.orig_nents = 0; log_rdma_mr(INFO, "num_pages=0x%x count=0x%zx depth=%u\n", num_pages, iov_iter_count(iter), sp->max_frmr_depth); - smbd_iter_to_mr(iter, &smbdirect_mr->sgt, sp->max_frmr_depth); + smbd_iter_to_mr(iter, &mr->sgt, sp->max_frmr_depth); - rc = ib_dma_map_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, dir); + rc = ib_dma_map_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); if (!rc) { log_rdma_mr(ERR, "ib_dma_map_sg num_pages=%x dir=%x rc=%x\n", - num_pages, dir, rc); + num_pages, mr->dir, rc); goto dma_map_error; } - rc = ib_map_mr_sg(smbdirect_mr->mr, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, NULL, PAGE_SIZE); - if (rc != smbdirect_mr->sgt.nents) { + rc = ib_map_mr_sg(mr->mr, mr->sgt.sgl, mr->sgt.nents, NULL, PAGE_SIZE); + if (rc != mr->sgt.nents) { log_rdma_mr(ERR, - "ib_map_mr_sg failed rc = %d nents = %x\n", - rc, smbdirect_mr->sgt.nents); + "ib_map_mr_sg failed rc = %d nents = %x\n", + rc, mr->sgt.nents); goto map_mr_error; } - ib_update_fast_reg_key(smbdirect_mr->mr, - ib_inc_rkey(smbdirect_mr->mr->rkey)); - reg_wr = &smbdirect_mr->wr; + ib_update_fast_reg_key(mr->mr, ib_inc_rkey(mr->mr->rkey)); + reg_wr = &mr->wr; reg_wr->wr.opcode = IB_WR_REG_MR; - smbdirect_mr->cqe.done = register_mr_done; - reg_wr->wr.wr_cqe = &smbdirect_mr->cqe; + mr->cqe.done = register_mr_done; + reg_wr->wr.wr_cqe = &mr->cqe; reg_wr->wr.num_sge = 0; reg_wr->wr.send_flags = IB_SEND_SIGNALED; - reg_wr->mr = smbdirect_mr->mr; - reg_wr->key = smbdirect_mr->mr->rkey; + reg_wr->mr = mr->mr; + reg_wr->key = mr->mr->rkey; reg_wr->access = writing ? IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE : IB_ACCESS_REMOTE_READ; @@ -2570,24 +2631,51 @@ struct smbdirect_mr_io *smbd_register_mr(struct smbd_connection *info, * on the next ib_post_send when we actually send I/O to remote peer */ rc = ib_post_send(sc->ib.qp, ®_wr->wr, NULL); - if (!rc) - return smbdirect_mr; + if (!rc) { + /* + * get_mr() gave us a reference + * via kref_get(&mr->kref), we keep that and let + * the caller use smbd_deregister_mr() + * to remove it again. + */ + mutex_unlock(&mr->mutex); + return mr; + } log_rdma_mr(ERR, "ib_post_send failed rc=%x reg_wr->key=%x\n", rc, reg_wr->key); /* If all failed, attempt to recover this MR by setting it SMBDIRECT_MR_ERROR*/ map_mr_error: - ib_dma_unmap_sg(sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, smbdirect_mr->dir); + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); dma_map_error: - smbdirect_mr->state = SMBDIRECT_MR_ERROR; + mr->sgt.nents = 0; + mr->state = SMBDIRECT_MR_ERROR; if (atomic_dec_and_test(&sc->mr_io.used.count)) wake_up(&sc->mr_io.cleanup.wait_queue); smbd_disconnect_rdma_connection(sc); + /* + * get_mr() gave us a reference + * via kref_get(&mr->kref), we need to remove it again + * on error. + * + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); + return NULL; } @@ -2612,44 +2700,55 @@ static void local_inv_done(struct ib_cq *cq, struct ib_wc *wc) * and we have to locally invalidate the buffer to prevent data is being * modified by remote peer after upper layer consumes it */ -int smbd_deregister_mr(struct smbdirect_mr_io *smbdirect_mr) +void smbd_deregister_mr(struct smbdirect_mr_io *mr) { - struct ib_send_wr *wr; - struct smbdirect_socket *sc = smbdirect_mr->socket; - int rc = 0; + struct smbdirect_socket *sc = mr->socket; + + mutex_lock(&mr->mutex); + if (mr->state == SMBDIRECT_MR_DISABLED) + goto put_kref; + + if (sc->status != SMBDIRECT_SOCKET_CONNECTED) { + smbd_mr_disable_locked(mr); + goto put_kref; + } + + if (mr->need_invalidate) { + struct ib_send_wr *wr = &mr->inv_wr; + int rc; - if (smbdirect_mr->need_invalidate) { /* Need to finish local invalidation before returning */ - wr = &smbdirect_mr->inv_wr; wr->opcode = IB_WR_LOCAL_INV; - smbdirect_mr->cqe.done = local_inv_done; - wr->wr_cqe = &smbdirect_mr->cqe; + mr->cqe.done = local_inv_done; + wr->wr_cqe = &mr->cqe; wr->num_sge = 0; - wr->ex.invalidate_rkey = smbdirect_mr->mr->rkey; + wr->ex.invalidate_rkey = mr->mr->rkey; wr->send_flags = IB_SEND_SIGNALED; - init_completion(&smbdirect_mr->invalidate_done); + init_completion(&mr->invalidate_done); rc = ib_post_send(sc->ib.qp, wr, NULL); if (rc) { log_rdma_mr(ERR, "ib_post_send failed rc=%x\n", rc); + smbd_mr_disable_locked(mr); smbd_disconnect_rdma_connection(sc); goto done; } - wait_for_completion(&smbdirect_mr->invalidate_done); - smbdirect_mr->need_invalidate = false; + wait_for_completion(&mr->invalidate_done); + mr->need_invalidate = false; } else /* * For remote invalidation, just set it to SMBDIRECT_MR_INVALIDATED * and defer to mr_recovery_work to recover the MR for next use */ - smbdirect_mr->state = SMBDIRECT_MR_INVALIDATED; + mr->state = SMBDIRECT_MR_INVALIDATED; - if (smbdirect_mr->state == SMBDIRECT_MR_INVALIDATED) { - ib_dma_unmap_sg( - sc->ib.dev, smbdirect_mr->sgt.sgl, - smbdirect_mr->sgt.nents, - smbdirect_mr->dir); - smbdirect_mr->state = SMBDIRECT_MR_READY; + if (mr->sgt.nents) { + ib_dma_unmap_sg(sc->ib.dev, mr->sgt.sgl, mr->sgt.nents, mr->dir); + mr->sgt.nents = 0; + } + + if (mr->state == SMBDIRECT_MR_INVALIDATED) { + mr->state = SMBDIRECT_MR_READY; if (atomic_inc_return(&sc->mr_io.ready.count) == 1) wake_up(&sc->mr_io.ready.wait_queue); } else @@ -2663,7 +2762,23 @@ done: if (atomic_dec_and_test(&sc->mr_io.used.count)) wake_up(&sc->mr_io.cleanup.wait_queue); - return rc; +put_kref: + /* + * No kref_put_mutex() as it's already locked. + * + * If smbd_mr_free_locked() is called + * and the mutex is unlocked and mr is gone, + * in that case kref_put() returned 1. + * + * If kref_put() returned 0 we know that + * smbd_mr_free_locked() didn't + * run. Not by us nor by anyone else, as we + * still hold the mutex, so we need to unlock + * and keep the mr in SMBDIRECT_MR_READY or + * SMBDIRECT_MR_ERROR state. + */ + if (!kref_put(&mr->kref, smbd_mr_free_locked)) + mutex_unlock(&mr->mutex); } static bool smb_set_sge(struct smb_extract_to_rdma *rdma, diff --git a/fs/smb/client/smbdirect.h b/fs/smb/client/smbdirect.h index d67ac5ddaff4..577d37dbeb8a 100644 --- a/fs/smb/client/smbdirect.h +++ b/fs/smb/client/smbdirect.h @@ -60,7 +60,7 @@ int smbd_send(struct TCP_Server_Info *server, struct smbdirect_mr_io *smbd_register_mr( struct smbd_connection *info, struct iov_iter *iter, bool writing, bool need_invalidate); -int smbd_deregister_mr(struct smbdirect_mr_io *mr); +void smbd_deregister_mr(struct smbdirect_mr_io *mr); #else #define cifs_rdma_enabled(server) 0 diff --git a/fs/smb/client/trace.h b/fs/smb/client/trace.h index fd650e2afc76..28e00c34df1c 100644 --- a/fs/smb/client/trace.h +++ b/fs/smb/client/trace.h @@ -266,7 +266,7 @@ DEFINE_EVENT(smb3_copy_range_err_class, smb3_##name, \ TP_ARGS(xid, src_fid, target_fid, tid, sesid, src_offset, target_offset, len, rc)) DEFINE_SMB3_COPY_RANGE_ERR_EVENT(clone_err); -/* TODO: Add SMB3_COPY_RANGE_ERR_EVENT(copychunk_err) */ +DEFINE_SMB3_COPY_RANGE_ERR_EVENT(copychunk_err); DECLARE_EVENT_CLASS(smb3_copy_range_done_class, TP_PROTO(unsigned int xid, diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index a61ba7f3fb86..051cd9dbba13 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -22,6 +22,7 @@ #include <linux/mempool.h> #include <linux/sched/signal.h> #include <linux/task_io_accounting_ops.h> +#include <linux/task_work.h> #include "cifspdu.h" #include "cifsglob.h" #include "cifsproto.h" @@ -173,9 +174,16 @@ smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, * send a packet. In most cases if we fail to send * after the retries we will kill the socket and * reconnect which may clear the network problem. + * + * Even if regular signals are masked, EINTR might be + * propagated from sk_stream_wait_memory() to here when + * TIF_NOTIFY_SIGNAL is used for task work. For example, + * certain io_uring completions will use that. Treat + * having EINTR with pending task work the same as EAGAIN + * to avoid unnecessary reconnects. */ rc = sock_sendmsg(ssocket, smb_msg); - if (rc == -EAGAIN) { + if (rc == -EAGAIN || unlikely(rc == -EINTR && task_work_pending(current))) { retries++; if (retries >= 14 || (!server->noblocksnd && (retries > 2))) { @@ -323,8 +331,7 @@ int __smb_send_rqst(struct TCP_Server_Info *server, int num_rqst, break; total_len += sent; } - -} + } unmask: sigprocmask(SIG_SETMASK, &oldmask, NULL); diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index b88fa04f5792..029910d56c22 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -178,7 +178,6 @@ static int cifs_xattr_set(const struct xattr_handler *handler, memcpy(pacl, value, size); if (pTcon->ses->server->ops->set_acl) { int aclflags = 0; - rc = 0; switch (handler->flags) { case XATTR_CIFS_NTSD_FULL: diff --git a/fs/smb/common/Makefile b/fs/smb/common/Makefile index c66dbbc1469c..9e0730a385fb 100644 --- a/fs/smb/common/Makefile +++ b/fs/smb/common/Makefile @@ -3,5 +3,4 @@ # Makefile for Linux filesystem routines that are shared by client and server. # -obj-$(CONFIG_SMBFS) += cifs_arc4.o obj-$(CONFIG_SMBFS) += cifs_md4.o diff --git a/fs/smb/common/arc4.h b/fs/smb/common/arc4.h deleted file mode 100644 index 12e71ec033a1..000000000000 --- a/fs/smb/common/arc4.h +++ /dev/null @@ -1,23 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0+ */ -/* - * Common values for ARC4 Cipher Algorithm - */ - -#ifndef _CRYPTO_ARC4_H -#define _CRYPTO_ARC4_H - -#include <linux/types.h> - -#define ARC4_MIN_KEY_SIZE 1 -#define ARC4_MAX_KEY_SIZE 256 -#define ARC4_BLOCK_SIZE 1 - -struct arc4_ctx { - u32 S[256]; - u32 x, y; -}; - -int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len); -void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len); - -#endif /* _CRYPTO_ARC4_H */ diff --git a/fs/smb/common/cifs_arc4.c b/fs/smb/common/cifs_arc4.c deleted file mode 100644 index df360ca47826..000000000000 --- a/fs/smb/common/cifs_arc4.c +++ /dev/null @@ -1,75 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * Cryptographic API - * - * ARC4 Cipher Algorithm - * - * Jon Oberheide <jon@oberheide.org> - */ - -#include <linux/module.h> -#include "arc4.h" - -MODULE_DESCRIPTION("ARC4 Cipher Algorithm"); -MODULE_LICENSE("GPL"); - -int cifs_arc4_setkey(struct arc4_ctx *ctx, const u8 *in_key, unsigned int key_len) -{ - int i, j = 0, k = 0; - - ctx->x = 1; - ctx->y = 0; - - for (i = 0; i < 256; i++) - ctx->S[i] = i; - - for (i = 0; i < 256; i++) { - u32 a = ctx->S[i]; - - j = (j + in_key[k] + a) & 0xff; - ctx->S[i] = ctx->S[j]; - ctx->S[j] = a; - if (++k >= key_len) - k = 0; - } - - return 0; -} -EXPORT_SYMBOL_GPL(cifs_arc4_setkey); - -void cifs_arc4_crypt(struct arc4_ctx *ctx, u8 *out, const u8 *in, unsigned int len) -{ - u32 *const S = ctx->S; - u32 x, y, a, b; - u32 ty, ta, tb; - - if (len == 0) - return; - - x = ctx->x; - y = ctx->y; - - a = S[x]; - y = (y + a) & 0xff; - b = S[y]; - - do { - S[y] = a; - a = (a + b) & 0xff; - S[x] = b; - x = (x + 1) & 0xff; - ta = S[x]; - ty = (y + ta) & 0xff; - tb = S[ty]; - *out++ = *in++ ^ S[a]; - if (--len == 0) - break; - y = ty; - a = ta; - b = tb; - } while (true); - - ctx->x = x; - ctx->y = y; -} -EXPORT_SYMBOL_GPL(cifs_arc4_crypt); diff --git a/fs/smb/common/cifsglob.h b/fs/smb/common/cifsglob.h new file mode 100644 index 000000000000..00fd215e3eb5 --- /dev/null +++ b/fs/smb/common/cifsglob.h @@ -0,0 +1,30 @@ +/* SPDX-License-Identifier: LGPL-2.1 */ +/* + * + * Copyright (C) International Business Machines Corp., 2002,2008 + * Author(s): Steve French (sfrench@us.ibm.com) + * Jeremy Allison (jra@samba.org) + * + */ +#ifndef _COMMON_CIFS_GLOB_H +#define _COMMON_CIFS_GLOB_H + +static inline void inc_rfc1001_len(void *buf, int count) +{ + be32_add_cpu((__be32 *)buf, count); +} + +#define SMB1_VERSION_STRING "1.0" +#define SMB20_VERSION_STRING "2.0" +#define SMB21_VERSION_STRING "2.1" +#define SMBDEFAULT_VERSION_STRING "default" +#define SMB3ANY_VERSION_STRING "3" +#define SMB30_VERSION_STRING "3.0" +#define SMB302_VERSION_STRING "3.02" +#define ALT_SMB302_VERSION_STRING "3.0.2" +#define SMB311_VERSION_STRING "3.1.1" +#define ALT_SMB311_VERSION_STRING "3.11" + +#define CIFS_DEFAULT_IOSIZE (1024 * 1024) + +#endif /* _COMMON_CIFS_GLOB_H */ diff --git a/fs/smb/common/smbdirect/smbdirect_socket.h b/fs/smb/common/smbdirect/smbdirect_socket.h index db22a1d0546b..361db7f9f623 100644 --- a/fs/smb/common/smbdirect/smbdirect_socket.h +++ b/fs/smb/common/smbdirect/smbdirect_socket.h @@ -437,13 +437,22 @@ enum smbdirect_mr_state { SMBDIRECT_MR_READY, SMBDIRECT_MR_REGISTERED, SMBDIRECT_MR_INVALIDATED, - SMBDIRECT_MR_ERROR + SMBDIRECT_MR_ERROR, + SMBDIRECT_MR_DISABLED }; struct smbdirect_mr_io { struct smbdirect_socket *socket; struct ib_cqe cqe; + /* + * We can have up to two references: + * 1. by the connection + * 2. by the registration + */ + struct kref kref; + struct mutex mutex; + struct list_head list; enum smbdirect_mr_state state; diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index 4a23a5e7e8fe..098cac98d31e 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -10,6 +10,7 @@ config SMB_SERVER select CRYPTO_MD5 select CRYPTO_HMAC select CRYPTO_ECB + select CRYPTO_LIB_ARC4 select CRYPTO_LIB_DES select CRYPTO_LIB_SHA256 select CRYPTO_SHA256 diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index d99871c21451..b4020bb55a26 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -20,6 +20,7 @@ #include "glob.h" #include <linux/fips.h> +#include <crypto/arc4.h> #include <crypto/des.h> #include "server.h" @@ -29,7 +30,6 @@ #include "mgmt/user_config.h" #include "crypto_ctx.h" #include "transport_ipc.h" -#include "../common/arc4.h" /* * Fixed format data defining GSS header and fixed string @@ -365,10 +365,9 @@ int ksmbd_decode_ntlmssp_auth_blob(struct authenticate_message *authblob, if (!ctx_arc4) return -ENOMEM; - cifs_arc4_setkey(ctx_arc4, sess->sess_key, - SMB2_NTLMV2_SESSKEY_SIZE); - cifs_arc4_crypt(ctx_arc4, sess->sess_key, - (char *)authblob + sess_key_off, sess_key_len); + arc4_setkey(ctx_arc4, sess->sess_key, SMB2_NTLMV2_SESSKEY_SIZE); + arc4_crypt(ctx_arc4, sess->sess_key, + (char *)authblob + sess_key_off, sess_key_len); kfree_sensitive(ctx_arc4); } diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index 91a934411134..b6b4f1286b9c 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -19,7 +19,7 @@ static DEFINE_MUTEX(init_lock); static struct ksmbd_conn_ops default_conn_ops; -LIST_HEAD(conn_list); +DEFINE_HASHTABLE(conn_list, CONN_HASH_BITS); DECLARE_RWSEM(conn_list_lock); /** @@ -33,7 +33,7 @@ DECLARE_RWSEM(conn_list_lock); void ksmbd_conn_free(struct ksmbd_conn *conn) { down_write(&conn_list_lock); - list_del(&conn->conns_list); + hash_del(&conn->hlist); up_write(&conn_list_lock); xa_destroy(&conn->sessions); @@ -77,7 +77,6 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) init_waitqueue_head(&conn->req_running_q); init_waitqueue_head(&conn->r_count_q); - INIT_LIST_HEAD(&conn->conns_list); INIT_LIST_HEAD(&conn->requests); INIT_LIST_HEAD(&conn->async_requests); spin_lock_init(&conn->request_lock); @@ -90,19 +89,17 @@ struct ksmbd_conn *ksmbd_conn_alloc(void) init_rwsem(&conn->session_lock); - down_write(&conn_list_lock); - list_add(&conn->conns_list, &conn_list); - up_write(&conn_list_lock); return conn; } bool ksmbd_conn_lookup_dialect(struct ksmbd_conn *c) { struct ksmbd_conn *t; + int bkt; bool ret = false; down_read(&conn_list_lock); - list_for_each_entry(t, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, t, hlist) { if (memcmp(t->ClientGUID, c->ClientGUID, SMB2_CLIENT_GUID_SIZE)) continue; @@ -163,9 +160,10 @@ void ksmbd_conn_unlock(struct ksmbd_conn *conn) void ksmbd_all_conn_set_status(u64 sess_id, u32 status) { struct ksmbd_conn *conn; + int bkt; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { if (conn->binding || xa_load(&conn->sessions, sess_id)) WRITE_ONCE(conn->status, status); } @@ -181,14 +179,14 @@ int ksmbd_conn_wait_idle_sess_id(struct ksmbd_conn *curr_conn, u64 sess_id) { struct ksmbd_conn *conn; int rc, retry_count = 0, max_timeout = 120; - int rcount = 1; + int rcount = 1, bkt; retry_idle: if (retry_count >= max_timeout) return -EIO; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { if (conn->binding || xa_load(&conn->sessions, sess_id)) { if (conn == curr_conn) rcount = 2; @@ -480,10 +478,11 @@ static void stop_sessions(void) { struct ksmbd_conn *conn; struct ksmbd_transport *t; + int bkt; again: down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { t = conn->transport; ksmbd_conn_set_exiting(conn); if (t->ops->shutdown) { @@ -494,7 +493,7 @@ again: } up_read(&conn_list_lock); - if (!list_empty(&conn_list)) { + if (!hash_empty(conn_list)) { msleep(100); goto again; } diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 07b43634262a..7f9bcd9817b5 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -54,11 +54,12 @@ struct ksmbd_conn { u8 inet6_addr[16]; #endif }; + unsigned int inet_hash; char *request_buf; struct ksmbd_transport *transport; struct nls_table *local_nls; struct unicode_map *um; - struct list_head conns_list; + struct hlist_node hlist; struct rw_semaphore session_lock; /* smb session 1 per user */ struct xarray sessions; @@ -153,7 +154,8 @@ struct ksmbd_transport { #define KSMBD_TCP_SEND_TIMEOUT (5 * HZ) #define KSMBD_TCP_PEER_SOCKADDR(c) ((struct sockaddr *)&((c)->peer_addr)) -extern struct list_head conn_list; +#define CONN_HASH_BITS 12 +extern DECLARE_HASHTABLE(conn_list, CONN_HASH_BITS); extern struct rw_semaphore conn_list_lock; bool ksmbd_conn_alive(struct ksmbd_conn *conn); diff --git a/fs/smb/server/ksmbd_netlink.h b/fs/smb/server/ksmbd_netlink.h index 3f07a612c05b..8ccd57fd904b 100644 --- a/fs/smb/server/ksmbd_netlink.h +++ b/fs/smb/server/ksmbd_netlink.h @@ -112,10 +112,11 @@ struct ksmbd_startup_request { __u32 smbd_max_io_size; /* smbd read write size */ __u32 max_connections; /* Number of maximum simultaneous connections */ __s8 bind_interfaces_only; - __s8 reserved[503]; /* Reserved room */ + __u32 max_ip_connections; /* Number of maximum connection per ip address */ + __s8 reserved[499]; /* Reserved room */ __u32 ifc_list_sz; /* interfaces list size */ __s8 ____payload[]; -}; +} __packed; #define KSMBD_STARTUP_CONFIG_INTERFACES(s) ((s)->____payload) diff --git a/fs/smb/server/mgmt/share_config.c b/fs/smb/server/mgmt/share_config.c index d3d5f99bdd34..c9b1108d6e96 100644 --- a/fs/smb/server/mgmt/share_config.c +++ b/fs/smb/server/mgmt/share_config.c @@ -19,7 +19,7 @@ #include "../transport_ipc.h" #include "../misc.h" -#define SHARE_HASH_BITS 3 +#define SHARE_HASH_BITS 12 static DEFINE_HASHTABLE(shares_table, SHARE_HASH_BITS); static DECLARE_RWSEM(shares_table_lock); diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 9dec4c2940bc..1c181ef99929 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -18,7 +18,7 @@ static DEFINE_IDA(session_ida); -#define SESSION_HASH_BITS 3 +#define SESSION_HASH_BITS 12 static DEFINE_HASHTABLE(sessions_table, SESSION_HASH_BITS); static DECLARE_RWSEM(sessions_table_lock); @@ -104,29 +104,32 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; - down_read(&sess->rpc_lock); entry->method = method; entry->id = id = ksmbd_ipc_id_alloc(); if (id < 0) goto free_entry; + + down_write(&sess->rpc_lock); old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); - if (xa_is_err(old)) + if (xa_is_err(old)) { + up_write(&sess->rpc_lock); goto free_id; + } resp = ksmbd_rpc_open(sess, id); - if (!resp) - goto erase_xa; + if (!resp) { + xa_erase(&sess->rpc_handle_list, entry->id); + up_write(&sess->rpc_lock); + goto free_id; + } - up_read(&sess->rpc_lock); + up_write(&sess->rpc_lock); kvfree(resp); return id; -erase_xa: - xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); - up_read(&sess->rpc_lock); return -EINVAL; } @@ -145,7 +148,9 @@ int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + lockdep_assert_held(&sess->rpc_lock); entry = xa_load(&sess->rpc_handle_list, id); + return entry ? entry->method : 0; } diff --git a/fs/smb/server/server.h b/fs/smb/server/server.h index 995555febe7d..b8a7317be86b 100644 --- a/fs/smb/server/server.h +++ b/fs/smb/server/server.h @@ -43,6 +43,7 @@ struct ksmbd_server_config { unsigned int auth_mechs; unsigned int max_connections; unsigned int max_inflight_req; + unsigned int max_ip_connections; char *conf[SERVER_CONF_WORK_GROUP + 1]; struct task_struct *dh_task; diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 0c069eff80b7..f901ae18e68a 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -1806,6 +1806,7 @@ int smb2_sess_setup(struct ksmbd_work *work) if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; + ksmbd_user_session_put(sess); sess = NULL; goto out_err; } @@ -4625,8 +4626,15 @@ static int smb2_get_info_file_pipe(struct ksmbd_session *sess, * pipe without opening it, checking error condition here */ id = req->VolatileFileId; - if (!ksmbd_session_rpc_method(sess, id)) + + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); + if (!ksmbd_session_rpc_method(sess, id)) { + up_read(&sess->rpc_lock); return -ENOENT; + } + up_read(&sess->rpc_lock); ksmbd_debug(SMB, "FileInfoClass %u, FileId 0x%llx\n", req->FileInfoClass, req->VolatileFileId); @@ -5629,7 +5637,8 @@ static int smb2_get_info_filesystem(struct ksmbd_work *work, if (!work->tcon->posix_extensions) { pr_err("client doesn't negotiate with SMB3.1.1 POSIX Extensions\n"); - rc = -EOPNOTSUPP; + path_put(&path); + return -EOPNOTSUPP; } else { info = (struct filesystem_posix_info *)(rsp->Buffer); info->OptimalTransferSize = cpu_to_le32(stfs.f_bsize); @@ -6823,6 +6832,7 @@ int smb2_read(struct ksmbd_work *work) nbytes = ksmbd_vfs_read(work, fp, length, &offset, aux_payload_buf); if (nbytes < 0) { + kvfree(aux_payload_buf); err = nbytes; goto out; } @@ -7361,7 +7371,7 @@ int smb2_lock(struct ksmbd_work *work) int nolock = 0; LIST_HEAD(lock_list); LIST_HEAD(rollback_list); - int prior_lock = 0; + int prior_lock = 0, bkt; WORK_BUFFERS(work, req, rsp); @@ -7471,7 +7481,7 @@ int smb2_lock(struct ksmbd_work *work) nolock = 1; /* check locks in connection list */ down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) { + hash_for_each(conn_list, bkt, conn, hlist) { spin_lock(&conn->llist_lock); list_for_each_entry_safe(cmp_lock, tmp2, &conn->lock_list, clist) { if (file_inode(cmp_lock->fl->c.flc_file) != diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index d742ba754348..863716207a0d 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -10,6 +10,7 @@ #include "glob.h" #include "nterr.h" +#include "../common/cifsglob.h" #include "../common/smb2pdu.h" #include "smb2pdu.h" @@ -26,16 +27,8 @@ #define SMB311_PROT 6 #define BAD_PROT 0xFFFF -#define SMB1_VERSION_STRING "1.0" -#define SMB20_VERSION_STRING "2.0" -#define SMB21_VERSION_STRING "2.1" -#define SMB30_VERSION_STRING "3.0" -#define SMB302_VERSION_STRING "3.02" -#define SMB311_VERSION_STRING "3.1.1" - #define SMB_ECHO_INTERVAL (60 * HZ) -#define CIFS_DEFAULT_IOSIZE (64 * 1024) #define MAX_CIFS_SMALL_BUFFER_SIZE 448 /* big enough for most */ #define MAX_STREAM_PROT_LEN 0x00FFFFFF @@ -464,9 +457,4 @@ static inline unsigned int get_rfc1002_len(void *buf) { return be32_to_cpu(*((__be32 *)buf)) & 0xffffff; } - -static inline void inc_rfc1001_len(void *buf, int count) -{ - be32_add_cpu((__be32 *)buf, count); -} #endif /* __SMB_COMMON_H__ */ diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 2a3e2b0ce557..46f87fd1ce1c 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -335,6 +335,9 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) if (req->max_connections) server_conf.max_connections = req->max_connections; + if (req->max_ip_connections) + server_conf.max_ip_connections = req->max_ip_connections; + ret = ksmbd_set_netbios_name(req->netbios_name); ret |= ksmbd_set_server_string(req->server_string); ret |= ksmbd_set_work_group(req->work_group); @@ -822,6 +825,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -830,6 +836,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_write(struct ksmbd_session *sess, int handle req->flags |= KSMBD_RPC_WRITE_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); @@ -846,6 +853,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -853,6 +863,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_read(struct ksmbd_session *sess, int handle) req->flags |= rpc_context_flags(sess); req->flags |= KSMBD_RPC_READ_METHOD; req->payload_sz = 0; + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); @@ -873,6 +884,9 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle if (!msg) return NULL; + lockdep_assert_not_held(&sess->rpc_lock); + + down_read(&sess->rpc_lock); msg->type = KSMBD_EVENT_RPC_REQUEST; req = (struct ksmbd_rpc_command *)msg->payload; req->handle = handle; @@ -881,6 +895,7 @@ struct ksmbd_rpc_command *ksmbd_rpc_ioctl(struct ksmbd_session *sess, int handle req->flags |= KSMBD_RPC_IOCTL_METHOD; req->payload_sz = payload_sz; memcpy(req->payload, payload, payload_sz); + up_read(&sess->rpc_lock); resp = ipc_msg_send_request(msg, req->handle); ipc_msg_free(msg); diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index 9e644a0daf1c..a201c5871a77 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -425,6 +425,11 @@ static struct smb_direct_transport *alloc_transport(struct rdma_cm_id *cm_id) conn = ksmbd_conn_alloc(); if (!conn) goto err; + + down_write(&conn_list_lock); + hash_add(conn_list, &conn->hlist, 0); + up_write(&conn_list_lock); + conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_smb_direct_transport_ops; @@ -1569,18 +1574,14 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, get_buf_page_count(desc_buf, desc_buf_len), msg->sg_list, SG_CHUNK_SIZE); if (ret) { - kfree(msg); ret = -ENOMEM; - goto out; + goto free_msg; } ret = get_sg_list(desc_buf, desc_buf_len, msg->sgt.sgl, msg->sgt.orig_nents); - if (ret < 0) { - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); - goto out; - } + if (ret < 0) + goto free_table; ret = rdma_rw_ctx_init(&msg->rdma_ctx, sc->ib.qp, sc->ib.qp->port, msg->sgt.sgl, @@ -1591,9 +1592,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, is_read ? DMA_FROM_DEVICE : DMA_TO_DEVICE); if (ret < 0) { pr_err("failed to init rdma_rw_ctx: %d\n", ret); - sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); - kfree(msg); - goto out; + goto free_table; } list_add_tail(&msg->list, &msg_list); @@ -1625,6 +1624,12 @@ out: atomic_add(credits_needed, &sc->rw_io.credits.count); wake_up(&sc->rw_io.credits.wait_queue); return ret; + +free_table: + sg_free_table_chained(&msg->sgt, SG_CHUNK_SIZE); +free_msg: + kfree(msg); + goto out; } static int smb_direct_rdma_write(struct ksmbd_transport *t, diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 4337df97987d..7a1e3dcc2cde 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -86,13 +86,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) } #if IS_ENABLED(CONFIG_IPV6) - if (client_sk->sk->sk_family == AF_INET6) + if (client_sk->sk->sk_family == AF_INET6) { memcpy(&conn->inet6_addr, &client_sk->sk->sk_v6_daddr, 16); - else + conn->inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr); + } else { conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; + conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); + } #else conn->inet_addr = inet_sk(client_sk->sk)->inet_daddr; + conn->inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); #endif + down_write(&conn_list_lock); + hash_add(conn_list, &conn->hlist, conn->inet_hash); + up_write(&conn_list_lock); + conn->transport = KSMBD_TRANS(t); KSMBD_TRANS(t)->conn = conn; KSMBD_TRANS(t)->ops = &ksmbd_tcp_transport_ops; @@ -170,17 +178,6 @@ static struct kvec *get_conn_iovec(struct tcp_transport *t, unsigned int nr_segs return new_iov; } -static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa) -{ - switch (sa->sa_family) { - case AF_INET: - return ntohs(((struct sockaddr_in *)sa)->sin_port); - case AF_INET6: - return ntohs(((struct sockaddr_in6 *)sa)->sin6_port); - } - return 0; -} - /** * ksmbd_tcp_new_connection() - create a new tcp session on mount * @client_sk: socket associated with new connection @@ -192,7 +189,6 @@ static unsigned short ksmbd_tcp_get_port(const struct sockaddr *sa) */ static int ksmbd_tcp_new_connection(struct socket *client_sk) { - struct sockaddr *csin; int rc = 0; struct tcp_transport *t; struct task_struct *handler; @@ -203,27 +199,26 @@ static int ksmbd_tcp_new_connection(struct socket *client_sk) return -ENOMEM; } - csin = KSMBD_TCP_PEER_SOCKADDR(KSMBD_TRANS(t)->conn); - if (kernel_getpeername(client_sk, csin) < 0) { - pr_err("client ip resolution failed\n"); - rc = -EINVAL; - goto out_error; - } - +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:%pI6c", + &KSMBD_TRANS(t)->conn->inet6_addr); + else + handler = kthread_run(ksmbd_conn_handler_loop, + KSMBD_TRANS(t)->conn, "ksmbd:%pI4", + &KSMBD_TRANS(t)->conn->inet_addr); +#else handler = kthread_run(ksmbd_conn_handler_loop, - KSMBD_TRANS(t)->conn, - "ksmbd:%u", - ksmbd_tcp_get_port(csin)); + KSMBD_TRANS(t)->conn, "ksmbd:%pI4", + &KSMBD_TRANS(t)->conn->inet_addr); +#endif if (IS_ERR(handler)) { pr_err("cannot start conn thread\n"); rc = PTR_ERR(handler); free_transport(t); } return rc; - -out_error: - free_transport(t); - return rc; } /** @@ -237,7 +232,8 @@ static int ksmbd_kthread_fn(void *p) struct socket *client_sk = NULL; struct interface *iface = (struct interface *)p; struct ksmbd_conn *conn; - int ret; + int ret, inet_hash; + unsigned int max_ip_conns; while (!kthread_should_stop()) { mutex_lock(&iface->sock_release_lock); @@ -255,34 +251,49 @@ static int ksmbd_kthread_fn(void *p) continue; } + if (!server_conf.max_ip_connections) + goto skip_max_ip_conns_limit; + /* * Limits repeated connections from clients with the same IP. */ +#if IS_ENABLED(CONFIG_IPV6) + if (client_sk->sk->sk_family == AF_INET6) + inet_hash = ipv6_addr_hash(&client_sk->sk->sk_v6_daddr); + else + inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); +#else + inet_hash = ipv4_addr_hash(inet_sk(client_sk->sk)->inet_daddr); +#endif + + max_ip_conns = 0; down_read(&conn_list_lock); - list_for_each_entry(conn, &conn_list, conns_list) + hash_for_each_possible(conn_list, conn, hlist, inet_hash) { #if IS_ENABLED(CONFIG_IPV6) if (client_sk->sk->sk_family == AF_INET6) { if (memcmp(&client_sk->sk->sk_v6_daddr, - &conn->inet6_addr, 16) == 0) { - ret = -EAGAIN; - break; - } + &conn->inet6_addr, 16) == 0) + max_ip_conns++; } else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { - ret = -EAGAIN; - break; - } + conn->inet_addr) + max_ip_conns++; #else if (inet_sk(client_sk->sk)->inet_daddr == - conn->inet_addr) { + conn->inet_addr) + max_ip_conns++; +#endif + if (server_conf.max_ip_connections <= max_ip_conns) { + pr_info_ratelimited("Maximum IP connections exceeded (%u/%u)\n", + max_ip_conns, server_conf.max_ip_connections); ret = -EAGAIN; break; } -#endif + } up_read(&conn_list_lock); if (ret == -EAGAIN) continue; +skip_max_ip_conns_limit: if (server_conf.max_connections && atomic_inc_return(&active_num_conn) >= server_conf.max_connections) { pr_info_ratelimited("Limit the maximum number of connections(%u)\n", @@ -468,12 +479,13 @@ static int create_socket(struct interface *iface) struct socket *ksmbd_socket; bool ipv4 = false; - ret = sock_create(PF_INET6, SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET6, SOCK_STREAM, + IPPROTO_TCP, &ksmbd_socket); if (ret) { if (ret != -EAFNOSUPPORT) pr_err("Can't create socket for ipv6, fallback to ipv4: %d\n", ret); - ret = sock_create(PF_INET, SOCK_STREAM, IPPROTO_TCP, - &ksmbd_socket); + ret = sock_create_kern(current->nsproxy->net_ns, PF_INET, + SOCK_STREAM, IPPROTO_TCP, &ksmbd_socket); if (ret) { pr_err("Can't create socket for ipv4: %d\n", ret); goto out_clear; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 1cfa688904b2..891ed2dc2b73 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -20,6 +20,7 @@ #include <linux/sched/xacct.h> #include <linux/crc32c.h> #include <linux/namei.h> +#include <linux/splice.h> #include "glob.h" #include "oplock.h" @@ -72,7 +73,7 @@ static int ksmbd_vfs_path_lookup(struct ksmbd_share_config *share_conf, { struct qstr last; struct filename *filename __free(putname) = NULL; - struct path *root_share_path = &share_conf->vfs_path; + const struct path *root_share_path = &share_conf->vfs_path; int err, type; struct dentry *d; @@ -1305,7 +1306,7 @@ int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *filepath, caseless, true); } -void ksmbd_vfs_kern_path_unlock(struct path *path) +void ksmbd_vfs_kern_path_unlock(const struct path *path) { /* While lock is still held, ->d_parent is safe */ inode_unlock(d_inode(path->dentry->d_parent)); @@ -1829,8 +1830,19 @@ int ksmbd_vfs_copy_file_ranges(struct ksmbd_work *work, if (src_off + len > src_file_size) return -E2BIG; - ret = vfs_copy_file_range(src_fp->filp, src_off, - dst_fp->filp, dst_off, len, 0); + /* + * vfs_copy_file_range does not allow overlapped copying + * within the same file. + */ + if (file_inode(src_fp->filp) == file_inode(dst_fp->filp) && + dst_off + len > src_off && + dst_off < src_off + len) + ret = do_splice_direct(src_fp->filp, &src_off, + dst_fp->filp, &dst_off, + min_t(size_t, len, MAX_RW_COUNT), 0); + else + ret = vfs_copy_file_range(src_fp->filp, src_off, + dst_fp->filp, dst_off, len, 0); if (ret == -EOPNOTSUPP || ret == -EXDEV) ret = vfs_copy_file_range(src_fp->filp, src_off, dst_fp->filp, dst_off, len, @@ -1855,7 +1867,7 @@ void ksmbd_vfs_posix_lock_unblock(struct file_lock *flock) } int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct path *path) + const struct path *path) { struct posix_acl_state acl_state; struct posix_acl *acls; @@ -1908,7 +1920,7 @@ int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, } int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct path *path, struct inode *parent_inode) + const struct path *path, struct inode *parent_inode) { struct posix_acl *acls; struct posix_acl_entry *pace; diff --git a/fs/smb/server/vfs.h b/fs/smb/server/vfs.h index d47472f3e30b..df6421b4590b 100644 --- a/fs/smb/server/vfs.h +++ b/fs/smb/server/vfs.h @@ -123,7 +123,7 @@ int ksmbd_vfs_kern_path(struct ksmbd_work *work, char *name, int ksmbd_vfs_kern_path_locked(struct ksmbd_work *work, char *name, unsigned int flags, struct path *path, bool caseless); -void ksmbd_vfs_kern_path_unlock(struct path *path); +void ksmbd_vfs_kern_path_unlock(const struct path *path); struct dentry *ksmbd_vfs_kern_path_create(struct ksmbd_work *work, const char *name, unsigned int flags, @@ -164,8 +164,8 @@ int ksmbd_vfs_get_dos_attrib_xattr(struct mnt_idmap *idmap, struct dentry *dentry, struct xattr_dos_attrib *da); int ksmbd_vfs_set_init_posix_acl(struct mnt_idmap *idmap, - struct path *path); + const struct path *path); int ksmbd_vfs_inherit_posix_acl(struct mnt_idmap *idmap, - struct path *path, + const struct path *path, struct inode *parent_inode); #endif /* __KSMBD_VFS_H__ */ diff --git a/fs/stat.c b/fs/stat.c index f95c1dc3eaa4..6c79661e1b96 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -293,7 +293,7 @@ static int statx_lookup_flags(int flags) return lookup_flags; } -static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, +static int vfs_statx_path(const struct path *path, int flags, struct kstat *stat, u32 request_mask) { int error = vfs_getattr(path, stat, request_mask, flags); diff --git a/fs/super.c b/fs/super.c index f4fa0e93c463..5bab94fb7e03 100644 --- a/fs/super.c +++ b/fs/super.c @@ -323,7 +323,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, if (!s) return NULL; - INIT_LIST_HEAD(&s->s_mounts); s->s_user_ns = get_user_ns(user_ns); init_rwsem(&s->s_umount); lockdep_set_class(&s->s_umount, &type->s_umount_key); @@ -408,7 +407,7 @@ static void __put_super(struct super_block *s) list_del_init(&s->s_list); WARN_ON(s->s_dentry_lru.node); WARN_ON(s->s_inode_lru.node); - WARN_ON(!list_empty(&s->s_mounts)); + WARN_ON(s->s_mounts); call_rcu(&s->rcu, destroy_super_rcu); } } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index f24aa98e6869..a79d73f28aa7 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -2272,6 +2272,9 @@ int udf_current_aext(struct inode *inode, struct extent_position *epos, if (check_add_overflow(sizeof(struct allocExtDesc), le32_to_cpu(header->lengthAllocDescs), &alen)) return -1; + + if (alen > epos->bh->b_size) + return -1; } switch (iinfo->i_alloc_type) { diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index 770e29ec3557..42bedc4ec7af 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -315,46 +315,39 @@ static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry, { struct vboxsf_sbi *sbi = VBOXSF_SBI(parent->i_sb); struct vboxsf_handle *sf_handle; - struct dentry *res = NULL; u64 handle; int err; if (d_in_lookup(dentry)) { - res = vboxsf_dir_lookup(parent, dentry, 0); - if (IS_ERR(res)) - return PTR_ERR(res); - - if (res) - dentry = res; + struct dentry *res = vboxsf_dir_lookup(parent, dentry, 0); + if (res || d_really_is_positive(dentry)) + return finish_no_open(file, res); } /* Only creates */ - if (!(flags & O_CREAT) || d_really_is_positive(dentry)) - return finish_no_open(file, res); + if (!(flags & O_CREAT)) + return finish_no_open(file, NULL); err = vboxsf_dir_create(parent, dentry, mode, false, flags & O_EXCL, &handle); if (err) - goto out; + return err; sf_handle = vboxsf_create_sf_handle(d_inode(dentry), handle, SHFL_CF_ACCESS_READWRITE); if (IS_ERR(sf_handle)) { vboxsf_close(sbi->root, handle); - err = PTR_ERR(sf_handle); - goto out; + return PTR_ERR(sf_handle); } err = finish_open(file, dentry, generic_file_open); if (err) { /* This also closes the handle passed to vboxsf_create_sf_handle() */ vboxsf_release_sf_handle(d_inode(dentry), sf_handle); - goto out; + return err; } file->private_data = sf_handle; file->f_mode |= FMODE_CREATED; -out: - dput(res); - return err; + return 0; } static int vboxsf_dir_unlink(struct inode *parent, struct dentry *dentry) diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index fd3a5922f6c3..90e2ad8ee5f4 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -85,7 +85,7 @@ static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset, /* * For conventional zones, all blocks are always mapped. For sequential * zones, all blocks after always mapped below the inode size (zone - * write pointer) and unwriten beyond. + * write pointer) and unwritten beyond. */ mutex_lock(&zi->i_truncate_mutex); iomap->bdev = inode->i_sb->s_bdev; diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index 4dc7f967c861..70be0b3dda49 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -268,7 +268,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, * Check the zone condition: if the zone is not "bad" (offline or * read-only), read errors are simply signaled to the IO issuer as long * as there is no inconsistency between the inode size and the amount of - * data writen in the zone (data_size). + * data written in the zone (data_size). */ data_size = zonefs_check_zone_condition(sb, z, zone); isize = i_size_read(inode); @@ -282,7 +282,7 @@ static void zonefs_handle_io_error(struct inode *inode, struct blk_zone *zone, * For the latter case, the cause may be a write IO error or an external * action on the device. Two error patterns exist: * 1) The inode size is lower than the amount of data in the zone: - * a write operation partially failed and data was writen at the end + * a write operation partially failed and data was written at the end * of the file. This can happen in the case of a large direct IO * needing several BIOs and/or write requests to be processed. * 2) The inode size is larger than the amount of data in the zone: |