diff options
Diffstat (limited to 'fs/ceph')
-rw-r--r-- | fs/ceph/crypto.c | 2 | ||||
-rw-r--r-- | fs/ceph/dir.c | 3 | ||||
-rw-r--r-- | fs/ceph/file.c | 30 | ||||
-rw-r--r-- | fs/ceph/inode.c | 12 | ||||
-rw-r--r-- | fs/ceph/io.c | 100 | ||||
-rw-r--r-- | fs/ceph/io.h | 8 | ||||
-rw-r--r-- | fs/ceph/ioctl.c | 17 | ||||
-rw-r--r-- | fs/ceph/locks.c | 5 | ||||
-rw-r--r-- | fs/ceph/mds_client.c | 24 | ||||
-rw-r--r-- | fs/ceph/mdsmap.c | 14 | ||||
-rw-r--r-- | fs/ceph/super.c | 18 | ||||
-rw-r--r-- | fs/ceph/super.h | 18 |
12 files changed, 177 insertions, 74 deletions
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c index cab722619207..7026e794813c 100644 --- a/fs/ceph/crypto.c +++ b/fs/ceph/crypto.c @@ -133,6 +133,8 @@ static const union fscrypt_policy *ceph_get_dummy_policy(struct super_block *sb) } static struct fscrypt_operations ceph_fscrypt_ops = { + .inode_info_offs = (int)offsetof(struct ceph_inode_info, i_crypt_info) - + (int)offsetof(struct ceph_inode_info, netfs.inode), .needs_bounce_pages = 1, .get_context = ceph_crypt_get_context, .set_context = ceph_crypt_set_context, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 32973c62c1a2..d18c0eaef9b7 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -1260,8 +1260,7 @@ static void ceph_async_unlink_cb(struct ceph_mds_client *mdsc, spin_unlock(&fsc->async_unlink_conflict_lock); spin_lock(&dentry->d_lock); - di->flags &= ~CEPH_DENTRY_ASYNC_UNLINK; - wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_UNLINK_BIT); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_UNLINK_BIT, &di->flags); spin_unlock(&dentry->d_lock); synchronize_rcu(); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 978acd3d4b32..99b30f784ee2 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -579,8 +579,7 @@ static void wake_async_create_waiters(struct inode *inode, spin_lock(&ci->i_ceph_lock); if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) { - ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE; - wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT); + clear_and_wake_up_bit(CEPH_ASYNC_CREATE_BIT, &ci->i_ceph_flags); if (ci->i_ceph_flags & CEPH_I_ASYNC_CHECK_CAPS) { ci->i_ceph_flags &= ~CEPH_I_ASYNC_CHECK_CAPS; @@ -762,8 +761,7 @@ static int ceph_finish_async_create(struct inode *dir, struct inode *inode, } spin_lock(&dentry->d_lock); - di->flags &= ~CEPH_DENTRY_ASYNC_CREATE; - wake_up_bit(&di->flags, CEPH_DENTRY_ASYNC_CREATE_BIT); + clear_and_wake_up_bit(CEPH_DENTRY_ASYNC_CREATE_BIT, &di->flags); spin_unlock(&dentry->d_lock); return ret; @@ -2121,10 +2119,10 @@ again: if (ceph_inode_is_shutdown(inode)) return -ESTALE; - if (direct_lock) - ceph_start_io_direct(inode); - else - ceph_start_io_read(inode); + ret = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_read(inode); + if (ret) + return ret; if (!(fi->flags & CEPH_F_SYNC) && !direct_lock) want |= CEPH_CAP_FILE_CACHE; @@ -2277,7 +2275,9 @@ static ssize_t ceph_splice_read(struct file *in, loff_t *ppos, (fi->flags & CEPH_F_SYNC)) return copy_splice_read(in, ppos, pipe, len, flags); - ceph_start_io_read(inode); + ret = ceph_start_io_read(inode); + if (ret) + return ret; want = CEPH_CAP_FILE_CACHE; if (fi->fmode & CEPH_FILE_MODE_LAZY) @@ -2356,10 +2356,10 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) direct_lock = true; retry_snap: - if (direct_lock) - ceph_start_io_direct(inode); - else - ceph_start_io_write(inode); + err = direct_lock ? ceph_start_io_direct(inode) : + ceph_start_io_write(inode); + if (err) + goto out_unlocked; if (iocb->ki_flags & IOCB_APPEND) { err = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -2878,7 +2878,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off struct ceph_object_id src_oid, dst_oid; struct ceph_osd_client *osdc; struct ceph_osd_request *req; - size_t bytes = 0; + ssize_t bytes = 0; u64 src_objnum, src_objoff, dst_objnum, dst_objoff; u32 src_objlen, dst_objlen; u32 object_size = src_ci->i_layout.object_size; @@ -2928,7 +2928,7 @@ static ssize_t ceph_do_objects_copy(struct ceph_inode_info *src_ci, u64 *src_off "OSDs don't support copy-from2; disabling copy offload\n"); } doutc(cl, "returned %d\n", ret); - if (!bytes) + if (bytes <= 0) bytes = ret; goto out; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index f67025465de0..a6e260d9e420 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -711,6 +711,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ci->i_work_mask = 0; memset(&ci->i_btime, '\0', sizeof(ci->i_btime)); #ifdef CONFIG_FS_ENCRYPTION + ci->i_crypt_info = NULL; ci->fscrypt_auth = NULL; ci->fscrypt_auth_len = 0; #endif @@ -1793,6 +1794,11 @@ retry_lookup: goto done; } + if (unlikely(!in)) { + err = -EINVAL; + goto done; + } + /* attach proper inode */ if (d_really_is_negative(dn)) { ceph_dir_clear_ordered(dir); @@ -1828,6 +1834,12 @@ retry_lookup: doutc(cl, " linking snapped dir %p to dn %p\n", in, req->r_dentry); ceph_dir_clear_ordered(dir); + + if (unlikely(!in)) { + err = -EINVAL; + goto done; + } + ihold(in); err = splice_dentry(&req->r_dentry, in); if (err < 0) diff --git a/fs/ceph/io.c b/fs/ceph/io.c index c456509b31c3..2d10f49c93a9 100644 --- a/fs/ceph/io.c +++ b/fs/ceph/io.c @@ -21,14 +21,23 @@ /* Call with exclusively locked inode->i_rwsem */ static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) { + bool is_odirect; + lockdep_assert_held_write(&inode->i_rwsem); - if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) { - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags &= ~CEPH_I_ODIRECT; - spin_unlock(&ci->i_ceph_lock); - inode_dio_wait(inode); + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (is_odirect) { + clear_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); } + spin_unlock(&ci->i_ceph_lock); + + if (is_odirect) + inode_dio_wait(inode); } /** @@ -47,20 +56,35 @@ static void ceph_block_o_direct(struct ceph_inode_info *ci, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void -ceph_start_io_read(struct inode *inode) +int ceph_start_io_read(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; /* Be an optimist! */ - down_read(&inode->i_rwsem); - if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) - return; + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (!is_odirect) + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + ceph_block_o_direct(ci, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** @@ -83,11 +107,12 @@ ceph_end_io_read(struct inode *inode) * Declare that a buffered write operation is about to start, and ensure * that we block all direct I/O. */ -void -ceph_start_io_write(struct inode *inode) +int ceph_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - ceph_block_o_direct(ceph_inode(inode), inode); + int err = down_write_killable(&inode->i_rwsem); + if (!err) + ceph_block_o_direct(ceph_inode(inode), inode); + return err; } /** @@ -106,12 +131,22 @@ ceph_end_io_write(struct inode *inode) /* Call with exclusively locked inode->i_rwsem */ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) { + bool is_odirect; + lockdep_assert_held_write(&inode->i_rwsem); - if (!(READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT)) { - spin_lock(&ci->i_ceph_lock); - ci->i_ceph_flags |= CEPH_I_ODIRECT; - spin_unlock(&ci->i_ceph_lock); + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + if (!is_odirect) { + set_bit(CEPH_I_ODIRECT_BIT, &ci->i_ceph_flags); + /* ensure modified bit is visible */ + smp_mb__after_atomic(); + } + spin_unlock(&ci->i_ceph_lock); + + if (!is_odirect) { /* FIXME: unmap_mapping_range? */ filemap_write_and_wait(inode->i_mapping); } @@ -133,20 +168,35 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void -ceph_start_io_direct(struct inode *inode) +int ceph_start_io_direct(struct inode *inode) { struct ceph_inode_info *ci = ceph_inode(inode); + bool is_odirect; + int err; /* Be an optimist! */ - down_read(&inode->i_rwsem); - if (READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT) - return; + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; + + spin_lock(&ci->i_ceph_lock); + /* ensure that bit state is consistent */ + smp_mb__before_atomic(); + is_odirect = READ_ONCE(ci->i_ceph_flags) & CEPH_I_ODIRECT; + spin_unlock(&ci->i_ceph_lock); + if (is_odirect) + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; + ceph_block_buffered(ci, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** diff --git a/fs/ceph/io.h b/fs/ceph/io.h index fa594cd77348..79029825e8b8 100644 --- a/fs/ceph/io.h +++ b/fs/ceph/io.h @@ -2,11 +2,13 @@ #ifndef _FS_CEPH_IO_H #define _FS_CEPH_IO_H -void ceph_start_io_read(struct inode *inode); +#include <linux/compiler_attributes.h> + +int __must_check ceph_start_io_read(struct inode *inode); void ceph_end_io_read(struct inode *inode); -void ceph_start_io_write(struct inode *inode); +int __must_check ceph_start_io_write(struct inode *inode); void ceph_end_io_write(struct inode *inode); -void ceph_start_io_direct(struct inode *inode); +int __must_check ceph_start_io_direct(struct inode *inode); void ceph_end_io_direct(struct inode *inode); #endif /* FS_CEPH_IO_H */ diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index e861de3c79b9..15cde055f3da 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -246,21 +246,28 @@ static long ceph_ioctl_lazyio(struct file *file) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_client *mdsc = ceph_inode_to_fs_client(inode)->mdsc; struct ceph_client *cl = mdsc->fsc->client; + bool is_file_already_lazy = false; + spin_lock(&ci->i_ceph_lock); if ((fi->fmode & CEPH_FILE_MODE_LAZY) == 0) { - spin_lock(&ci->i_ceph_lock); fi->fmode |= CEPH_FILE_MODE_LAZY; ci->i_nr_by_mode[ffs(CEPH_FILE_MODE_LAZY)]++; __ceph_touch_fmode(ci, mdsc, fi->fmode); - spin_unlock(&ci->i_ceph_lock); + } else { + is_file_already_lazy = true; + } + spin_unlock(&ci->i_ceph_lock); + + if (is_file_already_lazy) { + doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, + ceph_vinop(inode)); + } else { doutc(cl, "file %p %p %llx.%llx marked lazy\n", file, inode, ceph_vinop(inode)); ceph_check_caps(ci, 0); - } else { - doutc(cl, "file %p %p %llx.%llx already lazy\n", file, inode, - ceph_vinop(inode)); } + return 0; } diff --git a/fs/ceph/locks.c b/fs/ceph/locks.c index ebf4ac0055dd..dd764f9c64b9 100644 --- a/fs/ceph/locks.c +++ b/fs/ceph/locks.c @@ -221,7 +221,10 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, if (err && err != -ERESTARTSYS) return err; - wait_for_completion_killable(&req->r_safe_completion); + err = wait_for_completion_killable(&req->r_safe_completion); + if (err) + return err; + return 0; } diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 3bc72b47fe4d..1740047aef0f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -979,14 +979,15 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, if (mds >= mdsc->max_sessions) { int newmax = 1 << get_count_order(mds + 1); struct ceph_mds_session **sa; + size_t ptr_size = sizeof(struct ceph_mds_session *); doutc(cl, "realloc to %d\n", newmax); - sa = kcalloc(newmax, sizeof(void *), GFP_NOFS); + sa = kcalloc(newmax, ptr_size, GFP_NOFS); if (!sa) goto fail_realloc; if (mdsc->sessions) { memcpy(sa, mdsc->sessions, - mdsc->max_sessions * sizeof(void *)); + mdsc->max_sessions * ptr_size); kfree(mdsc->sessions); } mdsc->sessions = sa; @@ -2221,7 +2222,7 @@ static int trim_caps_cb(struct inode *inode, int mds, void *arg) int count; dput(dentry); d_prune_aliases(inode); - count = atomic_read(&inode->i_count); + count = icount_read(inode); if (count == 1) (*remaining)--; doutc(cl, "%p %llx.%llx cap %p pruned, count now %d\n", @@ -2532,6 +2533,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, struct ceph_mount_options *opt = req->r_mdsc->fsc->mount_options; size_t size = sizeof(struct ceph_mds_reply_dir_entry); unsigned int num_entries; + u64 bytes_count; int order; spin_lock(&ci->i_ceph_lock); @@ -2540,7 +2542,11 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, num_entries = max(num_entries, 1U); num_entries = min(num_entries, opt->max_readdir); - order = get_order(size * num_entries); + bytes_count = (u64)size * num_entries; + if (unlikely(bytes_count > ULONG_MAX)) + bytes_count = ULONG_MAX; + + order = get_order((unsigned long)bytes_count); while (order >= 0) { rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL | __GFP_NOWARN | @@ -2550,7 +2556,7 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req, break; order--; } - if (!rinfo->dir_entries) + if (!rinfo->dir_entries || unlikely(order < 0)) return -ENOMEM; num_entries = (PAGE_SIZE << order) / size; @@ -5649,11 +5655,19 @@ static int ceph_mds_auth_match(struct ceph_mds_client *mdsc, u32 caller_uid = from_kuid(&init_user_ns, cred->fsuid); u32 caller_gid = from_kgid(&init_user_ns, cred->fsgid); struct ceph_client *cl = mdsc->fsc->client; + const char *fs_name = mdsc->fsc->mount_options->mds_namespace; const char *spath = mdsc->fsc->mount_options->server_path; bool gid_matched = false; u32 gid, tlen, len; int i, j; + doutc(cl, "fsname check fs_name=%s match.fs_name=%s\n", + fs_name, auth->match.fs_name ? auth->match.fs_name : ""); + if (auth->match.fs_name && strcmp(auth->match.fs_name, fs_name)) { + /* fsname mismatch, try next one */ + return 0; + } + doutc(cl, "match.uid %lld\n", auth->match.uid); if (auth->match.uid != MDS_AUTH_UID_ANY) { if (auth->match.uid != caller_uid) diff --git a/fs/ceph/mdsmap.c b/fs/ceph/mdsmap.c index 8109aba66e02..2c7b151a7c95 100644 --- a/fs/ceph/mdsmap.c +++ b/fs/ceph/mdsmap.c @@ -353,10 +353,22 @@ struct ceph_mdsmap *ceph_mdsmap_decode(struct ceph_mds_client *mdsc, void **p, __decode_and_drop_type(p, end, u8, bad_ext); } if (mdsmap_ev >= 8) { + u32 fsname_len; /* enabled */ ceph_decode_8_safe(p, end, m->m_enabled, bad_ext); /* fs_name */ - ceph_decode_skip_string(p, end, bad_ext); + ceph_decode_32_safe(p, end, fsname_len, bad_ext); + + /* validate fsname against mds_namespace */ + if (!namespace_equals(mdsc->fsc->mount_options, *p, + fsname_len)) { + pr_warn_client(cl, "fsname %*pE doesn't match mds_namespace %s\n", + (int)fsname_len, (char *)*p, + mdsc->fsc->mount_options->mds_namespace); + goto bad; + } + /* skip fsname after validation */ + ceph_decode_skip_n(p, end, fsname_len, bad); } /* damaged */ if (mdsmap_ev >= 9) { diff --git a/fs/ceph/super.c b/fs/ceph/super.c index c3eb651862c5..ad0cf177e75a 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -246,20 +246,6 @@ static void canonicalize_path(char *path) path[j] = '\0'; } -/* - * Check if the mds namespace in ceph_mount_options matches - * the passed in namespace string. First time match (when - * ->mds_namespace is NULL) is treated specially, since - * ->mds_namespace needs to be initialized by the caller. - */ -static int namespace_equals(struct ceph_mount_options *fsopt, - const char *namespace, size_t len) -{ - return !(fsopt->mds_namespace && - (strlen(fsopt->mds_namespace) != len || - strncmp(fsopt->mds_namespace, namespace, len))); -} - static int ceph_parse_old_source(const char *dev_name, const char *dev_name_end, struct fs_context *fc) { @@ -862,7 +848,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, fsc->inode_wq = alloc_workqueue("ceph-inode", WQ_UNBOUND, 0); if (!fsc->inode_wq) goto fail_client; - fsc->cap_wq = alloc_workqueue("ceph-cap", 0, 1); + fsc->cap_wq = alloc_workqueue("ceph-cap", WQ_PERCPU, 1); if (!fsc->cap_wq) goto fail_inode_wq; @@ -1042,7 +1028,7 @@ static const struct super_operations ceph_super_ops = { .alloc_inode = ceph_alloc_inode, .free_inode = ceph_free_inode, .write_inode = ceph_write_inode, - .drop_inode = generic_delete_inode, + .drop_inode = inode_just_drop, .evict_inode = ceph_evict_inode, .sync_fs = ceph_sync_fs, .put_super = ceph_put_super, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index cf176aab0f82..a1f781c46b41 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -104,6 +104,20 @@ struct ceph_mount_options { struct fscrypt_dummy_policy dummy_enc_policy; }; +/* + * Check if the mds namespace in ceph_mount_options matches + * the passed in namespace string. First time match (when + * ->mds_namespace is NULL) is treated specially, since + * ->mds_namespace needs to be initialized by the caller. + */ +static inline int namespace_equals(struct ceph_mount_options *fsopt, + const char *namespace, size_t len) +{ + return !(fsopt->mds_namespace && + (strlen(fsopt->mds_namespace) != len || + strncmp(fsopt->mds_namespace, namespace, len))); +} + /* mount state */ enum { CEPH_MOUNT_MOUNTING, @@ -463,6 +477,7 @@ struct ceph_inode_info { unsigned long i_work_mask; #ifdef CONFIG_FS_ENCRYPTION + struct fscrypt_inode_info *i_crypt_info; u32 fscrypt_auth_len; u32 fscrypt_file_len; u8 *fscrypt_auth; @@ -638,7 +653,8 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, #define CEPH_I_FLUSH_SNAPS (1 << 8) /* need flush snapss */ #define CEPH_I_ERROR_WRITE (1 << 9) /* have seen write errors */ #define CEPH_I_ERROR_FILELOCK (1 << 10) /* have seen file lock errors */ -#define CEPH_I_ODIRECT (1 << 11) /* inode in direct I/O mode */ +#define CEPH_I_ODIRECT_BIT (11) /* inode in direct I/O mode */ +#define CEPH_I_ODIRECT (1 << CEPH_I_ODIRECT_BIT) #define CEPH_ASYNC_CREATE_BIT (12) /* async create in flight for this */ #define CEPH_I_ASYNC_CREATE (1 << CEPH_ASYNC_CREATE_BIT) #define CEPH_I_SHUTDOWN (1 << 13) /* inode is no longer usable */ |