diff options
| author | Takashi Iwai <tiwai@suse.de> | 2014-10-27 12:52:29 +0100 |
|---|---|---|
| committer | Takashi Iwai <tiwai@suse.de> | 2014-10-27 12:52:29 +0100 |
| commit | d5432503bfb49f3425bad0b850714ffd8b533cfc (patch) | |
| tree | 97926e7eea0b6d8b0e8bf98748dd9c0ef4c718ad /fs | |
| parent | 6a98e34b58919c52e8c8beec991759999af342da (diff) | |
| parent | c0d018bd5b1aabad59dffdec568e189359d93a14 (diff) | |
Merge tag 'asoc-v3.18-rc2' of git://git.kernel.org/pub/scm/linux/kernel/git/broonie/sound into for-linus
ASoC: Fixes for v3.18
A few small driver fixes for v3.18 plus the removal of the s6000 support
since the relevant chip is no longer supported in mainline.
Diffstat (limited to 'fs')
416 files changed, 17587 insertions, 11209 deletions
diff --git a/fs/9p/fid.c b/fs/9p/fid.c index d51ec9fafcc8..47db55aee7f2 100644 --- a/fs/9p/fid.c +++ b/fs/9p/fid.c @@ -65,8 +65,8 @@ static struct p9_fid *v9fs_fid_find(struct dentry *dentry, kuid_t uid, int any) { struct p9_fid *fid, *ret; - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p) uid %d any %d\n", - dentry->d_name.name, dentry, from_kuid(&init_user_ns, uid), + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p) uid %d any %d\n", + dentry, dentry, from_kuid(&init_user_ns, uid), any); ret = NULL; /* we'll recheck under lock if there's anything to look in */ diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index cc1cfae726b3..eb14e055ea83 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -266,8 +266,8 @@ v9fs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t pos) * Now that we do caching with cache mode enabled, We need * to support direct IO */ - p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%s) off/no(%lld/%lu) EINVAL\n", - iocb->ki_filp->f_path.dentry->d_name.name, + p9_debug(P9_DEBUG_VFS, "v9fs_direct_IO: v9fs_direct_IO (%pD) off/no(%lld/%lu) EINVAL\n", + iocb->ki_filp, (long long)pos, iter->nr_segs); return -EINVAL; diff --git a/fs/9p/vfs_dentry.c b/fs/9p/vfs_dentry.c index b03dd23feda8..a345b2d659cc 100644 --- a/fs/9p/vfs_dentry.c +++ b/fs/9p/vfs_dentry.c @@ -49,8 +49,8 @@ */ static int v9fs_cached_dentry_delete(const struct dentry *dentry) { - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", + dentry, dentry); /* Don't cache negative dentries */ if (!dentry->d_inode) @@ -67,8 +67,8 @@ static int v9fs_cached_dentry_delete(const struct dentry *dentry) static void v9fs_dentry_release(struct dentry *dentry) { struct hlist_node *p, *n; - p9_debug(P9_DEBUG_VFS, " dentry: %s (%p)\n", - dentry->d_name.name, dentry); + p9_debug(P9_DEBUG_VFS, " dentry: %pd (%p)\n", + dentry, dentry); hlist_for_each_safe(p, n, (struct hlist_head *)&dentry->d_fsdata) p9_client_clunk(hlist_entry(p, struct p9_fid, dlist)); dentry->d_fsdata = NULL; diff --git a/fs/9p/vfs_dir.c b/fs/9p/vfs_dir.c index 0b3bfa303dda..4f1151088ebe 100644 --- a/fs/9p/vfs_dir.c +++ b/fs/9p/vfs_dir.c @@ -116,7 +116,7 @@ static int v9fs_dir_readdir(struct file *file, struct dir_context *ctx) int reclen = 0; struct p9_rdir *rdir; - p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %pD\n", file); fid = file->private_data; buflen = fid->clnt->msize - P9_IOHDRSZ; @@ -172,7 +172,7 @@ static int v9fs_dir_readdir_dotl(struct file *file, struct dir_context *ctx) struct p9_rdir *rdir; struct p9_dirent curdirent; - p9_debug(P9_DEBUG_VFS, "name %s\n", file->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %pD\n", file); fid = file->private_data; buflen = fid->clnt->msize - P9_READDIRHDRSZ; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 520c11c2dcca..5594505e6e73 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -301,8 +301,8 @@ static int v9fs_file_lock_dotl(struct file *filp, int cmd, struct file_lock *fl) struct inode *inode = file_inode(filp); int ret = -ENOLCK; - p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", - filp, cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", + filp, cmd, fl, filp); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) @@ -337,8 +337,8 @@ static int v9fs_file_flock_dotl(struct file *filp, int cmd, struct inode *inode = file_inode(filp); int ret = -ENOLCK; - p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %s\n", - filp, cmd, fl, filp->f_path.dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "filp: %p cmd:%d lock: %p name: %pD\n", + filp, cmd, fl, filp); /* No mandatory locks */ if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 7fa4f7a7653d..296482fc77a9 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -648,7 +648,7 @@ v9fs_create(struct v9fs_session_info *v9ses, struct inode *dir, struct p9_fid *dfid, *ofid, *fid; struct inode *inode; - p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); err = 0; ofid = NULL; @@ -755,7 +755,7 @@ static int v9fs_vfs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode struct p9_fid *fid; struct v9fs_session_info *v9ses; - p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); err = 0; v9ses = v9fs_inode2v9ses(dir); perm = unixmode2p9mode(v9ses, mode | S_IFDIR); @@ -791,8 +791,8 @@ struct dentry *v9fs_vfs_lookup(struct inode *dir, struct dentry *dentry, struct inode *inode; char *name; - p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%s) %p flags: %x\n", - dir, dentry->d_name.name, dentry, flags); + p9_debug(P9_DEBUG_VFS, "dir: %p dentry: (%pd) %p flags: %x\n", + dir, dentry, dentry, flags); if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); @@ -1239,7 +1239,7 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) struct p9_fid *fid; struct p9_wstat *st; - p9_debug(P9_DEBUG_VFS, " %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %pd\n", dentry); retval = -EPERM; v9ses = v9fs_dentry2v9ses(dentry); fid = v9fs_fid_lookup(dentry); @@ -1262,8 +1262,8 @@ static int v9fs_readlink(struct dentry *dentry, char *buffer, int buflen) retval = min(strlen(st->extension)+1, (size_t)buflen); memcpy(buffer, st->extension, retval); - p9_debug(P9_DEBUG_VFS, "%s -> %s (%.*s)\n", - dentry->d_name.name, st->extension, buflen, buffer); + p9_debug(P9_DEBUG_VFS, "%pd -> %s (%.*s)\n", + dentry, st->extension, buflen, buffer); done: p9stat_free(st); @@ -1283,7 +1283,7 @@ static void *v9fs_vfs_follow_link(struct dentry *dentry, struct nameidata *nd) int len = 0; char *link = __getname(); - p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); if (!link) link = ERR_PTR(-ENOMEM); @@ -1314,8 +1314,8 @@ v9fs_vfs_put_link(struct dentry *dentry, struct nameidata *nd, void *p) { char *s = nd_get_link(nd); - p9_debug(P9_DEBUG_VFS, " %s %s\n", - dentry->d_name.name, IS_ERR(s) ? "<error>" : s); + p9_debug(P9_DEBUG_VFS, " %pd %s\n", + dentry, IS_ERR(s) ? "<error>" : s); if (!IS_ERR(s)) __putname(s); } @@ -1364,8 +1364,8 @@ static int v9fs_vfs_mkspecial(struct inode *dir, struct dentry *dentry, static int v9fs_vfs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", - dir->i_ino, dentry->d_name.name, symname); + p9_debug(P9_DEBUG_VFS, " %lu,%pd,%s\n", + dir->i_ino, dentry, symname); return v9fs_vfs_mkspecial(dir, dentry, P9_DMSYMLINK, symname); } @@ -1386,8 +1386,8 @@ v9fs_vfs_link(struct dentry *old_dentry, struct inode *dir, char *name; struct p9_fid *oldfid; - p9_debug(P9_DEBUG_VFS, " %lu,%s,%s\n", - dir->i_ino, dentry->d_name.name, old_dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, " %lu,%pd,%pd\n", + dir->i_ino, dentry, old_dentry); oldfid = v9fs_fid_clone(old_dentry); if (IS_ERR(oldfid)) @@ -1428,8 +1428,8 @@ v9fs_vfs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rde char *name; u32 perm; - p9_debug(P9_DEBUG_VFS, " %lu,%s mode: %hx MAJOR: %u MINOR: %u\n", - dir->i_ino, dentry->d_name.name, mode, + p9_debug(P9_DEBUG_VFS, " %lu,%pd mode: %hx MAJOR: %u MINOR: %u\n", + dir->i_ino, dentry, mode, MAJOR(rdev), MINOR(rdev)); if (!new_valid_dev(rdev)) diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1fa85aae24df..02b64f4e576a 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -393,7 +393,7 @@ static int v9fs_vfs_mkdir_dotl(struct inode *dir, struct dentry *dir_dentry; struct posix_acl *dacl = NULL, *pacl = NULL; - p9_debug(P9_DEBUG_VFS, "name %s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "name %pd\n", dentry); err = 0; v9ses = v9fs_inode2v9ses(dir); @@ -767,8 +767,8 @@ v9fs_vfs_link_dotl(struct dentry *old_dentry, struct inode *dir, struct p9_fid *dfid, *oldfid; struct v9fs_session_info *v9ses; - p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %s, new_name: %s\n", - dir->i_ino, old_dentry->d_name.name, dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "dir ino: %lu, old_name: %pd, new_name: %pd\n", + dir->i_ino, old_dentry, dentry); v9ses = v9fs_inode2v9ses(dir); dir_dentry = dentry->d_parent; @@ -917,7 +917,7 @@ v9fs_vfs_follow_link_dotl(struct dentry *dentry, struct nameidata *nd) char *link = __getname(); char *target; - p9_debug(P9_DEBUG_VFS, "%s\n", dentry->d_name.name); + p9_debug(P9_DEBUG_VFS, "%pd\n", dentry); if (!link) { link = ERR_PTR(-ENOMEM); diff --git a/fs/Kconfig b/fs/Kconfig index 312393f32948..664991afe0c0 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -67,6 +67,7 @@ source "fs/quota/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" +source "fs/overlayfs/Kconfig" menu "Caches" @@ -233,9 +234,13 @@ if NETWORK_FILESYSTEMS source "fs/nfs/Kconfig" source "fs/nfsd/Kconfig" +config GRACE_PERIOD + tristate + config LOCKD tristate depends on FILE_LOCKING + select GRACE_PERIOD config LOCKD_V4 bool @@ -249,7 +254,7 @@ config NFS_ACL_SUPPORT config NFS_COMMON bool - depends on NFSD || NFS_FS + depends on NFSD || NFS_FS || LOCKD default y source "net/sunrpc/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 90c88529892b..34a1b9dea6dd 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -104,6 +104,7 @@ obj-$(CONFIG_QNX6FS_FS) += qnx6/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs/ obj-$(CONFIG_UDF_FS) += udf/ obj-$(CONFIG_SUN_OPENPROMFS) += openpromfs/ obj-$(CONFIG_OMFS_FS) += omfs/ diff --git a/fs/affs/amigaffs.c b/fs/affs/amigaffs.c index 406b29836b19..abc853968fed 100644 --- a/fs/affs/amigaffs.c +++ b/fs/affs/amigaffs.c @@ -10,8 +10,6 @@ #include "affs.h" -extern struct timezone sys_tz; - static char ErrorBuffer[256]; /* diff --git a/fs/affs/file.c b/fs/affs/file.c index a7fe57d2cd9a..1ed590aafecf 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -584,11 +584,14 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) bh->b_state &= ~(1UL << BH_New); mark_buffer_dirty_inode(bh, inode); if (prev_bh) { - u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); - if (tmp) - affs_warning(sb, "extent_file_ofs", "next block already set for %d (%d)", bidx, tmp); + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "extent_file_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); - affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); mark_buffer_dirty_inode(prev_bh, inode); affs_brelse(prev_bh); } @@ -727,11 +730,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); if (prev_bh) { - u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); - if (tmp) - affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "commit_write_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); - affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); mark_buffer_dirty_inode(prev_bh, inode); } } @@ -758,11 +764,14 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); if (prev_bh) { - u32 tmp = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); - if (tmp) - affs_warning(sb, "commit_write_ofs", "next block already set for %d (%d)", bidx, tmp); + u32 tmp_next = be32_to_cpu(AFFS_DATA_HEAD(prev_bh)->next); + + if (tmp_next) + affs_warning(sb, "commit_write_ofs", + "next block already set for %d (%d)", + bidx, tmp_next); AFFS_DATA_HEAD(prev_bh)->next = cpu_to_be32(bh->b_blocknr); - affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp); + affs_adjust_checksum(prev_bh, bh->b_blocknr - tmp_next); mark_buffer_dirty_inode(prev_bh, inode); } } else if (be32_to_cpu(AFFS_DATA_HEAD(bh)->size) < tmp) @@ -842,12 +851,12 @@ affs_truncate(struct inode *inode) struct address_space *mapping = inode->i_mapping; struct page *page; void *fsdata; - loff_t size = inode->i_size; + loff_t isize = inode->i_size; int res; - res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); + res = mapping->a_ops->write_begin(NULL, mapping, isize, 0, 0, &page, &fsdata); if (!res) - res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + res = mapping->a_ops->write_end(NULL, mapping, isize, 0, 0, page, fsdata); else inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); diff --git a/fs/affs/inode.c b/fs/affs/inode.c index bec2d1a0c91c..e217c511459b 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -14,13 +14,11 @@ #include "affs.h" extern const struct inode_operations affs_symlink_inode_operations; -extern struct timezone sys_tz; struct inode *affs_iget(struct super_block *sb, unsigned long ino) { struct affs_sb_info *sbi = AFFS_SB(sb); struct buffer_head *bh; - struct affs_head *head; struct affs_tail *tail; struct inode *inode; u32 block; @@ -49,7 +47,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) goto bad_inode; } - head = AFFS_HEAD(bh); tail = AFFS_TAIL(sb, bh); prot = be32_to_cpu(tail->protect); diff --git a/fs/affs/super.c b/fs/affs/super.c index 51f1a95bff73..f754ab68a840 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -20,8 +20,6 @@ #include <linux/writeback.h> #include "affs.h" -extern struct timezone sys_tz; - static int affs_statfs(struct dentry *dentry, struct kstatfs *buf); static int affs_remount (struct super_block *sb, int *flags, char *data); @@ -308,7 +306,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) u32 chksum; int num_bm; int i, j; - s32 key; kuid_t uid; kgid_t gid; int reserved; @@ -367,7 +364,7 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) i = j = blocksize; size = size / (blocksize / 512); } - for (blocksize = i, key = 0; blocksize <= j; blocksize <<= 1, size >>= 1) { + for (blocksize = i; blocksize <= j; blocksize <<= 1, size >>= 1) { sbi->s_root_block = root_block; if (root_block < 0) sbi->s_root_block = (reserved + size - 1) / 2; @@ -399,7 +396,6 @@ static int affs_fill_super(struct super_block *sb, void *data, int silent) be32_to_cpu(AFFS_ROOT_TAIL(sb, root_bh)->stype) == ST_ROOT) { sbi->s_hashsize = blocksize / 4 - 56; sbi->s_root_block += num_bm; - key = 1; goto got_root; } affs_brelse(root_bh); diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 529300327f45..a1645b88fe8a 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -669,7 +669,6 @@ static int afs_d_revalidate(struct dentry *dentry, unsigned int flags) out_valid: dentry->d_fsdata = dir_version; -out_skip: dput(parent); key_put(key); _leave(" = 1 [valid]"); @@ -682,10 +681,6 @@ not_found: spin_unlock(&dentry->d_lock); out_bad: - /* don't unhash if we have submounts */ - if (check_submounts_and_drop(dentry) != 0) - goto out_skip; - _debug("dropping dentry %s/%s", parent->d_name.name, dentry->d_name.name); dput(parent); diff --git a/fs/afs/vlocation.c b/fs/afs/vlocation.c index b6df2e83809f..52976785a32c 100644 --- a/fs/afs/vlocation.c +++ b/fs/afs/vlocation.c @@ -130,7 +130,6 @@ static int afs_vlocation_access_vl_by_id(struct afs_vlocation *vl, /* second+ BUSY - sleep a little bit */ set_current_state(TASK_UNINTERRUPTIBLE); schedule_timeout(1); - __set_current_state(TASK_RUNNING); } continue; } @@ -661,10 +661,10 @@ static struct kioctx *ioctx_alloc(unsigned nr_events) INIT_LIST_HEAD(&ctx->active_reqs); - if (percpu_ref_init(&ctx->users, free_ioctx_users)) + if (percpu_ref_init(&ctx->users, free_ioctx_users, 0, GFP_KERNEL)) goto err; - if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs)) + if (percpu_ref_init(&ctx->reqs, free_ioctx_reqs, 0, GFP_KERNEL)) goto err; ctx->cpu = alloc_percpu(struct kioctx_cpu); diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 9e359fb20c0a..8e98cf954bab 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -79,6 +79,10 @@ struct autofs_info { }; #define AUTOFS_INF_EXPIRING (1<<0) /* dentry is in the process of expiring */ +#define AUTOFS_INF_NO_RCU (1<<1) /* the dentry is being considered + * for expiry, so RCU_walk is + * not permitted + */ #define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ struct autofs_wait_queue { @@ -148,7 +152,7 @@ void autofs4_free_ino(struct autofs_info *); /* Expiration */ int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(struct dentry *dentry); +int autofs4_expire_wait(struct dentry *dentry, int rcu_walk); int autofs4_expire_run(struct super_block *, struct vfsmount *, struct autofs_sb_info *, struct autofs_packet_expire __user *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 5b570b6efa28..aaf96cb25452 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -450,7 +450,7 @@ static int autofs_dev_ioctl_requester(struct file *fp, ino = autofs4_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(path.dentry); + autofs4_expire_wait(path.dentry, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); param->requester.gid = from_kgid_munged(current_user_ns(), ino->gid); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index a7be57e39be7..683a5b9ce22a 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -30,12 +30,6 @@ static inline int autofs4_can_expire(struct dentry *dentry, /* Too young to die */ if (!timeout || time_after(ino->last_used + timeout, now)) return 0; - - /* update last_used here :- - - obviously makes sense if it is in use now - - less obviously, prevents rapid-fire expire - attempts if expire fails the first time */ - ino->last_used = now; } return 1; } @@ -255,12 +249,6 @@ static int autofs4_tree_busy(struct vfsmount *mnt, struct autofs_info *ino = autofs4_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); - /* - * Clean stale dentries below that have not been - * invalidated after a mount fail during lookup - */ - d_invalidate(p); - /* allow for dget above and top is already dgot */ if (p == top) ino_count += 2; @@ -333,10 +321,19 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, if (ino->flags & AUTOFS_INF_PENDING) goto out; if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { - ino->flags |= AUTOFS_INF_EXPIRING; - init_completion(&ino->expire_complete); + ino->flags |= AUTOFS_INF_NO_RCU; spin_unlock(&sbi->fs_lock); - return root; + synchronize_rcu(); + spin_lock(&sbi->fs_lock); + if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + ino->flags |= AUTOFS_INF_EXPIRING; + smp_mb(); + ino->flags &= ~AUTOFS_INF_NO_RCU; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + return root; + } + ino->flags &= ~AUTOFS_INF_NO_RCU; } out: spin_unlock(&sbi->fs_lock); @@ -345,6 +342,89 @@ out: return NULL; } +/* Check if 'dentry' should expire, or return a nearby + * dentry that is suitable. + * If returned dentry is different from arg dentry, + * then a dget() reference was taken, else not. + */ +static struct dentry *should_expire(struct dentry *dentry, + struct vfsmount *mnt, + unsigned long timeout, + int how) +{ + int do_now = how & AUTOFS_EXP_IMMEDIATE; + int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino = autofs4_dentry_ino(dentry); + unsigned int ino_count; + + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + return NULL; + + /* + * Case 1: (i) indirect mount or top level pseudo direct mount + * (autofs-4.1). + * (ii) indirect mount with offset mount, check the "/" + * offset (autofs-5.0+). + */ + if (d_mountpoint(dentry)) { + DPRINTK("checking mountpoint %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + + /* Can we umount this guy */ + if (autofs4_mount_busy(mnt, dentry)) + return NULL; + + /* Can we expire this guy */ + if (autofs4_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { + DPRINTK("checking symlink %p %.*s", + dentry, (int)dentry->d_name.len, dentry->d_name.name); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs4_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (simple_empty(dentry)) + return NULL; + + /* Case 2: tree mount, expire iff entire tree is not busy */ + if (!exp_leaves) { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + return dentry; + /* + * Case 3: pseudo direct mount, expire individual leaves + * (autofs-4.1). + */ + } else { + /* Path walk currently on this dentry? */ + struct dentry *expired; + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + if (expired) { + if (expired == dentry) + dput(dentry); + return expired; + } + } + return NULL; +} /* * Find an eligible tree to time-out * A tree is eligible if :- @@ -359,11 +439,8 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, unsigned long timeout; struct dentry *root = sb->s_root; struct dentry *dentry; - struct dentry *expired = NULL; - int do_now = how & AUTOFS_EXP_IMMEDIATE; - int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct dentry *expired; struct autofs_info *ino; - unsigned int ino_count; if (!root) return NULL; @@ -375,77 +452,28 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, while ((dentry = get_next_positive_subdir(dentry, root))) { spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - goto next; - - /* - * Case 1: (i) indirect mount or top level pseudo direct mount - * (autofs-4.1). - * (ii) indirect mount with offset mount, check the "/" - * offset (autofs-5.0+). - */ - if (d_mountpoint(dentry)) { - DPRINTK("checking mountpoint %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); - - /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, dentry)) - goto next; - - /* Can we expire this guy */ - if (autofs4_can_expire(dentry, timeout, do_now)) { - expired = dentry; - goto found; - } - goto next; - } - - if (dentry->d_inode && S_ISLNK(dentry->d_inode->i_mode)) { - DPRINTK("checking symlink %p %.*s", - dentry, (int)dentry->d_name.len, dentry->d_name.name); - /* - * A symlink can't be "busy" in the usual sense so - * just check last used for expire timeout. - */ - if (autofs4_can_expire(dentry, timeout, do_now)) { - expired = dentry; - goto found; - } - goto next; + if (ino->flags & AUTOFS_INF_NO_RCU) + expired = NULL; + else + expired = should_expire(dentry, mnt, timeout, how); + if (!expired) { + spin_unlock(&sbi->fs_lock); + continue; } - - if (simple_empty(dentry)) - goto next; - - /* Case 2: tree mount, expire iff entire tree is not busy */ - if (!exp_leaves) { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - goto next; - - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) { - expired = dentry; - goto found; - } - /* - * Case 3: pseudo direct mount, expire individual leaves - * (autofs-4.1). - */ - } else { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - goto next; - - expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); - if (expired) { + ino = autofs4_dentry_ino(expired); + ino->flags |= AUTOFS_INF_NO_RCU; + spin_unlock(&sbi->fs_lock); + synchronize_rcu(); + spin_lock(&sbi->fs_lock); + if (should_expire(expired, mnt, timeout, how)) { + if (expired != dentry) dput(dentry); - goto found; - } + goto found; } -next: + + ino->flags &= ~AUTOFS_INF_NO_RCU; + if (expired != dentry) + dput(expired); spin_unlock(&sbi->fs_lock); } return NULL; @@ -453,8 +481,9 @@ next: found: DPRINTK("returning %p %.*s", expired, (int)expired->d_name.len, expired->d_name.name); - ino = autofs4_dentry_ino(expired); ino->flags |= AUTOFS_INF_EXPIRING; + smp_mb(); + ino->flags &= ~AUTOFS_INF_NO_RCU; init_completion(&ino->expire_complete); spin_unlock(&sbi->fs_lock); spin_lock(&sbi->lookup_lock); @@ -467,13 +496,18 @@ found: return expired; } -int autofs4_expire_wait(struct dentry *dentry) +int autofs4_expire_wait(struct dentry *dentry, int rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status; /* Block on any pending expire */ + if (!(ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU))) + return 0; + if (rcu_walk) + return -ECHILD; + spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_EXPIRING) { spin_unlock(&sbi->fs_lock); @@ -525,6 +559,8 @@ int autofs4_expire_run(struct super_block *sb, spin_lock(&sbi->fs_lock); ino = autofs4_dentry_ino(dentry); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; ino->flags &= ~AUTOFS_INF_EXPIRING; complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); @@ -551,6 +587,8 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, ret = autofs4_wait(sbi, dentry, NFY_EXPIRE); spin_lock(&sbi->fs_lock); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; ino->flags &= ~AUTOFS_INF_EXPIRING; complete_all(&ino->expire_complete); spin_unlock(&sbi->fs_lock); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index cdb25ebccc4c..d76d083f2f06 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -210,7 +210,8 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) +static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, + bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; @@ -229,6 +230,11 @@ static struct dentry *autofs4_lookup_expiring(struct dentry *dentry) struct dentry *expiring; struct qstr *qstr; + if (rcu_walk) { + spin_unlock(&sbi->lookup_lock); + return ERR_PTR(-ECHILD); + } + ino = list_entry(p, struct autofs_info, expiring); expiring = ino->dentry; @@ -264,13 +270,15 @@ next: return NULL; } -static int autofs4_mount_wait(struct dentry *dentry) +static int autofs4_mount_wait(struct dentry *dentry, bool rcu_walk) { struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); struct autofs_info *ino = autofs4_dentry_ino(dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { + if (rcu_walk) + return -ECHILD; DPRINTK("waiting for mount name=%.*s", dentry->d_name.len, dentry->d_name.name); status = autofs4_wait(sbi, dentry, NFY_MOUNT); @@ -280,20 +288,22 @@ static int autofs4_mount_wait(struct dentry *dentry) return status; } -static int do_expire_wait(struct dentry *dentry) +static int do_expire_wait(struct dentry *dentry, bool rcu_walk) { struct dentry *expiring; - expiring = autofs4_lookup_expiring(dentry); + expiring = autofs4_lookup_expiring(dentry, rcu_walk); + if (IS_ERR(expiring)) + return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(dentry); + return autofs4_expire_wait(dentry, rcu_walk); else { /* * If we are racing with expire the request might not * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(expiring); + autofs4_expire_wait(expiring, 0); autofs4_del_expiring(expiring); dput(expiring); } @@ -345,7 +355,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) * and the directory was removed, so just go ahead and try * the mount. */ - status = do_expire_wait(dentry); + status = do_expire_wait(dentry, 0); if (status && status != -EAGAIN) return NULL; @@ -353,7 +363,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry); + status = autofs4_mount_wait(dentry, 0); if (status) return ERR_PTR(status); goto done; @@ -394,7 +404,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(dentry); + status = autofs4_mount_wait(dentry, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -423,28 +433,46 @@ static int autofs4_d_manage(struct dentry *dentry, bool rcu_walk) /* The daemon never waits. */ if (autofs4_oz_mode(sbi)) { - if (rcu_walk) - return 0; if (!d_mountpoint(dentry)) return -EISDIR; return 0; } - /* We need to sleep, so we need pathwalk to be in ref-mode */ - if (rcu_walk) - return -ECHILD; - /* Wait for pending expires */ - do_expire_wait(dentry); + if (do_expire_wait(dentry, rcu_walk) == -ECHILD) + return -ECHILD; /* * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(dentry); + status = autofs4_mount_wait(dentry, rcu_walk); if (status) return status; + if (rcu_walk) { + /* We don't need fs_lock in rcu_walk mode, + * just testing 'AUTOFS_INFO_NO_RCU' is enough. + * simple_empty() takes a spinlock, so leave it + * to last. + * We only return -EISDIR when certain this isn't + * a mount-trap. + */ + struct inode *inode; + if (ino->flags & (AUTOFS_INF_EXPIRING | AUTOFS_INF_NO_RCU)) + return 0; + if (d_mountpoint(dentry)) + return 0; + inode = ACCESS_ONCE(dentry->d_inode); + if (inode && S_ISLNK(inode->i_mode)) + return -EISDIR; + if (list_empty(&dentry->d_subdirs)) + return 0; + if (!simple_empty(dentry)) + return -EISDIR; + return 0; + } + spin_lock(&sbi->fs_lock); /* * If the dentry has been selected for expire while we slept diff --git a/fs/befs/btree.c b/fs/befs/btree.c index 9c7faa8a9288..0826e91dacda 100644 --- a/fs/befs/btree.c +++ b/fs/befs/btree.c @@ -78,11 +78,11 @@ /* * In memory structure of each btree node */ -typedef struct { +struct befs_btree_node { befs_host_btree_nodehead head; /* head of node converted to cpu byteorder */ struct buffer_head *bh; befs_btree_nodehead *od_node; /* on disk node */ -} befs_btree_node; +}; /* local constants */ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; @@ -90,27 +90,30 @@ static const befs_off_t befs_bt_inval = 0xffffffffffffffffULL; /* local functions */ static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, befs_btree_super * bt_super, - befs_btree_node * this_node, + struct befs_btree_node *this_node, befs_off_t * node_off); static int befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, befs_btree_super * sup); static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, - befs_btree_node * node, befs_off_t node_off); + struct befs_btree_node *node, + befs_off_t node_off); -static int befs_leafnode(befs_btree_node * node); +static int befs_leafnode(struct befs_btree_node *node); -static fs16 *befs_bt_keylen_index(befs_btree_node * node); +static fs16 *befs_bt_keylen_index(struct befs_btree_node *node); -static fs64 *befs_bt_valarray(befs_btree_node * node); +static fs64 *befs_bt_valarray(struct befs_btree_node *node); -static char *befs_bt_keydata(befs_btree_node * node); +static char *befs_bt_keydata(struct befs_btree_node *node); -static int befs_find_key(struct super_block *sb, befs_btree_node * node, +static int befs_find_key(struct super_block *sb, + struct befs_btree_node *node, const char *findkey, befs_off_t * value); -static char *befs_bt_get_key(struct super_block *sb, befs_btree_node * node, +static char *befs_bt_get_key(struct super_block *sb, + struct befs_btree_node *node, int index, u16 * keylen); static int befs_compare_strings(const void *key1, int keylen1, @@ -191,7 +194,7 @@ befs_bt_read_super(struct super_block *sb, befs_data_stream * ds, static int befs_bt_read_node(struct super_block *sb, befs_data_stream * ds, - befs_btree_node * node, befs_off_t node_off) + struct befs_btree_node *node, befs_off_t node_off) { uint off = 0; @@ -247,7 +250,7 @@ int befs_btree_find(struct super_block *sb, befs_data_stream * ds, const char *key, befs_off_t * value) { - befs_btree_node *this_node = NULL; + struct befs_btree_node *this_node = NULL; befs_btree_super bt_super; befs_off_t node_off; int res; @@ -260,11 +263,11 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, goto error; } - this_node = kmalloc(sizeof (befs_btree_node), + this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS); if (!this_node) { befs_error(sb, "befs_btree_find() failed to allocate %zu " - "bytes of memory", sizeof (befs_btree_node)); + "bytes of memory", sizeof(struct befs_btree_node)); goto error; } @@ -333,7 +336,7 @@ befs_btree_find(struct super_block *sb, befs_data_stream * ds, * Use binary search instead of a linear. */ static int -befs_find_key(struct super_block *sb, befs_btree_node * node, +befs_find_key(struct super_block *sb, struct befs_btree_node *node, const char *findkey, befs_off_t * value) { int first, last, mid; @@ -417,7 +420,7 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, loff_t key_no, size_t bufsize, char *keybuf, size_t * keysize, befs_off_t * value) { - befs_btree_node *this_node; + struct befs_btree_node *this_node; befs_btree_super bt_super; befs_off_t node_off = 0; int cur_key; @@ -436,9 +439,10 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, goto error; } - if ((this_node = kmalloc(sizeof (befs_btree_node), GFP_NOFS)) == NULL) { + this_node = kmalloc(sizeof(struct befs_btree_node), GFP_NOFS); + if (this_node == NULL) { befs_error(sb, "befs_btree_read() failed to allocate %zu " - "bytes of memory", sizeof (befs_btree_node)); + "bytes of memory", sizeof(struct befs_btree_node)); goto error; } @@ -545,7 +549,8 @@ befs_btree_read(struct super_block *sb, befs_data_stream * ds, */ static int befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, - befs_btree_super * bt_super, befs_btree_node * this_node, + befs_btree_super *bt_super, + struct befs_btree_node *this_node, befs_off_t * node_off) { @@ -600,7 +605,7 @@ befs_btree_seekleaf(struct super_block *sb, befs_data_stream * ds, * Return 1 if leaf, 0 if interior */ static int -befs_leafnode(befs_btree_node * node) +befs_leafnode(struct befs_btree_node *node) { /* all interior nodes (and only interior nodes) have an overflow node */ if (node->head.overflow == befs_bt_inval) @@ -623,7 +628,7 @@ befs_leafnode(befs_btree_node * node) * Except that rounding up to 8 works, and rounding up to 4 doesn't. */ static fs16 * -befs_bt_keylen_index(befs_btree_node * node) +befs_bt_keylen_index(struct befs_btree_node *node) { const int keylen_align = 8; unsigned long int off = @@ -644,7 +649,7 @@ befs_bt_keylen_index(befs_btree_node * node) * of the node pointed to by the node header */ static fs64 * -befs_bt_valarray(befs_btree_node * node) +befs_bt_valarray(struct befs_btree_node *node) { void *keylen_index_start = (void *) befs_bt_keylen_index(node); size_t keylen_index_size = node->head.all_key_count * sizeof (fs16); @@ -660,7 +665,7 @@ befs_bt_valarray(befs_btree_node * node) * of the node pointed to by the node header */ static char * -befs_bt_keydata(befs_btree_node * node) +befs_bt_keydata(struct befs_btree_node *node) { return (char *) ((void *) node->od_node + sizeof (befs_btree_nodehead)); } @@ -676,7 +681,7 @@ befs_bt_keydata(befs_btree_node * node) * Returns NULL on failure (bad input) and sets *@keylen = 0 */ static char * -befs_bt_get_key(struct super_block *sb, befs_btree_node * node, +befs_bt_get_key(struct super_block *sb, struct befs_btree_node *node, int index, u16 * keylen) { int prev_key_end; diff --git a/fs/binfmt_aout.c b/fs/binfmt_aout.c index ca0ba15a7306..929dec08c348 100644 --- a/fs/binfmt_aout.c +++ b/fs/binfmt_aout.c @@ -256,11 +256,8 @@ static int load_aout_binary(struct linux_binprm * bprm) (current->mm->start_brk = N_BSSADDR(ex)); retval = setup_arg_pages(bprm, STACK_TOP, EXSTACK_DEFAULT); - if (retval < 0) { - /* Someone check-me: is this error path enough? */ - send_sig(SIGKILL, current, 0); + if (retval < 0) return retval; - } install_exec_creds(bprm); @@ -278,17 +275,13 @@ static int load_aout_binary(struct linux_binprm * bprm) map_size = ex.a_text+ex.a_data; #endif error = vm_brk(text_addr & PAGE_MASK, map_size); - if (error != (text_addr & PAGE_MASK)) { - send_sig(SIGKILL, current, 0); + if (error != (text_addr & PAGE_MASK)) return error; - } error = read_code(bprm->file, text_addr, pos, ex.a_text+ex.a_data); - if ((signed long)error < 0) { - send_sig(SIGKILL, current, 0); + if ((signed long)error < 0) return error; - } } else { if ((ex.a_text & 0xfff || ex.a_data & 0xfff) && (N_MAGIC(ex) != NMAGIC) && printk_ratelimit()) @@ -315,28 +308,22 @@ static int load_aout_binary(struct linux_binprm * bprm) MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset); - if (error != N_TXTADDR(ex)) { - send_sig(SIGKILL, current, 0); + if (error != N_TXTADDR(ex)) return error; - } error = vm_mmap(bprm->file, N_DATADDR(ex), ex.a_data, PROT_READ | PROT_WRITE | PROT_EXEC, MAP_FIXED | MAP_PRIVATE | MAP_DENYWRITE | MAP_EXECUTABLE, fd_offset + ex.a_text); - if (error != N_DATADDR(ex)) { - send_sig(SIGKILL, current, 0); + if (error != N_DATADDR(ex)) return error; - } } beyond_if: set_binfmt(&aout_format); retval = set_brk(current->mm->start_brk, current->mm->brk); - if (retval < 0) { - send_sig(SIGKILL, current, 0); + if (retval < 0) return retval; - } current->mm->start_stack = (unsigned long) create_aout_tables((char __user *) bprm->p, bprm); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 3892c1a23241..d8fc0605b9d2 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -738,10 +738,8 @@ static int load_elf_binary(struct linux_binprm *bprm) change some of these later */ retval = setup_arg_pages(bprm, randomize_stack_top(STACK_TOP), executable_stack); - if (retval < 0) { - send_sig(SIGKILL, current, 0); + if (retval < 0) goto out_free_dentry; - } current->mm->start_stack = bprm->p; @@ -763,10 +761,8 @@ static int load_elf_binary(struct linux_binprm *bprm) and clear the area. */ retval = set_brk(elf_bss + load_bias, elf_brk + load_bias); - if (retval) { - send_sig(SIGKILL, current, 0); + if (retval) goto out_free_dentry; - } nbyte = ELF_PAGEOFFSET(elf_bss); if (nbyte) { nbyte = ELF_MIN_ALIGN - nbyte; @@ -820,7 +816,6 @@ static int load_elf_binary(struct linux_binprm *bprm) error = elf_map(bprm->file, load_bias + vaddr, elf_ppnt, elf_prot, elf_flags, 0); if (BAD_ADDR(error)) { - send_sig(SIGKILL, current, 0); retval = IS_ERR((void *)error) ? PTR_ERR((void*)error) : -EINVAL; goto out_free_dentry; @@ -851,7 +846,6 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_ppnt->p_memsz > TASK_SIZE || TASK_SIZE - elf_ppnt->p_memsz < k) { /* set_brk can never work. Avoid overflows. */ - send_sig(SIGKILL, current, 0); retval = -EINVAL; goto out_free_dentry; } @@ -883,12 +877,9 @@ static int load_elf_binary(struct linux_binprm *bprm) * up getting placed where the bss needs to go. */ retval = set_brk(elf_bss, elf_brk); - if (retval) { - send_sig(SIGKILL, current, 0); + if (retval) goto out_free_dentry; - } if (likely(elf_bss != elf_brk) && unlikely(padzero(elf_bss))) { - send_sig(SIGSEGV, current, 0); retval = -EFAULT; /* Nobody gets to see this, but.. */ goto out_free_dentry; } @@ -909,7 +900,6 @@ static int load_elf_binary(struct linux_binprm *bprm) elf_entry += loc->interp_elf_ex.e_entry; } if (BAD_ADDR(elf_entry)) { - force_sig(SIGSEGV, current); retval = IS_ERR((void *)elf_entry) ? (int)elf_entry : -EINVAL; goto out_free_dentry; @@ -922,7 +912,6 @@ static int load_elf_binary(struct linux_binprm *bprm) } else { elf_entry = loc->elf_ex.e_entry; if (BAD_ADDR(elf_entry)) { - force_sig(SIGSEGV, current); retval = -EINVAL; goto out_free_dentry; } @@ -934,19 +923,15 @@ static int load_elf_binary(struct linux_binprm *bprm) #ifdef ARCH_HAS_SETUP_ADDITIONAL_PAGES retval = arch_setup_additional_pages(bprm, !!elf_interpreter); - if (retval < 0) { - send_sig(SIGKILL, current, 0); + if (retval < 0) goto out; - } #endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */ install_exec_creds(bprm); retval = create_elf_tables(bprm, &loc->elf_ex, load_addr, interp_load_addr); - if (retval < 0) { - send_sig(SIGKILL, current, 0); + if (retval < 0) goto out; - } /* N.B. passed_fileno might not be initialized? */ current->mm->end_code = end_code; current->mm->start_code = start_code; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index fe2a643ee005..d3634bfb7fe1 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -317,8 +317,8 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) goto error; /* there's now no turning back... the old userspace image is dead, - * defunct, deceased, etc. after this point we have to exit via - * error_kill */ + * defunct, deceased, etc. + */ set_personality(PER_LINUX_FDPIC); if (elf_read_implies_exec(&exec_params.hdr, executable_stack)) current->personality |= READ_IMPLIES_EXEC; @@ -343,24 +343,22 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) retval = setup_arg_pages(bprm, current->mm->start_stack, executable_stack); - if (retval < 0) { - send_sig(SIGKILL, current, 0); - goto error_kill; - } + if (retval < 0) + goto error; #endif /* load the executable and interpreter into memory */ retval = elf_fdpic_map_file(&exec_params, bprm->file, current->mm, "executable"); if (retval < 0) - goto error_kill; + goto error; if (interpreter_name) { retval = elf_fdpic_map_file(&interp_params, interpreter, current->mm, "interpreter"); if (retval < 0) { printk(KERN_ERR "Unable to load interpreter\n"); - goto error_kill; + goto error; } allow_write_access(interpreter); @@ -397,7 +395,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) if (IS_ERR_VALUE(current->mm->start_brk)) { retval = current->mm->start_brk; current->mm->start_brk = 0; - goto error_kill; + goto error; } current->mm->brk = current->mm->start_brk; @@ -410,7 +408,7 @@ static int load_elf_fdpic_binary(struct linux_binprm *bprm) install_exec_creds(bprm); if (create_elf_fdpic_tables(bprm, current->mm, &exec_params, &interp_params) < 0) - goto error_kill; + goto error; kdebug("- start_code %lx", current->mm->start_code); kdebug("- end_code %lx", current->mm->end_code); @@ -449,12 +447,6 @@ error: kfree(interp_params.phdrs); kfree(interp_params.loadmap); return retval; - - /* unrecoverable error - kill the process */ -error_kill: - send_sig(SIGSEGV, current, 0); - goto error; - } /*****************************************************************************/ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index b60500300dd7..fd8beb9657a2 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -62,7 +62,22 @@ static struct file_system_type bm_fs_type; static struct vfsmount *bm_mnt; static int entry_count; -/* +/* + * Max length of the register string. Determined by: + * - 7 delimiters + * - name: ~50 bytes + * - type: 1 byte + * - offset: 3 bytes (has to be smaller than BINPRM_BUF_SIZE) + * - magic: 128 bytes (512 in escaped form) + * - mask: 128 bytes (512 in escaped form) + * - interp: ~50 bytes + * - flags: 5 bytes + * Round that up a bit, and then back off to hold the internal data + * (like struct Node). + */ +#define MAX_REGISTER_LENGTH 1920 + +/* * Check if we support the binfmt * if we do, return the node, else NULL * locking is done in load_misc_binary @@ -279,7 +294,7 @@ static Node *create_entry(const char __user *buffer, size_t count) /* some sanity checks */ err = -EINVAL; - if ((count < 11) || (count > 256)) + if ((count < 11) || (count > MAX_REGISTER_LENGTH)) goto out; err = -ENOMEM; @@ -396,12 +411,12 @@ static int parse_command(const char __user *buffer, size_t count) { char s[4]; - if (!count) - return 0; if (count > 3) return -EINVAL; if (copy_from_user(s, buffer, count)) return -EFAULT; + if (!count) + return 0; if (s[count-1] == '\n') count--; if (count == 1 && s[0] == '0') diff --git a/fs/block_dev.c b/fs/block_dev.c index 6d7274619bf9..cc9d4114cda0 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -50,32 +50,22 @@ inline struct block_device *I_BDEV(struct inode *inode) EXPORT_SYMBOL(I_BDEV); /* - * Move the inode from its current bdi to a new bdi. If the inode is dirty we - * need to move it onto the dirty list of @dst so that the inode is always on - * the right list. + * Move the inode from its current bdi to a new bdi. Make sure the inode + * is clean before moving so that it doesn't linger on the old bdi. */ static void bdev_inode_switch_bdi(struct inode *inode, struct backing_dev_info *dst) { - struct backing_dev_info *old = inode->i_data.backing_dev_info; - bool wakeup_bdi = false; - - if (unlikely(dst == old)) /* deadlock avoidance */ - return; - bdi_lock_two(&old->wb, &dst->wb); - spin_lock(&inode->i_lock); - inode->i_data.backing_dev_info = dst; - if (inode->i_state & I_DIRTY) { - if (bdi_cap_writeback_dirty(dst) && !wb_has_dirty_io(&dst->wb)) - wakeup_bdi = true; - list_move(&inode->i_wb_list, &dst->wb.b_dirty); + while (true) { + spin_lock(&inode->i_lock); + if (!(inode->i_state & I_DIRTY)) { + inode->i_data.backing_dev_info = dst; + spin_unlock(&inode->i_lock); + return; + } + spin_unlock(&inode->i_lock); + WARN_ON_ONCE(write_inode_now(inode, true)); } - spin_unlock(&inode->i_lock); - spin_unlock(&old->wb.list_lock); - spin_unlock(&dst->wb.list_lock); - - if (wakeup_bdi) - bdi_wakeup_thread_delayed(dst); } /* Kill _all_ buffers and pagecache , dirty or not.. */ @@ -304,6 +294,12 @@ static int blkdev_readpage(struct file * file, struct page * page) return block_read_full_page(page, blkdev_get_block); } +static int blkdev_readpages(struct file *file, struct address_space *mapping, + struct list_head *pages, unsigned nr_pages) +{ + return mpage_readpages(mapping, pages, nr_pages, blkdev_get_block); +} + static int blkdev_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1173,8 +1169,6 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode, int for_part) if (!ret) { bd_set_size(bdev,(loff_t)get_capacity(disk)<<9); bdi = blk_get_backing_dev_info(bdev); - if (bdi == NULL) - bdi = &default_backing_dev_info; bdev_inode_switch_bdi(bdev->bd_inode, bdi); } @@ -1622,6 +1616,7 @@ static int blkdev_releasepage(struct page *page, gfp_t wait) static const struct address_space_operations def_blk_aops = { .readpage = blkdev_readpage, + .readpages = blkdev_readpages, .writepage = blkdev_writepage, .write_begin = blkdev_write_begin, .write_end = blkdev_write_end, diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index fbd76ded9a34..4dabeb893b7c 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -74,6 +74,7 @@ BTRFS_WORK_HELPER(endio_helper); BTRFS_WORK_HELPER(endio_meta_helper); BTRFS_WORK_HELPER(endio_meta_write_helper); BTRFS_WORK_HELPER(endio_raid56_helper); +BTRFS_WORK_HELPER(endio_repair_helper); BTRFS_WORK_HELPER(rmw_helper); BTRFS_WORK_HELPER(endio_write_helper); BTRFS_WORK_HELPER(freespace_write_helper); @@ -91,7 +92,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active, { struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->max_active = max_active; @@ -115,7 +116,7 @@ __btrfs_alloc_workqueue(const char *name, int flags, int max_active, ret->normal_wq = alloc_workqueue("%s-%s", flags, ret->max_active, "btrfs", name); - if (unlikely(!ret->normal_wq)) { + if (!ret->normal_wq) { kfree(ret); return NULL; } @@ -137,12 +138,12 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, { struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_NOFS); - if (unlikely(!ret)) + if (!ret) return NULL; ret->normal = __btrfs_alloc_workqueue(name, flags & ~WQ_HIGHPRI, max_active, thresh); - if (unlikely(!ret->normal)) { + if (!ret->normal) { kfree(ret); return NULL; } @@ -150,7 +151,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(const char *name, if (flags & WQ_HIGHPRI) { ret->high = __btrfs_alloc_workqueue(name, flags, max_active, thresh); - if (unlikely(!ret->high)) { + if (!ret->high) { __btrfs_destroy_workqueue(ret->normal); kfree(ret); return NULL; diff --git a/fs/btrfs/async-thread.h b/fs/btrfs/async-thread.h index e9e31c94758f..e386c29ef1f6 100644 --- a/fs/btrfs/async-thread.h +++ b/fs/btrfs/async-thread.h @@ -53,6 +53,7 @@ BTRFS_WORK_HELPER_PROTO(endio_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_helper); BTRFS_WORK_HELPER_PROTO(endio_meta_write_helper); BTRFS_WORK_HELPER_PROTO(endio_raid56_helper); +BTRFS_WORK_HELPER_PROTO(endio_repair_helper); BTRFS_WORK_HELPER_PROTO(rmw_helper); BTRFS_WORK_HELPER_PROTO(endio_write_helper); BTRFS_WORK_HELPER_PROTO(freespace_write_helper); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 54a201dac7f9..2d3e32ebfd15 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -25,6 +25,9 @@ #include "delayed-ref.h" #include "locking.h" +/* Just an arbitrary number so we can be sure this happened */ +#define BACKREF_FOUND_SHARED 6 + struct extent_inode_elem { u64 inum; u64 offset; @@ -377,7 +380,8 @@ out: static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 time_seq, struct list_head *head, - const u64 *extent_item_pos, u64 total_refs) + const u64 *extent_item_pos, u64 total_refs, + u64 root_objectid) { int err; int ret = 0; @@ -402,6 +406,10 @@ static int __resolve_indirect_refs(struct btrfs_fs_info *fs_info, continue; if (ref->count == 0) continue; + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } err = __resolve_indirect_ref(fs_info, path, time_seq, ref, parents, extent_item_pos, total_refs); @@ -482,7 +490,7 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, continue; BUG_ON(!ref->wanted_disk_byte); eb = read_tree_block(fs_info->tree_root, ref->wanted_disk_byte, - fs_info->tree_root->leafsize, 0); + 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -561,7 +569,8 @@ static void __merge_refs(struct list_head *head, int mode) * smaller or equal that seq to the list */ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, - struct list_head *prefs, u64 *total_refs) + struct list_head *prefs, u64 *total_refs, + u64 inum) { struct btrfs_delayed_extent_op *extent_op = head->extent_op; struct rb_node *n = &head->node.rb_node; @@ -625,6 +634,16 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, key.objectid = ref->objectid; key.type = BTRFS_EXTENT_DATA_KEY; key.offset = ref->offset; + + /* + * Found a inum that doesn't match our known inum, we + * know it's shared. + */ + if (inum && ref->objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + ret = __add_prelim_ref(prefs, ref->root, &key, 0, 0, node->bytenr, node->ref_mod * sgn, GFP_ATOMIC); @@ -659,7 +678,7 @@ static int __add_delayed_refs(struct btrfs_delayed_ref_head *head, u64 seq, static int __add_inline_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, int *info_level, struct list_head *prefs, - u64 *total_refs) + u64 *total_refs, u64 inum) { int ret = 0; int slot; @@ -744,6 +763,12 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, count, GFP_NOFS); @@ -765,7 +790,7 @@ static int __add_inline_refs(struct btrfs_fs_info *fs_info, */ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, struct btrfs_path *path, u64 bytenr, - int info_level, struct list_head *prefs) + int info_level, struct list_head *prefs, u64 inum) { struct btrfs_root *extent_root = fs_info->extent_root; int ret; @@ -827,6 +852,12 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, dref); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = btrfs_extent_data_ref_offset(leaf, dref); + + if (inum && key.objectid != inum) { + ret = BACKREF_FOUND_SHARED; + break; + } + root = btrfs_extent_data_ref_root(leaf, dref); ret = __add_prelim_ref(prefs, root, &key, 0, 0, bytenr, count, GFP_NOFS); @@ -854,7 +885,8 @@ static int __add_keyed_refs(struct btrfs_fs_info *fs_info, static int find_parent_nodes(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info, u64 bytenr, u64 time_seq, struct ulist *refs, - struct ulist *roots, const u64 *extent_item_pos) + struct ulist *roots, const u64 *extent_item_pos, + u64 root_objectid, u64 inum) { struct btrfs_key key; struct btrfs_path *path; @@ -929,7 +961,8 @@ again: } spin_unlock(&delayed_refs->lock); ret = __add_delayed_refs(head, time_seq, - &prefs_delayed, &total_refs); + &prefs_delayed, &total_refs, + inum); mutex_unlock(&head->mutex); if (ret) goto out; @@ -951,11 +984,11 @@ again: key.type == BTRFS_METADATA_ITEM_KEY)) { ret = __add_inline_refs(fs_info, path, bytenr, &info_level, &prefs, - &total_refs); + &total_refs, inum); if (ret) goto out; ret = __add_keyed_refs(fs_info, path, bytenr, - info_level, &prefs); + info_level, &prefs, inum); if (ret) goto out; } @@ -971,7 +1004,8 @@ again: __merge_refs(&prefs, 1); ret = __resolve_indirect_refs(fs_info, path, time_seq, &prefs, - extent_item_pos, total_refs); + extent_item_pos, total_refs, + root_objectid); if (ret) goto out; @@ -981,6 +1015,11 @@ again: ref = list_first_entry(&prefs, struct __prelim_ref, list); WARN_ON(ref->count < 0); if (roots && ref->count && ref->root_id && ref->parent == 0) { + if (root_objectid && ref->root_id != root_objectid) { + ret = BACKREF_FOUND_SHARED; + goto out; + } + /* no parent == root of tree */ ret = ulist_add(roots, ref->root_id, 0, GFP_NOFS); if (ret < 0) @@ -989,12 +1028,10 @@ again: if (ref->count && ref->parent) { if (extent_item_pos && !ref->inode_list && ref->level == 0) { - u32 bsz; struct extent_buffer *eb; - bsz = btrfs_level_size(fs_info->extent_root, - ref->level); + eb = read_tree_block(fs_info->extent_root, - ref->parent, bsz, 0); + ref->parent, 0); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); ret = -EIO; @@ -1087,7 +1124,7 @@ static int btrfs_find_all_leafs(struct btrfs_trans_handle *trans, return -ENOMEM; ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, *leafs, NULL, extent_item_pos); + time_seq, *leafs, NULL, extent_item_pos, 0, 0); if (ret < 0 && ret != -ENOENT) { free_leaf_list(*leafs); return ret; @@ -1130,7 +1167,7 @@ static int __btrfs_find_all_roots(struct btrfs_trans_handle *trans, ULIST_ITER_INIT(&uiter); while (1) { ret = find_parent_nodes(trans, fs_info, bytenr, - time_seq, tmp, *roots, NULL); + time_seq, tmp, *roots, NULL, 0, 0); if (ret < 0 && ret != -ENOENT) { ulist_free(tmp); ulist_free(*roots); @@ -1161,6 +1198,54 @@ int btrfs_find_all_roots(struct btrfs_trans_handle *trans, return ret; } +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr) +{ + struct ulist *tmp = NULL; + struct ulist *roots = NULL; + struct ulist_iterator uiter; + struct ulist_node *node; + struct seq_list elem = {}; + int ret = 0; + + tmp = ulist_alloc(GFP_NOFS); + roots = ulist_alloc(GFP_NOFS); + if (!tmp || !roots) { + ulist_free(tmp); + ulist_free(roots); + return -ENOMEM; + } + + if (trans) + btrfs_get_tree_mod_seq(fs_info, &elem); + else + down_read(&fs_info->commit_root_sem); + ULIST_ITER_INIT(&uiter); + while (1) { + ret = find_parent_nodes(trans, fs_info, bytenr, elem.seq, tmp, + roots, NULL, root_objectid, inum); + if (ret == BACKREF_FOUND_SHARED) { + ret = 1; + break; + } + if (ret < 0 && ret != -ENOENT) + break; + node = ulist_next(tmp, &uiter); + if (!node) + break; + bytenr = node->val; + cond_resched(); + } + if (trans) + btrfs_put_tree_mod_seq(fs_info, &elem); + else + up_read(&fs_info->commit_root_sem); + ulist_free(tmp); + ulist_free(roots); + return ret; +} + /* * this makes the path point to (inum INODE_ITEM ioff) */ @@ -1193,7 +1278,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, unsigned long ptr; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = start_off; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -1233,7 +1318,7 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, ret = -ENOENT; if (found_key.objectid != inode_objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_INODE_EXTREF_KEY) + if (found_key.type != BTRFS_INODE_EXTREF_KEY) break; ret = 0; @@ -1366,7 +1451,7 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, } btrfs_item_key_to_cpu(path->nodes[0], found_key, path->slots[0]); if (found_key->type == BTRFS_METADATA_ITEM_KEY) - size = fs_info->extent_root->leafsize; + size = fs_info->extent_root->nodesize; else if (found_key->type == BTRFS_EXTENT_ITEM_KEY) size = found_key->offset; diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 86fc20fec282..2a1ac6bfc724 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -71,6 +71,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid, u64 start_off, struct btrfs_path *path, struct btrfs_inode_extref **ret_extref, u64 *found_off); +int btrfs_check_shared(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 root_objectid, + u64 inum, u64 bytenr); int __init btrfs_prelim_ref_init(void); void btrfs_prelim_ref_exit(void); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 56b8522d5767..4aadadcfab20 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -44,6 +44,17 @@ #define BTRFS_INODE_IN_DELALLOC_LIST 9 #define BTRFS_INODE_READDIO_NEED_LOCK 10 #define BTRFS_INODE_HAS_PROPS 11 +/* + * The following 3 bits are meant only for the btree inode. + * When any of them is set, it means an error happened while writing an + * extent buffer belonging to: + * 1) a non-log btree + * 2) a log btree and first log sub-transaction + * 3) a log btree and second log sub-transaction + */ +#define BTRFS_INODE_BTREE_ERR 12 +#define BTRFS_INODE_BTREE_LOG1_ERR 13 +#define BTRFS_INODE_BTREE_LOG2_ERR 14 /* in memory btrfs inode */ struct btrfs_inode { @@ -121,6 +132,12 @@ struct btrfs_inode { u64 delalloc_bytes; /* + * total number of bytes pending defrag, used by stat to check whether + * it needs COW. + */ + u64 defrag_bytes; + + /* * the size of the file stored in the metadata on disk. data=ordered * means the in-memory i_size might be larger than the size on disk * because not all the blocks are written yet. @@ -248,8 +265,11 @@ static inline int btrfs_inode_in_log(struct inode *inode, u64 generation) return 0; } +#define BTRFS_DIO_ORIG_BIO_SUBMITTED 0x1 + struct btrfs_dio_private { struct inode *inode; + unsigned long flags; u64 logical_offset; u64 disk_bytenr; u64 bytes; @@ -266,7 +286,12 @@ struct btrfs_dio_private { /* dio_bio came from fs/direct-io.c */ struct bio *dio_bio; - u8 csum[0]; + + /* + * The original bio may be splited to several sub-bios, this is + * done during endio of sub-bios + */ + int (*subio_endio)(struct inode *, struct btrfs_io_bio *, int); }; /* diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index ce92ae30250f..cb7f3fe9c9f6 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -807,7 +807,7 @@ static int btrfsic_process_superblock_dev_mirror( /* super block bytenr is always the unmapped device bytenr */ dev_bytenr = btrfs_sb_offset(superblock_mirror_num); - if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->total_bytes) + if (dev_bytenr + BTRFS_SUPER_INFO_SIZE > device->commit_total_bytes) return -1; bh = __bread(superblock_bdev, dev_bytenr / 4096, BTRFS_SUPER_INFO_SIZE); @@ -820,7 +820,6 @@ static int btrfsic_process_superblock_dev_mirror( btrfs_super_magic(super_tmp) != BTRFS_MAGIC || memcmp(device->uuid, super_tmp->dev_item.uuid, BTRFS_UUID_SIZE) || btrfs_super_nodesize(super_tmp) != state->metablock_size || - btrfs_super_leafsize(super_tmp) != state->metablock_size || btrfs_super_sectorsize(super_tmp) != state->datablock_size) { brelse(bh); return 0; @@ -1252,8 +1251,7 @@ static void btrfsic_read_from_block_data( while (len > 0) { cur = min(len, ((size_t)PAGE_CACHE_SIZE - offset_in_page)); - BUG_ON(i >= (block_ctx->len + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT); + BUG_ON(i >= DIV_ROUND_UP(block_ctx->len, PAGE_CACHE_SIZE)); kaddr = block_ctx->datav[i]; memcpy(dst, kaddr + offset_in_page, cur); @@ -3120,24 +3118,12 @@ int btrfsic_mount(struct btrfs_root *root, struct list_head *dev_head = &fs_devices->devices; struct btrfs_device *device; - if (root->nodesize != root->leafsize) { - printk(KERN_INFO - "btrfsic: cannot handle nodesize %d != leafsize %d!\n", - root->nodesize, root->leafsize); - return -1; - } if (root->nodesize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle nodesize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", root->nodesize, PAGE_CACHE_SIZE); return -1; } - if (root->leafsize & ((u64)PAGE_CACHE_SIZE - 1)) { - printk(KERN_INFO - "btrfsic: cannot handle leafsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", - root->leafsize, PAGE_CACHE_SIZE); - return -1; - } if (root->sectorsize & ((u64)PAGE_CACHE_SIZE - 1)) { printk(KERN_INFO "btrfsic: cannot handle sectorsize %d not being a multiple of PAGE_CACHE_SIZE %ld!\n", diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1daea0b47187..d3220d31d3cb 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -91,8 +91,7 @@ static inline int compressed_bio_size(struct btrfs_root *root, u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); return sizeof(struct compressed_bio) + - ((disk_size + root->sectorsize - 1) / root->sectorsize) * - csum_size; + (DIV_ROUND_UP(disk_size, root->sectorsize)) * csum_size; } static struct bio *compressed_bio_alloc(struct block_device *bdev, @@ -389,7 +388,8 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, * freed before we're done setting it up */ atomic_inc(&cb->pending_bios); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -420,7 +420,7 @@ int btrfs_submit_compressed_write(struct inode *inode, u64 start, } bio_get(bio); - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!skip_sum) { @@ -615,8 +615,7 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, cb->compress_type = extent_compress_type(bio_flags); cb->orig_bio = bio; - nr_pages = (compressed_len + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + nr_pages = DIV_ROUND_UP(compressed_len, PAGE_CACHE_SIZE); cb->compressed_pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); if (!cb->compressed_pages) @@ -670,7 +669,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, PAGE_CACHE_SIZE) { bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ /* @@ -686,8 +686,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, comp_bio, sums); BUG_ON(ret); /* -ENOMEM */ } - sums += (comp_bio->bi_iter.bi_size + - root->sectorsize - 1) / root->sectorsize; + sums += DIV_ROUND_UP(comp_bio->bi_iter.bi_size, + root->sectorsize); ret = btrfs_map_bio(root, READ, comp_bio, mirror_num, 0); @@ -708,7 +708,8 @@ int btrfs_submit_compressed_read(struct inode *inode, struct bio *bio, } bio_get(comp_bio); - ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, comp_bio, + BTRFS_WQ_ENDIO_DATA); BUG_ON(ret); /* -ENOMEM */ if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 44ee5d2e52a4..19bc6162fb8e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -258,9 +258,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, else btrfs_node_key(buf, &disk_key, 0); - cow = btrfs_alloc_free_block(trans, root, buf->len, 0, - new_root_objectid, &disk_key, level, - buf->start, 0); + cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, + &disk_key, level, buf->start, 0); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1133,9 +1132,9 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, } else parent_start = 0; - cow = btrfs_alloc_free_block(trans, root, buf->len, parent_start, - root->root_key.objectid, &disk_key, - level, search_start, empty_size); + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1425,7 +1424,6 @@ get_old_root(struct btrfs_root *root, u64 time_seq) struct tree_mod_root *old_root = NULL; u64 old_generation = 0; u64 logical; - u32 blocksize; eb_root = btrfs_read_lock_root_node(root); tm = __tree_mod_log_oldest_root(root->fs_info, eb_root, time_seq); @@ -1444,8 +1442,7 @@ get_old_root(struct btrfs_root *root, u64 time_seq) if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { btrfs_tree_read_unlock(eb_root); free_extent_buffer(eb_root); - blocksize = btrfs_level_size(root, old_root->level); - old = read_tree_block(root, logical, blocksize, 0); + old = read_tree_block(root, logical, 0); if (WARN_ON(!old || !extent_buffer_uptodate(old))) { free_extent_buffer(old); btrfs_warn(root->fs_info, @@ -1506,10 +1503,9 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + /* ensure we can see the force_cow */ smp_rmb(); @@ -1651,7 +1647,7 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, WARN_ON(trans->transid != root->fs_info->generation); parent_nritems = btrfs_header_nritems(parent); - blocksize = btrfs_level_size(root, parent_level - 1); + blocksize = root->nodesize; end_slot = parent_nritems; if (parent_nritems == 1) @@ -1685,15 +1681,14 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, continue; } - cur = btrfs_find_tree_block(root, blocknr, blocksize); + cur = btrfs_find_tree_block(root, blocknr); if (cur) uptodate = btrfs_buffer_uptodate(cur, gen, 0); else uptodate = 0; if (!cur || !uptodate) { if (!cur) { - cur = read_tree_block(root, blocknr, - blocksize, gen); + cur = read_tree_block(root, blocknr, gen); if (!cur || !extent_buffer_uptodate(cur)) { free_extent_buffer(cur); return -EIO; @@ -1872,7 +1867,6 @@ static noinline struct extent_buffer *read_node_slot(struct btrfs_root *root, BUG_ON(level == 0); eb = read_tree_block(root, btrfs_node_blockptr(parent, slot), - btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(parent, slot)); if (eb && !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); @@ -2267,8 +2261,8 @@ static void reada_for_search(struct btrfs_root *root, node = path->nodes[level]; search = btrfs_node_blockptr(node, slot); - blocksize = btrfs_level_size(root, level - 1); - eb = btrfs_find_tree_block(root, search, blocksize); + blocksize = root->nodesize; + eb = btrfs_find_tree_block(root, search); if (eb) { free_extent_buffer(eb); return; @@ -2298,7 +2292,7 @@ static void reada_for_search(struct btrfs_root *root, if ((search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { gen = btrfs_node_ptr_generation(node, nr); - readahead_tree_block(root, search, blocksize, gen); + readahead_tree_block(root, search, blocksize); nread += blocksize; } nscan++; @@ -2325,12 +2319,12 @@ static noinline void reada_for_balance(struct btrfs_root *root, nritems = btrfs_header_nritems(parent); slot = path->slots[level + 1]; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; if (slot > 0) { block1 = btrfs_node_blockptr(parent, slot - 1); gen = btrfs_node_ptr_generation(parent, slot - 1); - eb = btrfs_find_tree_block(root, block1, blocksize); + eb = btrfs_find_tree_block(root, block1); /* * if we get -eagain from btrfs_buffer_uptodate, we * don't want to return eagain here. That will loop @@ -2343,16 +2337,16 @@ static noinline void reada_for_balance(struct btrfs_root *root, if (slot + 1 < nritems) { block2 = btrfs_node_blockptr(parent, slot + 1); gen = btrfs_node_ptr_generation(parent, slot + 1); - eb = btrfs_find_tree_block(root, block2, blocksize); + eb = btrfs_find_tree_block(root, block2); if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0) block2 = 0; free_extent_buffer(eb); } if (block1) - readahead_tree_block(root, block1, blocksize, 0); + readahead_tree_block(root, block1, blocksize); if (block2) - readahead_tree_block(root, block2, blocksize, 0); + readahead_tree_block(root, block2, blocksize); } @@ -2454,16 +2448,14 @@ read_block_for_search(struct btrfs_trans_handle *trans, { u64 blocknr; u64 gen; - u32 blocksize; struct extent_buffer *b = *eb_ret; struct extent_buffer *tmp; int ret; blocknr = btrfs_node_blockptr(b, slot); gen = btrfs_node_ptr_generation(b, slot); - blocksize = btrfs_level_size(root, level - 1); - tmp = btrfs_find_tree_block(root, blocknr, blocksize); + tmp = btrfs_find_tree_block(root, blocknr); if (tmp) { /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { @@ -2507,7 +2499,7 @@ read_block_for_search(struct btrfs_trans_handle *trans, btrfs_release_path(p); ret = -EAGAIN; - tmp = read_tree_block(root, blocknr, blocksize, 0); + tmp = read_tree_block(root, blocknr, 0); if (tmp) { /* * If the read above didn't mark this buffer up to date, @@ -2792,8 +2784,6 @@ again: if (!should_cow_block(trans, root, b)) goto cow_done; - btrfs_set_path_blocking(p); - /* * must have write locks on this node and the * parent @@ -2807,6 +2797,7 @@ again: goto again; } + btrfs_set_path_blocking(p); err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], p->slots[level + 1], &b); @@ -3362,9 +3353,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, else btrfs_node_key(lower, &lower_key, 0); - c = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, &lower_key, - level, root->node->start, 0); + c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &lower_key, level, root->node->start, 0); if (IS_ERR(c)) return PTR_ERR(c); @@ -3502,9 +3492,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans, mid = (c_nritems + 1) / 2; btrfs_node_key(c, &disk_key, mid); - split = btrfs_alloc_free_block(trans, root, root->nodesize, 0, - root->root_key.objectid, - &disk_key, level, c->start, 0); + split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, level, c->start, 0); if (IS_ERR(split)) return PTR_ERR(split); @@ -4282,13 +4271,12 @@ again: else btrfs_item_key(l, &disk_key, mid); - right = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - root->root_key.objectid, - &disk_key, 0, l->start, 0); + right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid, + &disk_key, 0, l->start, 0); if (IS_ERR(right)) return PTR_ERR(right); - root_add_used(root, root->leafsize); + root_add_used(root, root->nodesize); memset_extent_buffer(right, 0, 0, sizeof(struct btrfs_header)); btrfs_set_header_bytenr(right, right->start); @@ -4626,8 +4614,7 @@ void btrfs_truncate_item(struct btrfs_root *root, struct btrfs_path *path, ptr = btrfs_item_ptr_offset(leaf, slot); memmove_extent_buffer(leaf, ptr, (unsigned long)fi, - offsetof(struct btrfs_file_extent_item, - disk_bytenr)); + BTRFS_FILE_EXTENT_INLINE_DATA_START); } } @@ -4738,6 +4725,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, int slot; struct btrfs_map_token token; + if (path->slots[0] == 0) { + btrfs_cpu_key_to_disk(&disk_key, cpu_key); + fixup_low_keys(root, path, &disk_key, 1); + } + btrfs_unlock_up_safe(path, 1); + btrfs_init_map_token(&token); leaf = path->nodes[0]; @@ -4798,12 +4791,6 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, } btrfs_set_header_nritems(leaf, nritems + nr); - - if (slot == 0) { - btrfs_cpu_key_to_disk(&disk_key, cpu_key); - fixup_low_keys(root, path, &disk_key, 1); - } - btrfs_unlock_up_safe(path, 1); btrfs_mark_buffer_dirty(leaf); if (btrfs_leaf_free_space(root, leaf) < 0) { @@ -5145,8 +5132,9 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u32 nritems; int level; int ret = 1; + int keep_locks = path->keep_locks; - WARN_ON(!path->keep_locks); + path->keep_locks = 1; again: cur = btrfs_read_lock_root_node(root); level = btrfs_header_level(cur); @@ -5210,7 +5198,6 @@ find_next_key: path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; - unlock_up(path, level, 1, 0, NULL); goto out; } btrfs_set_path_blocking(path); @@ -5225,9 +5212,12 @@ find_next_key: btrfs_clear_path_blocking(path, NULL, 0); } out: - if (ret == 0) + path->keep_locks = keep_locks; + if (ret == 0) { + btrfs_unlock_up_safe(path, path->lowest_level + 1); + btrfs_set_path_blocking(path); memcpy(min_key, &found_key, sizeof(found_key)); - btrfs_set_path_blocking(path); + } return ret; } @@ -5375,7 +5365,7 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; } - tmp_buf = kmalloc(left_root->leafsize, GFP_NOFS); + tmp_buf = kmalloc(left_root->nodesize, GFP_NOFS); if (!tmp_buf) { ret = -ENOMEM; goto out; @@ -5520,18 +5510,18 @@ int btrfs_compare_trees(struct btrfs_root *left_root, goto out; advance_right = ADVANCE; } else { - enum btrfs_compare_tree_result cmp; + enum btrfs_compare_tree_result result; WARN_ON(!extent_buffer_uptodate(left_path->nodes[0])); ret = tree_compare_item(left_root, left_path, right_path, tmp_buf); if (ret) - cmp = BTRFS_COMPARE_TREE_CHANGED; + result = BTRFS_COMPARE_TREE_CHANGED; else - cmp = BTRFS_COMPARE_TREE_SAME; + result = BTRFS_COMPARE_TREE_SAME; ret = changed_cb(left_root, right_root, left_path, right_path, - &left_key, cmp, ctx); + &left_key, result, ctx); if (ret < 0) goto out; advance_left = ADVANCE; diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 8e29b614fe93..d557264ee974 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -34,6 +34,7 @@ #include <linux/pagemap.h> #include <linux/btrfs.h> #include <linux/workqueue.h> +#include <linux/security.h> #include "extent_io.h" #include "extent_map.h" #include "async-thread.h" @@ -62,13 +63,6 @@ struct btrfs_ordered_sum; #define BTRFS_COMPAT_EXTENT_TREE_V0 -/* - * files bigger than this get some pre-flushing when they are added - * to the ordered operations list. That way we limit the total - * work done by the commit - */ -#define BTRFS_ORDERED_OPERATIONS_FLUSH_LIMIT (8 * 1024 * 1024) - /* holds pointers to all of the tree roots */ #define BTRFS_ROOT_TREE_OBJECTID 1ULL @@ -391,10 +385,12 @@ struct btrfs_header { sizeof(struct btrfs_header)) / \ sizeof(struct btrfs_key_ptr)) #define __BTRFS_LEAF_DATA_SIZE(bs) ((bs) - sizeof(struct btrfs_header)) -#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->leafsize)) +#define BTRFS_LEAF_DATA_SIZE(r) (__BTRFS_LEAF_DATA_SIZE(r->nodesize)) +#define BTRFS_FILE_EXTENT_INLINE_DATA_START \ + (offsetof(struct btrfs_file_extent_item, disk_bytenr)) #define BTRFS_MAX_INLINE_DATA_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) - \ - sizeof(struct btrfs_file_extent_item)) + BTRFS_FILE_EXTENT_INLINE_DATA_START) #define BTRFS_MAX_XATTR_SIZE(r) (BTRFS_LEAF_DATA_SIZE(r) - \ sizeof(struct btrfs_item) -\ sizeof(struct btrfs_dir_item)) @@ -474,7 +470,7 @@ struct btrfs_super_block { __le64 num_devices; __le32 sectorsize; __le32 nodesize; - __le32 leafsize; + __le32 __unused_leafsize; __le32 stripesize; __le32 sys_chunk_array_size; __le64 chunk_root_generation; @@ -903,6 +899,8 @@ struct btrfs_file_extent_item { /* * disk space consumed by the extent, checksum blocks are included * in these numbers + * + * At this offset in the structure, the inline extent data start. */ __le64 disk_bytenr; __le64 disk_num_bytes; @@ -1305,8 +1303,8 @@ struct btrfs_block_group_cache { */ struct list_head cluster_list; - /* For delayed block group creation */ - struct list_head new_bg_list; + /* For delayed block group creation or deletion of empty block groups */ + struct list_head bg_list; }; /* delayed seq elem */ @@ -1545,6 +1543,7 @@ struct btrfs_fs_info { struct btrfs_workqueue *endio_workers; struct btrfs_workqueue *endio_meta_workers; struct btrfs_workqueue *endio_raid56_workers; + struct btrfs_workqueue *endio_repair_workers; struct btrfs_workqueue *rmw_workers; struct btrfs_workqueue *endio_meta_write_workers; struct btrfs_workqueue *endio_write_workers; @@ -1574,6 +1573,7 @@ struct btrfs_fs_info { int do_barriers; int closing; int log_root_recovering; + int open; u64 total_pinned; @@ -1723,6 +1723,12 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + + spinlock_t unused_bgs_lock; + struct list_head unused_bgs; + + /* For btrfs to record security options */ + struct security_mnt_opts security_opts; }; struct btrfs_subvolume_writers { @@ -1776,12 +1782,12 @@ struct btrfs_root { /* free ino cache stuff */ struct btrfs_free_space_ctl *free_ino_ctl; - enum btrfs_caching_type cached; - spinlock_t cache_lock; - wait_queue_head_t cache_wait; + enum btrfs_caching_type ino_cache_state; + spinlock_t ino_cache_lock; + wait_queue_head_t ino_cache_wait; struct btrfs_free_space_ctl *free_ino_pinned; - u64 cache_progress; - struct inode *cache_inode; + u64 ino_cache_progress; + struct inode *ino_cache_inode; struct mutex log_mutex; wait_queue_head_t log_writer_wait; @@ -1806,18 +1812,14 @@ struct btrfs_root { /* node allocations are done in nodesize units */ u32 nodesize; - /* leaf allocations are done in leafsize units */ - u32 leafsize; - u32 stripesize; u32 type; u64 highest_objectid; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + /* only used with CONFIG_BTRFS_FS_RUN_SANITY_TESTS is enabled */ u64 alloc_bytenr; -#endif u64 defrag_trans_start; struct btrfs_key defrag_progress; @@ -2094,6 +2096,7 @@ struct btrfs_ioctl_defrag_range_args { #define BTRFS_MOUNT_CHANGE_INODE_CACHE (1 << 24) #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_DEFAULT_MAX_INLINE (8192) #define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt) #define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt) @@ -2995,8 +2998,6 @@ BTRFS_SETGET_STACK_FUNCS(super_sectorsize, struct btrfs_super_block, sectorsize, 32); BTRFS_SETGET_STACK_FUNCS(super_nodesize, struct btrfs_super_block, nodesize, 32); -BTRFS_SETGET_STACK_FUNCS(super_leafsize, struct btrfs_super_block, - leafsize, 32); BTRFS_SETGET_STACK_FUNCS(super_stripesize, struct btrfs_super_block, stripesize, 32); BTRFS_SETGET_STACK_FUNCS(super_root_dir, struct btrfs_super_block, @@ -3049,14 +3050,12 @@ BTRFS_SETGET_STACK_FUNCS(stack_file_extent_compression, static inline unsigned long btrfs_file_extent_inline_start(struct btrfs_file_extent_item *e) { - unsigned long offset = (unsigned long)e; - offset += offsetof(struct btrfs_file_extent_item, disk_bytenr); - return offset; + return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START; } static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) { - return offsetof(struct btrfs_file_extent_item, disk_bytenr) + datasize; + return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize; } BTRFS_SETGET_FUNCS(file_extent_disk_bytenr, struct btrfs_file_extent_item, @@ -3086,9 +3085,7 @@ BTRFS_SETGET_FUNCS(file_extent_other_encoding, struct btrfs_file_extent_item, static inline u32 btrfs_file_extent_inline_item_len(struct extent_buffer *eb, struct btrfs_item *e) { - unsigned long offset; - offset = offsetof(struct btrfs_file_extent_item, disk_bytenr); - return btrfs_item_size(eb, e) - offset; + return btrfs_item_size(eb, e) - BTRFS_FILE_EXTENT_INLINE_DATA_START; } /* this returns the number of file bytes represented by the inline item. @@ -3232,13 +3229,6 @@ static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb) return sb->s_fs_info; } -static inline u32 btrfs_level_size(struct btrfs_root *root, int level) -{ - if (level == 0) - return root->leafsize; - return root->nodesize; -} - /* helper function to cast into the data area of the leaf. */ #define btrfs_item_ptr(leaf, slot, type) \ ((type *)(btrfs_leaf_data(leaf) + \ @@ -3263,7 +3253,7 @@ static inline gfp_t btrfs_alloc_write_mask(struct address_space *mapping) static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * + return (root->nodesize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * 2 * num_items; } @@ -3274,8 +3264,7 @@ static inline u64 btrfs_calc_trans_metadata_size(struct btrfs_root *root, static inline u64 btrfs_calc_trunc_metadata_size(struct btrfs_root *root, unsigned num_items) { - return (root->leafsize + root->nodesize * (BTRFS_MAX_LEVEL - 1)) * - num_items; + return root->nodesize * BTRFS_MAX_LEVEL * num_items; } int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans, @@ -3305,9 +3294,9 @@ struct btrfs_block_group_cache *btrfs_lookup_block_group( u64 bytenr); void btrfs_put_block_group(struct btrfs_block_group_cache *cache); int get_block_group_index(struct btrfs_block_group_cache *cache); -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, - u64 parent, u64 root_objectid, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, + u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, @@ -3363,6 +3352,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 size); int btrfs_remove_block_group(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 group_start); +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_root *root); u64 btrfs_get_alloc_profile(struct btrfs_root *root, int data); @@ -3604,6 +3594,7 @@ static inline void free_fs_info(struct btrfs_fs_info *fs_info) kfree(fs_info->uuid_root); kfree(fs_info->super_copy); kfree(fs_info->super_for_commit); + security_free_mnt_opts(&fs_info->security_opts); kfree(fs_info); } @@ -3739,8 +3730,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, struct bio *bio, u32 *dst); int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 logical_offset); + struct bio *bio, u64 logical_offset); int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, @@ -4141,8 +4131,15 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); -int btrfs_verify_qgroup_counts(struct btrfs_fs_info *fs_info, u64 qgroupid, - u64 rfer, u64 excl); #endif +static inline int btrfs_test_is_dummy_root(struct btrfs_root *root) +{ +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS + if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + return 1; +#endif + return 0; +} + #endif diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index a2e90f855d7d..054577bddaf2 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -1042,7 +1042,7 @@ static int __btrfs_update_delayed_inode(struct btrfs_trans_handle *trans, int ret; key.objectid = node->inode_id; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; if (test_bit(BTRFS_DELAYED_NODE_DEL_IREF, &node->flags)) @@ -1099,7 +1099,7 @@ err_out: search: btrfs_release_path(path); - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = -1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1473,7 +1473,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans, } delayed_item->key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&delayed_item->key, BTRFS_DIR_INDEX_KEY); + delayed_item->key.type = BTRFS_DIR_INDEX_KEY; delayed_item->key.offset = index; dir_item = (struct btrfs_dir_item *)delayed_item->data; @@ -1542,7 +1542,7 @@ int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans, return PTR_ERR(node); item_key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&item_key, BTRFS_DIR_INDEX_KEY); + item_key.type = BTRFS_DIR_INDEX_KEY; item_key.offset = index; ret = btrfs_delete_delayed_insertion_item(root, node, &item_key); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index eea26e1b2fda..6f662b34ba0e 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -168,8 +168,12 @@ no_valid_dev_replace_entry_found: dev_replace->srcdev->total_bytes; dev_replace->tgtdev->disk_total_bytes = dev_replace->srcdev->disk_total_bytes; + dev_replace->tgtdev->commit_total_bytes = + dev_replace->srcdev->commit_total_bytes; dev_replace->tgtdev->bytes_used = dev_replace->srcdev->bytes_used; + dev_replace->tgtdev->commit_bytes_used = + dev_replace->srcdev->commit_bytes_used; } dev_replace->tgtdev->is_tgtdev_for_dev_replace = 1; btrfs_init_dev_replace_tgtdev_for_resume(fs_info, @@ -329,30 +333,34 @@ int btrfs_dev_replace_start(struct btrfs_root *root, args->start.tgtdev_name[0] == '\0') return -EINVAL; - mutex_lock(&fs_info->volume_mutex); - ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, - &tgt_device); - if (ret) { - btrfs_err(fs_info, "target device %s is invalid!", - args->start.tgtdev_name); - mutex_unlock(&fs_info->volume_mutex); - return -EINVAL; + /* + * Here we commit the transaction to make sure commit_total_bytes + * of all the devices are updated. + */ + trans = btrfs_attach_transaction(root); + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, root); + if (ret) + return ret; + } else if (PTR_ERR(trans) != -ENOENT) { + return PTR_ERR(trans); } + /* the disk copy procedure reuses the scrub code */ + mutex_lock(&fs_info->volume_mutex); ret = btrfs_dev_replace_find_srcdev(root, args->start.srcdevid, args->start.srcdev_name, &src_device); - mutex_unlock(&fs_info->volume_mutex); if (ret) { - ret = -EINVAL; - goto leave_no_lock; + mutex_unlock(&fs_info->volume_mutex); + return ret; } - if (tgt_device->total_bytes < src_device->total_bytes) { - btrfs_err(fs_info, "target device is smaller than source device!"); - ret = -EINVAL; - goto leave_no_lock; - } + ret = btrfs_init_dev_replace_tgtdev(root, args->start.tgtdev_name, + src_device, &tgt_device); + mutex_unlock(&fs_info->volume_mutex); + if (ret) + return ret; btrfs_dev_replace_lock(dev_replace); switch (dev_replace->replace_state) { @@ -380,10 +388,6 @@ int btrfs_dev_replace_start(struct btrfs_root *root, src_device->devid, rcu_str_deref(tgt_device->name)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; - /* * from now on, the writes to the srcdev are all duplicated to * go to the tgtdev as well (refer to btrfs_map_block()). @@ -414,7 +418,7 @@ int btrfs_dev_replace_start(struct btrfs_root *root, /* the disk copy procedure reuses the scrub code */ ret = btrfs_scrub_dev(fs_info, src_device->devid, 0, - src_device->total_bytes, + btrfs_device_get_total_bytes(src_device), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(root->fs_info, ret); @@ -426,9 +430,7 @@ leave: dev_replace->srcdev = NULL; dev_replace->tgtdev = NULL; btrfs_dev_replace_unlock(dev_replace); -leave_no_lock: - if (tgt_device) - btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); + btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); return ret; } @@ -507,9 +509,10 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, ret = btrfs_commit_transaction(trans, root); WARN_ON(ret); + mutex_lock(&uuid_mutex); /* keep away write_all_supers() during the finishing procedure */ - mutex_lock(&root->fs_info->chunk_mutex); mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + mutex_lock(&root->fs_info->chunk_mutex); btrfs_dev_replace_lock(dev_replace); dev_replace->replace_state = scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED @@ -532,8 +535,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, src_device->devid, rcu_str_deref(tgt_device->name), scrub_ret); btrfs_dev_replace_unlock(dev_replace); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); if (tgt_device) btrfs_destroy_dev_replace_tgtdev(fs_info, tgt_device); mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); @@ -542,7 +546,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, } printk_in_rcu(KERN_INFO - "BTRFS: dev_replace from %s (devid %llu) to %s) finished\n", + "BTRFS: dev_replace from %s (devid %llu) to %s finished\n", src_device->missing ? "<missing disk>" : rcu_str_deref(src_device->name), src_device->devid, @@ -550,23 +554,29 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, tgt_device->is_tgtdev_for_dev_replace = 0; tgt_device->devid = src_device->devid; src_device->devid = BTRFS_DEV_REPLACE_DEVID; - tgt_device->bytes_used = src_device->bytes_used; memcpy(uuid_tmp, tgt_device->uuid, sizeof(uuid_tmp)); memcpy(tgt_device->uuid, src_device->uuid, sizeof(tgt_device->uuid)); memcpy(src_device->uuid, uuid_tmp, sizeof(src_device->uuid)); - tgt_device->total_bytes = src_device->total_bytes; - tgt_device->disk_total_bytes = src_device->disk_total_bytes; - tgt_device->bytes_used = src_device->bytes_used; + btrfs_device_set_total_bytes(tgt_device, src_device->total_bytes); + btrfs_device_set_disk_total_bytes(tgt_device, + src_device->disk_total_bytes); + btrfs_device_set_bytes_used(tgt_device, src_device->bytes_used); + ASSERT(list_empty(&src_device->resized_list)); + tgt_device->commit_total_bytes = src_device->commit_total_bytes; + tgt_device->commit_bytes_used = src_device->bytes_used; if (fs_info->sb->s_bdev == src_device->bdev) fs_info->sb->s_bdev = tgt_device->bdev; if (fs_info->fs_devices->latest_bdev == src_device->bdev) fs_info->fs_devices->latest_bdev = tgt_device->bdev; list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list); + fs_info->fs_devices->rw_devices++; /* replace the sysfs entry */ btrfs_kobj_rm_device(fs_info, src_device); btrfs_kobj_add_device(fs_info, tgt_device); + btrfs_dev_replace_unlock(dev_replace); + btrfs_rm_dev_replace_blocked(fs_info); btrfs_rm_dev_replace_srcdev(fs_info, src_device); @@ -580,9 +590,9 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * superblock is scratched out so that it is no longer marked to * belong to this filesystem. */ - btrfs_dev_replace_unlock(dev_replace); - mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); mutex_unlock(&root->fs_info->chunk_mutex); + mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); /* write back the superblocks */ trans = btrfs_start_transaction(root, 0); @@ -643,6 +653,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_dev_replace_args *args) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; + struct btrfs_device *srcdev; btrfs_dev_replace_lock(dev_replace); /* even if !dev_replace_is_valid, the values are good enough for @@ -665,8 +676,9 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: + srcdev = dev_replace->srcdev; args->status.progress_1000 = div64_u64(dev_replace->cursor_left, - div64_u64(dev_replace->srcdev->total_bytes, 1000)); + div64_u64(btrfs_device_get_total_bytes(srcdev), 1000)); break; } btrfs_dev_replace_unlock(dev_replace); @@ -825,7 +837,7 @@ static int btrfs_dev_replace_continue_on_mount(struct btrfs_fs_info *fs_info) ret = btrfs_scrub_dev(fs_info, dev_replace->srcdev->devid, dev_replace->committed_cursor_left, - dev_replace->srcdev->total_bytes, + btrfs_device_get_total_bytes(dev_replace->srcdev), &dev_replace->scrub_progress, 0, 1); ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index a0691df5dcea..fc8df866e919 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -86,7 +86,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans, BUG_ON(name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)); key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); data_size = sizeof(*dir_item) + name_len + data_len; @@ -137,7 +137,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, struct btrfs_root u32 data_size; key.objectid = btrfs_ino(dir); - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); path = btrfs_alloc_path(); @@ -204,7 +204,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); @@ -234,7 +234,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, return -ENOMEM; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_ITEM_KEY); + key.type = BTRFS_DIR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -297,7 +297,7 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = objectid; ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); @@ -367,7 +367,7 @@ struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans, int cow = mod != 0; key.objectid = dir; - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = btrfs_name_hash(name, name_len); ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow); if (ret < 0) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a1d36e62179c..1ad0f47ac850 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -72,21 +72,41 @@ static int btrfs_cleanup_transaction(struct btrfs_root *root); static void btrfs_error_commit_super(struct btrfs_root *root); /* - * end_io_wq structs are used to do processing in task context when an IO is - * complete. This is used during reads to verify checksums, and it is used + * btrfs_end_io_wq structs are used to do processing in task context when an IO + * is complete. This is used during reads to verify checksums, and it is used * by writes to insert metadata for new file extents after IO is complete. */ -struct end_io_wq { +struct btrfs_end_io_wq { struct bio *bio; bio_end_io_t *end_io; void *private; struct btrfs_fs_info *info; int error; - int metadata; + enum btrfs_wq_endio_type metadata; struct list_head list; struct btrfs_work work; }; +static struct kmem_cache *btrfs_end_io_wq_cache; + +int __init btrfs_end_io_wq_init(void) +{ + btrfs_end_io_wq_cache = kmem_cache_create("btrfs_end_io_wq", + sizeof(struct btrfs_end_io_wq), + 0, + SLAB_RECLAIM_ACCOUNT | SLAB_MEM_SPREAD, + NULL); + if (!btrfs_end_io_wq_cache) + return -ENOMEM; + return 0; +} + +void btrfs_end_io_wq_exit(void) +{ + if (btrfs_end_io_wq_cache) + kmem_cache_destroy(btrfs_end_io_wq_cache); +} + /* * async submit bios are used to offload expensive checksumming * onto the worker threads. They checksum file and metadata bios @@ -327,8 +347,7 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, { struct extent_state *cached_state = NULL; int ret; - bool need_lock = (current->journal_info == - (void *)BTRFS_SEND_TRANS_STUB); + bool need_lock = (current->journal_info == BTRFS_SEND_TRANS_STUB); if (!parent_transid || btrfs_header_generation(eb) == parent_transid) return 0; @@ -348,9 +367,9 @@ static int verify_parent_transid(struct extent_io_tree *io_tree, ret = 0; goto out; } - printk_ratelimited("parent transid verify failed on %llu wanted %llu " - "found %llu\n", - eb->start, parent_transid, btrfs_header_generation(eb)); + printk_ratelimited(KERN_INFO "BTRFS (device %s): parent transid verify failed on %llu wanted %llu found %llu\n", + eb->fs_info->sb->s_id, eb->start, + parent_transid, btrfs_header_generation(eb)); ret = 1; /* @@ -607,22 +626,22 @@ static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, goto err; eb->read_mirror = mirror; - if (test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { + if (test_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags)) { ret = -EIO; goto err; } found_start = btrfs_header_bytenr(eb); if (found_start != eb->start) { - printk_ratelimited(KERN_INFO "BTRFS: bad tree block start " + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad tree block start " "%llu %llu\n", - found_start, eb->start); + eb->fs_info->sb->s_id, found_start, eb->start); ret = -EIO; goto err; } if (check_tree_block_fsid(root, eb)) { - printk_ratelimited(KERN_INFO "BTRFS: bad fsid on block %llu\n", - eb->start); + printk_ratelimited(KERN_INFO "BTRFS (device %s): bad fsid on block %llu\n", + eb->fs_info->sb->s_id, eb->start); ret = -EIO; goto err; } @@ -680,7 +699,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) struct btrfs_root *root = BTRFS_I(page->mapping->host)->root; eb = (struct extent_buffer *)page->private; - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = failed_mirror; atomic_dec(&eb->io_pages); if (test_and_clear_bit(EXTENT_BUFFER_READAHEAD, &eb->bflags)) @@ -690,7 +709,7 @@ static int btree_io_failed_hook(struct page *page, int failed_mirror) static void end_workqueue_bio(struct bio *bio, int err) { - struct end_io_wq *end_io_wq = bio->bi_private; + struct btrfs_end_io_wq *end_io_wq = bio->bi_private; struct btrfs_fs_info *fs_info; struct btrfs_workqueue *wq; btrfs_work_func_t func; @@ -713,7 +732,11 @@ static void end_workqueue_bio(struct bio *bio, int err) func = btrfs_endio_write_helper; } } else { - if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { + if (unlikely(end_io_wq->metadata == + BTRFS_WQ_ENDIO_DIO_REPAIR)) { + wq = fs_info->endio_repair_workers; + func = btrfs_endio_repair_helper; + } else if (end_io_wq->metadata == BTRFS_WQ_ENDIO_RAID56) { wq = fs_info->endio_raid56_workers; func = btrfs_endio_raid56_helper; } else if (end_io_wq->metadata) { @@ -729,19 +752,12 @@ static void end_workqueue_bio(struct bio *bio, int err) btrfs_queue_work(wq, &end_io_wq->work); } -/* - * For the metadata arg you want - * - * 0 - if data - * 1 - if normal metadta - * 2 - if writing to the free space cache area - * 3 - raid parity work - */ int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata) + enum btrfs_wq_endio_type metadata) { - struct end_io_wq *end_io_wq; - end_io_wq = kmalloc(sizeof(*end_io_wq), GFP_NOFS); + struct btrfs_end_io_wq *end_io_wq; + + end_io_wq = kmem_cache_alloc(btrfs_end_io_wq_cache, GFP_NOFS); if (!end_io_wq) return -ENOMEM; @@ -925,7 +941,7 @@ static int btree_submit_bio_hook(struct inode *inode, int rw, struct bio *bio, * can happen in the async kernel threads */ ret = btrfs_bio_wq_end_io(BTRFS_I(inode)->root->fs_info, - bio, 1); + bio, BTRFS_WQ_ENDIO_METADATA); if (ret) goto out_w_error; ret = btrfs_map_bio(BTRFS_I(inode)->root, rw, bio, @@ -1057,20 +1073,17 @@ static const struct address_space_operations btree_aops = { .set_page_dirty = btree_set_page_dirty, }; -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid) +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { struct extent_buffer *buf = NULL; struct inode *btree_inode = root->fs_info->btree_inode; - int ret = 0; buf = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!buf) - return 0; + return; read_extent_buffer_pages(&BTRFS_I(btree_inode)->io_tree, buf, 0, WAIT_NONE, btree_get_extent, 0); free_extent_buffer(buf); - return ret; } int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, @@ -1106,7 +1119,7 @@ int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, } struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize) + u64 bytenr) { return find_extent_buffer(root->fs_info, bytenr); } @@ -1114,11 +1127,9 @@ struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize) { -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return alloc_test_extent_buffer(root->fs_info, bytenr, blocksize); -#endif return alloc_extent_buffer(root->fs_info, bytenr, blocksize); } @@ -1136,12 +1147,12 @@ int btrfs_wait_tree_block_writeback(struct extent_buffer *buf) } struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid) + u64 parent_transid) { struct extent_buffer *buf = NULL; int ret; - buf = btrfs_find_create_tree_block(root, bytenr, blocksize); + buf = btrfs_find_create_tree_block(root, bytenr, root->nodesize); if (!buf) return NULL; @@ -1183,7 +1194,7 @@ static struct btrfs_subvolume_writers *btrfs_alloc_subvolume_writers(void) if (!writers) return ERR_PTR(-ENOMEM); - ret = percpu_counter_init(&writers->counter, 0); + ret = percpu_counter_init(&writers->counter, 0, GFP_KERNEL); if (ret < 0) { kfree(writers); return ERR_PTR(ret); @@ -1200,16 +1211,14 @@ btrfs_free_subvolume_writers(struct btrfs_subvolume_writers *writers) kfree(writers); } -static void __setup_root(u32 nodesize, u32 leafsize, u32 sectorsize, - u32 stripesize, struct btrfs_root *root, - struct btrfs_fs_info *fs_info, +static void __setup_root(u32 nodesize, u32 sectorsize, u32 stripesize, + struct btrfs_root *root, struct btrfs_fs_info *fs_info, u64 objectid) { root->node = NULL; root->commit_root = NULL; root->sectorsize = sectorsize; root->nodesize = nodesize; - root->leafsize = leafsize; root->stripesize = stripesize; root->state = 0; root->orphan_cleanup_state = 0; @@ -1295,7 +1304,7 @@ struct btrfs_root *btrfs_alloc_dummy_root(void) root = btrfs_alloc_root(NULL); if (!root) return ERR_PTR(-ENOMEM); - __setup_root(4096, 4096, 4096, 4096, root, NULL, 1); + __setup_root(4096, 4096, 4096, root, NULL, 1); set_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state); root->alloc_bytenr = 0; @@ -1318,15 +1327,13 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, objectid); root->root_key.objectid = objectid; root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1396,9 +1403,9 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, if (!root) return ERR_PTR(-ENOMEM); - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, BTRFS_TREE_LOG_OBJECTID); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, + BTRFS_TREE_LOG_OBJECTID); root->root_key.objectid = BTRFS_TREE_LOG_OBJECTID; root->root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -1413,9 +1420,8 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, * updated (along with back refs to the log tree). */ - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, 0, - BTRFS_TREE_LOG_OBJECTID, NULL, - 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, + NULL, 0, 0, 0); if (IS_ERR(leaf)) { kfree(root); return ERR_CAST(leaf); @@ -1465,7 +1471,7 @@ int btrfs_add_log_tree(struct btrfs_trans_handle *trans, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_node(&log_root->root_item, log_root->node); @@ -1485,7 +1491,6 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, struct btrfs_fs_info *fs_info = tree_root->fs_info; struct btrfs_path *path; u64 generation; - u32 blocksize; int ret; path = btrfs_alloc_path(); @@ -1498,9 +1503,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, goto alloc_fail; } - __setup_root(tree_root->nodesize, tree_root->leafsize, - tree_root->sectorsize, tree_root->stripesize, - root, fs_info, key->objectid); + __setup_root(tree_root->nodesize, tree_root->sectorsize, + tree_root->stripesize, root, fs_info, key->objectid); ret = btrfs_find_root(tree_root, key, path, &root->root_item, &root->root_key); @@ -1511,9 +1515,8 @@ static struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, } generation = btrfs_root_generation(&root->root_item); - blocksize = btrfs_level_size(root, btrfs_root_level(&root->root_item)); root->node = read_tree_block(root, btrfs_root_bytenr(&root->root_item), - blocksize, generation); + generation); if (!root->node) { ret = -ENOMEM; goto find_fail; @@ -1573,8 +1576,8 @@ int btrfs_init_fs_root(struct btrfs_root *root) root->subv_writers = writers; btrfs_init_free_ino_ctl(root); - spin_lock_init(&root->cache_lock); - init_waitqueue_head(&root->cache_wait); + spin_lock_init(&root->ino_cache_lock); + init_waitqueue_head(&root->ino_cache_wait); ret = get_anon_bdev(&root->anon_dev); if (ret) @@ -1699,7 +1702,7 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) if (!device->bdev) continue; bdi = blk_get_backing_dev_info(device->bdev); - if (bdi && bdi_congested(bdi, bdi_bits)) { + if (bdi_congested(bdi, bdi_bits)) { ret = 1; break; } @@ -1708,10 +1711,6 @@ static int btrfs_congested_fn(void *congested_data, int bdi_bits) return ret; } -/* - * If this fails, caller must call bdi_destroy() to get rid of the - * bdi again. - */ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) { int err; @@ -1734,16 +1733,16 @@ static int setup_bdi(struct btrfs_fs_info *info, struct backing_dev_info *bdi) static void end_workqueue_fn(struct btrfs_work *work) { struct bio *bio; - struct end_io_wq *end_io_wq; + struct btrfs_end_io_wq *end_io_wq; int error; - end_io_wq = container_of(work, struct end_io_wq, work); + end_io_wq = container_of(work, struct btrfs_end_io_wq, work); bio = end_io_wq->bio; error = end_io_wq->error; bio->bi_private = end_io_wq->private; bio->bi_end_io = end_io_wq->end_io; - kfree(end_io_wq); + kmem_cache_free(btrfs_end_io_wq_cache, end_io_wq); bio_endio_nodec(bio, error); } @@ -1772,6 +1771,7 @@ static int cleaner_kthread(void *arg) } btrfs_run_delayed_iputs(root); + btrfs_delete_unused_bgs(root->fs_info); again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2063,6 +2063,7 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info) btrfs_destroy_workqueue(fs_info->endio_workers); btrfs_destroy_workqueue(fs_info->endio_meta_workers); btrfs_destroy_workqueue(fs_info->endio_raid56_workers); + btrfs_destroy_workqueue(fs_info->endio_repair_workers); btrfs_destroy_workqueue(fs_info->rmw_workers); btrfs_destroy_workqueue(fs_info->endio_meta_write_workers); btrfs_destroy_workqueue(fs_info->endio_write_workers); @@ -2143,8 +2144,6 @@ int open_ctree(struct super_block *sb, { u32 sectorsize; u32 nodesize; - u32 leafsize; - u32 blocksize; u32 stripesize; u64 generation; u64 features; @@ -2188,7 +2187,7 @@ int open_ctree(struct super_block *sb, goto fail_srcu; } - ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0); + ret = percpu_counter_init(&fs_info->dirty_metadata_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_bdi; @@ -2196,13 +2195,13 @@ int open_ctree(struct super_block *sb, fs_info->dirty_metadata_batch = PAGE_CACHE_SIZE * (1 + ilog2(nr_cpu_ids)); - ret = percpu_counter_init(&fs_info->delalloc_bytes, 0); + ret = percpu_counter_init(&fs_info->delalloc_bytes, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_dirty_metadata_bytes; } - ret = percpu_counter_init(&fs_info->bio_counter, 0); + ret = percpu_counter_init(&fs_info->bio_counter, 0, GFP_KERNEL); if (ret) { err = ret; goto fail_delalloc_bytes; @@ -2233,6 +2232,7 @@ int open_ctree(struct super_block *sb, spin_lock_init(&fs_info->super_lock); spin_lock_init(&fs_info->qgroup_op_lock); spin_lock_init(&fs_info->buffer_lock); + spin_lock_init(&fs_info->unused_bgs_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); @@ -2242,6 +2242,7 @@ int open_ctree(struct super_block *sb, INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); + INIT_LIST_HEAD(&fs_info->unused_bgs); btrfs_mapping_init(&fs_info->mapping_tree); btrfs_init_block_rsv(&fs_info->global_block_rsv, BTRFS_BLOCK_RSV_GLOBAL); @@ -2260,7 +2261,7 @@ int open_ctree(struct super_block *sb, atomic_set(&fs_info->qgroup_op_seq, 0); atomic64_set(&fs_info->tree_mod_seq, 0); fs_info->sb = sb; - fs_info->max_inline = 8192 * 1024; + fs_info->max_inline = BTRFS_DEFAULT_MAX_INLINE; fs_info->metadata_ratio = 0; fs_info->defrag_inodes = RB_ROOT; fs_info->free_chunk_space = 0; @@ -2389,7 +2390,7 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - __setup_root(4096, 4096, 4096, 4096, tree_root, + __setup_root(4096, 4096, 4096, tree_root, fs_info, BTRFS_ROOT_TREE_OBJECTID); invalidate_bdev(fs_devices->latest_bdev); @@ -2469,19 +2470,22 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) != + /* + * Leafsize and nodesize were always equal, this is only a sanity check. + */ + if (le32_to_cpu(disk_super->__unused_leafsize) != btrfs_super_nodesize(disk_super)) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksizes don't match. node %d leaf %d\n", btrfs_super_nodesize(disk_super), - btrfs_super_leafsize(disk_super)); + le32_to_cpu(disk_super->__unused_leafsize)); err = -EINVAL; goto fail_alloc; } - if (btrfs_super_leafsize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { + if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { printk(KERN_ERR "BTRFS: couldn't mount because metadata " "blocksize (%d) was too large\n", - btrfs_super_leafsize(disk_super)); + btrfs_super_nodesize(disk_super)); err = -EINVAL; goto fail_alloc; } @@ -2498,17 +2502,16 @@ int open_ctree(struct super_block *sb, * flag our filesystem as having big metadata blocks if * they are bigger than the page size */ - if (btrfs_super_leafsize(disk_super) > PAGE_CACHE_SIZE) { + if (btrfs_super_nodesize(disk_super) > PAGE_CACHE_SIZE) { if (!(features & BTRFS_FEATURE_INCOMPAT_BIG_METADATA)) printk(KERN_INFO "BTRFS: flagging fs with big metadata feature\n"); features |= BTRFS_FEATURE_INCOMPAT_BIG_METADATA; } nodesize = btrfs_super_nodesize(disk_super); - leafsize = btrfs_super_leafsize(disk_super); sectorsize = btrfs_super_sectorsize(disk_super); stripesize = btrfs_super_stripesize(disk_super); - fs_info->dirty_metadata_batch = leafsize * (1 + ilog2(nr_cpu_ids)); + fs_info->dirty_metadata_batch = nodesize * (1 + ilog2(nr_cpu_ids)); fs_info->delalloc_batch = sectorsize * 512 * (1 + ilog2(nr_cpu_ids)); /* @@ -2516,7 +2519,7 @@ int open_ctree(struct super_block *sb, * extent buffers for the same range. It leads to corruptions */ if ((features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) && - (sectorsize != leafsize)) { + (sectorsize != nodesize)) { printk(KERN_WARNING "BTRFS: unequal leaf/node/sector sizes " "are not allowed for mixed block groups on %s\n", sb->s_id); @@ -2579,6 +2582,8 @@ int open_ctree(struct super_block *sb, btrfs_alloc_workqueue("endio-meta-write", flags, max_active, 2); fs_info->endio_raid56_workers = btrfs_alloc_workqueue("endio-raid56", flags, max_active, 4); + fs_info->endio_repair_workers = + btrfs_alloc_workqueue("endio-repair", flags, 1, 0); fs_info->rmw_workers = btrfs_alloc_workqueue("rmw", flags, max_active, 2); fs_info->endio_write_workers = @@ -2600,11 +2605,12 @@ int open_ctree(struct super_block *sb, fs_info->submit_workers && fs_info->flush_workers && fs_info->endio_workers && fs_info->endio_meta_workers && fs_info->endio_meta_write_workers && + fs_info->endio_repair_workers && fs_info->endio_write_workers && fs_info->endio_raid56_workers && fs_info->endio_freespace_worker && fs_info->rmw_workers && fs_info->caching_workers && fs_info->readahead_workers && fs_info->fixup_workers && fs_info->delayed_workers && - fs_info->fixup_workers && fs_info->extent_workers && + fs_info->extent_workers && fs_info->qgroup_rescan_workers)) { err = -ENOMEM; goto fail_sb_buffer; @@ -2615,7 +2621,6 @@ int open_ctree(struct super_block *sb, 4 * 1024 * 1024 / PAGE_CACHE_SIZE); tree_root->nodesize = nodesize; - tree_root->leafsize = leafsize; tree_root->sectorsize = sectorsize; tree_root->stripesize = stripesize; @@ -2642,16 +2647,14 @@ int open_ctree(struct super_block *sb, goto fail_sb_buffer; } - blocksize = btrfs_level_size(tree_root, - btrfs_super_chunk_root_level(disk_super)); generation = btrfs_super_chunk_root_generation(disk_super); - __setup_root(nodesize, leafsize, sectorsize, stripesize, - chunk_root, fs_info, BTRFS_CHUNK_TREE_OBJECTID); + __setup_root(nodesize, sectorsize, stripesize, chunk_root, + fs_info, BTRFS_CHUNK_TREE_OBJECTID); chunk_root->node = read_tree_block(chunk_root, btrfs_super_chunk_root(disk_super), - blocksize, generation); + generation); if (!chunk_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &chunk_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read chunk root on %s\n", @@ -2684,13 +2687,11 @@ int open_ctree(struct super_block *sb, } retry_root_backup: - blocksize = btrfs_level_size(tree_root, - btrfs_super_root_level(disk_super)); generation = btrfs_super_generation(disk_super); tree_root->node = read_tree_block(tree_root, btrfs_super_root(disk_super), - blocksize, generation); + generation); if (!tree_root->node || !test_bit(EXTENT_BUFFER_UPTODATE, &tree_root->node->bflags)) { printk(KERN_WARNING "BTRFS: failed to read tree root on %s\n", @@ -2859,9 +2860,6 @@ retry_root_backup: err = -EIO; goto fail_qgroup; } - blocksize = - btrfs_level_size(tree_root, - btrfs_super_log_root_level(disk_super)); log_tree_root = btrfs_alloc_root(fs_info); if (!log_tree_root) { @@ -2869,11 +2867,10 @@ retry_root_backup: goto fail_qgroup; } - __setup_root(nodesize, leafsize, sectorsize, stripesize, + __setup_root(nodesize, sectorsize, stripesize, log_tree_root, fs_info, BTRFS_TREE_LOG_OBJECTID); log_tree_root->node = read_tree_block(tree_root, bytenr, - blocksize, generation + 1); if (!log_tree_root->node || !extent_buffer_uptodate(log_tree_root->node)) { @@ -2980,6 +2977,8 @@ retry_root_backup: fs_info->update_uuid_tree_gen = 1; } + fs_info->open = 1; + return 0; fail_qgroup: @@ -3139,7 +3138,8 @@ static int write_dev_supers(struct btrfs_device *device, for (i = 0; i < max_mirrors; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE >= device->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= + device->commit_total_bytes) break; if (wait) { @@ -3456,8 +3456,9 @@ static int write_all_supers(struct btrfs_root *root, int max_mirrors) btrfs_set_stack_device_type(dev_item, dev->type); btrfs_set_stack_device_id(dev_item, dev->devid); btrfs_set_stack_device_total_bytes(dev_item, - dev->disk_total_bytes); - btrfs_set_stack_device_bytes_used(dev_item, dev->bytes_used); + dev->commit_total_bytes); + btrfs_set_stack_device_bytes_used(dev_item, + dev->commit_bytes_used); btrfs_set_stack_device_io_align(dev_item, dev->io_align); btrfs_set_stack_device_io_width(dev_item, dev->io_width); btrfs_set_stack_device_sector_size(dev_item, dev->sector_size); @@ -3532,7 +3533,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, static void free_fs_root(struct btrfs_root *root) { - iput(root->cache_inode); + iput(root->ino_cache_inode); WARN_ON(!RB_EMPTY_ROOT(&root->inode_tree)); btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; @@ -3623,7 +3624,7 @@ int btrfs_commit_super(struct btrfs_root *root) return btrfs_commit_transaction(trans, root); } -int close_ctree(struct btrfs_root *root) +void close_ctree(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; int ret; @@ -3689,6 +3690,7 @@ int close_ctree(struct btrfs_root *root) invalidate_inode_pages2(fs_info->btree_inode->i_mapping); btrfs_stop_all_workers(fs_info); + fs_info->open = 0; free_root_pointers(fs_info, 1); iput(fs_info->btree_inode); @@ -3711,8 +3713,6 @@ int close_ctree(struct btrfs_root *root) btrfs_free_block_rsv(root, root->orphan_block_rsv); root->orphan_block_rsv = NULL; - - return 0; } int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, @@ -3814,10 +3814,73 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid) static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { + struct btrfs_super_block *sb = fs_info->super_copy; + int ret = 0; + + if (sb->root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: tree_root level too big: %d > %d\n", + sb->root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->chunk_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: chunk_root level too big: %d > %d\n", + sb->chunk_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + if (sb->log_root_level > BTRFS_MAX_LEVEL) { + printk(KERN_ERR "BTRFS: log_root level too big: %d > %d\n", + sb->log_root_level, BTRFS_MAX_LEVEL); + ret = -EINVAL; + } + /* - * Placeholder for checks + * The common minimum, we don't know if we can trust the nodesize/sectorsize + * items yet, they'll be verified later. Issue just a warning. */ - return 0; + if (!IS_ALIGNED(sb->root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->root); + if (!IS_ALIGNED(sb->chunk_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->chunk_root); + if (!IS_ALIGNED(sb->log_root, 4096)) + printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", + sb->log_root); + + if (memcmp(fs_info->fsid, sb->dev_item.fsid, BTRFS_UUID_SIZE) != 0) { + printk(KERN_ERR "BTRFS: dev_item UUID does not match fsid: %pU != %pU\n", + fs_info->fsid, sb->dev_item.fsid); + ret = -EINVAL; + } + + /* + * Hint to catch really bogus numbers, bitflips or so, more exact checks are + * done later + */ + if (sb->num_devices > (1UL << 31)) + printk(KERN_WARNING "BTRFS: suspicious number of devices: %llu\n", + sb->num_devices); + + if (sb->bytenr != BTRFS_SUPER_INFO_OFFSET) { + printk(KERN_ERR "BTRFS: super offset mismatch %llu != %u\n", + sb->bytenr, BTRFS_SUPER_INFO_OFFSET); + ret = -EINVAL; + } + + /* + * The generation is a global counter, we'll trust it more than the others + * but it's still possible that it's the one that's wrong. + */ + if (sb->generation < sb->chunk_root_generation) + printk(KERN_WARNING + "BTRFS: suspicious: generation < chunk_root_generation: %llu < %llu\n", + sb->generation, sb->chunk_root_generation); + if (sb->generation < sb->cache_generation && sb->cache_generation != (u64)-1) + printk(KERN_WARNING + "BTRFS: suspicious: generation < cache_generation: %llu < %llu\n", + sb->generation, sb->cache_generation); + + return ret; } static void btrfs_error_commit_super(struct btrfs_root *root) @@ -4009,9 +4072,8 @@ static int btrfs_destroy_marked_extents(struct btrfs_root *root, clear_extent_bits(dirty_pages, start, end, mark, GFP_NOFS); while (start <= end) { - eb = btrfs_find_tree_block(root, start, - root->leafsize); - start += root->leafsize; + eb = btrfs_find_tree_block(root, start); + start += root->nodesize; if (!eb) continue; wait_on_extent_buffer_writeback(eb); diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 23ce3ceba0a9..414651821fb3 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -25,11 +25,12 @@ #define BTRFS_SUPER_MIRROR_MAX 3 #define BTRFS_SUPER_MIRROR_SHIFT 12 -enum { +enum btrfs_wq_endio_type { BTRFS_WQ_ENDIO_DATA = 0, BTRFS_WQ_ENDIO_METADATA = 1, BTRFS_WQ_ENDIO_FREE_SPACE = 2, BTRFS_WQ_ENDIO_RAID56 = 3, + BTRFS_WQ_ENDIO_DIO_REPAIR = 4, }; static inline u64 btrfs_sb_offset(int mirror) @@ -44,9 +45,8 @@ struct btrfs_device; struct btrfs_fs_devices; struct extent_buffer *read_tree_block(struct btrfs_root *root, u64 bytenr, - u32 blocksize, u64 parent_transid); -int readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize, - u64 parent_transid); + u64 parent_transid); +void readahead_tree_block(struct btrfs_root *root, u64 bytenr, u32 blocksize); int reada_tree_block_flagged(struct btrfs_root *root, u64 bytenr, u32 blocksize, int mirror_num, struct extent_buffer **eb); struct extent_buffer *btrfs_find_create_tree_block(struct btrfs_root *root, @@ -56,13 +56,13 @@ void clean_tree_block(struct btrfs_trans_handle *trans, int open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_devices, char *options); -int close_ctree(struct btrfs_root *root); +void close_ctree(struct btrfs_root *root); int write_ctree_super(struct btrfs_trans_handle *trans, struct btrfs_root *root, int max_mirrors); struct buffer_head *btrfs_read_dev_super(struct block_device *bdev); int btrfs_commit_super(struct btrfs_root *root); struct extent_buffer *btrfs_find_tree_block(struct btrfs_root *root, - u64 bytenr, u32 blocksize); + u64 bytenr); struct btrfs_root *btrfs_read_fs_root(struct btrfs_root *tree_root, struct btrfs_key *location); int btrfs_init_fs_root(struct btrfs_root *root); @@ -119,7 +119,7 @@ int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid); u32 btrfs_csum_data(char *data, u32 seed, size_t len); void btrfs_csum_final(u32 crc, char *result); int btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio, - int metadata); + enum btrfs_wq_endio_type metadata); int btrfs_wq_submit_bio(struct btrfs_fs_info *fs_info, struct inode *inode, int rw, struct bio *bio, int mirror_num, unsigned long bio_flags, u64 bio_offset, @@ -141,6 +141,8 @@ int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); int btrfs_calc_num_tolerated_disk_barrier_failures( struct btrfs_fs_info *fs_info); +int __init btrfs_end_io_wq_init(void); +void btrfs_end_io_wq_exit(void); #ifdef CONFIG_DEBUG_LOCK_ALLOC void btrfs_init_lockdep(void); diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index 41422a3de8ed..37d164540c3a 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -70,7 +70,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, return ERR_PTR(-ESTALE); key.objectid = root_objectid; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -82,7 +82,7 @@ static struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, } key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(sb, &key, root, NULL); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3efe1c3877bf..d56589571012 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -491,7 +491,7 @@ next: key.objectid); if (key.type == BTRFS_METADATA_ITEM_KEY) last = key.objectid + - fs_info->tree_root->leafsize; + fs_info->tree_root->nodesize; else last = key.objectid + key.offset; @@ -765,7 +765,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, * different */ if (metadata && !btrfs_fs_incompat(root->fs_info, SKINNY_METADATA)) { - offset = root->leafsize; + offset = root->nodesize; metadata = 0; } @@ -799,13 +799,13 @@ again: path->slots[0]); if (key.objectid == bytenr && key.type == BTRFS_EXTENT_ITEM_KEY && - key.offset == root->leafsize) + key.offset == root->nodesize) ret = 0; } if (ret) { key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = root->leafsize; + key.offset = root->nodesize; btrfs_release_path(path); goto again; } @@ -2651,7 +2651,7 @@ int btrfs_check_space_for_delayed_refs(struct btrfs_trans_handle *trans, num_bytes = btrfs_calc_trans_metadata_size(root, 1); num_heads = heads_to_leaves(root, num_heads); if (num_heads > 1) - num_bytes += (num_heads - 1) * root->leafsize; + num_bytes += (num_heads - 1) * root->nodesize; num_bytes <<= 1; global_rsv = &root->fs_info->global_block_rsv; @@ -3073,10 +3073,10 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, int (*process_func)(struct btrfs_trans_handle *, struct btrfs_root *, u64, u64, u64, u64, u64, u64, int); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + ref_root = btrfs_header_owner(buf); nritems = btrfs_header_nritems(buf); level = btrfs_header_level(buf); @@ -3097,7 +3097,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, for (i = 0; i < nritems; i++) { if (level == 0) { btrfs_item_key_to_cpu(buf, &key, i); - if (btrfs_key_type(&key) != BTRFS_EXTENT_DATA_KEY) + if (key.type != BTRFS_EXTENT_DATA_KEY) continue; fi = btrfs_item_ptr(buf, i, struct btrfs_file_extent_item); @@ -3117,7 +3117,7 @@ static int __btrfs_mod_ref(struct btrfs_trans_handle *trans, goto fail; } else { bytenr = btrfs_node_blockptr(buf, i); - num_bytes = btrfs_level_size(root, level - 1); + num_bytes = root->nodesize; ret = process_func(trans, root, bytenr, num_bytes, parent, ref_root, level - 1, 0, 1); @@ -3494,7 +3494,7 @@ static int update_space_info(struct btrfs_fs_info *info, u64 flags, if (!found) return -ENOMEM; - ret = percpu_counter_init(&found->total_bytes_pinned, 0); + ret = percpu_counter_init(&found->total_bytes_pinned, 0, GFP_KERNEL); if (ret) { kfree(found); return ret; @@ -4343,11 +4343,21 @@ static inline int need_do_async_reclaim(struct btrfs_space_info *space_info, } static int btrfs_need_do_async_reclaim(struct btrfs_space_info *space_info, - struct btrfs_fs_info *fs_info) + struct btrfs_fs_info *fs_info, + int flush_state) { u64 used; spin_lock(&space_info->lock); + /* + * We run out of space and have not got any free space via flush_space, + * so don't bother doing async reclaim. + */ + if (flush_state > COMMIT_TRANS && space_info->full) { + spin_unlock(&space_info->lock); + return 0; + } + used = space_info->bytes_used + space_info->bytes_reserved + space_info->bytes_pinned + space_info->bytes_readonly + space_info->bytes_may_use; @@ -4380,11 +4390,12 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) flush_space(fs_info->fs_root, space_info, to_reclaim, to_reclaim, flush_state); flush_state++; - if (!btrfs_need_do_async_reclaim(space_info, fs_info)) + if (!btrfs_need_do_async_reclaim(space_info, fs_info, + flush_state)) return; } while (flush_state <= COMMIT_TRANS); - if (btrfs_need_do_async_reclaim(space_info, fs_info)) + if (btrfs_need_do_async_reclaim(space_info, fs_info, flush_state)) queue_work(system_unbound_wq, work); } @@ -4502,7 +4513,13 @@ again: space_info->flush = 1; } else if (!ret && space_info->flags & BTRFS_BLOCK_GROUP_METADATA) { used += orig_bytes; - if (need_do_async_reclaim(space_info, root->fs_info, used) && + /* + * We will do the space reservation dance during log replay, + * which means we won't have fs_info->fs_root set, so don't do + * the async reclaim as we will panic. + */ + if (!root->fs_info->log_root_recovering && + need_do_async_reclaim(space_info, root->fs_info, used) && !work_busy(&root->fs_info->async_reclaim_work)) queue_work(system_unbound_wq, &root->fs_info->async_reclaim_work); @@ -4839,7 +4856,7 @@ static u64 calc_global_metadata_size(struct btrfs_fs_info *fs_info) if (num_bytes * 3 > meta_used) num_bytes = div64_u64(meta_used, 3); - return ALIGN(num_bytes, fs_info->extent_root->leafsize << 10); + return ALIGN(num_bytes, fs_info->extent_root->nodesize << 10); } static void update_global_block_rsv(struct btrfs_fs_info *fs_info) @@ -4988,7 +5005,7 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (root->fs_info->quota_enabled) { /* One for parent inode, two for dir entries */ - num_bytes = 3 * root->leafsize; + num_bytes = 3 * root->nodesize; ret = btrfs_qgroup_reserve(root, num_bytes); if (ret) return ret; @@ -5176,7 +5193,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (root->fs_info->quota_enabled) { ret = btrfs_qgroup_reserve(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); if (ret) goto out_fail; } @@ -5185,7 +5202,7 @@ int btrfs_delalloc_reserve_metadata(struct inode *inode, u64 num_bytes) if (unlikely(ret)) { if (root->fs_info->quota_enabled) btrfs_qgroup_free(root, num_bytes + - nr_extents * root->leafsize); + nr_extents * root->nodesize); goto out_fail; } @@ -5301,7 +5318,7 @@ void btrfs_delalloc_release_metadata(struct inode *inode, u64 num_bytes) btrfs_ino(inode), to_free, 0); if (root->fs_info->quota_enabled) { btrfs_qgroup_free(root, num_bytes + - dropped * root->leafsize); + dropped * root->nodesize); } btrfs_block_rsv_release(root, &root->fs_info->delalloc_block_rsv, @@ -5422,6 +5439,20 @@ static int update_block_group(struct btrfs_root *root, spin_unlock(&cache->space_info->lock); } else { old_val -= num_bytes; + + /* + * No longer have used bytes in this block group, queue + * it for deletion. + */ + if (old_val == 0) { + spin_lock(&info->unused_bgs_lock); + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } btrfs_set_block_group_used(&cache->item, old_val); cache->pinned += num_bytes; cache->space_info->bytes_pinned += num_bytes; @@ -6233,10 +6264,9 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, int ret; struct btrfs_fs_info *fs_info = root->fs_info; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + add_pinned_bytes(root->fs_info, num_bytes, owner, root_objectid); /* @@ -6263,14 +6293,6 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, return ret; } -static u64 stripe_align(struct btrfs_root *root, - struct btrfs_block_group_cache *cache, - u64 val, u64 num_bytes) -{ - u64 ret = ALIGN(val, root->stripesize); - return ret; -} - /* * when we wait for progress in the block group caching, its because * our allocation attempt failed at least once. So, we must sleep @@ -6464,7 +6486,7 @@ static noinline int find_free_extent(struct btrfs_root *orig_root, bool have_caching_bg = false; WARN_ON(num_bytes < root->sectorsize); - btrfs_set_key_type(ins, BTRFS_EXTENT_ITEM_KEY); + ins->type = BTRFS_EXTENT_ITEM_KEY; ins->objectid = 0; ins->offset = 0; @@ -6751,8 +6773,7 @@ unclustered_alloc: goto loop; } checks: - search_start = stripe_align(root, block_group, - offset, num_bytes); + search_start = ALIGN(offset, root->stripesize); /* move on to the next group */ if (search_start + num_bytes > @@ -7077,7 +7098,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); return -ENOMEM; } @@ -7086,7 +7107,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, ins, size); if (ret) { btrfs_free_and_pin_reserved_extent(root, ins->objectid, - root->leafsize); + root->nodesize); btrfs_free_path(path); return ret; } @@ -7101,7 +7122,7 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, if (skinny_metadata) { iref = (struct btrfs_extent_inline_ref *)(extent_item + 1); - num_bytes = root->leafsize; + num_bytes = root->nodesize; } else { block_info = (struct btrfs_tree_block_info *)(extent_item + 1); btrfs_set_tree_block_key(leaf, block_info, key); @@ -7131,14 +7152,14 @@ static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans, return ret; } - ret = update_block_group(root, ins->objectid, root->leafsize, 1); + ret = update_block_group(root, ins->objectid, root->nodesize, 1); if (ret) { /* -ENOENT, logic error */ btrfs_err(fs_info, "update block group failed for %llu %llu", ins->objectid, ins->offset); BUG(); } - trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->leafsize); + trace_btrfs_reserved_extent_alloc(root, ins->objectid, root->nodesize); return ret; } @@ -7213,17 +7234,19 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_buffer_uptodate(buf); if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + buf->log_index = root->log_transid % 2; /* * we allow two log transactions at a time, use different * EXENT bit to differentiate dirty pages. */ - if (root->log_transid % 2 == 0) + if (buf->log_index == 0) set_extent_dirty(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); else set_extent_new(&root->dirty_log_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } else { + buf->log_index = -1; set_extent_dirty(&trans->transaction->dirty_pages, buf->start, buf->start + buf->len - 1, GFP_NOFS); } @@ -7300,8 +7323,8 @@ static void unuse_block_rsv(struct btrfs_fs_info *fs_info, * * returns the tree buffer or NULL. */ -struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, - struct btrfs_root *root, u32 blocksize, +struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 parent, u64 root_objectid, struct btrfs_disk_key *key, int level, u64 hint, u64 empty_size) @@ -7311,18 +7334,18 @@ struct extent_buffer *btrfs_alloc_free_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf; u64 flags = 0; int ret; + u32 blocksize = root->nodesize; bool skinny_metadata = btrfs_fs_incompat(root->fs_info, SKINNY_METADATA); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) { + if (btrfs_test_is_dummy_root(root)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, blocksize, level); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; } -#endif + block_rsv = use_block_rsv(trans, root, blocksize); if (IS_ERR(block_rsv)) return ERR_CAST(block_rsv); @@ -7417,7 +7440,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, eb = path->nodes[wc->level]; nritems = btrfs_header_nritems(eb); - blocksize = btrfs_level_size(root, wc->level - 1); + blocksize = root->nodesize; for (slot = path->slots[wc->level]; slot < nritems; slot++) { if (nread >= wc->reada_count) @@ -7464,10 +7487,7 @@ static noinline void reada_walk_down(struct btrfs_trans_handle *trans, continue; } reada: - ret = readahead_tree_block(root, bytenr, blocksize, - generation); - if (ret) - break; + readahead_tree_block(root, bytenr, blocksize); nread++; } wc->reada_slot = slot; @@ -7626,7 +7646,6 @@ walk_down: level = root_level; while (level >= 0) { if (path->nodes[level] == NULL) { - int child_bsize = root->nodesize; int parent_slot; u64 child_gen; u64 child_bytenr; @@ -7638,8 +7657,7 @@ walk_down: child_bytenr = btrfs_node_blockptr(eb, parent_slot); child_gen = btrfs_node_ptr_generation(eb, parent_slot); - eb = read_tree_block(root, child_bytenr, child_bsize, - child_gen); + eb = read_tree_block(root, child_bytenr, child_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = -EIO; goto out; @@ -7655,7 +7673,7 @@ walk_down: ret = btrfs_qgroup_record_ref(trans, root->fs_info, root->objectid, child_bytenr, - child_bsize, + root->nodesize, BTRFS_QGROUP_OPER_SUB_SUBTREE, 0); if (ret) @@ -7806,9 +7824,9 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, } bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]); - blocksize = btrfs_level_size(root, level - 1); + blocksize = root->nodesize; - next = btrfs_find_tree_block(root, bytenr, blocksize); + next = btrfs_find_tree_block(root, bytenr); if (!next) { next = btrfs_find_create_tree_block(root, bytenr, blocksize); if (!next) @@ -7870,7 +7888,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans, if (!next) { if (reada && level == 1) reada_walk_down(trans, root, wc, path); - next = read_tree_block(root, bytenr, blocksize, generation); + next = read_tree_block(root, bytenr, generation); if (!next || !extent_buffer_uptodate(next)) { free_extent_buffer(next); return -EIO; @@ -8853,6 +8871,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } up_write(&info->commit_root_sem); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->unused_bgs)) { + block_group = list_first_entry(&info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group_cache, @@ -8987,7 +9015,7 @@ btrfs_create_block_group_cache(struct btrfs_root *root, u64 start, u64 size) init_rwsem(&cache->data_rwsem); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); return cache; @@ -9009,7 +9037,7 @@ int btrfs_read_block_groups(struct btrfs_root *root) root = info->extent_root; key.objectid = 0; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_BLOCK_GROUP_ITEM_KEY); + key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -9128,8 +9156,18 @@ int btrfs_read_block_groups(struct btrfs_root *root) __link_block_group(space_info, cache); set_avail_alloc_bits(root->fs_info, cache->flags); - if (btrfs_chunk_readonly(root, cache->key.objectid)) + if (btrfs_chunk_readonly(root, cache->key.objectid)) { set_block_group_ro(cache, 1); + } else if (btrfs_block_group_used(&cache->item) == 0) { + spin_lock(&info->unused_bgs_lock); + /* Should always be true but just in case. */ + if (list_empty(&cache->bg_list)) { + btrfs_get_block_group(cache); + list_add_tail(&cache->bg_list, + &info->unused_bgs); + } + spin_unlock(&info->unused_bgs_lock); + } } list_for_each_entry_rcu(space_info, &root->fs_info->space_info, list) { @@ -9170,10 +9208,8 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret = 0; - list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, - new_bg_list) { - list_del_init(&block_group->new_bg_list); - + list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { + list_del_init(&block_group->bg_list); if (ret) continue; @@ -9259,7 +9295,7 @@ int btrfs_make_block_group(struct btrfs_trans_handle *trans, __link_block_group(cache->space_info, cache); - list_add_tail(&cache->new_bg_list, &trans->new_bgs); + list_add_tail(&cache->bg_list, &trans->new_bgs); set_avail_alloc_bits(extent_root->fs_info, type); @@ -9413,8 +9449,6 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, memcpy(&key, &block_group->key, sizeof(key)); - btrfs_clear_space_info_full(root->fs_info); - btrfs_put_block_group(block_group); btrfs_put_block_group(block_group); @@ -9430,6 +9464,101 @@ out: return ret; } +/* + * Process the unused_bgs list and remove any that don't have any allocated + * space inside of them. + */ +void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) +{ + struct btrfs_block_group_cache *block_group; + struct btrfs_space_info *space_info; + struct btrfs_root *root = fs_info->extent_root; + struct btrfs_trans_handle *trans; + int ret = 0; + + if (!fs_info->open) + return; + + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->unused_bgs)) { + u64 start, end; + + block_group = list_first_entry(&fs_info->unused_bgs, + struct btrfs_block_group_cache, + bg_list); + space_info = block_group->space_info; + list_del_init(&block_group->bg_list); + if (ret || btrfs_mixed_space_info(space_info)) { + btrfs_put_block_group(block_group); + continue; + } + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't want to race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + spin_lock(&block_group->lock); + if (block_group->reserved || + btrfs_block_group_used(&block_group->item) || + block_group->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&block_group->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&block_group->lock); + + /* We don't want to force the issue, only flip if it's ok. */ + ret = set_block_group_ro(block_group, 0); + up_write(&space_info->groups_sem); + if (ret < 0) { + ret = 0; + goto next; + } + + /* + * Want to do this before we do anything else so we can recover + * properly if we fail to join the transaction. + */ + trans = btrfs_join_transaction(root); + if (IS_ERR(trans)) { + btrfs_set_block_group_rw(root, block_group); + ret = PTR_ERR(trans); + goto next; + } + + /* + * We could have pending pinned extents for this block group, + * just delete them, we don't care about them anymore. + */ + start = block_group->key.objectid; + end = start + block_group->key.offset - 1; + clear_extent_bits(&fs_info->freed_extents[0], start, end, + EXTENT_DIRTY, GFP_NOFS); + clear_extent_bits(&fs_info->freed_extents[1], start, end, + EXTENT_DIRTY, GFP_NOFS); + + /* Reset pinned so btrfs_put_block_group doesn't complain */ + block_group->pinned = 0; + + /* + * Btrfs_remove_chunk will abort the transaction if things go + * horribly wrong. + */ + ret = btrfs_remove_chunk(trans, root, + block_group->key.objectid); + btrfs_end_transaction(trans, root); +next: + btrfs_put_block_group(block_group); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + int btrfs_init_space_info(struct btrfs_fs_info *fs_info) { struct btrfs_space_info *space_info; @@ -9561,7 +9690,7 @@ void btrfs_end_nocow_write(struct btrfs_root *root) int btrfs_start_nocow_write(struct btrfs_root *root) { - if (unlikely(atomic_read(&root->will_be_snapshoted))) + if (atomic_read(&root->will_be_snapshoted)) return 0; percpu_counter_inc(&root->subv_writers->counter); @@ -9569,7 +9698,7 @@ int btrfs_start_nocow_write(struct btrfs_root *root) * Make sure counter is updated before we check for snapshot creation. */ smp_mb(); - if (unlikely(atomic_read(&root->will_be_snapshoted))) { + if (atomic_read(&root->will_be_snapshoted)) { btrfs_end_nocow_write(root); return 0; } diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index af0359dcf337..bf3f424e0013 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -25,6 +25,11 @@ static struct kmem_cache *extent_state_cache; static struct kmem_cache *extent_buffer_cache; static struct bio_set *btrfs_bioset; +static inline bool extent_state_in_tree(const struct extent_state *state) +{ + return !RB_EMPTY_NODE(&state->rb_node); +} + #ifdef CONFIG_BTRFS_DEBUG static LIST_HEAD(buffers); static LIST_HEAD(states); @@ -59,9 +64,9 @@ void btrfs_leak_debug_check(void) while (!list_empty(&states)) { state = list_entry(states.next, struct extent_state, leak_list); - printk(KERN_ERR "BTRFS: state leak: start %llu end %llu " - "state %lu in tree %p refs %d\n", - state->start, state->end, state->state, state->tree, + pr_err("BTRFS: state leak: start %llu end %llu state %lu in tree %d refs %d\n", + state->start, state->end, state->state, + extent_state_in_tree(state), atomic_read(&state->refs)); list_del(&state->leak_list); kmem_cache_free(extent_state_cache, state); @@ -209,7 +214,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) return state; state->state = 0; state->private = 0; - state->tree = NULL; + RB_CLEAR_NODE(&state->rb_node); btrfs_leak_debug_add(&state->leak_list, &states); atomic_set(&state->refs, 1); init_waitqueue_head(&state->wq); @@ -222,7 +227,7 @@ void free_extent_state(struct extent_state *state) if (!state) return; if (atomic_dec_and_test(&state->refs)) { - WARN_ON(state->tree); + WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del(&state->leak_list); trace_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); @@ -371,8 +376,8 @@ static void merge_state(struct extent_io_tree *tree, other->state == state->state) { merge_cb(tree, state, other); state->start = other->start; - other->tree = NULL; rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); free_extent_state(other); } } @@ -383,8 +388,8 @@ static void merge_state(struct extent_io_tree *tree, other->state == state->state) { merge_cb(tree, state, other); state->end = other->end; - other->tree = NULL; rb_erase(&other->rb_node, &tree->state); + RB_CLEAR_NODE(&other->rb_node); free_extent_state(other); } } @@ -442,7 +447,6 @@ static int insert_state(struct extent_io_tree *tree, found->start, found->end, start, end); return -EEXIST; } - state->tree = tree; merge_state(tree, state); return 0; } @@ -486,7 +490,6 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, free_extent_state(prealloc); return -EEXIST; } - prealloc->tree = tree; return 0; } @@ -524,9 +527,9 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, wake_up(&state->wq); if (state->state == 0) { next = next_state(state); - if (state->tree) { + if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); - state->tree = NULL; + RB_CLEAR_NODE(&state->rb_node); free_extent_state(state); } else { WARN_ON(1); @@ -606,8 +609,8 @@ again: cached_state = NULL; } - if (cached && cached->tree && cached->start <= start && - cached->end > start) { + if (cached && extent_state_in_tree(cached) && + cached->start <= start && cached->end > start) { if (clear) atomic_dec(&cached->refs); state = cached; @@ -843,7 +846,7 @@ again: if (cached_state && *cached_state) { state = *cached_state; if (state->start <= start && state->end > start && - state->tree) { + extent_state_in_tree(state)) { node = &state->rb_node; goto hit_next; } @@ -1069,7 +1072,7 @@ again: if (cached_state && *cached_state) { state = *cached_state; if (state->start <= start && state->end > start && - state->tree) { + extent_state_in_tree(state)) { node = &state->rb_node; goto hit_next; } @@ -1459,7 +1462,7 @@ int find_first_extent_bit(struct extent_io_tree *tree, u64 start, spin_lock(&tree->lock); if (cached_state && *cached_state) { state = *cached_state; - if (state->end == start - 1 && state->tree) { + if (state->end == start - 1 && extent_state_in_tree(state)) { n = rb_next(&state->rb_node); while (n) { state = rb_entry(n, struct extent_state, @@ -1905,7 +1908,7 @@ int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, int bitset = 0; spin_lock(&tree->lock); - if (cached && cached->tree && cached->start <= start && + if (cached && extent_state_in_tree(cached) && cached->start <= start && cached->end > start) node = &cached->rb_node; else @@ -1959,27 +1962,7 @@ static void check_page_uptodate(struct extent_io_tree *tree, struct page *page) SetPageUptodate(page); } -/* - * When IO fails, either with EIO or csum verification fails, we - * try other mirrors that might have a good copy of the data. This - * io_failure_record is used to record state as we go through all the - * mirrors. If another mirror has good data, the page is set up to date - * and things continue. If a good mirror can't be found, the original - * bio end_io callback is called to indicate things have failed. - */ -struct io_failure_record { - struct page *page; - u64 start; - u64 len; - u64 logical; - unsigned long bio_flags; - int this_mirror; - int failed_mirror; - int in_validation; -}; - -static int free_io_failure(struct inode *inode, struct io_failure_record *rec, - int did_repair) +int free_io_failure(struct inode *inode, struct io_failure_record *rec) { int ret; int err = 0; @@ -2012,10 +1995,10 @@ static int free_io_failure(struct inode *inode, struct io_failure_record *rec, * currently, there can be no more than two copies of every data bit. thus, * exactly one rewrite is required. */ -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num) +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, int mirror_num) { + struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct bio *bio; struct btrfs_device *dev; u64 map_length = 0; @@ -2053,7 +2036,7 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, return -EIO; } bio->bi_bdev = dev->bdev; - bio_add_page(bio, page, length, start - page_offset(page)); + bio_add_page(bio, page, length, pg_offset); if (btrfsic_submit_bio_wait(WRITE_SYNC, bio)) { /* try to remap that extent elsewhere? */ @@ -2063,10 +2046,9 @@ int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, } printk_ratelimited_in_rcu(KERN_INFO - "BTRFS: read error corrected: ino %lu off %llu " - "(dev %s sector %llu)\n", page->mapping->host->i_ino, - start, rcu_str_deref(dev->name), sector); - + "BTRFS: read error corrected: ino %llu off %llu (dev %s sector %llu)\n", + btrfs_ino(inode), start, + rcu_str_deref(dev->name), sector); bio_put(bio); return 0; } @@ -2082,9 +2064,11 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, return -EROFS; for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); - ret = repair_io_failure(root->fs_info, start, PAGE_CACHE_SIZE, - start, p, mirror_num); + struct page *p = eb->pages[i]; + + ret = repair_io_failure(root->fs_info->btree_inode, start, + PAGE_CACHE_SIZE, start, p, + start - page_offset(p), mirror_num); if (ret) break; start += PAGE_CACHE_SIZE; @@ -2097,16 +2081,15 @@ int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, * each time an IO finishes, we do a fast check in the IO failure tree * to see if we need to process or clean up an io_failure_record */ -static int clean_io_failure(u64 start, struct page *page) +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset) { u64 private; u64 private_failure; struct io_failure_record *failrec; - struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; struct extent_state *state; int num_copies; - int did_repair = 0; int ret; private = 0; @@ -2127,7 +2110,6 @@ static int clean_io_failure(u64 start, struct page *page) /* there was no real error, just free the record */ pr_debug("clean_io_failure: freeing dummy error at %llu\n", failrec->start); - did_repair = 1; goto out; } if (fs_info->sb->s_flags & MS_RDONLY) @@ -2144,55 +2126,70 @@ static int clean_io_failure(u64 start, struct page *page) num_copies = btrfs_num_copies(fs_info, failrec->logical, failrec->len); if (num_copies > 1) { - ret = repair_io_failure(fs_info, start, failrec->len, - failrec->logical, page, - failrec->failed_mirror); - did_repair = !ret; + repair_io_failure(inode, start, failrec->len, + failrec->logical, page, + pg_offset, failrec->failed_mirror); } - ret = 0; } out: - if (!ret) - ret = free_io_failure(inode, failrec, did_repair); + free_io_failure(inode, failrec); - return ret; + return 0; } /* - * this is a generic handler for readpage errors (default - * readpage_io_failed_hook). if other copies exist, read those and write back - * good data to the failed position. does not investigate in remapping the - * failed extent elsewhere, hoping the device will be smart enough to do this as - * needed + * Can be called when + * - hold extent lock + * - under ordered extent + * - the inode is freeing */ +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end) +{ + struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; + struct io_failure_record *failrec; + struct extent_state *state, *next; -static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int failed_mirror) + if (RB_EMPTY_ROOT(&failure_tree->state)) + return; + + spin_lock(&failure_tree->lock); + state = find_first_extent_bit_state(failure_tree, start, EXTENT_DIRTY); + while (state) { + if (state->start > end) + break; + + ASSERT(state->end <= end); + + next = next_state(state); + + failrec = (struct io_failure_record *)state->private; + free_extent_state(state); + kfree(failrec); + + state = next; + } + spin_unlock(&failure_tree->lock); +} + +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret) { - struct io_failure_record *failrec = NULL; + struct io_failure_record *failrec; u64 private; struct extent_map *em; - struct inode *inode = page->mapping->host; struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree; struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree; - struct bio *bio; - struct btrfs_io_bio *btrfs_failed_bio; - struct btrfs_io_bio *btrfs_bio; - int num_copies; int ret; - int read_mode; u64 logical; - BUG_ON(failed_bio->bi_rw & REQ_WRITE); - ret = get_state_private(failure_tree, start, &private); if (ret) { failrec = kzalloc(sizeof(*failrec), GFP_NOFS); if (!failrec) return -ENOMEM; + failrec->start = start; failrec->len = end - start + 1; failrec->this_mirror = 0; @@ -2212,11 +2209,11 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, em = NULL; } read_unlock(&em_tree->lock); - if (!em) { kfree(failrec); return -EIO; } + logical = start - em->start; logical = em->block_start + logical; if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { @@ -2225,8 +2222,10 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, extent_set_compress_type(&failrec->bio_flags, em->compress_type); } - pr_debug("bio_readpage_error: (new) logical=%llu, start=%llu, " - "len=%llu\n", logical, start, failrec->len); + + pr_debug("Get IO Failure Record: (new) logical=%llu, start=%llu, len=%llu\n", + logical, start, failrec->len); + failrec->logical = logical; free_extent_map(em); @@ -2246,8 +2245,7 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, } } else { failrec = (struct io_failure_record *)(unsigned long)private; - pr_debug("bio_readpage_error: (found) logical=%llu, " - "start=%llu, len=%llu, validation=%d\n", + pr_debug("Get IO Failure Record: (found) logical=%llu, start=%llu, len=%llu, validation=%d\n", failrec->logical, failrec->start, failrec->len, failrec->in_validation); /* @@ -2256,6 +2254,17 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, * clean_io_failure() clean all those errors at once. */ } + + *failrec_ret = failrec; + + return 0; +} + +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int failed_mirror) +{ + int num_copies; + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, failrec->logical, failrec->len); if (num_copies == 1) { @@ -2264,10 +2273,9 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, * all the retry and error correction code that follows. no * matter what the error is, it is very likely to persist. */ - pr_debug("bio_readpage_error: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + pr_debug("Check Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; + return 0; } /* @@ -2287,7 +2295,6 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, BUG_ON(failrec->in_validation); failrec->in_validation = 1; failrec->this_mirror = failed_mirror; - read_mode = READ_SYNC | REQ_FAILFAST_DEV; } else { /* * we're ready to fulfill a) and b) alongside. get a good copy @@ -2303,25 +2310,36 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, failrec->this_mirror++; if (failrec->this_mirror == failed_mirror) failrec->this_mirror++; - read_mode = READ_SYNC; } if (failrec->this_mirror > num_copies) { - pr_debug("bio_readpage_error: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + pr_debug("Check Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", num_copies, failrec->this_mirror, failed_mirror); - free_io_failure(inode, failrec, 0); - return -EIO; + return 0; } + return 1; +} + + +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data) +{ + struct bio *bio; + struct btrfs_io_bio *btrfs_failed_bio; + struct btrfs_io_bio *btrfs_bio; + bio = btrfs_io_bio_alloc(GFP_NOFS, 1); - if (!bio) { - free_io_failure(inode, failrec, 0); - return -EIO; - } - bio->bi_end_io = failed_bio->bi_end_io; + if (!bio) + return NULL; + + bio->bi_end_io = endio_func; bio->bi_iter.bi_sector = failrec->logical >> 9; bio->bi_bdev = BTRFS_I(inode)->root->fs_info->fs_devices->latest_bdev; bio->bi_iter.bi_size = 0; + bio->bi_private = data; btrfs_failed_bio = btrfs_io_bio(failed_bio); if (btrfs_failed_bio->csum) { @@ -2330,21 +2348,73 @@ static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, btrfs_bio = btrfs_io_bio(bio); btrfs_bio->csum = btrfs_bio->csum_inline; - phy_offset >>= inode->i_sb->s_blocksize_bits; - phy_offset *= csum_size; - memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + phy_offset, + icsum *= csum_size; + memcpy(btrfs_bio->csum, btrfs_failed_bio->csum + icsum, csum_size); } - bio_add_page(bio, page, failrec->len, start - page_offset(page)); + bio_add_page(bio, page, failrec->len, pg_offset); + + return bio; +} + +/* + * this is a generic handler for readpage errors (default + * readpage_io_failed_hook). if other copies exist, read those and write back + * good data to the failed position. does not investigate in remapping the + * failed extent elsewhere, hoping the device will be smart enough to do this as + * needed + */ + +static int bio_readpage_error(struct bio *failed_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int failed_mirror) +{ + struct io_failure_record *failrec; + struct inode *inode = page->mapping->host; + struct extent_io_tree *tree = &BTRFS_I(inode)->io_tree; + struct bio *bio; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_repairable(inode, failed_bio, failrec, failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + phy_offset >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + start - page_offset(page), + (int)phy_offset, failed_bio->bi_end_io, + NULL); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } - pr_debug("bio_readpage_error: submitting new read[%#x] to " - "this_mirror=%d, num_copies=%d, in_validation=%d\n", read_mode, - failrec->this_mirror, num_copies, failrec->in_validation); + pr_debug("Repair Read Error: submitting new read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); ret = tree->ops->submit_bio_hook(inode, read_mode, bio, failrec->this_mirror, failrec->bio_flags, 0); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + return ret; } @@ -2469,7 +2539,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) struct inode *inode = page->mapping->host; pr_debug("end_bio_extent_readpage: bi_sector=%llu, err=%d, " - "mirror=%lu\n", (u64)bio->bi_iter.bi_sector, err, + "mirror=%u\n", (u64)bio->bi_iter.bi_sector, err, io_bio->mirror_num); tree = &BTRFS_I(inode)->io_tree; @@ -2503,7 +2573,7 @@ static void end_bio_extent_readpage(struct bio *bio, int err) if (ret) uptodate = 0; else - clean_io_failure(start, page); + clean_io_failure(inode, start, page, 0); } if (likely(uptodate)) @@ -2540,12 +2610,12 @@ readpage_ok: if (likely(uptodate)) { loff_t i_size = i_size_read(inode); pgoff_t end_index = i_size >> PAGE_CACHE_SHIFT; - unsigned offset; + unsigned off; /* Zero out the end if this page straddles i_size */ - offset = i_size & (PAGE_CACHE_SIZE-1); - if (page->index == end_index && offset) - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + off = i_size & (PAGE_CACHE_SIZE-1); + if (page->index == end_index && off) + zero_user_segment(page, off, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { ClearPageUptodate(page); @@ -2618,9 +2688,18 @@ btrfs_bio_alloc(struct block_device *bdev, u64 first_sector, int nr_vecs, struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask) { - return bio_clone_bioset(bio, gfp_mask, btrfs_bioset); -} + struct btrfs_io_bio *btrfs_bio; + struct bio *new; + new = bio_clone_bioset(bio, gfp_mask, btrfs_bioset); + if (new) { + btrfs_bio = btrfs_io_bio(new); + btrfs_bio->csum = NULL; + btrfs_bio->csum_allocated = NULL; + btrfs_bio->end_io = NULL; + } + return new; +} /* this also allocates from the btrfs_bioset */ struct bio *btrfs_io_bio_alloc(gfp_t gfp_mask, unsigned int nr_iovecs) @@ -3501,7 +3580,7 @@ lock_extent_buffer_for_io(struct extent_buffer *eb, num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; if (!trylock_page(p)) { if (!flush) { @@ -3522,6 +3601,68 @@ static void end_extent_buffer_writeback(struct extent_buffer *eb) wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); } +static void set_btree_ioerr(struct page *page) +{ + struct extent_buffer *eb = (struct extent_buffer *)page->private; + struct btrfs_inode *btree_ino = BTRFS_I(eb->fs_info->btree_inode); + + SetPageError(page); + if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's + * committing (when we call filemap_fdata[write|wait]_range against + * the btree inode), we might have + * btree_inode->i_mapping->a_ops->writepages() called by the VM - if it + * returns an error or an error happens during writeback, when we're + * committing the transaction we wouldn't know about it, since the pages + * can be no longer dirty nor marked anymore for writeback (if a + * subsequent modification to the extent buffer didn't happen before the + * transaction commit), which makes filemap_fdata[write|wait]_range not + * able to find the pages tagged with SetPageError at transaction + * commit time. So if this happens we must abort the transaction, + * otherwise we commit a super block with btree roots that point to + * btree nodes/leafs whose content on disk is invalid - either garbage + * or the content of some node/leaf from a past generation that got + * cowed or deleted and is no longer valid. + * + * Note: setting AS_EIO/AS_ENOSPC in the btree inode's i_mapping would + * not be enough - we need to distinguish between log tree extents vs + * non-log tree extents, and the next filemap_fdatawait_range() call + * will catch and clear such errors in the mapping - and that call might + * be from a log sync and not from a transaction commit. Also, checking + * for the eb flag EXTENT_BUFFER_WRITE_ERR at transaction commit time is + * not done and would not be reliable - the eb might have been released + * from memory and reading it back again means that flag would not be + * set (since it's a runtime flag, not persisted on disk). + * + * Using the flags below in the btree inode also makes us achieve the + * goal of AS_EIO/AS_ENOSPC when writepages() returns success, started + * writeback for all dirty pages and before filemap_fdatawait_range() + * is called, the writeback for all dirty pages had already finished + * with errors - because we were not using AS_EIO/AS_ENOSPC, + * filemap_fdatawait_range() would return success, as it could not know + * that writeback errors happened (the pages were no longer tagged for + * writeback). + */ + switch (eb->log_index) { + case -1: + set_bit(BTRFS_INODE_BTREE_ERR, &btree_ino->runtime_flags); + break; + case 0: + set_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + break; + case 1: + set_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + break; + default: + BUG(); /* unexpected, logic error */ + } +} + static void end_bio_extent_buffer_writepage(struct bio *bio, int err) { struct bio_vec *bvec; @@ -3535,10 +3676,9 @@ static void end_bio_extent_buffer_writepage(struct bio *bio, int err) BUG_ON(!eb); done = atomic_dec_and_test(&eb->io_pages); - if (err || test_bit(EXTENT_BUFFER_IOERR, &eb->bflags)) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + if (err || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - SetPageError(page); + set_btree_ioerr(page); } end_page_writeback(page); @@ -3565,14 +3705,14 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, int rw = (epd->sync_io ? WRITE_SYNC : WRITE) | REQ_META; int ret = 0; - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); atomic_set(&eb->io_pages, num_pages); if (btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID) bio_flags = EXTENT_BIO_TREE_LOG; for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; clear_page_dirty_for_io(p); set_page_writeback(p); @@ -3582,8 +3722,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, 0, epd->bio_flags, bio_flags); epd->bio_flags = bio_flags; if (ret) { - set_bit(EXTENT_BUFFER_IOERR, &eb->bflags); - SetPageError(p); + set_btree_ioerr(p); + end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) end_extent_buffer_writeback(eb); ret = -EIO; @@ -3596,7 +3736,8 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, if (unlikely(ret)) { for (; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; + clear_page_dirty_for_io(p); unlock_page(p); } } @@ -4166,19 +4307,6 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, return NULL; } -static noinline int count_ext_ref(u64 inum, u64 offset, u64 root_id, void *ctx) -{ - unsigned long cnt = *((unsigned long *)ctx); - - cnt++; - *((unsigned long *)ctx) = cnt; - - /* Now we're sure that the extent is shared. */ - if (cnt > 1) - return 1; - return 0; -} - int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len, get_extent_t *get_extent) { @@ -4195,6 +4323,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; + struct btrfs_root *root = BTRFS_I(inode)->root; int end = 0; u64 em_start = 0; u64 em_len = 0; @@ -4215,8 +4344,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, BTRFS_I(inode)->root, - path, btrfs_ino(inode), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { btrfs_free_path(path); return ret; @@ -4224,7 +4353,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, WARN_ON(!ret); path->slots[0]--; btrfs_item_key_to_cpu(path->nodes[0], &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; /* No extents, but there might be delalloc bits */ if (found_key.objectid != btrfs_ino(inode) || @@ -4309,25 +4438,27 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } else if (em->block_start == EXTENT_MAP_DELALLOC) { flags |= (FIEMAP_EXTENT_DELALLOC | FIEMAP_EXTENT_UNKNOWN); - } else { - unsigned long ref_cnt = 0; + } else if (fieinfo->fi_extents_max) { + u64 bytenr = em->block_start - + (em->start - em->orig_start); disko = em->block_start + offset_in_extent; /* * As btrfs supports shared space, this information * can be exported to userspace tools via - * flag FIEMAP_EXTENT_SHARED. + * flag FIEMAP_EXTENT_SHARED. If fi_extents_max == 0 + * then we're just getting a count and we can skip the + * lookup stuff. */ - ret = iterate_inodes_from_logical( - em->block_start, - BTRFS_I(inode)->root->fs_info, - path, count_ext_ref, &ref_cnt); - if (ret < 0 && ret != -ENOENT) + ret = btrfs_check_shared(NULL, root->fs_info, + root->objectid, + btrfs_ino(inode), bytenr); + if (ret < 0) goto out_free; - - if (ref_cnt > 1) + if (ret) flags |= FIEMAP_EXTENT_SHARED; + ret = 0; } if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) flags |= FIEMAP_EXTENT_ENCODED; @@ -4381,24 +4512,21 @@ int extent_buffer_under_io(struct extent_buffer *eb) /* * Helper for releasing extent buffer page. */ -static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, - unsigned long start_idx) +static void btrfs_release_extent_buffer_page(struct extent_buffer *eb) { unsigned long index; - unsigned long num_pages; struct page *page; int mapped = !test_bit(EXTENT_BUFFER_DUMMY, &eb->bflags); BUG_ON(extent_buffer_under_io(eb)); - num_pages = num_extent_pages(eb->start, eb->len); - index = start_idx + num_pages; - if (start_idx >= index) + index = num_extent_pages(eb->start, eb->len); + if (index == 0) return; do { index--; - page = extent_buffer_page(eb, index); + page = eb->pages[index]; if (page && mapped) { spin_lock(&page->mapping->private_lock); /* @@ -4429,7 +4557,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, /* One for when we alloced the page */ page_cache_release(page); } - } while (index != start_idx); + } while (index != 0); } /* @@ -4437,7 +4565,7 @@ static void btrfs_release_extent_buffer_page(struct extent_buffer *eb, */ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) { - btrfs_release_extent_buffer_page(eb, 0); + btrfs_release_extent_buffer_page(eb); __free_extent_buffer(eb); } @@ -4580,7 +4708,8 @@ static void mark_extent_buffer_accessed(struct extent_buffer *eb, num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - struct page *p = extent_buffer_page(eb, i); + struct page *p = eb->pages[i]; + if (p != accessed) mark_page_accessed(p); } @@ -4749,7 +4878,7 @@ again: */ SetPageChecked(eb->pages[0]); for (i = 1; i < num_pages; i++) { - p = extent_buffer_page(eb, i); + p = eb->pages[i]; ClearPageChecked(p); unlock_page(p); } @@ -4794,7 +4923,7 @@ static int release_extent_buffer(struct extent_buffer *eb) } /* Should be safe to release our pages at this point */ - btrfs_release_extent_buffer_page(eb, 0); + btrfs_release_extent_buffer_page(eb); call_rcu(&eb->rcu_head, btrfs_release_extent_buffer_rcu); return 1; } @@ -4860,7 +4989,7 @@ void clear_extent_buffer_dirty(struct extent_buffer *eb) num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (!PageDirty(page)) continue; @@ -4896,7 +5025,7 @@ int set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); for (i = 0; i < num_pages; i++) - set_page_dirty(extent_buffer_page(eb, i)); + set_page_dirty(eb->pages[i]); return was_dirty; } @@ -4909,7 +5038,7 @@ int clear_extent_buffer_uptodate(struct extent_buffer *eb) clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (page) ClearPageUptodate(page); } @@ -4925,7 +5054,7 @@ int set_extent_buffer_uptodate(struct extent_buffer *eb) set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); num_pages = num_extent_pages(eb->start, eb->len); for (i = 0; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; SetPageUptodate(page); } return 0; @@ -4965,7 +5094,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, num_pages = num_extent_pages(eb->start, eb->len); for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (wait == WAIT_NONE) { if (!trylock_page(page)) goto unlock_exit; @@ -4984,11 +5113,11 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, goto unlock_exit; } - clear_bit(EXTENT_BUFFER_IOERR, &eb->bflags); + clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; atomic_set(&eb->io_pages, num_reads); for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; if (!PageUptodate(page)) { ClearPageError(page); err = __extent_read_full_page(tree, page, @@ -5013,7 +5142,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, return ret; for (i = start_i; i < num_pages; i++) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; wait_on_page_locked(page); if (!PageUptodate(page)) ret = -EIO; @@ -5024,7 +5153,7 @@ int read_extent_buffer_pages(struct extent_io_tree *tree, unlock_exit: i = start_i; while (locked_pages > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; i++; unlock_page(page); locked_pages--; @@ -5050,7 +5179,7 @@ void read_extent_buffer(struct extent_buffer *eb, void *dstv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); kaddr = page_address(page); @@ -5082,7 +5211,7 @@ int read_extent_buffer_to_user(struct extent_buffer *eb, void __user *dstv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); kaddr = page_address(page); @@ -5131,7 +5260,7 @@ int map_private_extent_buffer(struct extent_buffer *eb, unsigned long start, return -EINVAL; } - p = extent_buffer_page(eb, i); + p = eb->pages[i]; kaddr = page_address(p); *map = kaddr + offset; *map_len = PAGE_CACHE_SIZE - offset; @@ -5157,7 +5286,7 @@ int memcmp_extent_buffer(struct extent_buffer *eb, const void *ptrv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; cur = min(len, (PAGE_CACHE_SIZE - offset)); @@ -5191,7 +5320,7 @@ void write_extent_buffer(struct extent_buffer *eb, const void *srcv, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -5221,7 +5350,7 @@ void memset_extent_buffer(struct extent_buffer *eb, char c, offset = (start_offset + start) & (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(eb, i); + page = eb->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, PAGE_CACHE_SIZE - offset); @@ -5252,7 +5381,7 @@ void copy_extent_buffer(struct extent_buffer *dst, struct extent_buffer *src, (PAGE_CACHE_SIZE - 1); while (len > 0) { - page = extent_buffer_page(dst, i); + page = dst->pages[i]; WARN_ON(!PageUptodate(page)); cur = min(len, (unsigned long)(PAGE_CACHE_SIZE - offset)); @@ -5330,8 +5459,7 @@ void memcpy_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, cur, (unsigned long)(PAGE_CACHE_SIZE - dst_off_in_page)); - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), + copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page, src_off_in_page, cur); src_offset += cur; @@ -5377,8 +5505,7 @@ void memmove_extent_buffer(struct extent_buffer *dst, unsigned long dst_offset, cur = min_t(unsigned long, len, src_off_in_page + 1); cur = min(cur, dst_off_in_page + 1); - copy_pages(extent_buffer_page(dst, dst_i), - extent_buffer_page(dst, src_i), + copy_pages(dst->pages[dst_i], dst->pages[src_i], dst_off_in_page - cur + 1, src_off_in_page - cur + 1, cur); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index ccc264e7bde1..6d4b938be986 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -11,8 +11,6 @@ #define EXTENT_NEW (1 << 4) #define EXTENT_DELALLOC (1 << 5) #define EXTENT_DEFRAG (1 << 6) -#define EXTENT_DEFRAG_DONE (1 << 7) -#define EXTENT_BUFFER_FILLED (1 << 8) #define EXTENT_BOUNDARY (1 << 9) #define EXTENT_NODATASUM (1 << 10) #define EXTENT_DO_ACCOUNTING (1 << 11) @@ -34,16 +32,16 @@ /* these are bit numbers for test/set bit */ #define EXTENT_BUFFER_UPTODATE 0 -#define EXTENT_BUFFER_BLOCKING 1 #define EXTENT_BUFFER_DIRTY 2 #define EXTENT_BUFFER_CORRUPT 3 #define EXTENT_BUFFER_READAHEAD 4 /* this got triggered by readahead */ #define EXTENT_BUFFER_TREE_REF 5 #define EXTENT_BUFFER_STALE 6 #define EXTENT_BUFFER_WRITEBACK 7 -#define EXTENT_BUFFER_IOERR 8 +#define EXTENT_BUFFER_READ_ERR 8 /* read IO error */ #define EXTENT_BUFFER_DUMMY 9 #define EXTENT_BUFFER_IN_TREE 10 +#define EXTENT_BUFFER_WRITE_ERR 11 /* write IO error */ /* these are flags for extent_clear_unlock_delalloc */ #define PAGE_UNLOCK (1 << 0) @@ -57,7 +55,6 @@ * map has page->private set to one. */ #define EXTENT_PAGE_PRIVATE 1 -#define EXTENT_PAGE_PRIVATE_FIRST_PAGE 3 struct extent_state; struct btrfs_root; @@ -108,7 +105,6 @@ struct extent_state { struct rb_node rb_node; /* ADD NEW ELEMENTS AFTER THIS */ - struct extent_io_tree *tree; wait_queue_head_t wq; atomic_t refs; unsigned long state; @@ -126,8 +122,6 @@ struct extent_state { struct extent_buffer { u64 start; unsigned long len; - unsigned long map_start; - unsigned long map_len; unsigned long bflags; struct btrfs_fs_info *fs_info; spinlock_t refs_lock; @@ -144,7 +138,9 @@ struct extent_buffer { atomic_t blocking_readers; atomic_t spinning_readers; atomic_t spinning_writers; - int lock_nested; + short lock_nested; + /* >= 0 if eb belongs to a log tree, -1 otherwise */ + short log_index; /* protects write locks */ rwlock_t lock; @@ -286,12 +282,6 @@ static inline unsigned long num_extent_pages(u64 start, u64 len) (start >> PAGE_CACHE_SHIFT); } -static inline struct page *extent_buffer_page(struct extent_buffer *eb, - unsigned long i) -{ - return eb->pages[i]; -} - static inline void extent_buffer_get(struct extent_buffer *eb) { atomic_inc(&eb->refs); @@ -341,18 +331,50 @@ struct bio *btrfs_bio_clone(struct bio *bio, gfp_t gfp_mask); struct btrfs_fs_info; -int repair_io_failure(struct btrfs_fs_info *fs_info, u64 start, - u64 length, u64 logical, struct page *page, - int mirror_num); +int repair_io_failure(struct inode *inode, u64 start, u64 length, u64 logical, + struct page *page, unsigned int pg_offset, + int mirror_num); +int clean_io_failure(struct inode *inode, u64 start, struct page *page, + unsigned int pg_offset); int end_extent_writepage(struct page *page, int err, u64 start, u64 end); int repair_eb_io_failure(struct btrfs_root *root, struct extent_buffer *eb, int mirror_num); + +/* + * When IO fails, either with EIO or csum verification fails, we + * try other mirrors that might have a good copy of the data. This + * io_failure_record is used to record state as we go through all the + * mirrors. If another mirror has good data, the page is set up to date + * and things continue. If a good mirror can't be found, the original + * bio end_io callback is called to indicate things have failed. + */ +struct io_failure_record { + struct page *page; + u64 start; + u64 len; + u64 logical; + unsigned long bio_flags; + int this_mirror; + int failed_mirror; + int in_validation; +}; + +void btrfs_free_io_failure_record(struct inode *inode, u64 start, u64 end); +int btrfs_get_io_failure_record(struct inode *inode, u64 start, u64 end, + struct io_failure_record **failrec_ret); +int btrfs_check_repairable(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, int fail_mirror); +struct bio *btrfs_create_repair_bio(struct inode *inode, struct bio *failed_bio, + struct io_failure_record *failrec, + struct page *page, int pg_offset, int icsum, + bio_end_io_t *endio_func, void *data); +int free_io_failure(struct inode *inode, struct io_failure_record *rec); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS noinline u64 find_lock_delalloc_range(struct inode *inode, struct extent_io_tree *tree, struct page *locked_page, u64 *start, u64 *end, u64 max_bytes); +#endif struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, unsigned long len); #endif -#endif diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 54c84daec9b5..783a94355efd 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -55,7 +55,7 @@ int btrfs_insert_file_extent(struct btrfs_trans_handle *trans, return -ENOMEM; file_key.objectid = objectid; file_key.offset = pos; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; path->leave_spinning = 1; ret = btrfs_insert_empty_item(trans, root, path, &file_key, @@ -100,7 +100,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -111,7 +111,7 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, goto fail; path->slots[0]--; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY) + if (found_key.type != BTRFS_EXTENT_CSUM_KEY) goto fail; csum_offset = (bytenr - found_key.offset) >> @@ -148,7 +148,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, file_key.objectid = objectid; file_key.offset = offset; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_DATA_KEY); + file_key.type = BTRFS_EXTENT_DATA_KEY; ret = btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); return ret; } @@ -299,19 +299,9 @@ int btrfs_lookup_bio_sums(struct btrfs_root *root, struct inode *inode, } int btrfs_lookup_bio_sums_dio(struct btrfs_root *root, struct inode *inode, - struct btrfs_dio_private *dip, struct bio *bio, - u64 offset) + struct bio *bio, u64 offset) { - int len = (bio->bi_iter.bi_sector << 9) - dip->disk_bytenr; - u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - int ret; - - len >>= inode->i_sb->s_blocksize_bits; - len *= csum_size; - - ret = __btrfs_lookup_bio_sums(root, inode, bio, offset, - (u32 *)(dip->csum + len), 1); - return ret; + return __btrfs_lookup_bio_sums(root, inode, bio, offset, NULL, 1); } int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, @@ -329,8 +319,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, u64 csum_end; u16 csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - ASSERT(start == ALIGN(start, root->sectorsize) && - (end + 1) == ALIGN(end + 1, root->sectorsize)); + ASSERT(IS_ALIGNED(start, root->sectorsize) && + IS_ALIGNED(end + 1, root->sectorsize)); path = btrfs_alloc_path(); if (!path) @@ -720,7 +710,7 @@ again: bytenr = sums->bytenr + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; file_key.offset = bytenr; - btrfs_set_key_type(&file_key, BTRFS_EXTENT_CSUM_KEY); + file_key.type = BTRFS_EXTENT_CSUM_KEY; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -790,7 +780,7 @@ again: csum_offset = (bytenr - found_key.offset) >> root->fs_info->sb->s_blocksize_bits; - if (btrfs_key_type(&found_key) != BTRFS_EXTENT_CSUM_KEY || + if (found_key.type != BTRFS_EXTENT_CSUM_KEY || found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || csum_offset >= MAX_CSUM_ITEMS(root, csum_size)) { goto insert; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ff1cc0399b9a..a18ceabd99a8 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -299,7 +299,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, /* get the inode */ key.objectid = defrag->root; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; key.offset = (u64)-1; index = srcu_read_lock(&fs_info->subvol_srcu); @@ -311,7 +311,7 @@ static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, } key.objectid = defrag->ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; inode = btrfs_iget(fs_info->sb, &key, inode_root, NULL); if (IS_ERR(inode)) { @@ -452,7 +452,7 @@ static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, if (unlikely(copied == 0)) break; - if (unlikely(copied < PAGE_CACHE_SIZE - offset)) { + if (copied < PAGE_CACHE_SIZE - offset) { offset += copied; } else { pg++; @@ -1481,9 +1481,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, bool force_page_uptodate = false; bool need_unlock; - nrptrs = min((iov_iter_count(i) + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE, PAGE_CACHE_SIZE / - (sizeof(struct page *))); + nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_CACHE_SIZE), + PAGE_CACHE_SIZE / (sizeof(struct page *))); nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied); nrptrs = max(nrptrs, 8); pages = kmalloc(nrptrs * sizeof(struct page *), GFP_KERNEL); @@ -1497,8 +1496,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, size_t write_bytes = min(iov_iter_count(i), nrptrs * (size_t)PAGE_CACHE_SIZE - offset); - size_t num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + size_t num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); size_t reserve_bytes; size_t dirty_pages; size_t copied; @@ -1526,9 +1525,8 @@ static noinline ssize_t __btrfs_buffered_write(struct file *file, * our prealloc extent may be smaller than * write_bytes, so scale down. */ - num_pages = (write_bytes + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(write_bytes + offset, + PAGE_CACHE_SIZE); reserve_bytes = num_pages << PAGE_CACHE_SHIFT; ret = 0; } else { @@ -1590,9 +1588,8 @@ again: dirty_pages = 0; } else { force_page_uptodate = false; - dirty_pages = (copied + offset + - PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + dirty_pages = DIV_ROUND_UP(copied + offset, + PAGE_CACHE_SIZE); } /* @@ -1653,7 +1650,7 @@ again: cond_resched(); balance_dirty_pages_ratelimited(inode->i_mapping); - if (dirty_pages < (root->leafsize >> PAGE_CACHE_SHIFT) + 1) + if (dirty_pages < (root->nodesize >> PAGE_CACHE_SHIFT) + 1) btrfs_btree_balance_dirty(root); pos += copied; @@ -1795,7 +1792,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, if (sync) atomic_inc(&BTRFS_I(inode)->sync_writers); - if (unlikely(file->f_flags & O_DIRECT)) { + if (file->f_flags & O_DIRECT) { num_written = __btrfs_direct_write(iocb, from, pos); } else { num_written = __btrfs_buffered_write(file, from, pos); @@ -1852,6 +1849,20 @@ int btrfs_release_file(struct inode *inode, struct file *filp) return 0; } +static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) +{ + int ret; + + atomic_inc(&BTRFS_I(inode)->sync_writers); + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) + ret = filemap_fdatawrite_range(inode->i_mapping, start, end); + atomic_dec(&BTRFS_I(inode)->sync_writers); + + return ret; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -1881,30 +1892,64 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * multi-task, and make the performance up. See * btrfs_wait_ordered_range for an explanation of the ASYNC check. */ - atomic_inc(&BTRFS_I(inode)->sync_writers); - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - ret = filemap_fdatawrite_range(inode->i_mapping, start, end); - atomic_dec(&BTRFS_I(inode)->sync_writers); + ret = start_ordered_ops(inode, start, end); if (ret) return ret; mutex_lock(&inode->i_mutex); - - /* - * We flush the dirty pages again to avoid some dirty pages in the - * range being left. - */ atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); + /* + * We might have have had more pages made dirty after calling + * start_ordered_ops and before acquiring the inode's i_mutex. + */ if (full_sync) { + /* + * For a full sync, we need to make sure any ordered operations + * start and finish before we start logging the inode, so that + * all extents are persisted and the respective file extent + * items are in the fs/subvol btree. + */ ret = btrfs_wait_ordered_range(inode, start, end - start + 1); - if (ret) { - mutex_unlock(&inode->i_mutex); - goto out; - } + } else { + /* + * Start any new ordered operations before starting to log the + * inode. We will wait for them to finish in btrfs_sync_log(). + * + * Right before acquiring the inode's mutex, we might have new + * writes dirtying pages, which won't immediately start the + * respective ordered operations - that is done through the + * fill_delalloc callbacks invoked from the writepage and + * writepages address space operations. So make sure we start + * all ordered operations before starting to log our inode. Not + * doing this means that while logging the inode, writeback + * could start and invoke writepage/writepages, which would call + * the fill_delalloc callbacks (cow_file_range, + * submit_compressed_extents). These callbacks add first an + * extent map to the modified list of extents and then create + * the respective ordered operation, which means in + * tree-log.c:btrfs_log_inode() we might capture all existing + * ordered operations (with btrfs_get_logged_extents()) before + * the fill_delalloc callback adds its ordered operation, and by + * the time we visit the modified list of extent maps (with + * btrfs_log_changed_extents()), we see and process the extent + * map they created. We then use the extent map to construct a + * file extent item for logging without waiting for the + * respective ordered operation to finish - this file extent + * item points to a disk location that might not have yet been + * written to, containing random data - so after a crash a log + * replay will make our inode have file extent items that point + * to disk locations containing invalid data, as we returned + * success to userspace without waiting for the respective + * ordered operation to finish, because it wasn't captured by + * btrfs_get_logged_extents(). + */ + ret = start_ordered_ops(inode, start, end); + } + if (ret) { + mutex_unlock(&inode->i_mutex); + goto out; } atomic_inc(&root->log_batch); @@ -1984,6 +2029,25 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ mutex_unlock(&inode->i_mutex); + /* + * If any of the ordered extents had an error, just return it to user + * space, so that the application knows some writes didn't succeed and + * can take proper action (retry for e.g.). Blindly committing the + * transaction in this case, would fool userspace that everything was + * successful. And we also want to make sure our log doesn't contain + * file extent items pointing to extents that weren't fully written to - + * just like in the non fast fsync path, where we check for the ordered + * operation's error flag before writing to the log tree and return -EIO + * if any of them had this flag set (btrfs_wait_ordered_range) - + * therefore we need to check for errors in the ordered operations, + * which are indicated by ctx.io_err. + */ + if (ctx.io_err) { + btrfs_end_transaction(trans, root); + ret = ctx.io_err; + goto out; + } + if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { ret = btrfs_sync_log(trans, root, &ctx); @@ -2621,23 +2685,28 @@ static int find_desired_extent(struct inode *inode, loff_t *offset, int whence) struct btrfs_root *root = BTRFS_I(inode)->root; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - u64 lockstart = *offset; - u64 lockend = i_size_read(inode); - u64 start = *offset; - u64 len = i_size_read(inode); + u64 lockstart; + u64 lockend; + u64 start; + u64 len; int ret = 0; - lockend = max_t(u64, root->sectorsize, lockend); + if (inode->i_size == 0) + return -ENXIO; + + /* + * *offset can be negative, in this case we start finding DATA/HOLE from + * the very start of the file. + */ + start = max_t(loff_t, 0, *offset); + + lockstart = round_down(start, root->sectorsize); + lockend = round_up(i_size_read(inode), root->sectorsize); if (lockend <= lockstart) lockend = lockstart + root->sectorsize; - lockend--; len = lockend - lockstart + 1; - len = max_t(u64, len, root->sectorsize); - if (inode->i_size == 0) - return -ENXIO; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, 0, &cached_state); diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 2b0a627cb5f9..33848196550e 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -279,8 +279,7 @@ static int io_ctl_init(struct io_ctl *io_ctl, struct inode *inode, int num_pages; int check_crcs = 0; - num_pages = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT; + num_pages = DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE); if (btrfs_ino(inode) != BTRFS_FREE_INO_OBJECTID) check_crcs = 1; @@ -1998,6 +1997,128 @@ static bool try_merge_free_space(struct btrfs_free_space_ctl *ctl, return merged; } +static bool steal_from_bitmap_to_end(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + unsigned long i; + unsigned long j; + const u64 end = info->offset + info->bytes; + const u64 bitmap_offset = offset_to_bitmap(ctl, end); + u64 bytes; + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, end); + j = find_next_zero_bit(bitmap->bitmap, BITS_PER_BITMAP, i); + if (j == i) + return false; + bytes = (j - i) * ctl->unit; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, end, bytes); + else + __bitmap_clear_bits(ctl, bitmap, end, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +static bool steal_from_bitmap_to_front(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + struct btrfs_free_space *bitmap; + u64 bitmap_offset; + unsigned long i; + unsigned long j; + unsigned long prev_j; + u64 bytes; + + bitmap_offset = offset_to_bitmap(ctl, info->offset); + /* If we're on a boundary, try the previous logical bitmap. */ + if (bitmap_offset == info->offset) { + if (info->offset == 0) + return false; + bitmap_offset = offset_to_bitmap(ctl, info->offset - 1); + } + + bitmap = tree_search_offset(ctl, bitmap_offset, 1, 0); + if (!bitmap) + return false; + + i = offset_to_bit(bitmap->offset, ctl->unit, info->offset) - 1; + j = 0; + prev_j = (unsigned long)-1; + for_each_clear_bit_from(j, bitmap->bitmap, BITS_PER_BITMAP) { + if (j > i) + break; + prev_j = j; + } + if (prev_j == i) + return false; + + if (prev_j == (unsigned long)-1) + bytes = (i + 1) * ctl->unit; + else + bytes = (i - prev_j) * ctl->unit; + + info->offset -= bytes; + info->bytes += bytes; + + if (update_stat) + bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + else + __bitmap_clear_bits(ctl, bitmap, info->offset, bytes); + + if (!bitmap->bytes) + free_bitmap(ctl, bitmap); + + return true; +} + +/* + * We prefer always to allocate from extent entries, both for clustered and + * non-clustered allocation requests. So when attempting to add a new extent + * entry, try to see if there's adjacent free space in bitmap entries, and if + * there is, migrate that space from the bitmaps to the extent. + * Like this we get better chances of satisfying space allocation requests + * because we attempt to satisfy them based on a single cache entry, and never + * on 2 or more entries - even if the entries represent a contiguous free space + * region (e.g. 1 extent entry + 1 bitmap entry starting where the extent entry + * ends). + */ +static void steal_from_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info, + bool update_stat) +{ + /* + * Only work with disconnected entries, as we can change their offset, + * and must be extent entries. + */ + ASSERT(!info->bitmap); + ASSERT(RB_EMPTY_NODE(&info->offset_index)); + + if (ctl->total_bitmaps > 0) { + bool stole_end; + bool stole_front = false; + + stole_end = steal_from_bitmap_to_end(ctl, info, update_stat); + if (ctl->total_bitmaps > 0) + stole_front = steal_from_bitmap_to_front(ctl, info, + update_stat); + + if (stole_end || stole_front) + try_merge_free_space(ctl, info, update_stat); + } +} + int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, u64 offset, u64 bytes) { @@ -2010,6 +2131,7 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, info->offset = offset; info->bytes = bytes; + RB_CLEAR_NODE(&info->offset_index); spin_lock(&ctl->tree_lock); @@ -2029,6 +2151,14 @@ int __btrfs_add_free_space(struct btrfs_free_space_ctl *ctl, goto out; } link: + /* + * Only steal free space from adjacent bitmaps if we're sure we're not + * going to add the new free space to existing bitmap entries - because + * that would mean unnecessary work that would be reverted. Therefore + * attempt to steal space from bitmaps if we're adding an extent entry. + */ + steal_from_bitmap(ctl, info, true); + ret = link_free_space(ctl, info); if (ret) kmem_cache_free(btrfs_free_space_cachep, info); @@ -2205,10 +2335,13 @@ __btrfs_return_cluster_to_free_space( entry = rb_entry(node, struct btrfs_free_space, offset_index); node = rb_next(&entry->offset_index); rb_erase(&entry->offset_index, &cluster->root); + RB_CLEAR_NODE(&entry->offset_index); bitmap = (entry->bitmap != NULL); - if (!bitmap) + if (!bitmap) { try_merge_free_space(ctl, entry, false); + steal_from_bitmap(ctl, entry, false); + } tree_insert_offset(&ctl->free_space_offset, entry->offset, &entry->offset_index, bitmap); } @@ -3033,10 +3166,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, { struct inode *inode = NULL; - spin_lock(&root->cache_lock); - if (root->cache_inode) - inode = igrab(root->cache_inode); - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_inode) + inode = igrab(root->ino_cache_inode); + spin_unlock(&root->ino_cache_lock); if (inode) return inode; @@ -3044,10 +3177,10 @@ struct inode *lookup_free_ino_inode(struct btrfs_root *root, if (IS_ERR(inode)) return inode; - spin_lock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); if (!btrfs_fs_closing(root->fs_info)) - root->cache_inode = igrab(inode); - spin_unlock(&root->cache_lock); + root->ino_cache_inode = igrab(inode); + spin_unlock(&root->ino_cache_lock); return inode; } @@ -3176,6 +3309,7 @@ again: map = NULL; add_new_bitmap(ctl, info, offset); bitmap_info = info; + info = NULL; } bytes_added = add_bytes_to_bitmap(ctl, bitmap_info, offset, bytes); @@ -3186,6 +3320,8 @@ again: if (bytes) goto again; + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); if (map) kfree(map); return 0; @@ -3260,6 +3396,7 @@ have_info: goto have_info; } + ret = 0; goto out; } diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index 85889aa82c62..aae520b2aee5 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -20,10 +20,8 @@ static struct crypto_shash *tfm; int __init btrfs_hash_init(void) { tfm = crypto_alloc_shash("crc32c", 0, 0); - if (IS_ERR(tfm)) - return PTR_ERR(tfm); - return 0; + return PTR_ERR_OR_ZERO(tfm); } void btrfs_hash_exit(void) @@ -33,18 +31,16 @@ void btrfs_hash_exit(void) u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { - struct { - struct shash_desc shash; - char ctx[crypto_shash_descsize(tfm)]; - } desc; + SHASH_DESC_ON_STACK(shash, tfm); + u32 *ctx = (u32 *)shash_desc_ctx(shash); int err; - desc.shash.tfm = tfm; - desc.shash.flags = 0; - *(u32 *)desc.ctx = crc; + shash->tfm = tfm; + shash->flags = 0; + *ctx = crc; - err = crypto_shash_update(&desc.shash, address, length); + err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *(u32 *)desc.ctx; + return *ctx; } diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 2be38df703c9..8ffa4783cbf4 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -135,7 +135,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u32 item_size; key.objectid = inode_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_EXTREF_KEY); + key.type = BTRFS_INODE_EXTREF_KEY; key.offset = btrfs_extref_hash(ref_objectid, name, name_len); path = btrfs_alloc_path(); @@ -209,7 +209,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -337,7 +337,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, key.objectid = inode_objectid; key.offset = ref_objectid; - btrfs_set_key_type(&key, BTRFS_INODE_REF_KEY); + key.type = BTRFS_INODE_REF_KEY; path = btrfs_alloc_path(); if (!path) @@ -400,7 +400,7 @@ int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans, struct btrfs_key key; int ret; key.objectid = objectid; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -420,13 +420,13 @@ int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root struct btrfs_key found_key; ret = btrfs_search_slot(trans, root, location, path, ins_len, cow); - if (ret > 0 && btrfs_key_type(location) == BTRFS_ROOT_ITEM_KEY && + if (ret > 0 && location->type == BTRFS_ROOT_ITEM_KEY && location->offset == (u64)-1 && path->slots[0] != 0) { slot = path->slots[0] - 1; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid == location->objectid && - btrfs_key_type(&found_key) == btrfs_key_type(location)) { + found_key.type == location->type) { path->slots[0]--; return 0; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 888fbe19079f..83d646bd2e4b 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -87,7 +87,7 @@ again: */ btrfs_item_key_to_cpu(leaf, &key, 0); btrfs_release_path(path); - root->cache_progress = last; + root->ino_cache_progress = last; up_read(&fs_info->commit_root_sem); schedule_timeout(1); goto again; @@ -106,7 +106,7 @@ again: if (last != (u64)-1 && last + 1 != key.objectid) { __btrfs_add_free_space(ctl, last + 1, key.objectid - last - 1); - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); } last = key.objectid; @@ -119,14 +119,14 @@ next: root->highest_objectid - last - 1); } - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); - root->cache_progress = (u64)-1; + root->ino_cache_progress = (u64)-1; btrfs_unpin_free_ino(root); out: - wake_up(&root->cache_wait); + wake_up(&root->ino_cache_wait); up_read(&fs_info->commit_root_sem); btrfs_free_path(path); @@ -144,20 +144,20 @@ static void start_caching(struct btrfs_root *root) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_NO) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_NO) { + spin_unlock(&root->ino_cache_lock); return; } - root->cached = BTRFS_CACHE_STARTED; - spin_unlock(&root->cache_lock); + root->ino_cache_state = BTRFS_CACHE_STARTED; + spin_unlock(&root->ino_cache_lock); ret = load_free_ino_cache(root->fs_info, root); if (ret == 1) { - spin_lock(&root->cache_lock); - root->cached = BTRFS_CACHE_FINISHED; - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + root->ino_cache_state = BTRFS_CACHE_FINISHED; + spin_unlock(&root->ino_cache_lock); return; } @@ -196,11 +196,11 @@ again: start_caching(root); - wait_event(root->cache_wait, - root->cached == BTRFS_CACHE_FINISHED || + wait_event(root->ino_cache_wait, + root->ino_cache_state == BTRFS_CACHE_FINISHED || root->free_ino_ctl->free_space > 0); - if (root->cached == BTRFS_CACHE_FINISHED && + if (root->ino_cache_state == BTRFS_CACHE_FINISHED && root->free_ino_ctl->free_space == 0) return -ENOSPC; else @@ -214,17 +214,17 @@ void btrfs_return_ino(struct btrfs_root *root, u64 objectid) if (!btrfs_test_opt(root, INODE_MAP_CACHE)) return; again: - if (root->cached == BTRFS_CACHE_FINISHED) { + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { __btrfs_add_free_space(pinned, objectid, 1); } else { down_write(&root->fs_info->commit_root_sem); - spin_lock(&root->cache_lock); - if (root->cached == BTRFS_CACHE_FINISHED) { - spin_unlock(&root->cache_lock); + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state == BTRFS_CACHE_FINISHED) { + spin_unlock(&root->ino_cache_lock); up_write(&root->fs_info->commit_root_sem); goto again; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); start_caching(root); @@ -235,10 +235,10 @@ again: } /* - * When a transaction is committed, we'll move those inode numbers which - * are smaller than root->cache_progress from pinned tree to free_ino tree, - * and others will just be dropped, because the commit root we were - * searching has changed. + * When a transaction is committed, we'll move those inode numbers which are + * smaller than root->ino_cache_progress from pinned tree to free_ino tree, and + * others will just be dropped, because the commit root we were searching has + * changed. * * Must be called with root->fs_info->commit_root_sem held */ @@ -261,10 +261,10 @@ void btrfs_unpin_free_ino(struct btrfs_root *root) info = rb_entry(n, struct btrfs_free_space, offset_index); BUG_ON(info->bitmap); /* Logic error */ - if (info->offset > root->cache_progress) + if (info->offset > root->ino_cache_progress) goto free; - else if (info->offset + info->bytes > root->cache_progress) - count = root->cache_progress - info->offset + 1; + else if (info->offset + info->bytes > root->ino_cache_progress) + count = root->ino_cache_progress - info->offset + 1; else count = info->bytes; @@ -462,13 +462,13 @@ again: } } - spin_lock(&root->cache_lock); - if (root->cached != BTRFS_CACHE_FINISHED) { + spin_lock(&root->ino_cache_lock); + if (root->ino_cache_state != BTRFS_CACHE_FINISHED) { ret = -1; - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); goto out_put; } - spin_unlock(&root->cache_lock); + spin_unlock(&root->ino_cache_lock); spin_lock(&ctl->tree_lock); prealloc = sizeof(struct btrfs_free_space) * ctl->free_extents; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 016c403bfe7e..d23362f4464e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -153,7 +153,7 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, key.objectid = btrfs_ino(inode); key.offset = start; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(cur_size); path->leave_spinning = 1; @@ -249,8 +249,8 @@ static noinline int cow_file_range_inline(struct btrfs_root *root, data_len = compressed_size; if (start > 0 || - actual_end >= PAGE_CACHE_SIZE || - data_len >= BTRFS_MAX_INLINE_DATA_SIZE(root) || + actual_end > PAGE_CACHE_SIZE || + data_len > BTRFS_MAX_INLINE_DATA_SIZE(root) || (!compressed_size && (actual_end & (root->sectorsize - 1)) == 0) || end + 1 < isize || @@ -348,6 +348,23 @@ static noinline int add_async_extent(struct async_cow *cow, return 0; } +static inline int inode_need_compress(struct inode *inode) +{ + struct btrfs_root *root = BTRFS_I(inode)->root; + + /* force compress */ + if (btrfs_test_opt(root, FORCE_COMPRESS)) + return 1; + /* bad compression ratios */ + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + return 0; + if (btrfs_test_opt(root, COMPRESS) || + BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS || + BTRFS_I(inode)->force_compress) + return 1; + return 0; +} + /* * we create compressed extents in two phases. The first * phase compresses a range of pages that have already been @@ -444,10 +461,7 @@ again: * inode has not been flagged as nocompress. This flag can * change at any time if we discover bad compression ratios. */ - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) && - (btrfs_test_opt(root, COMPRESS) || - (BTRFS_I(inode)->force_compress) || - (BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS))) { + if (inode_need_compress(inode)) { WARN_ON(pages); pages = kzalloc(sizeof(struct page *) * nr_pages, GFP_NOFS); if (!pages) { @@ -1094,7 +1108,8 @@ static int cow_file_range_async(struct inode *inode, struct page *locked_page, async_cow->locked_page = locked_page; async_cow->start = start; - if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS) + if (BTRFS_I(inode)->flags & BTRFS_INODE_NOCOMPRESS && + !btrfs_test_opt(root, FORCE_COMPRESS)) cur_end = end; else cur_end = min(end, start + 512 * 1024 - 1); @@ -1445,6 +1460,26 @@ error: return ret; } +static inline int need_force_cow(struct inode *inode, u64 start, u64 end) +{ + + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + !(BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC)) + return 0; + + /* + * @defrag_bytes is a hint value, no spinlock held here, + * if is not zero, it means the file is defragging. + * Force cow if given extent needs to be defragged. + */ + if (BTRFS_I(inode)->defrag_bytes && + test_range_bit(&BTRFS_I(inode)->io_tree, start, end, + EXTENT_DEFRAG, 0, NULL)) + return 1; + + return 0; +} + /* * extent_io.c call back to do delayed allocation processing */ @@ -1453,17 +1488,15 @@ static int run_delalloc_range(struct inode *inode, struct page *locked_page, unsigned long *nr_written) { int ret; - struct btrfs_root *root = BTRFS_I(inode)->root; + int force_cow = need_force_cow(inode, start, end); - if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) { + if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 1, nr_written); - } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC) { + } else if (BTRFS_I(inode)->flags & BTRFS_INODE_PREALLOC && !force_cow) { ret = run_delalloc_nocow(inode, locked_page, start, end, page_started, 0, nr_written); - } else if (!btrfs_test_opt(root, COMPRESS) && - !(BTRFS_I(inode)->force_compress) && - !(BTRFS_I(inode)->flags & BTRFS_INODE_COMPRESS)) { + } else if (!inode_need_compress(inode)) { ret = cow_file_range(inode, locked_page, start, end, page_started, nr_written, 1); } else { @@ -1555,6 +1588,8 @@ static void btrfs_set_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC)) + WARN_ON(1); /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1577,6 +1612,8 @@ static void btrfs_set_bit_hook(struct inode *inode, root->fs_info->delalloc_batch); spin_lock(&BTRFS_I(inode)->lock); BTRFS_I(inode)->delalloc_bytes += len; + if (*bits & EXTENT_DEFRAG) + BTRFS_I(inode)->defrag_bytes += len; if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST, &BTRFS_I(inode)->runtime_flags)) btrfs_add_delalloc_inodes(root, inode); @@ -1591,6 +1628,13 @@ static void btrfs_clear_bit_hook(struct inode *inode, struct extent_state *state, unsigned long *bits) { + u64 len = state->end + 1 - state->start; + + spin_lock(&BTRFS_I(inode)->lock); + if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) + BTRFS_I(inode)->defrag_bytes -= len; + spin_unlock(&BTRFS_I(inode)->lock); + /* * set_bit and clear bit hooks normally require _irqsave/restore * but in this case, we are only testing for the DELALLOC @@ -1598,7 +1642,6 @@ static void btrfs_clear_bit_hook(struct inode *inode, */ if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) { struct btrfs_root *root = BTRFS_I(inode)->root; - u64 len = state->end + 1 - state->start; bool do_list = !btrfs_is_free_space_inode(inode); if (*bits & EXTENT_FIRST_DELALLOC) { @@ -2660,6 +2703,10 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } + btrfs_free_io_failure_record(inode, ordered_extent->file_offset, + ordered_extent->file_offset + + ordered_extent->len - 1); + if (test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) { truncated = true; logical_len = ordered_extent->truncated_len; @@ -2856,6 +2903,40 @@ static int btrfs_writepage_end_io_hook(struct page *page, u64 start, u64 end, return 0; } +static int __readpage_endio_check(struct inode *inode, + struct btrfs_io_bio *io_bio, + int icsum, struct page *page, + int pgoff, u64 start, size_t len) +{ + char *kaddr; + u32 csum_expected; + u32 csum = ~(u32)0; + static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + + csum_expected = *(((u32 *)io_bio->csum) + icsum); + + kaddr = kmap_atomic(page); + csum = btrfs_csum_data(kaddr + pgoff, csum, len); + btrfs_csum_final(csum, (char *)&csum); + if (csum != csum_expected) + goto zeroit; + + kunmap_atomic(kaddr); + return 0; +zeroit: + if (__ratelimit(&_rs)) + btrfs_info(BTRFS_I(inode)->root->fs_info, + "csum failed ino %llu off %llu csum %u expected csum %u", + btrfs_ino(inode), start, csum, csum_expected); + memset(kaddr + pgoff, 1, len); + flush_dcache_page(page); + kunmap_atomic(kaddr); + if (csum_expected == 0) + return 0; + return -EIO; +} + /* * when reads are done, we need to check csums to verify the data is correct * if there's a match, we allow the bio to finish. If not, the code in @@ -2868,20 +2949,15 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; - char *kaddr; struct btrfs_root *root = BTRFS_I(inode)->root; - u32 csum_expected; - u32 csum = ~(u32)0; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, - DEFAULT_RATELIMIT_BURST); if (PageChecked(page)) { ClearPageChecked(page); - goto good; + return 0; } if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) - goto good; + return 0; if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID && test_range_bit(io_tree, start, end, EXTENT_NODATASUM, 1, NULL)) { @@ -2891,28 +2967,8 @@ static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, } phy_offset >>= inode->i_sb->s_blocksize_bits; - csum_expected = *(((u32 *)io_bio->csum) + phy_offset); - - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + offset, csum, end - start + 1); - btrfs_csum_final(csum, (char *)&csum); - if (csum != csum_expected) - goto zeroit; - - kunmap_atomic(kaddr); -good: - return 0; - -zeroit: - if (__ratelimit(&_rs)) - btrfs_info(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(page->mapping->host), start, csum, csum_expected); - memset(kaddr + offset, 1, end - start + 1); - flush_dcache_page(page); - kunmap_atomic(kaddr); - if (csum_expected == 0) - return 0; - return -EIO; + return __readpage_endio_check(inode, io_bio, phy_offset, page, offset, + start, (size_t)(end - start + 1)); } struct delayed_iput { @@ -3159,7 +3215,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) path->reada = -1; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { @@ -3186,7 +3242,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) /* make sure the item matches what we want */ if (found_key.objectid != BTRFS_ORPHAN_OBJECTID) break; - if (btrfs_key_type(&found_key) != BTRFS_ORPHAN_ITEM_KEY) + if (found_key.type != BTRFS_ORPHAN_ITEM_KEY) break; /* release the path since we're done with it */ @@ -3662,7 +3718,8 @@ noinline int btrfs_update_inode(struct btrfs_trans_handle *trans, * without delay */ if (!btrfs_is_free_space_inode(inode) - && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID) { + && root->root_key.objectid != BTRFS_DATA_RELOC_TREE_OBJECTID + && !root->fs_info->log_root_recovering) { btrfs_update_root_times(trans, root); ret = btrfs_delayed_update_inode(trans, root, inode); @@ -4085,7 +4142,7 @@ search_again: fi = NULL; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != ino) break; @@ -4747,6 +4804,8 @@ void btrfs_evict_inode(struct inode *inode) /* do we really want it for ->i_nlink > 0 and zero btrfs_root_refs? */ btrfs_wait_ordered_range(inode, 0, (u64)-1); + btrfs_free_io_failure_record(inode, 0, (u64)-1); + if (root->fs_info->log_root_recovering) { BUG_ON(test_bit(BTRFS_INODE_HAS_ORPHAN_ITEM, &BTRFS_I(inode)->runtime_flags)); @@ -5202,42 +5261,6 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) iput(inode); inode = ERR_PTR(ret); } - /* - * If orphan cleanup did remove any orphans, it means the tree - * was modified and therefore the commit root is not the same as - * the current root anymore. This is a problem, because send - * uses the commit root and therefore can see inode items that - * don't exist in the current root anymore, and for example make - * calls to btrfs_iget, which will do tree lookups based on the - * current root and not on the commit root. Those lookups will - * fail, returning a -ESTALE error, and making send fail with - * that error. So make sure a send does not see any orphans we - * have just removed, and that it will see the same inodes - * regardless of whether a transaction commit happened before - * it started (meaning that the commit root will be the same as - * the current root) or not. - */ - if (sub_root->node != sub_root->commit_root) { - u64 sub_flags = btrfs_root_flags(&sub_root->root_item); - - if (sub_flags & BTRFS_ROOT_SUBVOL_RDONLY) { - struct extent_buffer *eb; - - /* - * Assert we can't have races between dentry - * lookup called through the snapshot creation - * ioctl and the VFS. - */ - ASSERT(mutex_is_locked(&dir->i_mutex)); - - down_write(&root->fs_info->commit_root_sem); - eb = sub_root->commit_root; - sub_root->commit_root = - btrfs_root_node(sub_root); - up_write(&root->fs_info->commit_root_sem); - free_extent_buffer(eb); - } - } } return inode; @@ -5331,7 +5354,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) btrfs_get_delayed_items(inode, &ins_list, &del_list); } - btrfs_set_key_type(&key, key_type); + key.type = key_type; key.offset = ctx->pos; key.objectid = btrfs_ino(inode); @@ -5356,7 +5379,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != key_type) + if (found_key.type != key_type) break; if (found_key.offset < ctx->pos) goto next; @@ -5568,7 +5591,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) int ret; key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_DIR_INDEX_KEY); + key.type = BTRFS_DIR_INDEX_KEY; key.offset = (u64)-1; path = btrfs_alloc_path(); @@ -5600,7 +5623,7 @@ static int btrfs_set_inode_index_count(struct inode *inode) btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != btrfs_ino(inode) || - btrfs_key_type(&found_key) != BTRFS_DIR_INDEX_KEY) { + found_key.type != BTRFS_DIR_INDEX_KEY) { BTRFS_I(inode)->index_cnt = 2; goto out; } @@ -5718,7 +5741,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); key[0].objectid = objectid; - btrfs_set_key_type(&key[0], BTRFS_INODE_ITEM_KEY); + key[0].type = BTRFS_INODE_ITEM_KEY; key[0].offset = 0; sizes[0] = sizeof(struct btrfs_inode_item); @@ -5731,7 +5754,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, * add more hard links than can fit in the ref item. */ key[1].objectid = objectid; - btrfs_set_key_type(&key[1], BTRFS_INODE_REF_KEY); + key[1].type = BTRFS_INODE_REF_KEY; key[1].offset = ref_objectid; sizes[1] = name_len + sizeof(*ref); @@ -5740,7 +5763,7 @@ static struct inode *btrfs_new_inode(struct btrfs_trans_handle *trans, location = &BTRFS_I(inode)->location; location->objectid = objectid; location->offset = 0; - btrfs_set_key_type(location, BTRFS_INODE_ITEM_KEY); + location->type = BTRFS_INODE_ITEM_KEY; ret = btrfs_insert_inode_locked(inode); if (ret < 0) @@ -5832,7 +5855,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, memcpy(&key, &BTRFS_I(inode)->root->root_key, sizeof(key)); } else { key.objectid = ino; - btrfs_set_key_type(&key, BTRFS_INODE_ITEM_KEY); + key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; } @@ -6191,21 +6214,60 @@ out_fail_inode: goto out_fail; } +/* Find next extent map of a given extent map, caller needs to ensure locks */ +static struct extent_map *next_extent_map(struct extent_map *em) +{ + struct rb_node *next; + + next = rb_next(&em->rb_node); + if (!next) + return NULL; + return container_of(next, struct extent_map, rb_node); +} + +static struct extent_map *prev_extent_map(struct extent_map *em) +{ + struct rb_node *prev; + + prev = rb_prev(&em->rb_node); + if (!prev) + return NULL; + return container_of(prev, struct extent_map, rb_node); +} + /* helper for btfs_get_extent. Given an existing extent in the tree, + * the existing extent is the nearest extent to map_start, * and an extent that you want to insert, deal with overlap and insert - * the new extent into the tree. + * the best fitted new extent into the tree. */ static int merge_extent_mapping(struct extent_map_tree *em_tree, struct extent_map *existing, struct extent_map *em, u64 map_start) { + struct extent_map *prev; + struct extent_map *next; + u64 start; + u64 end; u64 start_diff; BUG_ON(map_start < em->start || map_start >= extent_map_end(em)); - start_diff = map_start - em->start; - em->start = map_start; - em->len = existing->start - em->start; + + if (existing->start > map_start) { + next = existing; + prev = prev_extent_map(next); + } else { + prev = existing; + next = next_extent_map(prev); + } + + start = prev ? extent_map_end(prev) : em->start; + start = max_t(u64, start, em->start); + end = next ? next->start : extent_map_end(em); + end = min_t(u64, end, extent_map_end(em)); + start_diff = start - em->start; + em->start = start; + em->len = end - start; if (em->block_start < EXTENT_MAP_LAST_BYTE && !test_bit(EXTENT_FLAG_COMPRESSED, &em->flags)) { em->block_start += start_diff; @@ -6333,7 +6395,7 @@ again: struct btrfs_file_extent_item); /* are we inside the extent that was found? */ btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); - found_type = btrfs_key_type(&found_key); + found_type = found_key.type; if (found_key.objectid != objectid || found_type != BTRFS_EXTENT_DATA_KEY) { /* @@ -6482,25 +6544,21 @@ insert: ret = 0; - existing = lookup_extent_mapping(em_tree, start, len); - if (existing && (existing->start > start || - existing->start + existing->len <= start)) { + existing = search_extent_mapping(em_tree, start, len); + /* + * existing will always be non-NULL, since there must be + * extent causing the -EEXIST. + */ + if (start >= extent_map_end(existing) || + start <= existing->start) { + /* + * The existing extent map is the one nearest to + * the [start, start + len) range which overlaps + */ + err = merge_extent_mapping(em_tree, existing, + em, start); free_extent_map(existing); - existing = NULL; - } - if (!existing) { - existing = lookup_extent_mapping(em_tree, em->start, - em->len); - if (existing) { - err = merge_extent_mapping(em_tree, existing, - em, start); - free_extent_map(existing); - if (err) { - free_extent_map(em); - em = NULL; - } - } else { - err = -EIO; + if (err) { free_extent_map(em); em = NULL; } @@ -7112,8 +7170,10 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, block_start, len, orig_block_len, ram_bytes, type); - if (IS_ERR(em)) + if (IS_ERR(em)) { + ret = PTR_ERR(em); goto unlock_err; + } } ret = btrfs_add_ordered_extent_dio(inode, start, @@ -7188,45 +7248,277 @@ unlock_err: return ret; } -static void btrfs_endio_direct_read(struct bio *bio, int err) +static inline int submit_dio_repair_bio(struct inode *inode, struct bio *bio, + int rw, int mirror_num) { - struct btrfs_dio_private *dip = bio->bi_private; - struct bio_vec *bvec; - struct inode *inode = dip->inode; struct btrfs_root *root = BTRFS_I(inode)->root; - struct bio *dio_bio; - u32 *csums = (u32 *)dip->csum; + int ret; + + BUG_ON(rw & REQ_WRITE); + + bio_get(bio); + + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DIO_REPAIR); + if (ret) + goto err; + + ret = btrfs_map_bio(root, rw, bio, mirror_num, 0); +err: + bio_put(bio); + return ret; +} + +static int btrfs_check_dio_repairable(struct inode *inode, + struct bio *failed_bio, + struct io_failure_record *failrec, + int failed_mirror) +{ + int num_copies; + + num_copies = btrfs_num_copies(BTRFS_I(inode)->root->fs_info, + failrec->logical, failrec->len); + if (num_copies == 1) { + /* + * we only have a single copy of the data, so don't bother with + * all the retry and error correction code that follows. no + * matter what the error is, it is very likely to persist. + */ + pr_debug("Check DIO Repairable: cannot repair, num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + failrec->failed_mirror = failed_mirror; + failrec->this_mirror++; + if (failrec->this_mirror == failed_mirror) + failrec->this_mirror++; + + if (failrec->this_mirror > num_copies) { + pr_debug("Check DIO Repairable: (fail) num_copies=%d, next_mirror %d, failed_mirror %d\n", + num_copies, failrec->this_mirror, failed_mirror); + return 0; + } + + return 1; +} + +static int dio_read_error(struct inode *inode, struct bio *failed_bio, + struct page *page, u64 start, u64 end, + int failed_mirror, bio_end_io_t *repair_endio, + void *repair_arg) +{ + struct io_failure_record *failrec; + struct bio *bio; + int isector; + int read_mode; + int ret; + + BUG_ON(failed_bio->bi_rw & REQ_WRITE); + + ret = btrfs_get_io_failure_record(inode, start, end, &failrec); + if (ret) + return ret; + + ret = btrfs_check_dio_repairable(inode, failed_bio, failrec, + failed_mirror); + if (!ret) { + free_io_failure(inode, failrec); + return -EIO; + } + + if (failed_bio->bi_vcnt > 1) + read_mode = READ_SYNC | REQ_FAILFAST_DEV; + else + read_mode = READ_SYNC; + + isector = start - btrfs_io_bio(failed_bio)->logical; + isector >>= inode->i_sb->s_blocksize_bits; + bio = btrfs_create_repair_bio(inode, failed_bio, failrec, page, + 0, isector, repair_endio, repair_arg); + if (!bio) { + free_io_failure(inode, failrec); + return -EIO; + } + + btrfs_debug(BTRFS_I(inode)->root->fs_info, + "Repair DIO Read Error: submitting new dio read[%#x] to this_mirror=%d, in_validation=%d\n", + read_mode, failrec->this_mirror, failrec->in_validation); + + ret = submit_dio_repair_bio(inode, bio, read_mode, + failrec->this_mirror); + if (ret) { + free_io_failure(inode, failrec); + bio_put(bio); + } + + return ret; +} + +struct btrfs_retry_complete { + struct completion done; + struct inode *inode; + u64 start; + int uptodate; +}; + +static void btrfs_retry_endio_nocsum(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct bio_vec *bvec; + int i; + + if (err) + goto end; + + done->uptodate = 1; + bio_for_each_segment_all(bvec, bio, i) + clean_io_failure(done->inode, done->start, bvec->bv_page, 0); +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_correct_data_nocsum(struct inode *inode, + struct btrfs_io_bio *io_bio) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; u64 start; int i; + int ret; + + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio_nocsum, &done); + if (ret) + return ret; + + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } + + start += bvec->bv_len; + } - start = dip->logical_offset; + return 0; +} + +static void btrfs_retry_endio(struct bio *bio, int err) +{ + struct btrfs_retry_complete *done = bio->bi_private; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct bio_vec *bvec; + int uptodate; + int ret; + int i; + + if (err) + goto end; + + uptodate = 1; bio_for_each_segment_all(bvec, bio, i) { - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { - struct page *page = bvec->bv_page; - char *kaddr; - u32 csum = ~(u32)0; - unsigned long flags; - - local_irq_save(flags); - kaddr = kmap_atomic(page); - csum = btrfs_csum_data(kaddr + bvec->bv_offset, - csum, bvec->bv_len); - btrfs_csum_final(csum, (char *)&csum); - kunmap_atomic(kaddr); - local_irq_restore(flags); - - flush_dcache_page(bvec->bv_page); - if (csum != csums[i]) { - btrfs_err(root->fs_info, "csum failed ino %llu off %llu csum %u expected csum %u", - btrfs_ino(inode), start, csum, - csums[i]); - err = -EIO; - } + ret = __readpage_endio_check(done->inode, io_bio, i, + bvec->bv_page, 0, + done->start, bvec->bv_len); + if (!ret) + clean_io_failure(done->inode, done->start, + bvec->bv_page, 0); + else + uptodate = 0; + } + + done->uptodate = uptodate; +end: + complete(&done->done); + bio_put(bio); +} + +static int __btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + struct bio_vec *bvec; + struct btrfs_retry_complete done; + u64 start; + u64 offset = 0; + int i; + int ret; + + err = 0; + start = io_bio->logical; + done.inode = inode; + + bio_for_each_segment_all(bvec, &io_bio->bio, i) { + ret = __readpage_endio_check(inode, io_bio, i, bvec->bv_page, + 0, start, bvec->bv_len); + if (likely(!ret)) + goto next; +try_again: + done.uptodate = 0; + done.start = start; + init_completion(&done.done); + + ret = dio_read_error(inode, &io_bio->bio, bvec->bv_page, start, + start + bvec->bv_len - 1, + io_bio->mirror_num, + btrfs_retry_endio, &done); + if (ret) { + err = ret; + goto next; } + wait_for_completion(&done.done); + + if (!done.uptodate) { + /* We might have another mirror, so try again */ + goto try_again; + } +next: + offset += bvec->bv_len; start += bvec->bv_len; } + return err; +} + +static int btrfs_subio_endio_read(struct inode *inode, + struct btrfs_io_bio *io_bio, int err) +{ + bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; + + if (skip_csum) { + if (unlikely(err)) + return __btrfs_correct_data_nocsum(inode, io_bio); + else + return 0; + } else { + return __btrfs_subio_endio_read(inode, io_bio, err); + } +} + +static void btrfs_endio_direct_read(struct bio *bio, int err) +{ + struct btrfs_dio_private *dip = bio->bi_private; + struct inode *inode = dip->inode; + struct bio *dio_bio; + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + + if (dip->flags & BTRFS_DIO_ORIG_BIO_SUBMITTED) + err = btrfs_subio_endio_read(inode, io_bio, err); + unlock_extent(&BTRFS_I(inode)->io_tree, dip->logical_offset, dip->logical_offset + dip->bytes - 1); dio_bio = dip->dio_bio; @@ -7237,6 +7529,9 @@ static void btrfs_endio_direct_read(struct bio *bio, int err) if (err) clear_bit(BIO_UPTODATE, &dio_bio->bi_flags); dio_end_io(dio_bio, err); + + if (io_bio->end_io) + io_bio->end_io(io_bio, err); bio_put(bio); } @@ -7302,12 +7597,17 @@ static void btrfs_end_dio_bio(struct bio *bio, int err) { struct btrfs_dio_private *dip = bio->bi_private; + if (err) + btrfs_warn(BTRFS_I(dip->inode)->root->fs_info, + "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", + btrfs_ino(dip->inode), bio->bi_rw, + (unsigned long long)bio->bi_iter.bi_sector, + bio->bi_iter.bi_size, err); + + if (dip->subio_endio) + err = dip->subio_endio(dip->inode, btrfs_io_bio(bio), err); + if (err) { - btrfs_err(BTRFS_I(dip->inode)->root->fs_info, - "direct IO failed ino %llu rw %lu sector %#Lx len %u err no %d", - btrfs_ino(dip->inode), bio->bi_rw, - (unsigned long long)bio->bi_iter.bi_sector, - bio->bi_iter.bi_size, err); dip->errors = 1; /* @@ -7338,6 +7638,38 @@ static struct bio *btrfs_dio_bio_alloc(struct block_device *bdev, return btrfs_bio_alloc(bdev, first_sector, nr_vecs, gfp_flags); } +static inline int btrfs_lookup_and_bind_dio_csum(struct btrfs_root *root, + struct inode *inode, + struct btrfs_dio_private *dip, + struct bio *bio, + u64 file_offset) +{ + struct btrfs_io_bio *io_bio = btrfs_io_bio(bio); + struct btrfs_io_bio *orig_io_bio = btrfs_io_bio(dip->orig_bio); + int ret; + + /* + * We load all the csum data we need when we submit + * the first bio to reduce the csum tree search and + * contention. + */ + if (dip->logical_offset == file_offset) { + ret = btrfs_lookup_bio_sums_dio(root, inode, dip->orig_bio, + file_offset); + if (ret) + return ret; + } + + if (bio == dip->orig_bio) + return 0; + + file_offset -= dip->logical_offset; + file_offset >>= inode->i_sb->s_blocksize_bits; + io_bio->csum = (u8 *)(((u32 *)orig_io_bio->csum) + file_offset); + + return 0; +} + static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, int rw, u64 file_offset, int skip_sum, int async_submit) @@ -7353,7 +7685,8 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, bio_get(bio); if (!write) { - ret = btrfs_bio_wq_end_io(root->fs_info, bio, 0); + ret = btrfs_bio_wq_end_io(root->fs_info, bio, + BTRFS_WQ_ENDIO_DATA); if (ret) goto err; } @@ -7376,13 +7709,12 @@ static inline int __btrfs_submit_dio_bio(struct bio *bio, struct inode *inode, ret = btrfs_csum_one_bio(root, inode, bio, file_offset, 1); if (ret) goto err; - } else if (!skip_sum) { - ret = btrfs_lookup_bio_sums_dio(root, inode, dip, bio, - file_offset); + } else { + ret = btrfs_lookup_and_bind_dio_csum(root, inode, dip, bio, + file_offset); if (ret) goto err; } - map: ret = btrfs_map_bio(root, rw, bio, 0, async_submit); err: @@ -7403,7 +7735,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, u64 submit_len = 0; u64 map_length; int nr_pages = 0; - int ret = 0; + int ret; int async_submit = 0; map_length = orig_bio->bi_iter.bi_size; @@ -7414,6 +7746,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, if (map_length >= orig_bio->bi_iter.bi_size) { bio = orig_bio; + dip->flags |= BTRFS_DIO_ORIG_BIO_SUBMITTED; goto submit; } @@ -7430,12 +7763,13 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; atomic_inc(&dip->pending_bios); while (bvec <= (orig_bio->bi_io_vec + orig_bio->bi_vcnt - 1)) { - if (unlikely(map_length < submit_len + bvec->bv_len || + if (map_length < submit_len + bvec->bv_len || bio_add_page(bio, bvec->bv_page, bvec->bv_len, - bvec->bv_offset) < bvec->bv_len)) { + bvec->bv_offset) < bvec->bv_len) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -7464,6 +7798,7 @@ static int btrfs_submit_direct_hook(int rw, struct btrfs_dio_private *dip, goto out_err; bio->bi_private = dip; bio->bi_end_io = btrfs_end_dio_bio; + btrfs_io_bio(bio)->logical = file_offset; map_length = orig_bio->bi_iter.bi_size; ret = btrfs_map_block(root->fs_info, rw, @@ -7507,11 +7842,10 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_dio_private *dip; struct bio *io_bio; + struct btrfs_io_bio *btrfs_bio; int skip_sum; - int sum_len; int write = rw & REQ_WRITE; int ret = 0; - u16 csum_size; skip_sum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; @@ -7521,16 +7855,7 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, goto free_ordered; } - if (!skip_sum && !write) { - csum_size = btrfs_super_csum_size(root->fs_info->super_copy); - sum_len = dio_bio->bi_iter.bi_size >> - inode->i_sb->s_blocksize_bits; - sum_len *= csum_size; - } else { - sum_len = 0; - } - - dip = kmalloc(sizeof(*dip) + sum_len, GFP_NOFS); + dip = kzalloc(sizeof(*dip), GFP_NOFS); if (!dip) { ret = -ENOMEM; goto free_io_bio; @@ -7542,20 +7867,25 @@ static void btrfs_submit_direct(int rw, struct bio *dio_bio, dip->bytes = dio_bio->bi_iter.bi_size; dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; io_bio->bi_private = dip; - dip->errors = 0; dip->orig_bio = io_bio; dip->dio_bio = dio_bio; atomic_set(&dip->pending_bios, 0); + btrfs_bio = btrfs_io_bio(io_bio); + btrfs_bio->logical = file_offset; - if (write) + if (write) { io_bio->bi_end_io = btrfs_endio_direct_write; - else + } else { io_bio->bi_end_io = btrfs_endio_direct_read; + dip->subio_endio = btrfs_subio_endio_read; + } ret = btrfs_submit_direct_hook(rw, dip, skip_sum); if (!ret) return; + if (btrfs_bio->end_io) + btrfs_bio->end_io(btrfs_bio, ret); free_io_bio: bio_put(io_bio); @@ -7652,8 +7982,8 @@ static ssize_t btrfs_direct_IO(int rw, struct kiocb *iocb, ret = btrfs_delalloc_reserve_space(inode, count); if (ret) goto out; - } else if (unlikely(test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags))) { + } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, + &BTRFS_I(inode)->runtime_flags)) { inode_dio_done(inode); flags = DIO_LOCKING | DIO_SKIP_HOLES; wakeup = false; @@ -8173,6 +8503,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->last_sub_trans = 0; ei->logged_trans = 0; ei->delalloc_bytes = 0; + ei->defrag_bytes = 0; ei->disk_i_size = 0; ei->flags = 0; ei->csum_bytes = 0; @@ -8231,6 +8562,7 @@ void btrfs_destroy_inode(struct inode *inode) WARN_ON(BTRFS_I(inode)->reserved_extents); WARN_ON(BTRFS_I(inode)->delalloc_bytes); WARN_ON(BTRFS_I(inode)->csum_bytes); + WARN_ON(BTRFS_I(inode)->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8646,7 +8978,7 @@ static int __start_delalloc_inodes(struct btrfs_root *root, int delay_iput, spin_unlock(&root->delalloc_lock); work = btrfs_alloc_delalloc_work(inode, 0, delay_iput); - if (unlikely(!work)) { + if (!work) { if (delay_iput) btrfs_add_delayed_iput(inode); else @@ -8832,7 +9164,7 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, } key.objectid = btrfs_ino(inode); key.offset = 0; - btrfs_set_key_type(&key, BTRFS_EXTENT_DATA_KEY); + key.type = BTRFS_EXTENT_DATA_KEY; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8a8e29878c34..4399f0c3a4ce 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -332,6 +332,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) goto out_drop; } else { + ret = btrfs_set_prop(inode, "btrfs.compression", NULL, 0, 0); + if (ret && ret != -ENODATA) + goto out_drop; ip->flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } @@ -477,8 +480,7 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_free_block(trans, root, root->leafsize, - 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; @@ -503,7 +505,7 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_stack_inode_generation(inode_item, 1); btrfs_set_stack_inode_size(inode_item, 3); btrfs_set_stack_inode_nlink(inode_item, 1); - btrfs_set_stack_inode_nbytes(inode_item, root->leafsize); + btrfs_set_stack_inode_nbytes(inode_item, root->nodesize); btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755); btrfs_set_root_flags(&root_item, 0); @@ -535,7 +537,7 @@ static noinline int create_subvol(struct inode *dir, key.objectid = objectid; key.offset = 0; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, root->fs_info->tree_root, &key, &root_item); if (ret) @@ -711,6 +713,39 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret) goto fail; + ret = btrfs_orphan_cleanup(pending_snapshot->snap); + if (ret) + goto fail; + + /* + * If orphan cleanup did remove any orphans, it means the tree was + * modified and therefore the commit root is not the same as the + * current root anymore. This is a problem, because send uses the + * commit root and therefore can see inode items that don't exist + * in the current root anymore, and for example make calls to + * btrfs_iget, which will do tree lookups based on the current root + * and not on the commit root. Those lookups will fail, returning a + * -ESTALE error, and making send fail with that error. So make sure + * a send does not see any orphans we have just removed, and that it + * will see the same inodes regardless of whether a transaction + * commit happened before it started (meaning that the commit root + * will be the same as the current root) or not. + */ + if (readonly && pending_snapshot->snap->node != + pending_snapshot->snap->commit_root) { + trans = btrfs_join_transaction(pending_snapshot->snap); + if (IS_ERR(trans) && PTR_ERR(trans) != -ENOENT) { + ret = PTR_ERR(trans); + goto fail; + } + if (!IS_ERR(trans)) { + ret = btrfs_commit_transaction(trans, + pending_snapshot->snap); + if (ret) + goto fail; + } + } + inode = btrfs_lookup_dentry(dentry->d_parent->d_inode, dentry); if (IS_ERR(inode)) { ret = PTR_ERR(inode); @@ -730,23 +765,6 @@ out: return ret; } -/* copy of check_sticky in fs/namei.c() -* It's inline, so penalty for filesystems that don't use sticky bit is -* minimal. -*/ -static inline int btrfs_check_sticky(struct inode *dir, struct inode *inode) -{ - kuid_t fsuid = current_fsuid(); - - if (!(dir->i_mode & S_ISVTX)) - return 0; - if (uid_eq(inode->i_uid, fsuid)) - return 0; - if (uid_eq(dir->i_uid, fsuid)) - return 0; - return !capable(CAP_FOWNER); -} - /* copy of may_delete in fs/namei.c() * Check whether we can remove a link victim from directory dir, check * whether the type of victim is right. @@ -782,8 +800,7 @@ static int btrfs_may_delete(struct inode *dir, struct dentry *victim, int isdir) return error; if (IS_APPEND(dir)) return -EPERM; - if (btrfs_check_sticky(dir, victim->d_inode)|| - IS_APPEND(victim->d_inode)|| + if (check_sticky(dir, victim->d_inode) || IS_APPEND(victim->d_inode) || IS_IMMUTABLE(victim->d_inode) || IS_SWAPFILE(victim->d_inode)) return -EPERM; if (isdir) { @@ -882,7 +899,7 @@ out_unlock: * file you want to defrag, we return 0 to let you know to skip this * part of the file */ -static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) +static int check_defrag_in_cache(struct inode *inode, u64 offset, u32 thresh) { struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; struct extent_map *em = NULL; @@ -917,7 +934,7 @@ static int check_defrag_in_cache(struct inode *inode, u64 offset, int thresh) */ static int find_new_extents(struct btrfs_root *root, struct inode *inode, u64 newer_than, - u64 *off, int thresh) + u64 *off, u32 thresh) { struct btrfs_path *path; struct btrfs_key min_key; @@ -936,12 +953,9 @@ static int find_new_extents(struct btrfs_root *root, min_key.offset = *off; while (1) { - path->keep_locks = 1; ret = btrfs_search_forward(root, &min_key, path, newer_than); if (ret != 0) goto none; - path->keep_locks = 0; - btrfs_unlock_up_safe(path, 1); process_slot: if (min_key.objectid != ino) goto none; @@ -1029,7 +1043,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em) return ret; } -static int should_defrag_range(struct inode *inode, u64 start, int thresh, +static int should_defrag_range(struct inode *inode, u64 start, u32 thresh, u64 *last_len, u64 *skip, u64 *defrag_end, int compress) { @@ -1259,7 +1273,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, int ret; int defrag_count = 0; int compress_type = BTRFS_COMPRESS_ZLIB; - int extent_thresh = range->extent_thresh; + u32 extent_thresh = range->extent_thresh; unsigned long max_cluster = (256 * 1024) >> PAGE_CACHE_SHIFT; unsigned long cluster = max_cluster; u64 new_align = ~((u64)128 * 1024 - 1); @@ -1335,8 +1349,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, inode->i_mapping->writeback_index = i; while (i <= last_index && defrag_count < max_to_defrag && - (i < (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> - PAGE_CACHE_SHIFT)) { + (i < DIV_ROUND_UP(i_size_read(inode), PAGE_CACHE_SIZE))) { /* * make sure we stop running if someone unmounts * the FS @@ -1359,7 +1372,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, * the should_defrag function tells us how much to skip * bump our counter by the suggested amount */ - next = (skip + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + next = DIV_ROUND_UP(skip, PAGE_CACHE_SIZE); i = max(i + 1, next); continue; } @@ -1554,7 +1567,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, goto out_free; } - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); if (mod < 0) { if (new_size > old_size) { @@ -2089,8 +2102,6 @@ static noinline int search_ioctl(struct inode *inode, key.type = sk->min_type; key.offset = sk->min_offset; - path->keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, &key, path, sk->min_transid); if (ret != 0) { @@ -2423,9 +2434,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - err = d_invalidate(dentry); - if (err) - goto out_unlock; + d_invalidate(dentry); down_write(&root->fs_info->subvol_sem); @@ -2510,7 +2519,6 @@ out_release: btrfs_subvolume_release_metadata(root, &block_rsv, qgroup_reserved); out_up_write: up_write(&root->fs_info->subvol_sem); -out_unlock: if (err) { spin_lock(&dest->root_item_lock); root_flags = btrfs_root_flags(&dest->root_item); @@ -2526,9 +2534,9 @@ out_unlock: ASSERT(dest->send_in_progress == 0); /* the last ref */ - if (dest->cache_inode) { - iput(dest->cache_inode); - dest->cache_inode = NULL; + if (dest->ino_cache_inode) { + iput(dest->ino_cache_inode); + dest->ino_cache_inode = NULL; } } out_dput: @@ -2634,6 +2642,9 @@ static long btrfs_ioctl_add_dev(struct btrfs_root *root, void __user *arg) vol_args->name[BTRFS_PATH_NAME_MAX] = '\0'; ret = btrfs_init_new_device(root, vol_args->name); + if (!ret) + btrfs_info(root->fs_info, "disk added %s",vol_args->name); + kfree(vol_args); out: mutex_unlock(&root->fs_info->volume_mutex); @@ -2673,6 +2684,9 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) mutex_unlock(&root->fs_info->volume_mutex); atomic_set(&root->fs_info->mutually_exclusive_operation_running, 0); + if (!ret) + btrfs_info(root->fs_info, "disk deleted %s",vol_args->name); + out: kfree(vol_args); err_drop: @@ -2737,8 +2751,8 @@ static long btrfs_ioctl_dev_info(struct btrfs_root *root, void __user *arg) } di_args->devid = dev->devid; - di_args->bytes_used = dev->bytes_used; - di_args->total_bytes = dev->total_bytes; + di_args->bytes_used = btrfs_device_get_bytes_used(dev); + di_args->total_bytes = btrfs_device_get_total_bytes(dev); memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid)); if (dev->name) { struct rcu_string *name; @@ -3164,7 +3178,7 @@ static void clone_update_extent_map(struct inode *inode, em->start + em->len - 1, 0); } - if (unlikely(ret)) + if (ret) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); } @@ -3199,7 +3213,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, u64 last_dest_end = destoff; ret = -ENOMEM; - buf = vmalloc(btrfs_level_size(root, 0)); + buf = vmalloc(root->nodesize); if (!buf) return ret; @@ -3252,11 +3266,11 @@ process_slot: slot = path->slots[0]; btrfs_item_key_to_cpu(leaf, &key, slot); - if (btrfs_key_type(&key) > BTRFS_EXTENT_DATA_KEY || + if (key.type > BTRFS_EXTENT_DATA_KEY || key.objectid != btrfs_ino(src)) break; - if (btrfs_key_type(&key) == BTRFS_EXTENT_DATA_KEY) { + if (key.type == BTRFS_EXTENT_DATA_KEY) { struct btrfs_file_extent_item *extent; int type; u32 size; @@ -5283,6 +5297,12 @@ long btrfs_ioctl(struct file *file, unsigned int if (ret) return ret; ret = btrfs_sync_fs(file->f_dentry->d_sb, 1); + /* + * The transaction thread may want to do more work, + * namely it pokes the cleaner ktread that will start + * processing uncleaned subvols. + */ + wake_up_process(root->fs_info->transaction_kthread); return ret; } case BTRFS_IOC_START_SYNC: diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index dfad8514f0da..78285f30909e 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -266,8 +266,7 @@ static int lzo_decompress_biovec(struct list_head *ws, char *data_in; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); unsigned long buf_start; unsigned long buf_offset = 0; unsigned long bytes; diff --git a/fs/btrfs/orphan.c b/fs/btrfs/orphan.c index 65793edb38ca..47767d5b8f0b 100644 --- a/fs/btrfs/orphan.c +++ b/fs/btrfs/orphan.c @@ -27,7 +27,7 @@ int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); @@ -48,7 +48,7 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans, int ret = 0; key.objectid = BTRFS_ORPHAN_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = offset; path = btrfs_alloc_path(); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 9626b4ad3b9a..647ab12fdf5d 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -195,7 +195,7 @@ void btrfs_print_leaf(struct btrfs_root *root, struct extent_buffer *l) for (i = 0 ; i < nr ; i++) { item = btrfs_item_nr(i); btrfs_item_key_to_cpu(l, &key, i); - type = btrfs_key_type(&key); + type = key.type; printk(KERN_INFO "\titem %d key (%llu %u %llu) itemoff %d " "itemsize %d\n", i, key.objectid, type, key.offset, @@ -336,7 +336,6 @@ void btrfs_print_tree(struct btrfs_root *root, struct extent_buffer *c) for (i = 0; i < nr; i++) { struct extent_buffer *next = read_tree_block(root, btrfs_node_blockptr(c, i), - btrfs_level_size(root, level - 1), btrfs_node_ptr_generation(c, i)); if (btrfs_is_leaf(next) && level != 1) diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index ded5c601d916..48b60dbf807f 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -539,10 +539,9 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; struct btrfs_key key; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, "a_root->state))) + if (btrfs_test_is_dummy_root(quota_root)) return 0; -#endif + path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -551,9 +550,15 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroupid; + /* + * Avoid a transaction abort by catching -EEXIST here. In that + * case, we proceed by re-initializing the existing structure + * on disk. + */ + ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_info)); - if (ret) + if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; @@ -572,7 +577,7 @@ static int add_qgroup_item(struct btrfs_trans_handle *trans, key.type = BTRFS_QGROUP_LIMIT_KEY; ret = btrfs_insert_empty_item(trans, quota_root, path, &key, sizeof(*qgroup_limit)); - if (ret) + if (ret && ret != -EEXIST) goto out; leaf = path->nodes[0]; @@ -692,10 +697,9 @@ static int update_qgroup_info_item(struct btrfs_trans_handle *trans, int ret; int slot; -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS - if (unlikely(test_bit(BTRFS_ROOT_DUMMY_ROOT, &root->state))) + if (btrfs_test_is_dummy_root(root)) return 0; -#endif + key.objectid = 0; key.type = BTRFS_QGROUP_INFO_KEY; key.offset = qgroup->qgroupid; @@ -1335,6 +1339,8 @@ int btrfs_qgroup_record_ref(struct btrfs_trans_handle *trans, INIT_LIST_HEAD(&oper->elem.list); oper->elem.seq = 0; + trace_btrfs_qgroup_record_ref(oper); + if (type == BTRFS_QGROUP_OPER_SUB_SUBTREE) { /* * If any operation for this bytenr/ref_root combo @@ -2077,6 +2083,8 @@ static int btrfs_qgroup_account(struct btrfs_trans_handle *trans, ASSERT(is_fstree(oper->ref_root)); + trace_btrfs_qgroup_account(oper); + switch (oper->type) { case BTRFS_QGROUP_OPER_ADD_EXCL: case BTRFS_QGROUP_OPER_SUB_EXCL: @@ -2237,7 +2245,6 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, if (srcid) { struct btrfs_root *srcroot; struct btrfs_key srckey; - int srcroot_level; srckey.objectid = srcid; srckey.type = BTRFS_ROOT_ITEM_KEY; @@ -2249,8 +2256,7 @@ int btrfs_qgroup_inherit(struct btrfs_trans_handle *trans, } rcu_read_lock(); - srcroot_level = btrfs_header_level(srcroot->node); - level_size = btrfs_level_size(srcroot, srcroot_level); + level_size = srcroot->nodesize; rcu_read_unlock(); } @@ -2566,7 +2572,7 @@ qgroup_rescan_leaf(struct btrfs_fs_info *fs_info, struct btrfs_path *path, found.type != BTRFS_METADATA_ITEM_KEY) continue; if (found.type == BTRFS_METADATA_ITEM_KEY) - num_bytes = fs_info->extent_root->leafsize; + num_bytes = fs_info->extent_root->nodesize; else num_bytes = found.offset; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 0a6b6e4bcbb9..6a41631cb959 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -912,7 +912,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { unsigned long nr = stripe_len * nr_stripes; - return (nr + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); } /* @@ -1442,7 +1442,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1725,7 +1725,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1940,7 +1940,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) struct btrfs_bio *bbio = rbio->bbio; struct bio_list bio_list; int ret; - int nr_pages = (rbio->stripe_len + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT; + int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 20408c6b665a..b63ae20618fb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -347,7 +347,7 @@ static struct reada_extent *reada_find_extent(struct btrfs_root *root, if (!re) return NULL; - blocksize = btrfs_level_size(root, level); + blocksize = root->nodesize; re->logical = logical; re->blocksize = blocksize; re->top = *top; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 65245a07275b..74257d6436ad 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -736,7 +736,8 @@ again: err = ret; goto out; } - BUG_ON(!ret || !path1->slots[0]); + ASSERT(ret); + ASSERT(path1->slots[0]); path1->slots[0]--; @@ -746,10 +747,10 @@ again: * the backref was added previously when processing * backref of type BTRFS_TREE_BLOCK_REF_KEY */ - BUG_ON(!list_is_singular(&cur->upper)); + ASSERT(list_is_singular(&cur->upper)); edge = list_entry(cur->upper.next, struct backref_edge, list[LOWER]); - BUG_ON(!list_empty(&edge->list[UPPER])); + ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* * add the upper level block to pending list if we need @@ -831,7 +832,7 @@ again: cur->cowonly = 1; } #else - BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY); + ASSERT(key.type != BTRFS_EXTENT_REF_V0_KEY); if (key.type == BTRFS_SHARED_BLOCK_REF_KEY) { #endif if (key.objectid == key.offset) { @@ -840,7 +841,7 @@ again: * backref of this type. */ root = find_reloc_root(rc, cur->bytenr); - BUG_ON(!root); + ASSERT(root); cur->root = root; break; } @@ -868,7 +869,7 @@ again: } else { upper = rb_entry(rb_node, struct backref_node, rb_node); - BUG_ON(!upper->checked); + ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); } list_add_tail(&edge->list[LOWER], &cur->upper); @@ -892,7 +893,7 @@ again: if (btrfs_root_level(&root->root_item) == cur->level) { /* tree root */ - BUG_ON(btrfs_root_bytenr(&root->root_item) != + ASSERT(btrfs_root_bytenr(&root->root_item) == cur->bytenr); if (should_ignore_root(root)) list_add(&cur->list, &useless); @@ -927,7 +928,7 @@ again: need_check = true; for (; level < BTRFS_MAX_LEVEL; level++) { if (!path2->nodes[level]) { - BUG_ON(btrfs_root_bytenr(&root->root_item) != + ASSERT(btrfs_root_bytenr(&root->root_item) == lower->bytenr); if (should_ignore_root(root)) list_add(&lower->list, &useless); @@ -977,12 +978,15 @@ again: need_check = false; list_add_tail(&edge->list[UPPER], &list); - } else + } else { + if (upper->checked) + need_check = true; INIT_LIST_HEAD(&edge->list[UPPER]); + } } else { upper = rb_entry(rb_node, struct backref_node, rb_node); - BUG_ON(!upper->checked); + ASSERT(upper->checked); INIT_LIST_HEAD(&edge->list[UPPER]); if (!upper->owner) upper->owner = btrfs_header_owner(eb); @@ -1026,7 +1030,7 @@ next: * everything goes well, connect backref nodes and insert backref nodes * into the cache. */ - BUG_ON(!node->checked); + ASSERT(node->checked); cowonly = node->cowonly; if (!cowonly) { rb_node = tree_insert(&cache->rb_root, node->bytenr, @@ -1062,8 +1066,21 @@ next: continue; } - BUG_ON(!upper->checked); - BUG_ON(cowonly != upper->cowonly); + if (!upper->checked) { + /* + * Still want to blow up for developers since this is a + * logic bug. + */ + ASSERT(0); + err = -EINVAL; + goto out; + } + if (cowonly != upper->cowonly) { + ASSERT(0); + err = -EINVAL; + goto out; + } + if (!cowonly) { rb_node = tree_insert(&cache->rb_root, upper->bytenr, &upper->rb_node); @@ -1086,7 +1103,7 @@ next: while (!list_empty(&useless)) { upper = list_entry(useless.next, struct backref_node, list); list_del_init(&upper->list); - BUG_ON(!list_empty(&upper->upper)); + ASSERT(list_empty(&upper->upper)); if (upper == node) node = NULL; if (upper->lowest) { @@ -1119,29 +1136,45 @@ out: if (err) { while (!list_empty(&useless)) { lower = list_entry(useless.next, - struct backref_node, upper); - list_del_init(&lower->upper); + struct backref_node, list); + list_del_init(&lower->list); } - upper = node; - INIT_LIST_HEAD(&list); - while (upper) { - if (RB_EMPTY_NODE(&upper->rb_node)) { - list_splice_tail(&upper->upper, &list); - free_backref_node(cache, upper); - } - - if (list_empty(&list)) - break; - - edge = list_entry(list.next, struct backref_edge, - list[LOWER]); + while (!list_empty(&list)) { + edge = list_first_entry(&list, struct backref_edge, + list[UPPER]); + list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); + lower = edge->node[LOWER]; upper = edge->node[UPPER]; free_backref_edge(cache, edge); + + /* + * Lower is no longer linked to any upper backref nodes + * and isn't in the cache, we can free it ourselves. + */ + if (list_empty(&lower->upper) && + RB_EMPTY_NODE(&lower->rb_node)) + list_add(&lower->list, &useless); + + if (!RB_EMPTY_NODE(&upper->rb_node)) + continue; + + /* Add this guy's upper edges to the list to proces */ + list_for_each_entry(edge, &upper->upper, list[LOWER]) + list_add_tail(&edge->list[UPPER], &list); + if (list_empty(&upper->upper)) + list_add(&upper->list, &useless); + } + + while (!list_empty(&useless)) { + lower = list_entry(useless.next, + struct backref_node, list); + list_del_init(&lower->list); + free_backref_node(cache, lower); } return ERR_PTR(err); } - BUG_ON(node && node->detached); + ASSERT(!node || !node->detached); return node; } @@ -1787,7 +1820,7 @@ again: btrfs_node_key_to_cpu(parent, next_key, slot + 1); old_bytenr = btrfs_node_blockptr(parent, slot); - blocksize = btrfs_level_size(dest, level - 1); + blocksize = dest->nodesize; old_ptr_gen = btrfs_node_ptr_generation(parent, slot); if (level <= max_level) { @@ -1813,8 +1846,7 @@ again: break; } - eb = read_tree_block(dest, old_bytenr, blocksize, - old_ptr_gen); + eb = read_tree_block(dest, old_bytenr, old_ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { ret = (!eb) ? -ENOMEM : -EIO; free_extent_buffer(eb); @@ -1944,7 +1976,6 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, u64 bytenr; u64 ptr_gen = 0; u64 last_snapshot; - u32 blocksize; u32 nritems; last_snapshot = btrfs_root_last_snapshot(&root->root_item); @@ -1970,8 +2001,7 @@ int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path, } bytenr = btrfs_node_blockptr(eb, path->slots[i]); - blocksize = btrfs_level_size(root, i - 1); - eb = read_tree_block(root, bytenr, blocksize, ptr_gen); + eb = read_tree_block(root, bytenr, ptr_gen); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -2316,7 +2346,7 @@ void free_reloc_roots(struct list_head *list) } static noinline_for_stack -int merge_reloc_roots(struct reloc_control *rc) +void merge_reloc_roots(struct reloc_control *rc) { struct btrfs_root *root; struct btrfs_root *reloc_root; @@ -2397,7 +2427,6 @@ out: } BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root)); - return ret; } static void free_block_list(struct rb_root *blocks) @@ -2544,8 +2573,7 @@ u64 calcu_metadata_size(struct reloc_control *rc, if (next->processed && (reserve || next != node)) break; - num_bytes += btrfs_level_size(rc->extent_root, - next->level); + num_bytes += rc->extent_root->nodesize; if (list_empty(&next->upper)) break; @@ -2679,9 +2707,9 @@ static int do_relocation(struct btrfs_trans_handle *trans, goto next; } - blocksize = btrfs_level_size(root, node->level); + blocksize = root->nodesize; generation = btrfs_node_ptr_generation(upper->eb, slot); - eb = read_tree_block(root, bytenr, blocksize, generation); + eb = read_tree_block(root, bytenr, generation); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); err = -EIO; @@ -2789,7 +2817,7 @@ static void __mark_block_processed(struct reloc_control *rc, u32 blocksize; if (node->level == 0 || in_block_group(node->bytenr, rc->block_group)) { - blocksize = btrfs_level_size(rc->extent_root, node->level); + blocksize = rc->extent_root->nodesize; mark_block_processed(rc, node->bytenr, blocksize); } node->processed = 1; @@ -2843,7 +2871,7 @@ static int get_tree_block_key(struct reloc_control *rc, BUG_ON(block->key_ready); eb = read_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); + block->key.offset); if (!eb || !extent_buffer_uptodate(eb)) { free_extent_buffer(eb); return -EIO; @@ -2858,20 +2886,6 @@ static int get_tree_block_key(struct reloc_control *rc, return 0; } -static int reada_tree_block(struct reloc_control *rc, - struct tree_block *block) -{ - BUG_ON(block->key_ready); - if (block->key.type == BTRFS_METADATA_ITEM_KEY) - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, - rc->extent_root->leafsize); - else - readahead_tree_block(rc->extent_root, block->bytenr, - block->key.objectid, block->key.offset); - return 0; -} - /* * helper function to relocate a tree block */ @@ -2951,7 +2965,8 @@ int relocate_tree_blocks(struct btrfs_trans_handle *trans, while (rb_node) { block = rb_entry(rb_node, struct tree_block, rb_node); if (!block->key_ready) - reada_tree_block(rc, block); + readahead_tree_block(rc->extent_root, block->bytenr, + block->key.objectid); rb_node = rb_next(rb_node); } @@ -3313,7 +3328,7 @@ static int add_tree_block(struct reloc_control *rc, return -ENOMEM; block->bytenr = extent_key->objectid; - block->key.objectid = rc->extent_root->leafsize; + block->key.objectid = rc->extent_root->nodesize; block->key.offset = generation; block->level = level; block->key_ready = 0; @@ -3640,7 +3655,7 @@ int add_data_references(struct reloc_control *rc, struct btrfs_extent_inline_ref *iref; unsigned long ptr; unsigned long end; - u32 blocksize = btrfs_level_size(rc->extent_root, 0); + u32 blocksize = rc->extent_root->nodesize; int ret = 0; int err = 0; @@ -3783,7 +3798,7 @@ next: } if (key.type == BTRFS_METADATA_ITEM_KEY && - key.objectid + rc->extent_root->leafsize <= + key.objectid + rc->extent_root->nodesize <= rc->search_start) { path->slots[0]++; goto next; @@ -3801,7 +3816,7 @@ next: rc->search_start = key.objectid + key.offset; else rc->search_start = key.objectid + - rc->extent_root->leafsize; + rc->extent_root->nodesize; memcpy(extent_key, &key, sizeof(key)); return 0; } @@ -4096,7 +4111,6 @@ static int __insert_orphan_inode(struct btrfs_trans_handle *trans, btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS | BTRFS_INODE_PREALLOC); btrfs_mark_buffer_dirty(leaf); - btrfs_release_path(path); out: btrfs_free_path(path); return ret; diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index f4a41f37be22..efa083113827 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -137,7 +137,6 @@ struct scrub_ctx { int pages_per_rd_bio; u32 sectorsize; u32 nodesize; - u32 leafsize; int is_dev_replace; struct scrub_wr_ctx wr_ctx; @@ -178,17 +177,12 @@ struct scrub_copy_nocow_ctx { struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; - char *scratch_buf; - char *msg_buf; const char *errstr; sector_t sector; u64 logical; struct btrfs_device *dev; - int msg_bufsize; - int scratch_bufsize; }; - static void scrub_pending_bio_inc(struct scrub_ctx *sctx); static void scrub_pending_bio_dec(struct scrub_ctx *sctx); static void scrub_pending_trans_workers_inc(struct scrub_ctx *sctx); @@ -438,7 +432,6 @@ struct scrub_ctx *scrub_setup_ctx(struct btrfs_device *dev, int is_dev_replace) } sctx->first_free = 0; sctx->nodesize = dev->dev_root->nodesize; - sctx->leafsize = dev->dev_root->leafsize; sctx->sectorsize = dev->dev_root->sectorsize; atomic_set(&sctx->bios_in_flight, 0); atomic_set(&sctx->workers_pending, 0); @@ -553,7 +546,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) u64 ref_root; u32 item_size; u8 ref_level; - const int bufsize = 4096; int ret; WARN_ON(sblock->page_count < 1); @@ -561,18 +553,13 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) fs_info = sblock->sctx->dev_root->fs_info; path = btrfs_alloc_path(); + if (!path) + return; - swarn.scratch_buf = kmalloc(bufsize, GFP_NOFS); - swarn.msg_buf = kmalloc(bufsize, GFP_NOFS); swarn.sector = (sblock->pagev[0]->physical) >> 9; swarn.logical = sblock->pagev[0]->logical; swarn.errstr = errstr; swarn.dev = NULL; - swarn.msg_bufsize = bufsize; - swarn.scratch_bufsize = bufsize; - - if (!path || !swarn.scratch_buf || !swarn.msg_buf) - goto out; ret = extent_from_logical(fs_info, swarn.logical, path, &found_key, &flags); @@ -613,8 +600,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock) out: btrfs_free_path(path); - kfree(swarn.scratch_buf); - kfree(swarn.msg_buf); } static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) @@ -681,9 +666,9 @@ static int scrub_fixup_readpage(u64 inum, u64 offset, u64 root, void *fixup_ctx) ret = -EIO; goto out; } - fs_info = BTRFS_I(inode)->root->fs_info; - ret = repair_io_failure(fs_info, offset, PAGE_SIZE, + ret = repair_io_failure(inode, offset, PAGE_SIZE, fixup->logical, page, + offset - page_offset(page), fixup->mirror_num); unlock_page(page); corrected = !ret; @@ -1361,6 +1346,16 @@ static void scrub_recheck_block(struct btrfs_fs_info *fs_info, return; } +static inline int scrub_check_fsid(u8 fsid[], + struct scrub_page *spage) +{ + struct btrfs_fs_devices *fs_devices = spage->dev->fs_devices; + int ret; + + ret = memcmp(fsid, fs_devices->fsid, BTRFS_UUID_SIZE); + return !ret; +} + static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, struct scrub_block *sblock, int is_metadata, int have_csum, @@ -1380,7 +1375,7 @@ static void scrub_recheck_block_checksum(struct btrfs_fs_info *fs_info, h = (struct btrfs_header *)mapped_buffer; if (sblock->pagev[0]->logical != btrfs_stack_header_bytenr(h) || - memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE) || + !scrub_check_fsid(h->fsid, sblock->pagev[0]) || memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) { sblock->header_error = 1; @@ -1751,14 +1746,13 @@ static int scrub_checksum_tree_block(struct scrub_block *sblock) if (sblock->pagev[0]->generation != btrfs_stack_header_generation(h)) ++fail; - if (memcmp(h->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + if (!scrub_check_fsid(h->fsid, sblock->pagev[0])) ++fail; if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE)) ++fail; - WARN_ON(sctx->nodesize != sctx->leafsize); len = sctx->nodesize - BTRFS_CSUM_SIZE; mapped_size = PAGE_SIZE - BTRFS_CSUM_SIZE; p = ((u8 *)mapped_buffer) + BTRFS_CSUM_SIZE; @@ -1791,8 +1785,6 @@ static int scrub_checksum_super(struct scrub_block *sblock) { struct btrfs_super_block *s; struct scrub_ctx *sctx = sblock->sctx; - struct btrfs_root *root = sctx->dev_root; - struct btrfs_fs_info *fs_info = root->fs_info; u8 calculated_csum[BTRFS_CSUM_SIZE]; u8 on_disk_csum[BTRFS_CSUM_SIZE]; struct page *page; @@ -1817,7 +1809,7 @@ static int scrub_checksum_super(struct scrub_block *sblock) if (sblock->pagev[0]->generation != btrfs_super_generation(s)) ++fail_gen; - if (memcmp(s->fsid, fs_info->fsid, BTRFS_UUID_SIZE)) + if (!scrub_check_fsid(s->fsid, sblock->pagev[0])) ++fail_cor; len = BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE; @@ -2196,7 +2188,6 @@ static int scrub_extent(struct scrub_ctx *sctx, u64 logical, u64 len, sctx->stat.data_bytes_scrubbed += len; spin_unlock(&sctx->stat_lock); } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - WARN_ON(sctx->nodesize != sctx->leafsize); blocksize = sctx->nodesize; spin_lock(&sctx->stat_lock); sctx->stat.tree_extents_scrubbed++; @@ -2487,7 +2478,7 @@ static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx, btrfs_item_key_to_cpu(l, &key, slot); if (key.type == BTRFS_METADATA_ITEM_KEY) - bytes = root->leafsize; + bytes = root->nodesize; else bytes = key.offset; @@ -2714,7 +2705,7 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, if (found_key.objectid != scrub_dev->devid) break; - if (btrfs_key_type(&found_key) != BTRFS_DEV_EXTENT_KEY) + if (found_key.type != BTRFS_DEV_EXTENT_KEY) break; if (found_key.offset >= end) @@ -2828,11 +2819,16 @@ static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx, if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return -EIO; - gen = root->fs_info->last_trans_committed; + /* Seed devices of a new filesystem has their own generation. */ + if (scrub_dev->fs_devices != root->fs_info->fs_devices) + gen = scrub_dev->generation; + else + gen = root->fs_info->last_trans_committed; for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) { bytenr = btrfs_sb_offset(i); - if (bytenr + BTRFS_SUPER_INFO_SIZE > scrub_dev->total_bytes) + if (bytenr + BTRFS_SUPER_INFO_SIZE > + scrub_dev->commit_total_bytes) break; ret = scrub_pages(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr, @@ -2910,17 +2906,6 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start, if (btrfs_fs_closing(fs_info)) return -EINVAL; - /* - * check some assumptions - */ - if (fs_info->chunk_root->nodesize != fs_info->chunk_root->leafsize) { - btrfs_err(fs_info, - "scrub: size assumption nodesize == leafsize (%d == %d) fails", - fs_info->chunk_root->nodesize, - fs_info->chunk_root->leafsize); - return -EINVAL; - } - if (fs_info->chunk_root->nodesize > BTRFS_STRIPE_LEN) { /* * in this case scrub is unable to calculate the checksum diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 6528aa662181..874828dd0a86 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -515,7 +515,8 @@ static int write_buf(struct file *filp, const void *buf, u32 len, loff_t *off) set_fs(KERNEL_DS); while (pos < len) { - ret = vfs_write(filp, (char *)buf + pos, len - pos, off); + ret = vfs_write(filp, (__force const char __user *)buf + pos, + len - pos, off); /* TODO handle that correctly */ /*if (ret == -ERESTARTSYS) { continue; @@ -985,11 +986,13 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, int num; u8 type; - if (found_key->type == BTRFS_XATTR_ITEM_KEY) - buf_len = BTRFS_MAX_XATTR_SIZE(root); - else - buf_len = PATH_MAX; - + /* + * Start with a small buffer (1 page). If later we end up needing more + * space, which can happen for xattrs on a fs with a leaf size greater + * then the page size, attempt to increase the buffer. Typically xattr + * values are small. + */ + buf_len = PATH_MAX; buf = kmalloc(buf_len, GFP_NOFS); if (!buf) { ret = -ENOMEM; @@ -1016,7 +1019,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, ret = -ENAMETOOLONG; goto out; } - if (name_len + data_len > buf_len) { + if (name_len + data_len > BTRFS_MAX_XATTR_SIZE(root)) { ret = -E2BIG; goto out; } @@ -1024,12 +1027,34 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, /* * Path too long */ - if (name_len + data_len > buf_len) { + if (name_len + data_len > PATH_MAX) { ret = -ENAMETOOLONG; goto out; } } + if (name_len + data_len > buf_len) { + buf_len = name_len + data_len; + if (is_vmalloc_addr(buf)) { + vfree(buf); + buf = NULL; + } else { + char *tmp = krealloc(buf, buf_len, + GFP_NOFS | __GFP_NOWARN); + + if (!tmp) + kfree(buf); + buf = tmp; + } + if (!buf) { + buf = vmalloc(buf_len); + if (!buf) { + ret = -ENOMEM; + goto out; + } + } + } + read_extent_buffer(eb, buf, (unsigned long)(di + 1), name_len + data_len); @@ -1050,7 +1075,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path, } out: - kfree(buf); + kvfree(buf); return ret; } @@ -3302,7 +3327,7 @@ static int wait_for_parent_move(struct send_ctx *sctx, if (ret < 0 && ret != -ENOENT) { goto out; } else if (ret == -ENOENT) { - ret = 1; + ret = 0; break; } @@ -5703,7 +5728,7 @@ long btrfs_ioctl_send(struct file *mnt_file, void __user *arg_) NULL); sort_clone_roots = 1; - current->journal_info = (void *)BTRFS_SEND_TRANS_STUB; + current->journal_info = BTRFS_SEND_TRANS_STUB; ret = send_subvol(sctx); current->journal_info = NULL; if (ret < 0) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index c4124de4435b..a2b97ef10317 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -60,6 +60,7 @@ #include "backref.h" #include "tests/btrfs-tests.h" +#include "qgroup.h" #define CREATE_TRACE_POINTS #include <trace/events/btrfs.h> @@ -307,13 +308,7 @@ void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function, static void btrfs_put_super(struct super_block *sb) { - (void)close_ctree(btrfs_sb(sb)->tree_root); - /* FIXME: need to fix VFS to return error? */ - /* AV: return it _where_? ->put_super() can be triggered by any number - * of async events, up to and including delivery of SIGKILL to the - * last process that kept it busy. Or segfault in the aforementioned - * process... Whom would you report that to? - */ + close_ctree(btrfs_sb(sb)->tree_root); } enum { @@ -400,7 +395,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; - bool compress = false; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (cache_gen) @@ -478,7 +472,6 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: - compress = true; if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -508,11 +501,18 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_and_info(root, FORCE_COMPRESS, "force %s compression", compress_type); - } else if (compress) { + } else { if (!btrfs_test_opt(root, COMPRESS)) btrfs_info(root->fs_info, "btrfs: use %s compression", compress_type); + /* + * If we remount from compress-force=xxx to + * compress=xxx, we need clear FORCE_COMPRESS + * flag, otherwise, there is no way for users + * to disable forcible compression separately. + */ + btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } break; case Opt_ssd: @@ -1014,7 +1014,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) seq_puts(seq, ",nodatacow"); if (btrfs_test_opt(root, NOBARRIER)) seq_puts(seq, ",nobarrier"); - if (info->max_inline != 8192 * 1024) + if (info->max_inline != BTRFS_DEFAULT_MAX_INLINE) seq_printf(seq, ",max_inline=%llu", info->max_inline); if (info->alloc_start != 0) seq_printf(seq, ",alloc_start=%llu", info->alloc_start); @@ -1215,6 +1215,56 @@ static struct dentry *mount_subvol(const char *subvol_name, int flags, return root; } +static int parse_security_options(char *orig_opts, + struct security_mnt_opts *sec_opts) +{ + char *secdata = NULL; + int ret = 0; + + secdata = alloc_secdata(); + if (!secdata) + return -ENOMEM; + ret = security_sb_copy_data(orig_opts, secdata); + if (ret) { + free_secdata(secdata); + return ret; + } + ret = security_sb_parse_opts_str(secdata, sec_opts); + free_secdata(secdata); + return ret; +} + +static int setup_security_options(struct btrfs_fs_info *fs_info, + struct super_block *sb, + struct security_mnt_opts *sec_opts) +{ + int ret = 0; + + /* + * Call security_sb_set_mnt_opts() to check whether new sec_opts + * is valid. + */ + ret = security_sb_set_mnt_opts(sb, sec_opts, 0, NULL); + if (ret) + return ret; + +#ifdef CONFIG_SECURITY + if (!fs_info->security_opts.num_mnt_opts) { + /* first time security setup, copy sec_opts to fs_info */ + memcpy(&fs_info->security_opts, sec_opts, sizeof(*sec_opts)); + } else { + /* + * Since SELinux(the only one supports security_mnt_opts) does + * NOT support changing context during remount/mount same sb, + * This must be the same or part of the same security options, + * just free it. + */ + security_free_mnt_opts(sec_opts); + } +#endif + return ret; +} + /* * Find a superblock for the given device / mount point. * @@ -1229,6 +1279,7 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, struct dentry *root; struct btrfs_fs_devices *fs_devices = NULL; struct btrfs_fs_info *fs_info = NULL; + struct security_mnt_opts new_sec_opts; fmode_t mode = FMODE_READ; char *subvol_name = NULL; u64 subvol_objectid = 0; @@ -1251,9 +1302,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, return root; } + security_init_mnt_opts(&new_sec_opts); + if (data) { + error = parse_security_options(data, &new_sec_opts); + if (error) + return ERR_PTR(error); + } + error = btrfs_scan_one_device(device_name, mode, fs_type, &fs_devices); if (error) - return ERR_PTR(error); + goto error_sec_opts; /* * Setup a dummy root and fs_info for test/set super. This is because @@ -1262,13 +1320,16 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, * then open_ctree will properly initialize everything later. */ fs_info = kzalloc(sizeof(struct btrfs_fs_info), GFP_NOFS); - if (!fs_info) - return ERR_PTR(-ENOMEM); + if (!fs_info) { + error = -ENOMEM; + goto error_sec_opts; + } fs_info->fs_devices = fs_devices; fs_info->super_copy = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); fs_info->super_for_commit = kzalloc(BTRFS_SUPER_INFO_SIZE, GFP_NOFS); + security_init_mnt_opts(&fs_info->security_opts); if (!fs_info->super_copy || !fs_info->super_for_commit) { error = -ENOMEM; goto error_fs_info; @@ -1306,8 +1367,19 @@ static struct dentry *btrfs_mount(struct file_system_type *fs_type, int flags, } root = !error ? get_default_root(s, subvol_objectid) : ERR_PTR(error); - if (IS_ERR(root)) + if (IS_ERR(root)) { + deactivate_locked_super(s); + error = PTR_ERR(root); + goto error_sec_opts; + } + + fs_info = btrfs_sb(s); + error = setup_security_options(fs_info, s, &new_sec_opts); + if (error) { + dput(root); deactivate_locked_super(s); + goto error_sec_opts; + } return root; @@ -1315,6 +1387,8 @@ error_close_devices: btrfs_close_devices(fs_devices); error_fs_info: free_fs_info(fs_info); +error_sec_opts: + security_free_mnt_opts(&new_sec_opts); return ERR_PTR(error); } @@ -1396,6 +1470,21 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) sync_filesystem(sb); btrfs_remount_prepare(fs_info); + if (data) { + struct security_mnt_opts new_sec_opts; + + security_init_mnt_opts(&new_sec_opts); + ret = parse_security_options(data, &new_sec_opts); + if (ret) + goto restore; + ret = setup_security_options(fs_info, sb, + &new_sec_opts); + if (ret) { + security_free_mnt_opts(&new_sec_opts); + goto restore; + } + } + ret = btrfs_parse_options(root, data); if (ret) { ret = -EINVAL; @@ -1694,7 +1783,11 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; int ret; - /* holding chunk_muext to avoid allocating new chunks */ + /* + * holding chunk_muext to avoid allocating new chunks, holding + * device_list_mutex to avoid the device being removed + */ + mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); rcu_read_lock(); list_for_each_entry_rcu(found, head, list) { @@ -1735,11 +1828,13 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) ret = btrfs_calc_avail_data_space(fs_info->tree_root, &total_free_data); if (ret) { mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); return ret; } buf->f_bavail += div_u64(total_free_data, factor); buf->f_bavail = buf->f_bavail >> bits; mutex_unlock(&fs_info->chunk_mutex); + mutex_unlock(&fs_info->fs_devices->device_list_mutex); buf->f_type = BTRFS_SUPER_MAGIC; buf->f_bsize = dentry->d_sb->s_blocksize; @@ -1769,7 +1864,7 @@ static struct file_system_type btrfs_fs_type = { .name = "btrfs", .mount = btrfs_mount, .kill_sb = btrfs_kill_super, - .fs_flags = FS_REQUIRES_DEV, + .fs_flags = FS_REQUIRES_DEV | FS_BINARY_MOUNTDATA, }; MODULE_ALIAS_FS("btrfs"); @@ -1993,11 +2088,15 @@ static int __init init_btrfs_fs(void) err = btrfs_prelim_ref_init(); if (err) + goto free_delayed_ref; + + err = btrfs_end_io_wq_init(); + if (err) goto free_prelim_ref; err = btrfs_interface_init(); if (err) - goto free_delayed_ref; + goto free_end_io_wq; btrfs_init_lockdep(); @@ -2015,6 +2114,8 @@ static int __init init_btrfs_fs(void) unregister_ioctl: btrfs_interface_exit(); +free_end_io_wq: + btrfs_end_io_wq_exit(); free_prelim_ref: btrfs_prelim_ref_exit(); free_delayed_ref: diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 12e53556e214..b2e7bb4393f6 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -242,7 +242,7 @@ static ssize_t global_rsv_size_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->size, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_size, 0444, global_rsv_size_show); +BTRFS_ATTR(global_rsv_size, global_rsv_size_show); static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -251,7 +251,7 @@ static ssize_t global_rsv_reserved_show(struct kobject *kobj, struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv; return btrfs_show_u64(&block_rsv->reserved, &block_rsv->lock, buf); } -BTRFS_ATTR(global_rsv_reserved, 0444, global_rsv_reserved_show); +BTRFS_ATTR(global_rsv_reserved, global_rsv_reserved_show); #define to_space_info(_kobj) container_of(_kobj, struct btrfs_space_info, kobj) #define to_raid_kobj(_kobj) container_of(_kobj, struct raid_kobject, kobj) @@ -306,7 +306,7 @@ static ssize_t btrfs_space_info_show_##field(struct kobject *kobj, \ struct btrfs_space_info *sinfo = to_space_info(kobj); \ return btrfs_show_u64(&sinfo->field, &sinfo->lock, buf); \ } \ -BTRFS_ATTR(field, 0444, btrfs_space_info_show_##field) +BTRFS_ATTR(field, btrfs_space_info_show_##field) static ssize_t btrfs_space_info_show_total_bytes_pinned(struct kobject *kobj, struct kobj_attribute *a, @@ -325,7 +325,7 @@ SPACE_INFO_ATTR(bytes_reserved); SPACE_INFO_ATTR(bytes_may_use); SPACE_INFO_ATTR(disk_used); SPACE_INFO_ATTR(disk_total); -BTRFS_ATTR(total_bytes_pinned, 0444, btrfs_space_info_show_total_bytes_pinned); +BTRFS_ATTR(total_bytes_pinned, btrfs_space_info_show_total_bytes_pinned); static struct attribute *space_info_attrs[] = { BTRFS_ATTR_PTR(flags), @@ -363,7 +363,8 @@ static ssize_t btrfs_label_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) { struct btrfs_fs_info *fs_info = to_fs_info(kobj); - return snprintf(buf, PAGE_SIZE, "%s\n", fs_info->super_copy->label); + char *label = fs_info->super_copy->label; + return snprintf(buf, PAGE_SIZE, label[0] ? "%s\n" : "%s", label); } static ssize_t btrfs_label_store(struct kobject *kobj, @@ -374,8 +375,18 @@ static ssize_t btrfs_label_store(struct kobject *kobj, struct btrfs_trans_handle *trans; struct btrfs_root *root = fs_info->fs_root; int ret; + size_t p_len; - if (len >= BTRFS_LABEL_SIZE) + if (fs_info->sb->s_flags & MS_RDONLY) + return -EROFS; + + /* + * p_len is the len until the first occurrence of either + * '\n' or '\0' + */ + p_len = strcspn(buf, "\n"); + + if (p_len >= BTRFS_LABEL_SIZE) return -EINVAL; trans = btrfs_start_transaction(root, 0); @@ -383,7 +394,8 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return PTR_ERR(trans); spin_lock(&root->fs_info->super_lock); - strcpy(fs_info->super_copy->label, buf); + memset(fs_info->super_copy->label, 0, BTRFS_LABEL_SIZE); + memcpy(fs_info->super_copy->label, buf, p_len); spin_unlock(&root->fs_info->super_lock); ret = btrfs_commit_transaction(trans, root); @@ -392,14 +404,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj, return ret; } -BTRFS_ATTR_RW(label, 0644, btrfs_label_show, btrfs_label_store); - -static ssize_t btrfs_no_store(struct kobject *kobj, - struct kobj_attribute *a, - const char *buf, size_t len) -{ - return -EPERM; -} +BTRFS_ATTR_RW(label, btrfs_label_show, btrfs_label_store); static ssize_t btrfs_nodesize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -409,7 +414,7 @@ static ssize_t btrfs_nodesize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->nodesize); } -BTRFS_ATTR_RW(nodesize, 0444, btrfs_nodesize_show, btrfs_no_store); +BTRFS_ATTR(nodesize, btrfs_nodesize_show); static ssize_t btrfs_sectorsize_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -419,7 +424,7 @@ static ssize_t btrfs_sectorsize_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -BTRFS_ATTR_RW(sectorsize, 0444, btrfs_sectorsize_show, btrfs_no_store); +BTRFS_ATTR(sectorsize, btrfs_sectorsize_show); static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -429,7 +434,7 @@ static ssize_t btrfs_clone_alignment_show(struct kobject *kobj, return snprintf(buf, PAGE_SIZE, "%u\n", fs_info->super_copy->sectorsize); } -BTRFS_ATTR_RW(clone_alignment, 0444, btrfs_clone_alignment_show, btrfs_no_store); +BTRFS_ATTR(clone_alignment, btrfs_clone_alignment_show); static struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(label), diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index ac46df37504c..f7dd298b3cf6 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -20,16 +20,20 @@ enum btrfs_feature_set { .store = _store, \ } -#define BTRFS_ATTR_RW(_name, _mode, _show, _store) \ -static struct kobj_attribute btrfs_attr_##_name = \ - __INIT_KOBJ_ATTR(_name, _mode, _show, _store) -#define BTRFS_ATTR(_name, _mode, _show) \ - BTRFS_ATTR_RW(_name, _mode, _show, NULL) +#define BTRFS_ATTR_RW(_name, _show, _store) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0644, _show, _store) + +#define BTRFS_ATTR(_name, _show) \ + static struct kobj_attribute btrfs_attr_##_name = \ + __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + #define BTRFS_ATTR_PTR(_name) (&btrfs_attr_##_name.attr) #define BTRFS_RAID_ATTR(_name, _show) \ -static struct kobj_attribute btrfs_raid_attr_##_name = \ + static struct kobj_attribute btrfs_raid_attr_##_name = \ __INIT_KOBJ_ATTR(_name, 0444, _show, NULL) + #define BTRFS_RAID_ATTR_PTR(_name) (&btrfs_raid_attr_##_name.attr) diff --git a/fs/btrfs/tests/free-space-tests.c b/fs/btrfs/tests/free-space-tests.c index c8d9ddf84c69..2299bfde39ee 100644 --- a/fs/btrfs/tests/free-space-tests.c +++ b/fs/btrfs/tests/free-space-tests.c @@ -40,11 +40,12 @@ static struct btrfs_block_group_cache *init_test_block_group(void) cache->key.offset = 1024 * 1024 * 1024; cache->key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; cache->sectorsize = 4096; + cache->full_stripe_len = 4096; spin_lock_init(&cache->lock); INIT_LIST_HEAD(&cache->list); INIT_LIST_HEAD(&cache->cluster_list); - INIT_LIST_HEAD(&cache->new_bg_list); + INIT_LIST_HEAD(&cache->bg_list); btrfs_init_free_space_ctl(cache); @@ -364,6 +365,517 @@ static int test_bitmaps_and_extents(struct btrfs_block_group_cache *cache) return 0; } +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static bool test_use_bitmap(struct btrfs_free_space_ctl *ctl, + struct btrfs_free_space *info) +{ + return ctl->free_extents > 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int +check_num_extents_and_bitmaps(const struct btrfs_block_group_cache *cache, + const int num_extents, + const int num_bitmaps) +{ + if (cache->free_space_ctl->free_extents != num_extents) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->free_extents, num_extents); + return -EINVAL; + } + if (cache->free_space_ctl->total_bitmaps != num_bitmaps) { + test_msg("Incorrect # of extent entries in the cache: %d, expected %d\n", + cache->free_space_ctl->total_bitmaps, num_bitmaps); + return -EINVAL; + } + return 0; +} + +/* Used by test_steal_space_from_bitmap_to_extent(). */ +static int check_cache_empty(struct btrfs_block_group_cache *cache) +{ + u64 offset; + u64 max_extent_size; + + /* + * Now lets confirm that there's absolutely no free space left to + * allocate. + */ + if (cache->free_space_ctl->free_space != 0) { + test_msg("Cache free space is not 0\n"); + return -EINVAL; + } + + /* And any allocation request, no matter how small, should fail now. */ + offset = btrfs_find_space_for_alloc(cache, 0, 4096, 0, + &max_extent_size); + if (offset != 0) { + test_msg("Space allocation did not fail, returned offset: %llu", + offset); + return -EINVAL; + } + + /* And no extent nor bitmap entries in the cache anymore. */ + return check_num_extents_and_bitmaps(cache, 0, 0); +} + +/* + * Before we were able to steal free space from a bitmap entry to an extent + * entry, we could end up with 2 entries representing a contiguous free space. + * One would be an extent entry and the other a bitmap entry. Since in order + * to allocate space to a caller we use only 1 entry, we couldn't return that + * whole range to the caller if it was requested. This forced the caller to + * either assume ENOSPC or perform several smaller space allocations, which + * wasn't optimal as they could be spread all over the block group while under + * concurrency (extra overhead and fragmentation). + * + * This stealing approach is benefical, since we always prefer to allocate from + * extent entries, both for clustered and non-clustered allocation requests. + */ +static int +test_steal_space_from_bitmap_to_extent(struct btrfs_block_group_cache *cache) +{ + int ret; + u64 offset; + u64 max_extent_size; + + bool (*use_bitmap_op)(struct btrfs_free_space_ctl *, + struct btrfs_free_space *); + + test_msg("Running space stealing from bitmap to extent\n"); + + /* + * For this test, we want to ensure we end up with an extent entry + * immediately adjacent to a bitmap entry, where the bitmap starts + * at an offset where the extent entry ends. We keep adding and + * removing free space to reach into this state, but to get there + * we need to reach a point where marking new free space doesn't + * result in adding new extent entries or merging the new space + * with existing extent entries - the space ends up being marked + * in an existing bitmap that covers the new free space range. + * + * To get there, we need to reach the threshold defined set at + * cache->free_space_ctl->extents_thresh, which currently is + * 256 extents on a x86_64 system at least, and a few other + * conditions (check free_space_cache.c). Instead of making the + * test much longer and complicated, use a "use_bitmap" operation + * that forces use of bitmaps as soon as we have at least 1 + * extent entry. + */ + use_bitmap_op = cache->free_space_ctl->op->use_bitmap; + cache->free_space_ctl->op->use_bitmap = test_use_bitmap; + + /* + * Extent entry covering free space range [128Mb - 256Kb, 128Mb - 128Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [128Mb + 512Kb, 256Mb[ */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 512 * 1024, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the first 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb - 256Kb, 128Mb - 128Kb[ + * [128Mb + 512Kb, 128Mb + 768Kb[ + */ + ret = btrfs_remove_free_space(cache, + 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 + 512 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [128Mb + 768Kb, 256Mb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 768 * 1024, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb + 256Kb, 128Mb + 512Kb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 + 256 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb, 128Mb + 256Kb[, which is covered + * by the bitmap too, isn't marked as free either. + */ + if (test_check_exists(cache, 128 * 1024 * 1024, + 256 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb, 128Mb + 512Kb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the right of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 + 16 * 1024 * 1024, + 4096); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now mark the region [128Mb - 128Kb, 128Mb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb - 256Kb, 128Mb - 128Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 128 * 1024, + 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 4Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 256Kb, 128Mb + 768Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb - 256Kb, 128Mb[ + * bitmap entry covering range: [128Mb, 128Mb + 768Kb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 256 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 4096)) { + test_msg("Cache free space is not 1Mb + 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 256 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 4Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 4096) { + test_msg("Cache free space is not 4Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 4096, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 + 16 * 1024 * 1024)) { + test_msg("Failed to allocate 4Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + /* + * Now test a similar scenario, but where our extent entry is located + * to the right of the bitmap entry, so that we can check that stealing + * space from a bitmap to the front of an extent entry works. + */ + + /* + * Extent entry covering free space range [128Mb + 128Kb, 128Mb + 256Kb[ + */ + ret = test_add_free_space_entry(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024, 0); + if (ret) { + test_msg("Couldn't add extent entry %d\n", ret); + return ret; + } + + /* Bitmap entry covering free space range [0, 128Mb - 512Kb[ */ + ret = test_add_free_space_entry(cache, 0, + 128 * 1024 * 1024 - 512 * 1024, 1); + if (ret) { + test_msg("Couldn't add bitmap entry %d\n", ret); + return ret; + } + + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now make only the last 256Kb of the bitmap marked as free, so that + * we end up with only the following ranges marked as free space: + * + * [128Mb + 128b, 128Mb + 256Kb[ + * [128Mb - 768Kb, 128Mb - 512Kb[ + */ + ret = btrfs_remove_free_space(cache, + 0, + 128 * 1024 * 1024 - 768 * 1024); + if (ret) { + test_msg("Failed to free part of bitmap space %d\n", ret); + return ret; + } + + /* Confirm that only those 2 ranges are marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 + 128 * 1024, + 128 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 256 * 1024)) { + test_msg("Free space range missing\n"); + return -ENOENT; + } + + /* + * Confirm that the bitmap range [0, 128Mb - 768Kb[ isn't marked + * as free anymore. + */ + if (test_check_exists(cache, 0, + 128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Bitmap region not removed from space cache\n"); + return -EINVAL; + } + + /* + * Confirm that the region [128Mb - 512Kb, 128Mb[, which is + * covered by the bitmap, isn't marked as free. + */ + if (test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Invalid bitmap region marked as free\n"); + return -EINVAL; + } + + /* + * Now lets mark the region [128Mb - 512Kb, 128Mb[ as free too. But, + * lets make sure the free space cache marks it as free in the bitmap, + * and doesn't insert a new extent entry to represent this region. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 512 * 1024, + 512 * 1024)) { + test_msg("Bitmap region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that no new extent entries or bitmap entries were added to + * the cache after adding that free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * Now lets add a small free space region to the left of the previous + * one, which is not contiguous with it and is part of the bitmap too. + * The goal is to test that the bitmap entry space stealing doesn't + * steal this space region. + */ + ret = btrfs_add_free_space(cache, 32 * 1024 * 1024, 8192); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + + /* + * Now mark the region [128Mb, 128Mb + 128Kb[ as free too. This will + * expand the range covered by the existing extent entry that represents + * the free space [128Mb + 128Kb, 128Mb + 256Kb[. + */ + ret = btrfs_add_free_space(cache, 128 * 1024 * 1024, 128 * 1024); + if (ret) { + test_msg("Error adding free space: %d\n", ret); + return ret; + } + /* Confirm the region is marked as free. */ + if (!test_check_exists(cache, 128 * 1024 * 1024, 128 * 1024)) { + test_msg("Extent region not marked as free\n"); + return -ENOENT; + } + + /* + * Confirm that our extent entry didn't stole all free space from the + * bitmap, because of the small 8Kb free space region. + */ + ret = check_num_extents_and_bitmaps(cache, 2, 1); + if (ret) + return ret; + + /* + * So now we have the range [128Mb - 768Kb, 128Mb + 256Kb[ as free + * space. Without stealing bitmap free space into extent entry space, + * we would have all this free space represented by 2 entries in the + * cache: + * + * extent entry covering range: [128Mb, 128Mb + 256Kb[ + * bitmap entry covering range: [128Mb - 768Kb, 128Mb[ + * + * Attempting to allocate the whole free space (1Mb) would fail, because + * we can't allocate from multiple entries. + * With the bitmap free space stealing, we get a single extent entry + * that represents the 1Mb free space, and therefore we're able to + * allocate the whole free space at once. + */ + if (!test_check_exists(cache, 128 * 1024 * 1024 - 768 * 1024, + 1 * 1024 * 1024)) { + test_msg("Expected region not marked as free\n"); + return -ENOENT; + } + + if (cache->free_space_ctl->free_space != (1 * 1024 * 1024 + 8192)) { + test_msg("Cache free space is not 1Mb + 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 1 * 1024 * 1024, 0, + &max_extent_size); + if (offset != (128 * 1024 * 1024 - 768 * 1024)) { + test_msg("Failed to allocate 1Mb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + /* All that remains is a 8Kb free space region in a bitmap. Confirm. */ + ret = check_num_extents_and_bitmaps(cache, 1, 1); + if (ret) + return ret; + + if (cache->free_space_ctl->free_space != 8192) { + test_msg("Cache free space is not 8Kb\n"); + return -EINVAL; + } + + offset = btrfs_find_space_for_alloc(cache, + 0, 8192, 0, + &max_extent_size); + if (offset != (32 * 1024 * 1024)) { + test_msg("Failed to allocate 8Kb from space cache, returned offset is: %llu\n", + offset); + return -EINVAL; + } + + ret = check_cache_empty(cache); + if (ret) + return ret; + + cache->free_space_ctl->op->use_bitmap = use_bitmap_op; + __btrfs_remove_free_space_cache(cache->free_space_ctl); + + return 0; +} + int btrfs_test_free_space_cache(void) { struct btrfs_block_group_cache *cache; @@ -386,6 +898,8 @@ int btrfs_test_free_space_cache(void) ret = test_bitmaps_and_extents(cache); if (ret) goto out; + + ret = test_steal_space_from_bitmap_to_extent(cache); out: __btrfs_remove_free_space_cache(cache->free_space_ctl); kfree(cache->free_space_ctl); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d89c6d3542ca..dcaae3616728 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -386,7 +386,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, int ret; /* Send isn't supposed to start transactions. */ - ASSERT(current->journal_info != (void *)BTRFS_SEND_TRANS_STUB); + ASSERT(current->journal_info != BTRFS_SEND_TRANS_STUB); if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) return ERR_PTR(-EROFS); @@ -408,7 +408,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, if (num_items > 0 && root != root->fs_info->chunk_root) { if (root->fs_info->quota_enabled && is_fstree(root->root_key.objectid)) { - qgroup_reserved = num_items * root->leafsize; + qgroup_reserved = num_items * root->nodesize; ret = btrfs_qgroup_reserve(root, qgroup_reserved); if (ret) return ERR_PTR(ret); @@ -418,7 +418,7 @@ start_transaction(struct btrfs_root *root, u64 num_items, unsigned int type, /* * Do the reservation for the relocation root creation */ - if (unlikely(need_reserve_reloc_root(root))) { + if (need_reserve_reloc_root(root)) { num_bytes += root->nodesize; reloc_reserved = true; } @@ -609,7 +609,6 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) if (transid <= root->fs_info->last_trans_committed) goto out; - ret = -EINVAL; /* find specified transaction */ spin_lock(&root->fs_info->trans_lock); list_for_each_entry(t, &root->fs_info->trans_list, list) { @@ -625,9 +624,16 @@ int btrfs_wait_for_commit(struct btrfs_root *root, u64 transid) } } spin_unlock(&root->fs_info->trans_lock); - /* The specified transaction doesn't exist */ - if (!cur_trans) + + /* + * The specified transaction doesn't exist, or we + * raced with btrfs_commit_transaction + */ + if (!cur_trans) { + if (transid > root->fs_info->last_trans_committed) + ret = -EINVAL; goto out; + } } else { /* find newest transaction that is committing | committed */ spin_lock(&root->fs_info->trans_lock); @@ -851,6 +857,8 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, struct extent_state *cached_state = NULL; u64 start = 0; u64 end; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); + bool errors = false; while (!find_first_extent_bit(dirty_pages, start, &start, &end, EXTENT_NEED_WAIT, &cached_state)) { @@ -864,6 +872,26 @@ int btrfs_wait_marked_extents(struct btrfs_root *root, } if (err) werr = err; + + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if ((mark & EXTENT_DIRTY) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, + &btree_ino->runtime_flags)) + errors = true; + + if ((mark & EXTENT_NEW) && + test_and_clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, + &btree_ino->runtime_flags)) + errors = true; + } else { + if (test_and_clear_bit(BTRFS_INODE_BTREE_ERR, + &btree_ino->runtime_flags)) + errors = true; + } + + if (errors && !werr) + werr = -EIO; + return werr; } @@ -1629,6 +1657,7 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, { struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_transaction *prev_trans = NULL; + struct btrfs_inode *btree_ino = BTRFS_I(root->fs_info->btree_inode); int ret; /* Stop the commit early if ->aborted is set */ @@ -1868,6 +1897,12 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans, memcpy(root->fs_info->super_for_commit, root->fs_info->super_copy, sizeof(*root->fs_info->super_copy)); + btrfs_update_commit_device_size(root->fs_info); + btrfs_update_commit_device_bytes_used(root, cur_trans); + + clear_bit(BTRFS_INODE_BTREE_LOG1_ERR, &btree_ino->runtime_flags); + clear_bit(BTRFS_INODE_BTREE_LOG2_ERR, &btree_ino->runtime_flags); + spin_lock(&root->fs_info->trans_lock); cur_trans->state = TRANS_STATE_UNBLOCKED; root->fs_info->running_transaction = NULL; @@ -1981,9 +2016,6 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_root *root) ret = btrfs_drop_snapshot(root, NULL, 0, 0); else ret = btrfs_drop_snapshot(root, NULL, 1, 0); - /* - * If we encounter a transaction abort during snapshot cleaning, we - * don't want to crash here - */ + return (ret < 0) ? 0 : 1; } diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 579be51b27e5..d8f40e1a5d2d 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -79,7 +79,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_USERSPACE | __TRANS_START | \ __TRANS_ATTACH) -#define BTRFS_SEND_TRANS_STUB 1 +#define BTRFS_SEND_TRANS_STUB ((void *)1) struct btrfs_trans_handle { u64 transid; diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 1d1ba083ca6e..1475979e5718 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -97,7 +97,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only, const loff_t start, - const loff_t end); + const loff_t end, + struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, u64 objectid); @@ -1498,7 +1499,7 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, return -EIO; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; - btrfs_set_key_type(&key, BTRFS_ORPHAN_ITEM_KEY); + key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); @@ -1637,6 +1638,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, found_key.type == log_key.type && found_key.offset == log_key.offset && btrfs_dir_type(path->nodes[0], dst_di) == log_type) { + update_size = false; goto out; } @@ -2157,7 +2159,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans, bytenr = btrfs_node_blockptr(cur, path->slots[*level]); ptr_gen = btrfs_node_ptr_generation(cur, path->slots[*level]); - blocksize = btrfs_level_size(root, *level - 1); + blocksize = root->nodesize; parent = path->nodes[*level]; root_owner = btrfs_header_owner(parent); @@ -2983,8 +2985,6 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, min_key.type = key_type; min_key.offset = min_offset; - path->keep_locks = 1; - ret = btrfs_search_forward(root, &min_key, path, trans->transid); /* @@ -3364,7 +3364,7 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, * or deletes of this inode don't have to relog the inode * again */ - if (btrfs_key_type(ins_keys + i) == BTRFS_EXTENT_DATA_KEY && + if (ins_keys[i].type == BTRFS_EXTENT_DATA_KEY && !skip_csum) { int found_type; extent = btrfs_item_ptr(src, start_slot + i, @@ -3573,107 +3573,33 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) return 0; } -static int log_one_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_root *root, - struct extent_map *em, struct btrfs_path *path, - struct list_head *logged_list) +static int wait_ordered_extents(struct btrfs_trans_handle *trans, + struct inode *inode, + struct btrfs_root *root, + const struct extent_map *em, + const struct list_head *logged_list, + bool *ordered_io_error) { - struct btrfs_root *log = root->log_root; - struct btrfs_file_extent_item *fi; - struct extent_buffer *leaf; struct btrfs_ordered_extent *ordered; - struct list_head ordered_sums; - struct btrfs_map_token token; - struct btrfs_key key; + struct btrfs_root *log = root->log_root; u64 mod_start = em->mod_start; u64 mod_len = em->mod_len; + const bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; u64 csum_offset; u64 csum_len; - u64 extent_offset = em->start - em->orig_start; - u64 block_len; - int ret; - bool skip_csum = BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM; - int extent_inserted = 0; - - INIT_LIST_HEAD(&ordered_sums); - btrfs_init_map_token(&token); - - ret = __btrfs_drop_extents(trans, log, inode, path, em->start, - em->start + em->len, NULL, 0, 1, - sizeof(*fi), &extent_inserted); - if (ret) - return ret; - - if (!extent_inserted) { - key.objectid = btrfs_ino(inode); - key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = em->start; - - ret = btrfs_insert_empty_item(trans, log, path, &key, - sizeof(*fi)); - if (ret) - return ret; - } - leaf = path->nodes[0]; - fi = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); - - btrfs_set_token_file_extent_generation(leaf, fi, em->generation, - &token); - if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) { - skip_csum = true; - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_PREALLOC, - &token); - } else { - btrfs_set_token_file_extent_type(leaf, fi, - BTRFS_FILE_EXTENT_REG, - &token); - if (em->block_start == EXTENT_MAP_HOLE) - skip_csum = true; - } - - block_len = max(em->block_len, em->orig_block_len); - if (em->compress_type != BTRFS_COMPRESS_NONE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start, - &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, - em->block_start - - extent_offset, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, - &token); - } else { - btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); - btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, - &token); - } - - btrfs_set_token_file_extent_offset(leaf, fi, - em->start - em->orig_start, - &token); - btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); - btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); - btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, - &token); - btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); - btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); - btrfs_mark_buffer_dirty(leaf); + LIST_HEAD(ordered_sums); + int ret = 0; - btrfs_release_path(path); - if (ret) { - return ret; - } + *ordered_io_error = false; - if (skip_csum) + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) || + em->block_start == EXTENT_MAP_HOLE) return 0; /* - * First check and see if our csums are on our outstanding ordered - * extents. + * Wait far any ordered extent that covers our extent map. If it + * finishes without an error, first check and see if our csums are on + * our outstanding ordered extents. */ list_for_each_entry(ordered, logged_list, log_list) { struct btrfs_ordered_sum *sum; @@ -3685,6 +3611,24 @@ static int log_one_extent(struct btrfs_trans_handle *trans, mod_start + mod_len <= ordered->file_offset) continue; + if (!test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) && + !test_bit(BTRFS_ORDERED_IOERR, &ordered->flags) && + !test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) { + const u64 start = ordered->file_offset; + const u64 end = ordered->file_offset + ordered->len - 1; + + WARN_ON(ordered->inode != inode); + filemap_fdatawrite_range(inode->i_mapping, start, end); + } + + wait_event(ordered->wait, + (test_bit(BTRFS_ORDERED_IO_DONE, &ordered->flags) || + test_bit(BTRFS_ORDERED_IOERR, &ordered->flags))); + + if (test_bit(BTRFS_ORDERED_IOERR, &ordered->flags)) { + *ordered_io_error = true; + break; + } /* * We are going to copy all the csums on this ordered extent, so * go ahead and adjust mod_start and mod_len in case this @@ -3716,6 +3660,9 @@ static int log_one_extent(struct btrfs_trans_handle *trans, } } + if (skip_csum) + continue; + /* * To keep us from looping for the above case of an ordered * extent that falls inside of the logged extent. @@ -3733,18 +3680,16 @@ static int log_one_extent(struct btrfs_trans_handle *trans, list_for_each_entry(sum, &ordered->list, list) { ret = btrfs_csum_file_blocks(trans, log, sum); if (ret) - goto unlocked; + break; } - } -unlocked: - if (!mod_len || ret) + if (*ordered_io_error || !mod_len || ret || skip_csum) return ret; if (em->compress_type) { csum_offset = 0; - csum_len = block_len; + csum_len = max(em->block_len, em->orig_block_len); } else { csum_offset = mod_start - em->start; csum_len = mod_len; @@ -3771,11 +3716,106 @@ unlocked: return ret; } +static int log_one_extent(struct btrfs_trans_handle *trans, + struct inode *inode, struct btrfs_root *root, + const struct extent_map *em, + struct btrfs_path *path, + const struct list_head *logged_list, + struct btrfs_log_ctx *ctx) +{ + struct btrfs_root *log = root->log_root; + struct btrfs_file_extent_item *fi; + struct extent_buffer *leaf; + struct btrfs_map_token token; + struct btrfs_key key; + u64 extent_offset = em->start - em->orig_start; + u64 block_len; + int ret; + int extent_inserted = 0; + bool ordered_io_err = false; + + ret = wait_ordered_extents(trans, inode, root, em, logged_list, + &ordered_io_err); + if (ret) + return ret; + + if (ordered_io_err) { + ctx->io_err = -EIO; + return 0; + } + + btrfs_init_map_token(&token); + + ret = __btrfs_drop_extents(trans, log, inode, path, em->start, + em->start + em->len, NULL, 0, 1, + sizeof(*fi), &extent_inserted); + if (ret) + return ret; + + if (!extent_inserted) { + key.objectid = btrfs_ino(inode); + key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = em->start; + + ret = btrfs_insert_empty_item(trans, log, path, &key, + sizeof(*fi)); + if (ret) + return ret; + } + leaf = path->nodes[0]; + fi = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); + + btrfs_set_token_file_extent_generation(leaf, fi, em->generation, + &token); + if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_PREALLOC, + &token); + else + btrfs_set_token_file_extent_type(leaf, fi, + BTRFS_FILE_EXTENT_REG, + &token); + + block_len = max(em->block_len, em->orig_block_len); + if (em->compress_type != BTRFS_COMPRESS_NONE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start, + &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else if (em->block_start < EXTENT_MAP_LAST_BYTE) { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, + em->block_start - + extent_offset, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, block_len, + &token); + } else { + btrfs_set_token_file_extent_disk_bytenr(leaf, fi, 0, &token); + btrfs_set_token_file_extent_disk_num_bytes(leaf, fi, 0, + &token); + } + + btrfs_set_token_file_extent_offset(leaf, fi, extent_offset, &token); + btrfs_set_token_file_extent_num_bytes(leaf, fi, em->len, &token); + btrfs_set_token_file_extent_ram_bytes(leaf, fi, em->ram_bytes, &token); + btrfs_set_token_file_extent_compression(leaf, fi, em->compress_type, + &token); + btrfs_set_token_file_extent_encryption(leaf, fi, 0, &token); + btrfs_set_token_file_extent_other_encoding(leaf, fi, 0, &token); + btrfs_mark_buffer_dirty(leaf); + + btrfs_release_path(path); + + return ret; +} + static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, struct btrfs_path *path, - struct list_head *logged_list) + struct list_head *logged_list, + struct btrfs_log_ctx *ctx) { struct extent_map *em, *n; struct list_head extents; @@ -3833,7 +3873,8 @@ process: write_unlock(&tree->lock); - ret = log_one_extent(trans, inode, root, em, path, logged_list); + ret = log_one_extent(trans, inode, root, em, path, logged_list, + ctx); write_lock(&tree->lock); clear_em_logging(tree, em); free_extent_map(em); @@ -3863,7 +3904,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, int inode_only, const loff_t start, - const loff_t end) + const loff_t end, + struct btrfs_log_ctx *ctx) { struct btrfs_path *path; struct btrfs_path *dst_path; @@ -3964,7 +4006,6 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, err = ret; goto out_unlock; } - path->keep_locks = 1; while (1) { ins_nr = 0; @@ -3994,7 +4035,8 @@ again: if (ret < 0) { err = ret; goto out_unlock; - } if (ret) { + } + if (ret) { ins_nr = 0; btrfs_release_path(path); continue; @@ -4048,7 +4090,7 @@ log_extents: btrfs_release_path(dst_path); if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - &logged_list); + &logged_list, ctx); if (ret) { err = ret; goto out_unlock; @@ -4238,7 +4280,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end); + ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); if (ret) goto end_trans; @@ -4267,7 +4309,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > root->fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, inode_only, - 0, LLONG_MAX); + 0, LLONG_MAX, ctx); if (ret) goto end_trans; } @@ -4359,7 +4401,7 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; key.offset = (u64)-1; - btrfs_set_key_type(&key, BTRFS_ROOT_ITEM_KEY); + key.type = BTRFS_ROOT_ITEM_KEY; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index e2e798ae7cd7..154990c26dcb 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -28,6 +28,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; + int io_err; struct list_head list; }; @@ -35,6 +36,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx) { ctx->log_ret = 0; ctx->log_transid = 0; + ctx->io_err = 0; INIT_LIST_HEAD(&ctx->list); } diff --git a/fs/btrfs/uuid-tree.c b/fs/btrfs/uuid-tree.c index f6a4c03ee7d8..778282944530 100644 --- a/fs/btrfs/uuid-tree.c +++ b/fs/btrfs/uuid-tree.c @@ -279,7 +279,6 @@ int btrfs_uuid_tree_iterate(struct btrfs_fs_info *fs_info, key.offset = 0; again_search_slot: - path->keep_locks = 1; ret = btrfs_search_forward(root, &key, path, 0); if (ret) { if (ret > 0) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2c2d6d1d8eee..d47289c715c8 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -50,7 +50,7 @@ static void __btrfs_reset_dev_stats(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev); static void btrfs_dev_stat_print_on_load(struct btrfs_device *device); -static DEFINE_MUTEX(uuid_mutex); +DEFINE_MUTEX(uuid_mutex); static LIST_HEAD(fs_uuids); static void lock_chunks(struct btrfs_root *root) @@ -74,6 +74,7 @@ static struct btrfs_fs_devices *__alloc_fs_devices(void) mutex_init(&fs_devs->device_list_mutex); INIT_LIST_HEAD(&fs_devs->devices); + INIT_LIST_HEAD(&fs_devs->resized_devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->list); @@ -154,11 +155,13 @@ static struct btrfs_device *__alloc_device(void) INIT_LIST_HEAD(&dev->dev_list); INIT_LIST_HEAD(&dev->dev_alloc_list); + INIT_LIST_HEAD(&dev->resized_list); spin_lock_init(&dev->io_lock); spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); + atomic_set(&dev->dev_stats_ccnt, 0); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_WAIT); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_WAIT); @@ -474,14 +477,13 @@ static noinline int device_list_add(const char *path, return PTR_ERR(fs_devices); list_add(&fs_devices->list, &fs_uuids); - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; device = NULL; } else { device = __find_device(&fs_devices->devices, devid, disk_super->dev_item.uuid); } + if (!device) { if (fs_devices->opened) return -EBUSY; @@ -565,10 +567,6 @@ static noinline int device_list_add(const char *path, if (!fs_devices->opened) device->generation = found_transid; - if (found_transid > fs_devices->latest_trans) { - fs_devices->latest_devid = devid; - fs_devices->latest_trans = found_transid; - } *fs_devices_ret = fs_devices; return ret; @@ -584,8 +582,7 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) if (IS_ERR(fs_devices)) return fs_devices; - fs_devices->latest_devid = orig->latest_devid; - fs_devices->latest_trans = orig->latest_trans; + mutex_lock(&orig->device_list_mutex); fs_devices->total_devices = orig->total_devices; /* We have held the volume lock, it is safe to get the devices. */ @@ -614,8 +611,10 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) device->fs_devices = fs_devices; fs_devices->num_devices++; } + mutex_unlock(&orig->device_list_mutex); return fs_devices; error: + mutex_unlock(&orig->device_list_mutex); free_fs_devices(fs_devices); return ERR_PTR(-ENOMEM); } @@ -624,10 +623,7 @@ void btrfs_close_extra_devices(struct btrfs_fs_info *fs_info, struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *device, *next; - - struct block_device *latest_bdev = NULL; - u64 latest_devid = 0; - u64 latest_transid = 0; + struct btrfs_device *latest_dev = NULL; mutex_lock(&uuid_mutex); again: @@ -635,11 +631,9 @@ again: list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { if (device->in_fs_metadata) { if (!device->is_tgtdev_for_dev_replace && - (!latest_transid || - device->generation > latest_transid)) { - latest_devid = device->devid; - latest_transid = device->generation; - latest_bdev = device->bdev; + (!latest_dev || + device->generation > latest_dev->generation)) { + latest_dev = device; } continue; } @@ -681,9 +675,7 @@ again: goto again; } - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; + fs_devices->latest_bdev = latest_dev->bdev; mutex_unlock(&uuid_mutex); } @@ -732,8 +724,6 @@ static int __btrfs_close_devices(struct btrfs_fs_devices *fs_devices) fs_devices->rw_devices--; } - if (device->can_discard) - fs_devices->num_can_discard--; if (device->missing) fs_devices->missing_devices--; @@ -798,11 +788,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct block_device *bdev; struct list_head *head = &fs_devices->devices; struct btrfs_device *device; - struct block_device *latest_bdev = NULL; + struct btrfs_device *latest_dev = NULL; struct buffer_head *bh; struct btrfs_super_block *disk_super; - u64 latest_devid = 0; - u64 latest_transid = 0; u64 devid; int seeding = 1; int ret = 0; @@ -830,11 +818,9 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, goto error_brelse; device->generation = btrfs_super_generation(disk_super); - if (!latest_transid || device->generation > latest_transid) { - latest_devid = devid; - latest_transid = device->generation; - latest_bdev = bdev; - } + if (!latest_dev || + device->generation > latest_dev->generation) + latest_dev = device; if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) { device->writeable = 0; @@ -844,10 +830,8 @@ static int __btrfs_open_devices(struct btrfs_fs_devices *fs_devices, } q = bdev_get_queue(bdev); - if (blk_queue_discard(q)) { + if (blk_queue_discard(q)) device->can_discard = 1; - fs_devices->num_can_discard++; - } device->bdev = bdev; device->in_fs_metadata = 0; @@ -877,9 +861,7 @@ error_brelse: } fs_devices->seeding = seeding; fs_devices->opened = 1; - fs_devices->latest_bdev = latest_bdev; - fs_devices->latest_devid = latest_devid; - fs_devices->latest_trans = latest_transid; + fs_devices->latest_bdev = latest_dev->bdev; fs_devices->total_rw_bytes = 0; out: return ret; @@ -1053,7 +1035,7 @@ int btrfs_account_dev_extents_size(struct btrfs_device *device, u64 start, if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent); @@ -1205,7 +1187,7 @@ again: if (key.objectid > device->devid) break; - if (btrfs_key_type(&key) != BTRFS_DEV_EXTENT_KEY) + if (key.type != BTRFS_DEV_EXTENT_KEY) goto next; if (key.offset > search_start) { @@ -1284,7 +1266,7 @@ out: static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, struct btrfs_device *device, - u64 start) + u64 start, u64 *dev_extent_len) { int ret; struct btrfs_path *path; @@ -1326,13 +1308,8 @@ again: goto out; } - if (device->bytes_used > 0) { - u64 len = btrfs_dev_extent_length(leaf, extent); - device->bytes_used -= len; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space += len; - spin_unlock(&root->fs_info->free_chunk_lock); - } + *dev_extent_len = btrfs_dev_extent_length(leaf, extent); + ret = btrfs_del_item(trans, root, path); if (ret) { btrfs_error(root->fs_info, ret, @@ -1482,8 +1459,10 @@ static int btrfs_add_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_set_device_group(leaf, dev_item, 0); btrfs_set_device_seek_speed(leaf, dev_item, 0); btrfs_set_device_bandwidth(leaf, dev_item, 0); @@ -1539,7 +1518,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, key.objectid = BTRFS_DEV_ITEMS_OBJECTID; key.type = BTRFS_DEV_ITEM_KEY; key.offset = device->devid; - lock_chunks(root); ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -1555,7 +1533,6 @@ static int btrfs_rm_dev_item(struct btrfs_root *root, goto out; out: btrfs_free_path(path); - unlock_chunks(root); btrfs_commit_transaction(trans, root); return ret; } @@ -1671,8 +1648,8 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (device->writeable) { lock_chunks(root); list_del_init(&device->dev_alloc_list); + device->fs_devices->rw_devices--; unlock_chunks(root); - root->fs_info->fs_devices->rw_devices--; clear_super = true; } @@ -1691,11 +1668,6 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) if (ret) goto error_undo; - spin_lock(&root->fs_info->free_chunk_lock); - root->fs_info->free_chunk_space = device->total_bytes - - device->bytes_used; - spin_unlock(&root->fs_info->free_chunk_lock); - device->in_fs_metadata = 0; btrfs_scrub_cancel_dev(root->fs_info, device); @@ -1749,9 +1721,7 @@ int btrfs_rm_device(struct btrfs_root *root, char *device_path) fs_devices = fs_devices->seed; } cur_devices->seed = NULL; - lock_chunks(root); __btrfs_close_devices(cur_devices); - unlock_chunks(root); free_fs_devices(cur_devices); } @@ -1824,8 +1794,8 @@ error_undo: lock_chunks(root); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); + device->fs_devices->rw_devices++; unlock_chunks(root); - root->fs_info->fs_devices->rw_devices++; } goto error_brelse; } @@ -1833,29 +1803,57 @@ error_undo: void btrfs_rm_dev_replace_srcdev(struct btrfs_fs_info *fs_info, struct btrfs_device *srcdev) { + struct btrfs_fs_devices *fs_devices; + WARN_ON(!mutex_is_locked(&fs_info->fs_devices->device_list_mutex)); + /* + * in case of fs with no seed, srcdev->fs_devices will point + * to fs_devices of fs_info. However when the dev being replaced is + * a seed dev it will point to the seed's local fs_devices. In short + * srcdev will have its correct fs_devices in both the cases. + */ + fs_devices = srcdev->fs_devices; + list_del_rcu(&srcdev->dev_list); list_del_rcu(&srcdev->dev_alloc_list); - fs_info->fs_devices->num_devices--; - if (srcdev->missing) { - fs_info->fs_devices->missing_devices--; - fs_info->fs_devices->rw_devices++; - } - if (srcdev->can_discard) - fs_info->fs_devices->num_can_discard--; - if (srcdev->bdev) { - fs_info->fs_devices->open_devices--; + fs_devices->num_devices--; + if (srcdev->missing) + fs_devices->missing_devices--; - /* - * zero out the old super if it is not writable - * (e.g. seed device) - */ - if (srcdev->writeable) - btrfs_scratch_superblock(srcdev); + if (srcdev->writeable) { + fs_devices->rw_devices--; + /* zero out the old super if it is writable */ + btrfs_scratch_superblock(srcdev); } + if (srcdev->bdev) + fs_devices->open_devices--; + call_rcu(&srcdev->rcu, free_device); + + /* + * unless fs_devices is seed fs, num_devices shouldn't go + * zero + */ + BUG_ON(!fs_devices->num_devices && !fs_devices->seeding); + + /* if this is no devs we rather delete the fs_devices */ + if (!fs_devices->num_devices) { + struct btrfs_fs_devices *tmp_fs_devices; + + tmp_fs_devices = fs_info->fs_devices; + while (tmp_fs_devices) { + if (tmp_fs_devices->seed == fs_devices) { + tmp_fs_devices->seed = fs_devices->seed; + break; + } + tmp_fs_devices = tmp_fs_devices->seed; + } + fs_devices->seed = NULL; + __btrfs_close_devices(fs_devices); + free_fs_devices(fs_devices); + } } void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, @@ -1863,6 +1861,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *next_device; + mutex_lock(&uuid_mutex); WARN_ON(!tgtdev); mutex_lock(&fs_info->fs_devices->device_list_mutex); if (tgtdev->bdev) { @@ -1870,8 +1869,6 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, fs_info->fs_devices->open_devices--; } fs_info->fs_devices->num_devices--; - if (tgtdev->can_discard) - fs_info->fs_devices->num_can_discard++; next_device = list_entry(fs_info->fs_devices->devices.next, struct btrfs_device, dev_list); @@ -1884,6 +1881,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, call_rcu(&tgtdev->rcu, free_device); mutex_unlock(&fs_info->fs_devices->device_list_mutex); + mutex_unlock(&uuid_mutex); } static int btrfs_find_device_by_path(struct btrfs_root *root, char *device_path, @@ -1982,17 +1980,17 @@ static int btrfs_prepare_sprout(struct btrfs_root *root) mutex_lock(&root->fs_info->fs_devices->device_list_mutex); list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices, synchronize_rcu); + list_for_each_entry(device, &seed_devices->devices, dev_list) + device->fs_devices = seed_devices; + lock_chunks(root); list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - list_for_each_entry(device, &seed_devices->devices, dev_list) { - device->fs_devices = seed_devices; - } + unlock_chunks(root); fs_devices->seeding = 0; fs_devices->num_devices = 0; fs_devices->open_devices = 0; fs_devices->missing_devices = 0; - fs_devices->num_can_discard = 0; fs_devices->rotating = 0; fs_devices->seed = seed_devices; @@ -2092,7 +2090,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) struct list_head *devices; struct super_block *sb = root->fs_info->sb; struct rcu_string *name; - u64 total_bytes; + u64 tmp; int seeding_dev = 0; int ret = 0; @@ -2148,8 +2146,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) goto error; } - lock_chunks(root); - q = bdev_get_queue(bdev); if (blk_queue_discard(q)) device->can_discard = 1; @@ -2160,6 +2156,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->sector_size = root->sectorsize; device->total_bytes = i_size_read(bdev->bd_inode); device->disk_total_bytes = device->total_bytes; + device->commit_total_bytes = device->total_bytes; device->dev_root = root->fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2177,6 +2174,7 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) device->fs_devices = root->fs_info->fs_devices; mutex_lock(&root->fs_info->fs_devices->device_list_mutex); + lock_chunks(root); list_add_rcu(&device->dev_list, &root->fs_info->fs_devices->devices); list_add(&device->dev_alloc_list, &root->fs_info->fs_devices->alloc_list); @@ -2184,8 +2182,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fs_devices->open_devices++; root->fs_info->fs_devices->rw_devices++; root->fs_info->fs_devices->total_devices++; - if (device->can_discard) - root->fs_info->fs_devices->num_can_discard++; root->fs_info->fs_devices->total_rw_bytes += device->total_bytes; spin_lock(&root->fs_info->free_chunk_lock); @@ -2195,26 +2191,45 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) if (!blk_queue_nonrot(bdev_get_queue(bdev))) root->fs_info->fs_devices->rotating = 1; - total_bytes = btrfs_super_total_bytes(root->fs_info->super_copy); + tmp = btrfs_super_total_bytes(root->fs_info->super_copy); btrfs_set_super_total_bytes(root->fs_info->super_copy, - total_bytes + device->total_bytes); + tmp + device->total_bytes); - total_bytes = btrfs_super_num_devices(root->fs_info->super_copy); + tmp = btrfs_super_num_devices(root->fs_info->super_copy); btrfs_set_super_num_devices(root->fs_info->super_copy, - total_bytes + 1); + tmp + 1); /* add sysfs device entry */ btrfs_kobj_add_device(root->fs_info, device); + /* + * we've got more storage, clear any full flags on the space + * infos + */ + btrfs_clear_space_info_full(root->fs_info); + + unlock_chunks(root); mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); if (seeding_dev) { - char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + lock_chunks(root); ret = init_first_rw_device(trans, root, device); + unlock_chunks(root); if (ret) { btrfs_abort_transaction(trans, root, ret); goto error_trans; } + } + + ret = btrfs_add_device(trans, root, device); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto error_trans; + } + + if (seeding_dev) { + char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; + ret = btrfs_finish_sprout(trans, root); if (ret) { btrfs_abort_transaction(trans, root, ret); @@ -2228,21 +2243,8 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) root->fs_info->fsid); if (kobject_rename(&root->fs_info->super_kobj, fsid_buf)) goto error_trans; - } else { - ret = btrfs_add_device(trans, root, device); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto error_trans; - } } - /* - * we've got more storage, clear any full flags on the space - * infos - */ - btrfs_clear_space_info_full(root->fs_info); - - unlock_chunks(root); root->fs_info->num_tolerated_disk_barrier_failures = btrfs_calc_num_tolerated_disk_barrier_failures(root->fs_info); ret = btrfs_commit_transaction(trans, root); @@ -2274,7 +2276,6 @@ int btrfs_init_new_device(struct btrfs_root *root, char *device_path) return ret; error_trans: - unlock_chunks(root); btrfs_end_transaction(trans, root); rcu_string_free(device->name); btrfs_kobj_rm_device(root->fs_info, device); @@ -2289,6 +2290,7 @@ error: } int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out) { struct request_queue *q; @@ -2301,24 +2303,38 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) + if (fs_info->fs_devices->seeding) { + btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; + } bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL, fs_info->bdev_holder); - if (IS_ERR(bdev)) + if (IS_ERR(bdev)) { + btrfs_err(fs_info, "target device %s is invalid!", device_path); return PTR_ERR(bdev); + } filemap_write_and_wait(bdev->bd_inode->i_mapping); devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { if (device->bdev == bdev) { + btrfs_err(fs_info, "target device is in the filesystem!"); ret = -EEXIST; goto error; } } + + if (i_size_read(bdev->bd_inode) < + btrfs_device_get_total_bytes(srcdev)) { + btrfs_err(fs_info, "target device is smaller than source device!"); + ret = -EINVAL; + goto error; + } + + device = btrfs_alloc_device(NULL, &devid, NULL); if (IS_ERR(device)) { ret = PTR_ERR(device); @@ -2342,8 +2358,12 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, device->io_width = root->sectorsize; device->io_align = root->sectorsize; device->sector_size = root->sectorsize; - device->total_bytes = i_size_read(bdev->bd_inode); - device->disk_total_bytes = device->total_bytes; + device->total_bytes = btrfs_device_get_total_bytes(srcdev); + device->disk_total_bytes = btrfs_device_get_disk_total_bytes(srcdev); + device->bytes_used = btrfs_device_get_bytes_used(srcdev); + ASSERT(list_empty(&srcdev->resized_list)); + device->commit_total_bytes = srcdev->commit_total_bytes; + device->commit_bytes_used = device->bytes_used; device->dev_root = fs_info->dev_root; device->bdev = bdev; device->in_fs_metadata = 1; @@ -2355,8 +2375,6 @@ int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, list_add(&device->dev_list, &fs_info->fs_devices->devices); fs_info->fs_devices->num_devices++; fs_info->fs_devices->open_devices++; - if (device->can_discard) - fs_info->fs_devices->num_can_discard++; mutex_unlock(&root->fs_info->fs_devices->device_list_mutex); *device_out = device; @@ -2415,8 +2433,10 @@ static noinline int btrfs_update_device(struct btrfs_trans_handle *trans, btrfs_set_device_io_align(leaf, dev_item, device->io_align); btrfs_set_device_io_width(leaf, dev_item, device->io_width); btrfs_set_device_sector_size(leaf, dev_item, device->sector_size); - btrfs_set_device_total_bytes(leaf, dev_item, device->disk_total_bytes); - btrfs_set_device_bytes_used(leaf, dev_item, device->bytes_used); + btrfs_set_device_total_bytes(leaf, dev_item, + btrfs_device_get_disk_total_bytes(device)); + btrfs_set_device_bytes_used(leaf, dev_item, + btrfs_device_get_bytes_used(device)); btrfs_mark_buffer_dirty(leaf); out: @@ -2424,40 +2444,44 @@ out: return ret; } -static int __btrfs_grow_device(struct btrfs_trans_handle *trans, +int btrfs_grow_device(struct btrfs_trans_handle *trans, struct btrfs_device *device, u64 new_size) { struct btrfs_super_block *super_copy = device->dev_root->fs_info->super_copy; - u64 old_total = btrfs_super_total_bytes(super_copy); - u64 diff = new_size - device->total_bytes; + struct btrfs_fs_devices *fs_devices; + u64 old_total; + u64 diff; if (!device->writeable) return -EACCES; + + lock_chunks(device->dev_root); + old_total = btrfs_super_total_bytes(super_copy); + diff = new_size - device->total_bytes; + if (new_size <= device->total_bytes || - device->is_tgtdev_for_dev_replace) + device->is_tgtdev_for_dev_replace) { + unlock_chunks(device->dev_root); return -EINVAL; + } + + fs_devices = device->dev_root->fs_info->fs_devices; btrfs_set_super_total_bytes(super_copy, old_total + diff); device->fs_devices->total_rw_bytes += diff; - device->total_bytes = new_size; - device->disk_total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); + btrfs_device_set_disk_total_bytes(device, new_size); btrfs_clear_space_info_full(device->dev_root->fs_info); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &fs_devices->resized_devices); + unlock_chunks(device->dev_root); return btrfs_update_device(trans, device); } -int btrfs_grow_device(struct btrfs_trans_handle *trans, - struct btrfs_device *device, u64 new_size) -{ - int ret; - lock_chunks(device->dev_root); - ret = __btrfs_grow_device(trans, device, new_size); - unlock_chunks(device->dev_root); - return ret; -} - static int btrfs_free_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 chunk_tree, u64 chunk_objectid, @@ -2509,6 +2533,7 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 u32 cur; struct btrfs_key key; + lock_chunks(root); array_size = btrfs_super_sys_array_size(super_copy); ptr = super_copy->sys_chunk_array; @@ -2538,79 +2563,95 @@ static int btrfs_del_sys_chunk(struct btrfs_root *root, u64 chunk_objectid, u64 cur += len; } } + unlock_chunks(root); return ret; } -static int btrfs_relocate_chunk(struct btrfs_root *root, - u64 chunk_tree, u64 chunk_objectid, - u64 chunk_offset) +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset) { struct extent_map_tree *em_tree; - struct btrfs_root *extent_root; - struct btrfs_trans_handle *trans; struct extent_map *em; + struct btrfs_root *extent_root = root->fs_info->extent_root; struct map_lookup *map; - int ret; - int i; + u64 dev_extent_len = 0; + u64 chunk_objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; + u64 chunk_tree = root->fs_info->chunk_root->objectid; + int i, ret = 0; + /* Just in case */ root = root->fs_info->chunk_root; - extent_root = root->fs_info->extent_root; em_tree = &root->fs_info->mapping_tree.map_tree; - ret = btrfs_can_relocate(extent_root, chunk_offset); - if (ret) - return -ENOSPC; - - /* step one, relocate all the extents inside this chunk */ - ret = btrfs_relocate_block_group(extent_root, chunk_offset); - if (ret) - return ret; - - trans = btrfs_start_transaction(root, 0); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - btrfs_std_error(root->fs_info, ret); - return ret; - } - - lock_chunks(root); - - /* - * step two, delete the device extents and the - * chunk tree entries - */ read_lock(&em_tree->lock); em = lookup_extent_mapping(em_tree, chunk_offset, 1); read_unlock(&em_tree->lock); - BUG_ON(!em || em->start > chunk_offset || - em->start + em->len < chunk_offset); + if (!em || em->start > chunk_offset || + em->start + em->len < chunk_offset) { + /* + * This is a logic error, but we don't want to just rely on the + * user having built with ASSERT enabled, so if ASSERT doens't + * do anything we still error out. + */ + ASSERT(0); + if (em) + free_extent_map(em); + return -EINVAL; + } map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_free_dev_extent(trans, map->stripes[i].dev, - map->stripes[i].physical); - BUG_ON(ret); + struct btrfs_device *device = map->stripes[i].dev; + ret = btrfs_free_dev_extent(trans, device, + map->stripes[i].physical, + &dev_extent_len); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } + + if (device->bytes_used > 0) { + lock_chunks(root); + btrfs_device_set_bytes_used(device, + device->bytes_used - dev_extent_len); + spin_lock(&root->fs_info->free_chunk_lock); + root->fs_info->free_chunk_space += dev_extent_len; + spin_unlock(&root->fs_info->free_chunk_lock); + btrfs_clear_space_info_full(root->fs_info); + unlock_chunks(root); + } if (map->stripes[i].dev) { ret = btrfs_update_device(trans, map->stripes[i].dev); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } } ret = btrfs_free_chunk(trans, root, chunk_tree, chunk_objectid, chunk_offset); - - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } trace_btrfs_chunk_free(root, map, chunk_offset, em->len); if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) { ret = btrfs_del_sys_chunk(root, chunk_objectid, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, root, ret); + goto out; + } } ret = btrfs_remove_block_group(trans, extent_root, chunk_offset); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, extent_root, ret); + goto out; + } write_lock(&em_tree->lock); remove_extent_mapping(em_tree, em); @@ -2618,12 +2659,46 @@ static int btrfs_relocate_chunk(struct btrfs_root *root, /* once for the tree */ free_extent_map(em); +out: /* once for us */ free_extent_map(em); + return ret; +} - unlock_chunks(root); +static int btrfs_relocate_chunk(struct btrfs_root *root, + u64 chunk_tree, u64 chunk_objectid, + u64 chunk_offset) +{ + struct btrfs_root *extent_root; + struct btrfs_trans_handle *trans; + int ret; + + root = root->fs_info->chunk_root; + extent_root = root->fs_info->extent_root; + + ret = btrfs_can_relocate(extent_root, chunk_offset); + if (ret) + return -ENOSPC; + + /* step one, relocate all the extents inside this chunk */ + ret = btrfs_relocate_block_group(extent_root, chunk_offset); + if (ret) + return ret; + + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); + btrfs_std_error(root->fs_info, ret); + return ret; + } + + /* + * step two, delete the device extents and the + * chunk tree entries + */ + ret = btrfs_remove_chunk(trans, root, chunk_offset); btrfs_end_transaction(trans, root); - return 0; + return ret; } static int btrfs_relocate_sys_chunks(struct btrfs_root *root) @@ -2676,8 +2751,8 @@ again: found_key.offset); if (ret == -ENOSPC) failed++; - else if (ret) - BUG(); + else + BUG_ON(ret); } if (found_key.offset == 0) @@ -3084,11 +3159,12 @@ static int __btrfs_balance(struct btrfs_fs_info *fs_info) /* step one make some room on all the devices */ devices = &fs_info->fs_devices->devices; list_for_each_entry(device, devices, dev_list) { - old_size = device->total_bytes; + old_size = btrfs_device_get_total_bytes(device); size_to_free = div_factor(old_size, 1); size_to_free = min(size_to_free, (u64)1 * 1024 * 1024); if (!device->writeable || - device->total_bytes - device->bytes_used > size_to_free || + btrfs_device_get_total_bytes(device) - + btrfs_device_get_bytes_used(device) > size_to_free || device->is_tgtdev_for_dev_replace) continue; @@ -3643,8 +3719,6 @@ static int btrfs_uuid_scan_kthread(void *data) max_key.type = BTRFS_ROOT_ITEM_KEY; max_key.offset = (u64)-1; - path->keep_locks = 1; - while (1) { ret = btrfs_search_forward(root, &key, path, 0); if (ret) { @@ -3896,8 +3970,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) struct btrfs_key key; struct btrfs_super_block *super_copy = root->fs_info->super_copy; u64 old_total = btrfs_super_total_bytes(super_copy); - u64 old_size = device->total_bytes; - u64 diff = device->total_bytes - new_size; + u64 old_size = btrfs_device_get_total_bytes(device); + u64 diff = old_size - new_size; if (device->is_tgtdev_for_dev_replace) return -EINVAL; @@ -3910,7 +3984,7 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) lock_chunks(root); - device->total_bytes = new_size; + btrfs_device_set_total_bytes(device, new_size); if (device->writeable) { device->fs_devices->total_rw_bytes -= diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -3976,7 +4050,7 @@ again: ret = -ENOSPC; lock_chunks(root); - device->total_bytes = old_size; + btrfs_device_set_total_bytes(device, old_size); if (device->writeable) device->fs_devices->total_rw_bytes += diff; spin_lock(&root->fs_info->free_chunk_lock); @@ -3994,18 +4068,17 @@ again: } lock_chunks(root); + btrfs_device_set_disk_total_bytes(device, new_size); + if (list_empty(&device->resized_list)) + list_add_tail(&device->resized_list, + &root->fs_info->fs_devices->resized_devices); - device->disk_total_bytes = new_size; - /* Now btrfs_update_device() will change the on-disk size. */ - ret = btrfs_update_device(trans, device); - if (ret) { - unlock_chunks(root); - btrfs_end_transaction(trans, root); - goto done; - } WARN_ON(diff > old_total); btrfs_set_super_total_bytes(super_copy, old_total - diff); unlock_chunks(root); + + /* Now btrfs_update_device() will change the on-disk size. */ + ret = btrfs_update_device(trans, device); btrfs_end_transaction(trans, root); done: btrfs_free_path(path); @@ -4021,10 +4094,13 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, u32 array_size; u8 *ptr; + lock_chunks(root); array_size = btrfs_super_sys_array_size(super_copy); if (array_size + item_size + sizeof(disk_key) - > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) + > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) { + unlock_chunks(root); return -EFBIG; + } ptr = super_copy->sys_chunk_array + array_size; btrfs_cpu_key_to_disk(&disk_key, key); @@ -4033,6 +4109,8 @@ static int btrfs_add_system_chunk(struct btrfs_root *root, memcpy(ptr, chunk, item_size); item_size += sizeof(disk_key); btrfs_set_super_sys_array_size(super_copy, array_size + item_size); + unlock_chunks(root); + return 0; } @@ -4402,6 +4480,16 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, if (ret) goto error_del_extent; + for (i = 0; i < map->num_stripes; i++) { + num_bytes = map->stripes[i].dev->bytes_used + stripe_size; + btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes); + } + + spin_lock(&extent_root->fs_info->free_chunk_lock); + extent_root->fs_info->free_chunk_space -= (stripe_size * + map->num_stripes); + spin_unlock(&extent_root->fs_info->free_chunk_lock); + free_extent_map(em); check_raid56_incompat_flag(extent_root->fs_info, type); @@ -4473,7 +4561,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, device = map->stripes[i].dev; dev_offset = map->stripes[i].physical; - device->bytes_used += stripe_size; ret = btrfs_update_device(trans, device); if (ret) goto out; @@ -4486,11 +4573,6 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, goto out; } - spin_lock(&extent_root->fs_info->free_chunk_lock); - extent_root->fs_info->free_chunk_space -= (stripe_size * - map->num_stripes); - spin_unlock(&extent_root->fs_info->free_chunk_lock); - stripe = &chunk->stripe; for (i = 0; i < map->num_stripes; i++) { device = map->stripes[i].dev; @@ -4570,16 +4652,25 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans, alloc_profile = btrfs_get_alloc_profile(fs_info->chunk_root, 0); ret = __btrfs_alloc_chunk(trans, extent_root, sys_chunk_offset, alloc_profile); - if (ret) { - btrfs_abort_transaction(trans, root, ret); - goto out; + return ret; +} + +static inline int btrfs_chunk_max_errors(struct map_lookup *map) +{ + int max_errors; + + if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | + BTRFS_BLOCK_GROUP_RAID10 | + BTRFS_BLOCK_GROUP_RAID5 | + BTRFS_BLOCK_GROUP_DUP)) { + max_errors = 1; + } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { + max_errors = 2; + } else { + max_errors = 0; } - ret = btrfs_add_device(trans, fs_info->chunk_root, device); - if (ret) - btrfs_abort_transaction(trans, root, ret); -out: - return ret; + return max_errors; } int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) @@ -4588,6 +4679,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) struct map_lookup *map; struct btrfs_mapping_tree *map_tree = &root->fs_info->mapping_tree; int readonly = 0; + int miss_ndevs = 0; int i; read_lock(&map_tree->map_tree.lock); @@ -4596,18 +4688,27 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - if (btrfs_test_opt(root, DEGRADED)) { - free_extent_map(em); - return 0; - } - map = (struct map_lookup *)em->bdev; for (i = 0; i < map->num_stripes; i++) { + if (map->stripes[i].dev->missing) { + miss_ndevs++; + continue; + } + if (!map->stripes[i].dev->writeable) { readonly = 1; - break; + goto end; } } + + /* + * If the number of missing devices is larger than max errors, + * we can not write the data into that chunk successfully, so + * set it readonly. + */ + if (miss_ndevs > btrfs_chunk_max_errors(map)) + readonly = 1; +end: free_extent_map(em); return readonly; } @@ -5008,6 +5109,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, num_stripes = min_t(u64, map->num_stripes, stripe_nr_end - stripe_nr_orig); stripe_index = do_div(stripe_nr, map->num_stripes); + if (!(rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS))) + mirror_num = 1; } else if (map->type & BTRFS_BLOCK_GROUP_RAID1) { if (rw & (REQ_WRITE | REQ_DISCARD | REQ_GET_READ_MIRRORS)) num_stripes = map->num_stripes; @@ -5111,6 +5214,9 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, /* We distribute the parity blocks across stripes */ tmp = stripe_nr + stripe_index; stripe_index = do_div(tmp, map->num_stripes); + if (!(rw & (REQ_WRITE | REQ_DISCARD | + REQ_GET_READ_MIRRORS)) && mirror_num <= 1) + mirror_num = 1; } } else { /* @@ -5218,16 +5324,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, } } - if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) { - if (map->type & (BTRFS_BLOCK_GROUP_RAID1 | - BTRFS_BLOCK_GROUP_RAID10 | - BTRFS_BLOCK_GROUP_RAID5 | - BTRFS_BLOCK_GROUP_DUP)) { - max_errors = 1; - } else if (map->type & BTRFS_BLOCK_GROUP_RAID6) { - max_errors = 2; - } - } + if (rw & (REQ_WRITE | REQ_GET_READ_MIRRORS)) + max_errors = btrfs_chunk_max_errors(map); if (dev_replace_is_ongoing && (rw & (REQ_WRITE | REQ_DISCARD)) && dev_replace->tgtdev != NULL) { @@ -5610,8 +5708,8 @@ static void submit_stripe_bio(struct btrfs_root *root, struct btrfs_bio *bbio, name = rcu_dereference(dev->name); pr_debug("btrfs_map_bio: rw %d, sector=%llu, dev=%lu " "(%s id %llu), size=%u\n", rw, - (u64)bio->bi_sector, (u_long)dev->bdev->bd_dev, - name->str, dev->devid, bio->bi_size); + (u64)bio->bi_iter.bi_sector, (u_long)dev->bdev->bd_dev, + name->str, dev->devid, bio->bi_iter.bi_size); rcu_read_unlock(); } #endif @@ -5789,10 +5887,10 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, } static struct btrfs_device *add_missing_dev(struct btrfs_root *root, + struct btrfs_fs_devices *fs_devices, u64 devid, u8 *dev_uuid) { struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; device = btrfs_alloc_device(NULL, &devid, dev_uuid); if (IS_ERR(device)) @@ -5929,7 +6027,8 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } if (!map->stripes[i].dev) { map->stripes[i].dev = - add_missing_dev(root, devid, uuid); + add_missing_dev(root, root->fs_info->fs_devices, + devid, uuid); if (!map->stripes[i].dev) { free_extent_map(em); return -EIO; @@ -5956,7 +6055,9 @@ static void fill_device_from_item(struct extent_buffer *leaf, device->devid = btrfs_device_id(leaf, dev_item); device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item); device->total_bytes = device->disk_total_bytes; + device->commit_total_bytes = device->disk_total_bytes; device->bytes_used = btrfs_device_bytes_used(leaf, dev_item); + device->commit_bytes_used = device->bytes_used; device->type = btrfs_device_type(leaf, dev_item); device->io_align = btrfs_device_io_align(leaf, dev_item); device->io_width = btrfs_device_io_width(leaf, dev_item); @@ -5968,7 +6069,8 @@ static void fill_device_from_item(struct extent_buffer *leaf, read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE); } -static int open_seed_devices(struct btrfs_root *root, u8 *fsid) +static struct btrfs_fs_devices *open_seed_devices(struct btrfs_root *root, + u8 *fsid) { struct btrfs_fs_devices *fs_devices; int ret; @@ -5977,49 +6079,56 @@ static int open_seed_devices(struct btrfs_root *root, u8 *fsid) fs_devices = root->fs_info->fs_devices->seed; while (fs_devices) { - if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) { - ret = 0; - goto out; - } + if (!memcmp(fs_devices->fsid, fsid, BTRFS_UUID_SIZE)) + return fs_devices; + fs_devices = fs_devices->seed; } fs_devices = find_fsid(fsid); if (!fs_devices) { - ret = -ENOENT; - goto out; + if (!btrfs_test_opt(root, DEGRADED)) + return ERR_PTR(-ENOENT); + + fs_devices = alloc_fs_devices(fsid); + if (IS_ERR(fs_devices)) + return fs_devices; + + fs_devices->seeding = 1; + fs_devices->opened = 1; + return fs_devices; } fs_devices = clone_fs_devices(fs_devices); - if (IS_ERR(fs_devices)) { - ret = PTR_ERR(fs_devices); - goto out; - } + if (IS_ERR(fs_devices)) + return fs_devices; ret = __btrfs_open_devices(fs_devices, FMODE_READ, root->fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); + fs_devices = ERR_PTR(ret); goto out; } if (!fs_devices->seeding) { __btrfs_close_devices(fs_devices); free_fs_devices(fs_devices); - ret = -EINVAL; + fs_devices = ERR_PTR(-EINVAL); goto out; } fs_devices->seed = root->fs_info->fs_devices->seed; root->fs_info->fs_devices->seed = fs_devices; out: - return ret; + return fs_devices; } static int read_one_dev(struct btrfs_root *root, struct extent_buffer *leaf, struct btrfs_dev_item *dev_item) { + struct btrfs_fs_devices *fs_devices = root->fs_info->fs_devices; struct btrfs_device *device; u64 devid; int ret; @@ -6033,31 +6142,48 @@ static int read_one_dev(struct btrfs_root *root, BTRFS_UUID_SIZE); if (memcmp(fs_uuid, root->fs_info->fsid, BTRFS_UUID_SIZE)) { - ret = open_seed_devices(root, fs_uuid); - if (ret && !btrfs_test_opt(root, DEGRADED)) - return ret; + fs_devices = open_seed_devices(root, fs_uuid); + if (IS_ERR(fs_devices)) + return PTR_ERR(fs_devices); } device = btrfs_find_device(root->fs_info, devid, dev_uuid, fs_uuid); - if (!device || !device->bdev) { + if (!device) { if (!btrfs_test_opt(root, DEGRADED)) return -EIO; - if (!device) { - btrfs_warn(root->fs_info, "devid %llu missing", devid); - device = add_missing_dev(root, devid, dev_uuid); - if (!device) - return -ENOMEM; - } else if (!device->missing) { + btrfs_warn(root->fs_info, "devid %llu missing", devid); + device = add_missing_dev(root, fs_devices, devid, dev_uuid); + if (!device) + return -ENOMEM; + } else { + if (!device->bdev && !btrfs_test_opt(root, DEGRADED)) + return -EIO; + + if(!device->bdev && !device->missing) { /* * this happens when a device that was properly setup * in the device info lists suddenly goes bad. * device->bdev is NULL, and so we have to set * device->missing to one here */ - root->fs_info->fs_devices->missing_devices++; + device->fs_devices->missing_devices++; device->missing = 1; } + + /* Move the device to its own fs_devices */ + if (device->fs_devices != fs_devices) { + ASSERT(device->missing); + + list_move(&device->dev_list, &fs_devices->devices); + device->fs_devices->num_devices--; + fs_devices->num_devices++; + + device->fs_devices->missing_devices--; + fs_devices->missing_devices++; + + device->fs_devices = fs_devices; + } } if (device->fs_devices != root->fs_info->fs_devices) { @@ -6373,16 +6499,18 @@ int btrfs_run_dev_stats(struct btrfs_trans_handle *trans, struct btrfs_root *dev_root = fs_info->dev_root; struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; struct btrfs_device *device; + int stats_cnt; int ret = 0; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { - if (!device->dev_stats_valid || !device->dev_stats_dirty) + if (!device->dev_stats_valid || !btrfs_dev_stats_dirty(device)) continue; + stats_cnt = atomic_read(&device->dev_stats_ccnt); ret = update_dev_stat_item(trans, dev_root, device); if (!ret) - device->dev_stats_dirty = 0; + atomic_sub(stats_cnt, &device->dev_stats_ccnt); } mutex_unlock(&fs_devices->device_list_mutex); @@ -6481,3 +6609,51 @@ int btrfs_scratch_superblock(struct btrfs_device *device) return 0; } + +/* + * Update the size of all devices, which is used for writing out the + * super blocks. + */ +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_device *curr, *next; + + if (list_empty(&fs_devices->resized_devices)) + return; + + mutex_lock(&fs_devices->device_list_mutex); + lock_chunks(fs_info->dev_root); + list_for_each_entry_safe(curr, next, &fs_devices->resized_devices, + resized_list) { + list_del_init(&curr->resized_list); + curr->commit_total_bytes = curr->disk_total_bytes; + } + unlock_chunks(fs_info->dev_root); + mutex_unlock(&fs_devices->device_list_mutex); +} + +/* Must be invoked during the transaction commit */ +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction) +{ + struct extent_map *em; + struct map_lookup *map; + struct btrfs_device *dev; + int i; + + if (list_empty(&transaction->pending_chunks)) + return; + + /* In order to kick the device replace finish process */ + lock_chunks(root); + list_for_each_entry(em, &transaction->pending_chunks, list) { + map = (struct map_lookup *)em->bdev; + + for (i = 0; i < map->num_stripes; i++) { + dev = map->stripes[i].dev; + dev->commit_bytes_used = dev->bytes_used; + } + } + unlock_chunks(root); +} diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 2aaa00c47816..08980fa23039 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -24,6 +24,8 @@ #include <linux/btrfs.h> #include "async-thread.h" +extern struct mutex uuid_mutex; + #define BTRFS_STRIPE_LEN (64 * 1024) struct buffer_head; @@ -32,41 +34,59 @@ struct btrfs_pending_bios { struct bio *tail; }; +/* + * Use sequence counter to get consistent device stat data on + * 32-bit processors. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#include <linux/seqlock.h> +#define __BTRFS_NEED_DEVICE_DATA_ORDERED +#define btrfs_device_data_ordered_init(device) \ + seqcount_init(&device->data_seqcount) +#else +#define btrfs_device_data_ordered_init(device) do { } while (0) +#endif + struct btrfs_device { struct list_head dev_list; struct list_head dev_alloc_list; struct btrfs_fs_devices *fs_devices; + struct btrfs_root *dev_root; + struct rcu_string *name; + + u64 generation; + + spinlock_t io_lock ____cacheline_aligned; + int running_pending; /* regular prio bios */ struct btrfs_pending_bios pending_bios; /* WRITE_SYNC bios */ struct btrfs_pending_bios pending_sync_bios; - u64 generation; - int running_pending; + struct block_device *bdev; + + /* the mode sent to blkdev_get */ + fmode_t mode; + int writeable; int in_fs_metadata; int missing; int can_discard; int is_tgtdev_for_dev_replace; - spinlock_t io_lock; - /* the mode sent to blkdev_get */ - fmode_t mode; - - struct block_device *bdev; - - - struct rcu_string *name; +#ifdef __BTRFS_NEED_DEVICE_DATA_ORDERED + seqcount_t data_seqcount; +#endif /* the internal btrfs device id */ u64 devid; - /* size of the device */ + /* size of the device in memory */ u64 total_bytes; - /* size of the disk */ + /* size of the device on disk */ u64 disk_total_bytes; /* bytes used */ @@ -83,10 +103,26 @@ struct btrfs_device { /* minimal io size for this device */ u32 sector_size; - /* physical drive uuid (or lvm uuid) */ u8 uuid[BTRFS_UUID_SIZE]; + /* + * size of the device on the current transaction + * + * This variant is update when committing the transaction, + * and protected by device_list_mutex + */ + u64 commit_total_bytes; + + /* bytes used on the current transaction */ + u64 commit_bytes_used; + /* + * used to manage the device which is resized + * + * It is protected by chunk_lock. + */ + struct list_head resized_list; + /* for sending down flush barriers */ int nobarriers; struct bio *flush_bio; @@ -107,26 +143,90 @@ struct btrfs_device { struct radix_tree_root reada_zones; struct radix_tree_root reada_extents; - /* disk I/O failure stats. For detailed description refer to * enum btrfs_dev_stat_values in ioctl.h */ int dev_stats_valid; - int dev_stats_dirty; /* counters need to be written to disk */ + + /* Counter to record the change of device stats */ + atomic_t dev_stats_ccnt; atomic_t dev_stat_values[BTRFS_DEV_STAT_VALUES_MAX]; }; +/* + * If we read those variants at the context of their own lock, we needn't + * use the following helpers, reading them directly is safe. + */ +#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + unsigned int seq; \ + \ + do { \ + seq = read_seqcount_begin(&dev->data_seqcount); \ + size = dev->name; \ + } while (read_seqcount_retry(&dev->data_seqcount, seq)); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + write_seqcount_begin(&dev->data_seqcount); \ + dev->name = size; \ + write_seqcount_end(&dev->data_seqcount); \ + preempt_enable(); \ +} +#elif BITS_PER_LONG==32 && defined(CONFIG_PREEMPT) +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + u64 size; \ + \ + preempt_disable(); \ + size = dev->name; \ + preempt_enable(); \ + return size; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + preempt_disable(); \ + dev->name = size; \ + preempt_enable(); \ +} +#else +#define BTRFS_DEVICE_GETSET_FUNCS(name) \ +static inline u64 \ +btrfs_device_get_##name(const struct btrfs_device *dev) \ +{ \ + return dev->name; \ +} \ + \ +static inline void \ +btrfs_device_set_##name(struct btrfs_device *dev, u64 size) \ +{ \ + dev->name = size; \ +} +#endif + +BTRFS_DEVICE_GETSET_FUNCS(total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(disk_total_bytes); +BTRFS_DEVICE_GETSET_FUNCS(bytes_used); + struct btrfs_fs_devices { u8 fsid[BTRFS_FSID_SIZE]; /* FS specific uuid */ - /* the device with this id has the most recent copy of the super */ - u64 latest_devid; - u64 latest_trans; u64 num_devices; u64 open_devices; u64 rw_devices; u64 missing_devices; u64 total_rw_bytes; - u64 num_can_discard; u64 total_devices; struct block_device *latest_bdev; @@ -139,6 +239,7 @@ struct btrfs_fs_devices { struct mutex device_list_mutex; struct list_head devices; + struct list_head resized_devices; /* devices not currently being allocated */ struct list_head alloc_list; struct list_head list; @@ -167,8 +268,9 @@ struct btrfs_fs_devices { */ typedef void (btrfs_io_bio_end_io_t) (struct btrfs_io_bio *bio, int err); struct btrfs_io_bio { - unsigned long mirror_num; - unsigned long stripe_index; + unsigned int mirror_num; + unsigned int stripe_index; + u64 logical; u8 *csum; u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE]; u8 *csum_allocated; @@ -325,6 +427,7 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_info *fs_info, u64 devid, int btrfs_shrink_device(struct btrfs_device *device, u64 new_size); int btrfs_init_new_device(struct btrfs_root *root, char *path); int btrfs_init_dev_replace_tgtdev(struct btrfs_root *root, char *device_path, + struct btrfs_device *srcdev, struct btrfs_device **device_out); int btrfs_balance(struct btrfs_balance_control *bctl, struct btrfs_ioctl_balance_args *bargs); @@ -360,11 +463,20 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root, u64 chunk_offset, u64 chunk_size); +int btrfs_remove_chunk(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 chunk_offset); + +static inline int btrfs_dev_stats_dirty(struct btrfs_device *dev) +{ + return atomic_read(&dev->dev_stats_ccnt); +} + static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, int index) { atomic_inc(dev->dev_stat_values + index); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); } static inline int btrfs_dev_stat_read(struct btrfs_device *dev, @@ -379,7 +491,8 @@ static inline int btrfs_dev_stat_read_and_reset(struct btrfs_device *dev, int ret; ret = atomic_xchg(dev->dev_stat_values + index, 0); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); return ret; } @@ -387,7 +500,8 @@ static inline void btrfs_dev_stat_set(struct btrfs_device *dev, int index, unsigned long val) { atomic_set(dev->dev_stat_values + index, val); - dev->dev_stats_dirty = 1; + smp_mb__before_atomic(); + atomic_inc(&dev->dev_stats_ccnt); } static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, @@ -395,4 +509,8 @@ static inline void btrfs_dev_stat_reset(struct btrfs_device *dev, { btrfs_dev_stat_set(dev, index, 0); } + +void btrfs_update_commit_device_size(struct btrfs_fs_info *fs_info); +void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, + struct btrfs_transaction *transaction); #endif diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index ad8328d797ea..dcf20131fbe4 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -237,7 +237,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) * first xattr that we find and walk forward */ key.objectid = btrfs_ino(inode); - btrfs_set_key_type(&key, BTRFS_XATTR_ITEM_KEY); + key.type = BTRFS_XATTR_ITEM_KEY; key.offset = 0; path = btrfs_alloc_path(); @@ -273,7 +273,7 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size) /* check to make sure this item is what we want */ if (found_key.objectid != key.objectid) break; - if (btrfs_key_type(&found_key) != BTRFS_XATTR_ITEM_KEY) + if (found_key.type != BTRFS_XATTR_ITEM_KEY) break; di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index b67d8fc81277..759fa4e2de8f 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -33,8 +33,7 @@ #include "compression.h" struct workspace { - z_stream inf_strm; - z_stream def_strm; + z_stream strm; char *buf; struct list_head list; }; @@ -43,8 +42,7 @@ static void zlib_free_workspace(struct list_head *ws) { struct workspace *workspace = list_entry(ws, struct workspace, list); - vfree(workspace->def_strm.workspace); - vfree(workspace->inf_strm.workspace); + vfree(workspace->strm.workspace); kfree(workspace->buf); kfree(workspace); } @@ -52,17 +50,17 @@ static void zlib_free_workspace(struct list_head *ws) static struct list_head *zlib_alloc_workspace(void) { struct workspace *workspace; + int workspacesize; workspace = kzalloc(sizeof(*workspace), GFP_NOFS); if (!workspace) return ERR_PTR(-ENOMEM); - workspace->def_strm.workspace = vmalloc(zlib_deflate_workspacesize( - MAX_WBITS, MAX_MEM_LEVEL)); - workspace->inf_strm.workspace = vmalloc(zlib_inflate_workspacesize()); + workspacesize = max(zlib_deflate_workspacesize(MAX_WBITS, MAX_MEM_LEVEL), + zlib_inflate_workspacesize()); + workspace->strm.workspace = vmalloc(workspacesize); workspace->buf = kmalloc(PAGE_CACHE_SIZE, GFP_NOFS); - if (!workspace->def_strm.workspace || - !workspace->inf_strm.workspace || !workspace->buf) + if (!workspace->strm.workspace || !workspace->buf) goto fail; INIT_LIST_HEAD(&workspace->list); @@ -96,14 +94,14 @@ static int zlib_compress_pages(struct list_head *ws, *total_out = 0; *total_in = 0; - if (Z_OK != zlib_deflateInit(&workspace->def_strm, 3)) { + if (Z_OK != zlib_deflateInit(&workspace->strm, 3)) { printk(KERN_WARNING "BTRFS: deflateInit failed\n"); ret = -EIO; goto out; } - workspace->def_strm.total_in = 0; - workspace->def_strm.total_out = 0; + workspace->strm.total_in = 0; + workspace->strm.total_out = 0; in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); data_in = kmap(in_page); @@ -117,25 +115,25 @@ static int zlib_compress_pages(struct list_head *ws, pages[0] = out_page; nr_pages = 1; - workspace->def_strm.next_in = data_in; - workspace->def_strm.next_out = cpage_out; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.avail_in = min(len, PAGE_CACHE_SIZE); + workspace->strm.next_in = data_in; + workspace->strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.avail_in = min(len, PAGE_CACHE_SIZE); - while (workspace->def_strm.total_in < len) { - ret = zlib_deflate(&workspace->def_strm, Z_SYNC_FLUSH); + while (workspace->strm.total_in < len) { + ret = zlib_deflate(&workspace->strm, Z_SYNC_FLUSH); if (ret != Z_OK) { printk(KERN_DEBUG "BTRFS: deflate in loop returned %d\n", ret); - zlib_deflateEnd(&workspace->def_strm); + zlib_deflateEnd(&workspace->strm); ret = -EIO; goto out; } /* we're making it bigger, give up */ - if (workspace->def_strm.total_in > 8192 && - workspace->def_strm.total_in < - workspace->def_strm.total_out) { + if (workspace->strm.total_in > 8192 && + workspace->strm.total_in < + workspace->strm.total_out) { ret = -E2BIG; goto out; } @@ -143,7 +141,7 @@ static int zlib_compress_pages(struct list_head *ws, * before the total_in so we will pull in a new page for * the stream end if required */ - if (workspace->def_strm.avail_out == 0) { + if (workspace->strm.avail_out == 0) { kunmap(out_page); if (nr_pages == nr_dest_pages) { out_page = NULL; @@ -158,19 +156,19 @@ static int zlib_compress_pages(struct list_head *ws, cpage_out = kmap(out_page); pages[nr_pages] = out_page; nr_pages++; - workspace->def_strm.avail_out = PAGE_CACHE_SIZE; - workspace->def_strm.next_out = cpage_out; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = cpage_out; } /* we're all done */ - if (workspace->def_strm.total_in >= len) + if (workspace->strm.total_in >= len) break; /* we've read in a full page, get a new one */ - if (workspace->def_strm.avail_in == 0) { - if (workspace->def_strm.total_out > max_out) + if (workspace->strm.avail_in == 0) { + if (workspace->strm.total_out > max_out) break; - bytes_left = len - workspace->def_strm.total_in; + bytes_left = len - workspace->strm.total_in; kunmap(in_page); page_cache_release(in_page); @@ -178,28 +176,28 @@ static int zlib_compress_pages(struct list_head *ws, in_page = find_get_page(mapping, start >> PAGE_CACHE_SHIFT); data_in = kmap(in_page); - workspace->def_strm.avail_in = min(bytes_left, + workspace->strm.avail_in = min(bytes_left, PAGE_CACHE_SIZE); - workspace->def_strm.next_in = data_in; + workspace->strm.next_in = data_in; } } - workspace->def_strm.avail_in = 0; - ret = zlib_deflate(&workspace->def_strm, Z_FINISH); - zlib_deflateEnd(&workspace->def_strm); + workspace->strm.avail_in = 0; + ret = zlib_deflate(&workspace->strm, Z_FINISH); + zlib_deflateEnd(&workspace->strm); if (ret != Z_STREAM_END) { ret = -EIO; goto out; } - if (workspace->def_strm.total_out >= workspace->def_strm.total_in) { + if (workspace->strm.total_out >= workspace->strm.total_in) { ret = -E2BIG; goto out; } ret = 0; - *total_out = workspace->def_strm.total_out; - *total_in = workspace->def_strm.total_in; + *total_out = workspace->strm.total_out; + *total_in = workspace->strm.total_in; out: *out_pages = nr_pages; if (out_page) @@ -225,19 +223,18 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, size_t total_out = 0; unsigned long page_in_index = 0; unsigned long page_out_index = 0; - unsigned long total_pages_in = (srclen + PAGE_CACHE_SIZE - 1) / - PAGE_CACHE_SIZE; + unsigned long total_pages_in = DIV_ROUND_UP(srclen, PAGE_CACHE_SIZE); unsigned long buf_start; unsigned long pg_offset; data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); - workspace->inf_strm.total_in = 0; + workspace->strm.next_in = data_in; + workspace->strm.avail_in = min_t(size_t, srclen, PAGE_CACHE_SIZE); + workspace->strm.total_in = 0; - workspace->inf_strm.total_out = 0; - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; pg_offset = 0; /* If it's deflate, and it's got no preset dictionary, then @@ -247,21 +244,21 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, !(((data_in[0]<<8) + data_in[1]) % 31)) { wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -EIO; } - while (workspace->inf_strm.total_in < srclen) { - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + while (workspace->strm.total_in < srclen) { + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; buf_start = total_out; - total_out = workspace->inf_strm.total_out; + total_out = workspace->strm.total_out; /* we didn't make progress in this inflate call, we're done */ if (buf_start == total_out) @@ -276,10 +273,10 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, goto done; } - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; - if (workspace->inf_strm.avail_in == 0) { + if (workspace->strm.avail_in == 0) { unsigned long tmp; kunmap(pages_in[page_in_index]); page_in_index++; @@ -288,9 +285,9 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, break; } data_in = kmap(pages_in[page_in_index]); - workspace->inf_strm.next_in = data_in; - tmp = srclen - workspace->inf_strm.total_in; - workspace->inf_strm.avail_in = min(tmp, + workspace->strm.next_in = data_in; + tmp = srclen - workspace->strm.total_in; + workspace->strm.avail_in = min(tmp, PAGE_CACHE_SIZE); } } @@ -299,7 +296,7 @@ static int zlib_decompress_biovec(struct list_head *ws, struct page **pages_in, else ret = 0; done: - zlib_inflateEnd(&workspace->inf_strm); + zlib_inflateEnd(&workspace->strm); if (data_in) kunmap(pages_in[page_in_index]); return ret; @@ -317,13 +314,13 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long total_out = 0; char *kaddr; - workspace->inf_strm.next_in = data_in; - workspace->inf_strm.avail_in = srclen; - workspace->inf_strm.total_in = 0; + workspace->strm.next_in = data_in; + workspace->strm.avail_in = srclen; + workspace->strm.total_in = 0; - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; - workspace->inf_strm.total_out = 0; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.total_out = 0; /* If it's deflate, and it's got no preset dictionary, then we can tell zlib to skip the adler32 check. */ if (srclen > 2 && !(data_in[1] & PRESET_DICT) && @@ -331,11 +328,11 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, !(((data_in[0]<<8) + data_in[1]) % 31)) { wbits = -((data_in[0] >> 4) + 8); - workspace->inf_strm.next_in += 2; - workspace->inf_strm.avail_in -= 2; + workspace->strm.next_in += 2; + workspace->strm.avail_in -= 2; } - if (Z_OK != zlib_inflateInit2(&workspace->inf_strm, wbits)) { + if (Z_OK != zlib_inflateInit2(&workspace->strm, wbits)) { printk(KERN_WARNING "BTRFS: inflateInit failed\n"); return -EIO; } @@ -346,12 +343,12 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes; unsigned long pg_offset = 0; - ret = zlib_inflate(&workspace->inf_strm, Z_NO_FLUSH); + ret = zlib_inflate(&workspace->strm, Z_NO_FLUSH); if (ret != Z_OK && ret != Z_STREAM_END) break; buf_start = total_out; - total_out = workspace->inf_strm.total_out; + total_out = workspace->strm.total_out; if (total_out == buf_start) { ret = -EIO; @@ -377,8 +374,8 @@ static int zlib_decompress(struct list_head *ws, unsigned char *data_in, pg_offset += bytes; bytes_left -= bytes; next: - workspace->inf_strm.next_out = workspace->buf; - workspace->inf_strm.avail_out = PAGE_CACHE_SIZE; + workspace->strm.next_out = workspace->buf; + workspace->strm.avail_out = PAGE_CACHE_SIZE; } if (ret != Z_STREAM_END && bytes_left != 0) @@ -386,7 +383,7 @@ next: else ret = 0; - zlib_inflateEnd(&workspace->inf_strm); + zlib_inflateEnd(&workspace->strm); return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 3588a80854b2..6c48f20eddd4 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -993,7 +993,7 @@ init_page_buffers(struct page *page, struct block_device *bdev, */ static int grow_dev_page(struct block_device *bdev, sector_t block, - pgoff_t index, int size, int sizebits) + pgoff_t index, int size, int sizebits, gfp_t gfp) { struct inode *inode = bdev->bd_inode; struct page *page; @@ -1002,8 +1002,8 @@ grow_dev_page(struct block_device *bdev, sector_t block, int ret = 0; /* Will call free_more_memory() */ gfp_t gfp_mask; - gfp_mask = mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS; - gfp_mask |= __GFP_MOVABLE; + gfp_mask = (mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS) | gfp; + /* * XXX: __getblk_slow() can not really deal with failure and * will endlessly loop on improvised global reclaim. Prefer @@ -1060,7 +1060,7 @@ failed: * that page was dirty, the buffers are set dirty also. */ static int -grow_buffers(struct block_device *bdev, sector_t block, int size) +grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) { pgoff_t index; int sizebits; @@ -1087,11 +1087,12 @@ grow_buffers(struct block_device *bdev, sector_t block, int size) } /* Create a page with the proper size buffers.. */ - return grow_dev_page(bdev, block, index, size, sizebits); + return grow_dev_page(bdev, block, index, size, sizebits, gfp); } -static struct buffer_head * -__getblk_slow(struct block_device *bdev, sector_t block, int size) +struct buffer_head * +__getblk_slow(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || @@ -1113,13 +1114,14 @@ __getblk_slow(struct block_device *bdev, sector_t block, int size) if (bh) return bh; - ret = grow_buffers(bdev, block, size); + ret = grow_buffers(bdev, block, size, gfp); if (ret < 0) return NULL; if (ret == 0) free_more_memory(); } } +EXPORT_SYMBOL(__getblk_slow); /* * The relationship between dirty buffers and dirty pages: @@ -1253,7 +1255,7 @@ static struct buffer_head *__bread_slow(struct buffer_head *bh) * a local interrupt disable for that. */ -#define BH_LRU_SIZE 8 +#define BH_LRU_SIZE 16 struct bh_lru { struct buffer_head *bhs[BH_LRU_SIZE]; @@ -1331,8 +1333,8 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) for (i = 0; i < BH_LRU_SIZE; i++) { struct buffer_head *bh = __this_cpu_read(bh_lrus.bhs[i]); - if (bh && bh->b_bdev == bdev && - bh->b_blocknr == block && bh->b_size == size) { + if (bh && bh->b_blocknr == block && bh->b_bdev == bdev && + bh->b_size == size) { if (i) { while (i) { __this_cpu_write(bh_lrus.bhs[i], @@ -1373,24 +1375,25 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__find_get_block); /* - * __getblk will locate (and, if necessary, create) the buffer_head + * __getblk_gfp() will locate (and, if necessary, create) the buffer_head * which corresponds to the passed block_device, block and size. The * returned buffer has its reference count incremented. * - * __getblk() will lock up the machine if grow_dev_page's try_to_free_buffers() - * attempt is failing. FIXME, perhaps? + * __getblk_gfp() will lock up the machine if grow_dev_page's + * try_to_free_buffers() attempt is failing. FIXME, perhaps? */ struct buffer_head * -__getblk(struct block_device *bdev, sector_t block, unsigned size) +__getblk_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { struct buffer_head *bh = __find_get_block(bdev, block, size); might_sleep(); if (bh == NULL) - bh = __getblk_slow(bdev, block, size); + bh = __getblk_slow(bdev, block, size, gfp); return bh; } -EXPORT_SYMBOL(__getblk); +EXPORT_SYMBOL(__getblk_gfp); /* * Do async read-ahead on a buffer.. @@ -1406,24 +1409,28 @@ void __breadahead(struct block_device *bdev, sector_t block, unsigned size) EXPORT_SYMBOL(__breadahead); /** - * __bread() - reads a specified block and returns the bh + * __bread_gfp() - reads a specified block and returns the bh * @bdev: the block_device to read from * @block: number of block * @size: size (in bytes) to read - * + * @gfp: page allocation flag + * * Reads a specified block, and returns buffer head that contains it. + * The page cache can be allocated from non-movable area + * not to prevent page migration if you set gfp to zero. * It returns NULL if the block was unreadable. */ struct buffer_head * -__bread(struct block_device *bdev, sector_t block, unsigned size) +__bread_gfp(struct block_device *bdev, sector_t block, + unsigned size, gfp_t gfp) { - struct buffer_head *bh = __getblk(bdev, block, size); + struct buffer_head *bh = __getblk_gfp(bdev, block, size, gfp); if (likely(bh) && !buffer_uptodate(bh)) bh = __bread_slow(bh); return bh; } -EXPORT_SYMBOL(__bread); +EXPORT_SYMBOL(__bread_gfp); /* * invalidate_bh_lrus() is called rarely - but not only at unmount. @@ -2082,6 +2089,7 @@ int generic_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct inode *inode = mapping->host; + loff_t old_size = inode->i_size; int i_size_changed = 0; copied = block_write_end(file, mapping, pos, len, copied, page, fsdata); @@ -2101,6 +2109,8 @@ int generic_write_end(struct file *file, struct address_space *mapping, unlock_page(page); page_cache_release(page); + if (old_size < pos) + pagecache_isize_extended(inode, old_size, pos); /* * Don't mark the inode dirty under page lock. First, it unnecessarily * makes the holding time of page lock longer. Second, it forces lock @@ -2318,6 +2328,11 @@ static int cont_expand_zero(struct file *file, struct address_space *mapping, err = 0; balance_dirty_pages_ratelimited(mapping); + + if (unlikely(fatal_signal_pending(current))) { + err = -EINTR; + goto out; + } } /* page covers the boundary, find the boundary offset */ @@ -2956,7 +2971,7 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) /* * This allows us to do IO even on the odd last sectors - * of a device, even if the bh block size is some multiple + * of a device, even if the block size is some multiple * of the physical sector size. * * We'll just truncate the bio to the size of the device, @@ -2966,10 +2981,11 @@ static void end_bio_bh_io_sync(struct bio *bio, int err) * errors, this only handles the "we need to be able to * do IO at the final sector" case. */ -static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) +void guard_bio_eod(int rw, struct bio *bio) { sector_t maxsector; - unsigned bytes; + struct bio_vec *bvec = &bio->bi_io_vec[bio->bi_vcnt - 1]; + unsigned truncated_bytes; maxsector = i_size_read(bio->bi_bdev->bd_inode) >> 9; if (!maxsector) @@ -2984,23 +3000,20 @@ static void guard_bh_eod(int rw, struct bio *bio, struct buffer_head *bh) return; maxsector -= bio->bi_iter.bi_sector; - bytes = bio->bi_iter.bi_size; - if (likely((bytes >> 9) <= maxsector)) + if (likely((bio->bi_iter.bi_size >> 9) <= maxsector)) return; - /* Uhhuh. We've got a bh that straddles the device size! */ - bytes = maxsector << 9; + /* Uhhuh. We've got a bio that straddles the device size! */ + truncated_bytes = bio->bi_iter.bi_size - (maxsector << 9); /* Truncate the bio.. */ - bio->bi_iter.bi_size = bytes; - bio->bi_io_vec[0].bv_len = bytes; + bio->bi_iter.bi_size -= truncated_bytes; + bvec->bv_len -= truncated_bytes; /* ..and clear the end of the buffer for reads */ if ((rw & RW_MASK) == READ) { - void *kaddr = kmap_atomic(bh->b_page); - memset(kaddr + bh_offset(bh) + bytes, 0, bh->b_size - bytes); - kunmap_atomic(kaddr); - flush_dcache_page(bh->b_page); + zero_user(bvec->bv_page, bvec->bv_offset + bvec->bv_len, + truncated_bytes); } } @@ -3041,7 +3054,7 @@ int _submit_bh(int rw, struct buffer_head *bh, unsigned long bio_flags) bio->bi_flags |= bio_flags; /* Take care of bh's that straddle the end of the device */ - guard_bh_eod(rw, bio, bh); + guard_bio_eod(rw, bio); if (buffer_meta(bh)) rw |= REQ_META; diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 584743d456c3..1c7293c3a93a 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -268,20 +268,27 @@ static void cachefiles_drop_object(struct fscache_object *_object) ASSERT((atomic_read(&object->usage) & 0xffff0000) != 0x6b6b0000); #endif - /* delete retired objects */ - if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && - _object != cache->cache.fsdef - ) { - _debug("- retire object OBJ%x", object->fscache.debug_id); - cachefiles_begin_secure(cache, &saved_cred); - cachefiles_delete_object(cache, object); - cachefiles_end_secure(cache, saved_cred); - } + /* We need to tidy the object up if we did in fact manage to open it. + * It's possible for us to get here before the object is fully + * initialised if the parent goes away or the object gets retired + * before we set it up. + */ + if (object->dentry) { + /* delete retired objects */ + if (test_bit(FSCACHE_OBJECT_RETIRED, &object->fscache.flags) && + _object != cache->cache.fsdef + ) { + _debug("- retire object OBJ%x", object->fscache.debug_id); + cachefiles_begin_secure(cache, &saved_cred); + cachefiles_delete_object(cache, object); + cachefiles_end_secure(cache, saved_cred); + } - /* close the filesystem stuff attached to the object */ - if (object->backer != object->dentry) - dput(object->backer); - object->backer = NULL; + /* close the filesystem stuff attached to the object */ + if (object->backer != object->dentry) + dput(object->backer); + object->backer = NULL; + } /* note that the object is now inactive */ if (test_bit(CACHEFILES_OBJECT_ACTIVE, &object->flags)) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index dad7d9542a24..e12f189d539b 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -189,7 +189,7 @@ try_again: /* an old object from a previous incarnation is hogging the slot - we * need to wait for it to be destroyed */ wait_for_old_object: - if (fscache_object_is_live(&object->fscache)) { + if (fscache_object_is_live(&xobject->fscache)) { pr_err("\n"); pr_err("Error: Unexpected object collision\n"); cachefiles_printk_object(object, xobject); diff --git a/fs/cachefiles/rdwr.c b/fs/cachefiles/rdwr.c index 25e745b8eb1b..616db0e77b44 100644 --- a/fs/cachefiles/rdwr.c +++ b/fs/cachefiles/rdwr.c @@ -880,7 +880,6 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) { struct cachefiles_object *object; struct cachefiles_cache *cache; - mm_segment_t old_fs; struct file *file; struct path path; loff_t pos, eof; @@ -914,36 +913,27 @@ int cachefiles_write_page(struct fscache_storage *op, struct page *page) if (IS_ERR(file)) { ret = PTR_ERR(file); } else { - ret = -EIO; - if (file->f_op->write) { - pos = (loff_t) page->index << PAGE_SHIFT; - - /* we mustn't write more data than we have, so we have - * to beware of a partial page at EOF */ - eof = object->fscache.store_limit_l; - len = PAGE_SIZE; - if (eof & ~PAGE_MASK) { - ASSERTCMP(pos, <, eof); - if (eof - pos < PAGE_SIZE) { - _debug("cut short %llx to %llx", - pos, eof); - len = eof - pos; - ASSERTCMP(pos + len, ==, eof); - } + pos = (loff_t) page->index << PAGE_SHIFT; + + /* we mustn't write more data than we have, so we have + * to beware of a partial page at EOF */ + eof = object->fscache.store_limit_l; + len = PAGE_SIZE; + if (eof & ~PAGE_MASK) { + ASSERTCMP(pos, <, eof); + if (eof - pos < PAGE_SIZE) { + _debug("cut short %llx to %llx", + pos, eof); + len = eof - pos; + ASSERTCMP(pos + len, ==, eof); } - - data = kmap(page); - file_start_write(file); - old_fs = get_fs(); - set_fs(KERNEL_DS); - ret = file->f_op->write( - file, (const void __user *) data, len, &pos); - set_fs(old_fs); - kunmap(page); - file_end_write(file); - if (ret != len) - ret = -EIO; } + + data = kmap(page); + ret = __kernel_write(file, data, len, &pos); + kunmap(page); + if (ret != len) + ret = -EIO; fput(file); } diff --git a/fs/ceph/acl.c b/fs/ceph/acl.c index cebf2ebefb55..5bd853ba44ff 100644 --- a/fs/ceph/acl.c +++ b/fs/ceph/acl.c @@ -169,36 +169,109 @@ out: return ret; } -int ceph_init_acl(struct dentry *dentry, struct inode *inode, struct inode *dir) +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info) { - struct posix_acl *default_acl, *acl; - umode_t new_mode = inode->i_mode; - int error; - - error = posix_acl_create(dir, &new_mode, &default_acl, &acl); - if (error) - return error; - - if (!default_acl && !acl) { - cache_no_acl(inode); - if (new_mode != inode->i_mode) { - struct iattr newattrs = { - .ia_mode = new_mode, - .ia_valid = ATTR_MODE, - }; - error = ceph_setattr(dentry, &newattrs); + struct posix_acl *acl, *default_acl; + size_t val_size1 = 0, val_size2 = 0; + struct ceph_pagelist *pagelist = NULL; + void *tmp_buf = NULL; + int err; + + err = posix_acl_create(dir, mode, &default_acl, &acl); + if (err) + return err; + + if (acl) { + int ret = posix_acl_equiv_mode(acl, mode); + if (ret < 0) + goto out_err; + if (ret == 0) { + posix_acl_release(acl); + acl = NULL; } - return error; } - if (default_acl) { - error = ceph_set_acl(inode, default_acl, ACL_TYPE_DEFAULT); - posix_acl_release(default_acl); - } + if (!default_acl && !acl) + return 0; + + if (acl) + val_size1 = posix_acl_xattr_size(acl->a_count); + if (default_acl) + val_size2 = posix_acl_xattr_size(default_acl->a_count); + + err = -ENOMEM; + tmp_buf = kmalloc(max(val_size1, val_size2), GFP_NOFS); + if (!tmp_buf) + goto out_err; + pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_NOFS); + if (!pagelist) + goto out_err; + ceph_pagelist_init(pagelist); + + err = ceph_pagelist_reserve(pagelist, PAGE_SIZE); + if (err) + goto out_err; + + ceph_pagelist_encode_32(pagelist, acl && default_acl ? 2 : 1); + if (acl) { - if (!error) - error = ceph_set_acl(inode, acl, ACL_TYPE_ACCESS); - posix_acl_release(acl); + size_t len = strlen(POSIX_ACL_XATTR_ACCESS); + err = ceph_pagelist_reserve(pagelist, len + val_size1 + 8); + if (err) + goto out_err; + ceph_pagelist_encode_string(pagelist, POSIX_ACL_XATTR_ACCESS, + len); + err = posix_acl_to_xattr(&init_user_ns, acl, + tmp_buf, val_size1); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size1); + ceph_pagelist_append(pagelist, tmp_buf, val_size1); } - return error; + if (default_acl) { + size_t len = strlen(POSIX_ACL_XATTR_DEFAULT); + err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); + if (err) + goto out_err; + err = ceph_pagelist_encode_string(pagelist, + POSIX_ACL_XATTR_DEFAULT, len); + err = posix_acl_to_xattr(&init_user_ns, default_acl, + tmp_buf, val_size2); + if (err < 0) + goto out_err; + ceph_pagelist_encode_32(pagelist, val_size2); + ceph_pagelist_append(pagelist, tmp_buf, val_size2); + } + + kfree(tmp_buf); + + info->acl = acl; + info->default_acl = default_acl; + info->pagelist = pagelist; + return 0; + +out_err: + posix_acl_release(acl); + posix_acl_release(default_acl); + kfree(tmp_buf); + if (pagelist) + ceph_pagelist_release(pagelist); + return err; +} + +void ceph_init_inode_acls(struct inode* inode, struct ceph_acls_info *info) +{ + if (!inode) + return; + ceph_set_cached_acl(inode, ACL_TYPE_ACCESS, info->acl); + ceph_set_cached_acl(inode, ACL_TYPE_DEFAULT, info->default_acl); +} + +void ceph_release_acls_info(struct ceph_acls_info *info) +{ + posix_acl_release(info->acl); + posix_acl_release(info->default_acl); + if (info->pagelist) + ceph_pagelist_release(info->pagelist); } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 90b3954d48ed..18c06bbaf136 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1076,12 +1076,6 @@ retry_locked: /* past end of file? */ i_size = inode->i_size; /* caller holds i_mutex */ - if (i_size + len > inode->i_sb->s_maxbytes) { - /* file is too big */ - r = -EINVAL; - goto fail; - } - if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && end_in_page - pos_in_page != PAGE_CACHE_SIZE)) { @@ -1099,9 +1093,6 @@ retry_locked: if (r < 0) goto fail_nosnap; goto retry_locked; - -fail: - up_read(&mdsc->snap_rwsem); fail_nosnap: unlock_page(page); return r; diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 6d1cd45dca89..659f2ea9e6f7 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2397,12 +2397,12 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, u64 max_size = le64_to_cpu(grant->max_size); struct timespec mtime, atime, ctime; int check_caps = 0; - bool wake = 0; - bool writeback = 0; - bool queue_trunc = 0; - bool queue_invalidate = 0; - bool queue_revalidate = 0; - bool deleted_inode = 0; + bool wake = false; + bool writeback = false; + bool queue_trunc = false; + bool queue_invalidate = false; + bool queue_revalidate = false; + bool deleted_inode = false; dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, mds, seq, ceph_cap_string(newcaps)); @@ -2437,7 +2437,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, /* there were locked pages.. invalidate later in a separate thread. */ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) { - queue_invalidate = 1; + queue_invalidate = true; ci->i_rdcache_revoking = ci->i_rdcache_gen; } } @@ -2466,7 +2466,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, set_nlink(inode, le32_to_cpu(grant->nlink)); if (inode->i_nlink == 0 && (newcaps & (CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL))) - deleted_inode = 1; + deleted_inode = true; } if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) { @@ -2487,7 +2487,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, /* Do we need to revalidate our fscache cookie. Don't bother on the * first cache cap as we already validate at cookie creation time. */ if ((issued & CEPH_CAP_FILE_CACHE) && ci->i_rdcache_gen > 1) - queue_revalidate = 1; + queue_revalidate = true; if (newcaps & CEPH_CAP_ANY_RD) { /* ctime/mtime/atime? */ @@ -2516,7 +2516,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ci->i_wanted_max_size = 0; /* reset */ ci->i_requested_max_size = 0; } - wake = 1; + wake = true; } } @@ -2546,7 +2546,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ceph_cap_string(newcaps), ceph_cap_string(revoking)); if (revoking & used & CEPH_CAP_FILE_BUFFER) - writeback = 1; /* initiate writeback; will delay ack */ + writeback = true; /* initiate writeback; will delay ack */ else if (revoking == CEPH_CAP_FILE_CACHE && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && queue_invalidate) @@ -2572,7 +2572,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, cap->implemented |= newcaps; /* add bits only, to * avoid stepping on a * pending revocation */ - wake = 1; + wake = true; } BUG_ON(cap->issued & ~cap->implemented); @@ -2586,7 +2586,7 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, kick_flushing_inode_caps(mdsc, session, inode); up_read(&mdsc->snap_rwsem); if (newcaps & ~issued) - wake = 1; + wake = true; } if (queue_trunc) { @@ -3045,6 +3045,12 @@ void ceph_handle_caps(struct ceph_mds_session *session, } } + /* lookup ino */ + inode = ceph_find_inode(sb, vino); + ci = ceph_inode(inode); + dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, + vino.snap, inode); + mutex_lock(&session->s_mutex); session->s_seq++; dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, @@ -3053,11 +3059,6 @@ void ceph_handle_caps(struct ceph_mds_session *session, if (op == CEPH_CAP_OP_IMPORT) ceph_add_cap_releases(mdsc, session); - /* lookup ino */ - inode = ceph_find_inode(sb, vino); - ci = ceph_inode(inode); - dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino, - vino.snap, inode); if (!inode) { dout(" i don't have ino %llx\n", vino.ino); diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 5a743ac141ab..5d5a4c8c8496 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -158,10 +158,47 @@ static int dentry_lru_show(struct seq_file *s, void *ptr) return 0; } +static int mds_sessions_show(struct seq_file *s, void *ptr) +{ + struct ceph_fs_client *fsc = s->private; + struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_auth_client *ac = fsc->client->monc.auth; + struct ceph_options *opt = fsc->client->options; + int mds = -1; + + mutex_lock(&mdsc->mutex); + + /* The 'num' portion of an 'entity name' */ + seq_printf(s, "global_id %llu\n", ac->global_id); + + /* The -o name mount argument */ + seq_printf(s, "name \"%s\"\n", opt->name ? opt->name : ""); + + /* The list of MDS session rank+state */ + for (mds = 0; mds < mdsc->max_sessions; mds++) { + struct ceph_mds_session *session = + __ceph_lookup_mds_session(mdsc, mds); + if (!session) { + continue; + } + mutex_unlock(&mdsc->mutex); + seq_printf(s, "mds.%d %s\n", + session->s_mds, + ceph_session_state_name(session->s_state)); + + ceph_put_mds_session(session); + mutex_lock(&mdsc->mutex); + } + mutex_unlock(&mdsc->mutex); + + return 0; +} + CEPH_DEFINE_SHOW_FUNC(mdsmap_show) CEPH_DEFINE_SHOW_FUNC(mdsc_show) CEPH_DEFINE_SHOW_FUNC(caps_show) CEPH_DEFINE_SHOW_FUNC(dentry_lru_show) +CEPH_DEFINE_SHOW_FUNC(mds_sessions_show) /* @@ -193,6 +230,7 @@ void ceph_fs_debugfs_cleanup(struct ceph_fs_client *fsc) debugfs_remove(fsc->debugfs_bdi); debugfs_remove(fsc->debugfs_congestion_kb); debugfs_remove(fsc->debugfs_mdsmap); + debugfs_remove(fsc->debugfs_mds_sessions); debugfs_remove(fsc->debugfs_caps); debugfs_remove(fsc->debugfs_mdsc); debugfs_remove(fsc->debugfs_dentry_lru); @@ -231,6 +269,14 @@ int ceph_fs_debugfs_init(struct ceph_fs_client *fsc) if (!fsc->debugfs_mdsmap) goto out; + fsc->debugfs_mds_sessions = debugfs_create_file("mds_sessions", + 0600, + fsc->client->debugfs_dir, + fsc, + &mds_sessions_show_fops); + if (!fsc->debugfs_mds_sessions) + goto out; + fsc->debugfs_mdsc = debugfs_create_file("mdsc", 0600, fsc->client->debugfs_dir, diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index c29d6ae68874..e6d63f8f98c0 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -682,17 +682,22 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct ceph_acls_info acls = {}; int err; if (ceph_snap(dir) != CEPH_NOSNAP) return -EROFS; + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + return err; + dout("mknod in dir %p dentry %p mode 0%ho rdev %d\n", dir, dentry, mode, rdev); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS); if (IS_ERR(req)) { - d_drop(dentry); - return PTR_ERR(req); + err = PTR_ERR(req); + goto out; } req->r_dentry = dget(dentry); req->r_num_caps = 2; @@ -701,15 +706,20 @@ static int ceph_mknod(struct inode *dir, struct dentry *dentry, req->r_args.mknod.rdev = cpu_to_le32(rdev); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - +out: if (!err) - ceph_init_acl(dentry, dentry->d_inode, dir); + ceph_init_inode_acls(dentry->d_inode, &acls); else d_drop(dentry); + ceph_release_acls_info(&acls); return err; } @@ -733,8 +743,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest); req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS); if (IS_ERR(req)) { - d_drop(dentry); - return PTR_ERR(req); + err = PTR_ERR(req); + goto out; } req->r_dentry = dget(dentry); req->r_num_caps = 2; @@ -746,9 +756,8 @@ static int ceph_symlink(struct inode *dir, struct dentry *dentry, if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); - if (!err) - ceph_init_acl(dentry, dentry->d_inode, dir); - else +out: + if (err) d_drop(dentry); return err; } @@ -758,6 +767,7 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; + struct ceph_acls_info acls = {}; int err = -EROFS; int op; @@ -772,6 +782,12 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) } else { goto out; } + + mode |= S_IFDIR; + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + goto out; + req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { err = PTR_ERR(req); @@ -784,15 +800,20 @@ static int ceph_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) req->r_args.mkdir.mode = cpu_to_le32(mode); req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } err = ceph_mdsc_do_request(mdsc, dir, req); if (!err && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); ceph_mdsc_put_request(req); out: if (!err) - ceph_init_acl(dentry, dentry->d_inode, dir); + ceph_init_inode_acls(dentry->d_inode, &acls); else d_drop(dentry); + ceph_release_acls_info(&acls); return err; } @@ -1069,7 +1090,6 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ceph_dentry_lru_touch(dentry); } else { ceph_dir_clear_complete(dir); - d_drop(dentry); } iput(dir); return valid; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 2eb02f80a0ab..d7e0da8366e6 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -235,6 +235,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_request *req; struct dentry *dn; + struct ceph_acls_info acls = {}; int err; dout("atomic_open %p dentry %p '%.*s' %s flags %d mode 0%o\n", @@ -248,22 +249,34 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, if (err < 0) return err; + if (flags & O_CREAT) { + err = ceph_pre_init_acls(dir, &mode, &acls); + if (err < 0) + return err; + } + /* do the open */ req = prepare_open_request(dir->i_sb, flags, mode); - if (IS_ERR(req)) - return PTR_ERR(req); + if (IS_ERR(req)) { + err = PTR_ERR(req); + goto out_acl; + } req->r_dentry = dget(dentry); req->r_num_caps = 2; if (flags & O_CREAT) { req->r_dentry_drop = CEPH_CAP_FILE_SHARED; req->r_dentry_unless = CEPH_CAP_FILE_EXCL; + if (acls.pagelist) { + req->r_pagelist = acls.pagelist; + acls.pagelist = NULL; + } } req->r_locked_dir = dir; /* caller holds dir->i_mutex */ err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); if (err) - goto out_err; + goto out_req; err = ceph_handle_snapdir(req, dentry, err); if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) @@ -278,7 +291,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, dn = NULL; } if (err) - goto out_err; + goto out_req; if (dn || dentry->d_inode == NULL || S_ISLNK(dentry->d_inode->i_mode)) { /* make vfs retry on splice, ENOENT, or symlink */ dout("atomic_open finish_no_open on dn %p\n", dn); @@ -286,15 +299,17 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, } else { dout("atomic_open finish_open on dn %p\n", dn); if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) { - ceph_init_acl(dentry, dentry->d_inode, dir); + ceph_init_inode_acls(dentry->d_inode, &acls); *opened |= FILE_CREATED; } err = finish_open(file, dentry, ceph_open, opened); } -out_err: +out_req: if (!req->r_err && req->r_target_inode) ceph_put_fmode(ceph_inode(req->r_target_inode), req->r_fmode); ceph_mdsc_put_request(req); +out_acl: + ceph_release_acls_info(&acls); dout("atomic_open result=%d\n", err); return err; } @@ -826,8 +841,7 @@ again: ceph_put_cap_refs(ci, got); if (checkeof && ret >= 0) { - int statret = ceph_do_getattr(inode, - CEPH_STAT_CAP_SIZE); + int statret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); /* hit EOF or hole? */ if (statret == 0 && iocb->ki_pos < inode->i_size && @@ -836,7 +850,6 @@ again: ", reading more\n", iocb->ki_pos, inode->i_size); - iov_iter_advance(to, ret); read += ret; len -= ret; checkeof = 0; @@ -995,7 +1008,7 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) mutex_lock(&inode->i_mutex); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { - ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE); + ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); if (ret < 0) { offset = ret; goto out; diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 04c89c266cec..7b6139004401 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -766,7 +766,7 @@ static int fill_inode(struct inode *inode, /* xattrs */ /* note that if i_xattrs.len <= 4, i_xattrs.data will still be NULL. */ - if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && + if ((ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) && le64_to_cpu(info->xattr_version) > ci->i_xattrs.version) { if (ci->i_xattrs.blob) ceph_buffer_put(ci->i_xattrs.blob); @@ -1813,10 +1813,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (ia_valid & ATTR_SIZE) { dout("setattr %p size %lld -> %lld\n", inode, inode->i_size, attr->ia_size); - if (attr->ia_size > inode->i_sb->s_maxbytes) { - err = -EINVAL; - goto out; - } if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { inode->i_size = attr->ia_size; @@ -1896,8 +1892,6 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) if (mask & CEPH_SETATTR_SIZE) __ceph_do_pending_vmtruncate(inode); return err; -out: - spin_unlock(&ci->i_ceph_lock); out_put: ceph_mdsc_put_request(req); return err; @@ -1907,7 +1901,7 @@ out_put: * Verify that we have a lease on the given mask. If not, * do a getattr against an mds. */ -int ceph_do_getattr(struct inode *inode, int mask) +int ceph_do_getattr(struct inode *inode, int mask, bool force) { struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb); struct ceph_mds_client *mdsc = fsc->mdsc; @@ -1920,7 +1914,7 @@ int ceph_do_getattr(struct inode *inode, int mask) } dout("do_getattr inode %p mask %s mode 0%o\n", inode, ceph_cap_string(mask), inode->i_mode); - if (ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) + if (!force && ceph_caps_issued_mask(ceph_inode(inode), mask, 1)) return 0; req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS); @@ -1948,7 +1942,7 @@ int ceph_permission(struct inode *inode, int mask) if (mask & MAY_NOT_BLOCK) return -ECHILD; - err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED); + err = ceph_do_getattr(inode, CEPH_CAP_AUTH_SHARED, false); if (!err) err = generic_permission(inode, mask); @@ -1966,7 +1960,7 @@ int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, struct ceph_inode_info *ci = ceph_inode(inode); int err; - err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_INODE_ALL, false); if (!err) { generic_fillattr(inode, stat); stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino); diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c index a822a6e58290..f851d8d70158 100644 --- a/fs/ceph/ioctl.c +++ b/fs/ceph/ioctl.c @@ -19,7 +19,7 @@ static long ceph_ioctl_get_layout(struct file *file, void __user *arg) struct ceph_ioctl_layout l; int err; - err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (!err) { l.stripe_unit = ceph_file_layout_su(ci->i_layout); l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout); @@ -41,7 +41,7 @@ static long __validate_layout(struct ceph_mds_client *mdsc, /* validate striping parameters */ if ((l->object_size & ~PAGE_MASK) || (l->stripe_unit & ~PAGE_MASK) || - (l->stripe_unit != 0 && + ((unsigned)l->stripe_unit != 0 && ((unsigned)l->object_size % (unsigned)l->stripe_unit))) return -EINVAL; @@ -74,7 +74,7 @@ static long ceph_ioctl_set_layout(struct file *file, void __user *arg) return -EFAULT; /* validate changed params against current layout */ - err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT); + err = ceph_do_getattr(file_inode(file), CEPH_STAT_CAP_LAYOUT, false); if (err) return err; diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index bad07c09f91e..a92d3f5c6c12 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -7,6 +7,7 @@ #include <linux/sched.h> #include <linux/debugfs.h> #include <linux/seq_file.h> +#include <linux/utsname.h> #include "super.h" #include "mds_client.h" @@ -334,7 +335,7 @@ static void destroy_reply_info(struct ceph_mds_reply_info_parsed *info) /* * sessions */ -static const char *session_state_name(int s) +const char *ceph_session_state_name(int s) { switch (s) { case CEPH_MDS_SESSION_NEW: return "new"; @@ -542,6 +543,8 @@ void ceph_mdsc_release_request(struct kref *kref) } kfree(req->r_path1); kfree(req->r_path2); + if (req->r_pagelist) + ceph_pagelist_release(req->r_pagelist); put_request_session(req); ceph_unreserve_caps(req->r_mdsc, &req->r_caps_reservation); kfree(req); @@ -812,6 +815,74 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) h = msg->front.iov_base; h->op = cpu_to_le32(op); h->seq = cpu_to_le64(seq); + + return msg; +} + +/* + * session message, specialization for CEPH_SESSION_REQUEST_OPEN + * to include additional client metadata fields. + */ +static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u64 seq) +{ + struct ceph_msg *msg; + struct ceph_mds_session_head *h; + int i = -1; + int metadata_bytes = 0; + int metadata_key_count = 0; + struct ceph_options *opt = mdsc->fsc->client->options; + void *p; + + const char* metadata[3][2] = { + {"hostname", utsname()->nodename}, + {"entity_id", opt->name ? opt->name : ""}, + {NULL, NULL} + }; + + /* Calculate serialized length of metadata */ + metadata_bytes = 4; /* map length */ + for (i = 0; metadata[i][0] != NULL; ++i) { + metadata_bytes += 8 + strlen(metadata[i][0]) + + strlen(metadata[i][1]); + metadata_key_count++; + } + + /* Allocate the message */ + msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + metadata_bytes, + GFP_NOFS, false); + if (!msg) { + pr_err("create_session_msg ENOMEM creating msg\n"); + return NULL; + } + h = msg->front.iov_base; + h->op = cpu_to_le32(CEPH_SESSION_REQUEST_OPEN); + h->seq = cpu_to_le64(seq); + + /* + * Serialize client metadata into waiting buffer space, using + * the format that userspace expects for map<string, string> + */ + msg->hdr.version = 2; /* ClientSession messages with metadata are v2 */ + + /* The write pointer, following the session_head structure */ + p = msg->front.iov_base + sizeof(*h); + + /* Number of entries in the map */ + ceph_encode_32(&p, metadata_key_count); + + /* Two length-prefixed strings for each entry in the map */ + for (i = 0; metadata[i][0] != NULL; ++i) { + size_t const key_len = strlen(metadata[i][0]); + size_t const val_len = strlen(metadata[i][1]); + + ceph_encode_32(&p, key_len); + memcpy(p, metadata[i][0], key_len); + p += key_len; + ceph_encode_32(&p, val_len); + memcpy(p, metadata[i][1], val_len); + p += val_len; + } + return msg; } @@ -835,7 +906,7 @@ static int __open_session(struct ceph_mds_client *mdsc, session->s_renew_requested = jiffies; /* send connect message */ - msg = create_session_msg(CEPH_SESSION_REQUEST_OPEN, session->s_seq); + msg = create_session_open_msg(mdsc, session->s_seq); if (!msg) return -ENOMEM; ceph_con_send(&session->s_con, msg); @@ -1164,7 +1235,7 @@ static int send_flushmsg_ack(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n", - session->s_mds, session_state_name(session->s_state), seq); + session->s_mds, ceph_session_state_name(session->s_state), seq); msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq); if (!msg) return -ENOMEM; @@ -1216,7 +1287,7 @@ static int request_close_session(struct ceph_mds_client *mdsc, struct ceph_msg *msg; dout("request_close_session mds%d state %s seq %lld\n", - session->s_mds, session_state_name(session->s_state), + session->s_mds, ceph_session_state_name(session->s_state), session->s_seq); msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq); if (!msg) @@ -1847,13 +1918,15 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc, msg->front.iov_len = p - msg->front.iov_base; msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); - if (req->r_data_len) { - /* outbound data set only by ceph_sync_setxattr() */ - BUG_ON(!req->r_pages); - ceph_msg_data_add_pages(msg, req->r_pages, req->r_data_len, 0); + if (req->r_pagelist) { + struct ceph_pagelist *pagelist = req->r_pagelist; + atomic_inc(&pagelist->refcnt); + ceph_msg_data_add_pagelist(msg, pagelist); + msg->hdr.data_len = cpu_to_le32(pagelist->length); + } else { + msg->hdr.data_len = 0; } - msg->hdr.data_len = cpu_to_le32(req->r_data_len); msg->hdr.data_off = cpu_to_le16(0); out_free2: @@ -2007,7 +2080,7 @@ static int __do_request(struct ceph_mds_client *mdsc, req->r_session = get_session(session); dout("do_request mds%d session %p state %s\n", mds, session, - session_state_name(session->s_state)); + ceph_session_state_name(session->s_state)); if (session->s_state != CEPH_MDS_SESSION_OPEN && session->s_state != CEPH_MDS_SESSION_HUNG) { if (session->s_state == CEPH_MDS_SESSION_NEW || @@ -2078,6 +2151,7 @@ static void kick_requests(struct ceph_mds_client *mdsc, int mds) if (req->r_session && req->r_session->s_mds == mds) { dout(" kicking tid %llu\n", req->r_tid); + list_del_init(&req->r_wait); __do_request(mdsc, req); } } @@ -2444,7 +2518,7 @@ static void handle_session(struct ceph_mds_session *session, dout("handle_session mds%d %s %p state %s seq %llu\n", mds, ceph_session_op_name(op), session, - session_state_name(session->s_state), seq); + ceph_session_state_name(session->s_state), seq); if (session->s_state == CEPH_MDS_SESSION_HUNG) { session->s_state = CEPH_MDS_SESSION_OPEN; @@ -2471,9 +2545,8 @@ static void handle_session(struct ceph_mds_session *session, if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) pr_info("mds%d reconnect denied\n", session->s_mds); remove_session_caps(session); - wake = 1; /* for good measure */ + wake = 2; /* for good measure */ wake_up_all(&mdsc->session_close_wq); - kick_requests(mdsc, mds); break; case CEPH_SESSION_STALE: @@ -2503,6 +2576,8 @@ static void handle_session(struct ceph_mds_session *session, if (wake) { mutex_lock(&mdsc->mutex); __wake_requests(mdsc, &session->s_waiting); + if (wake == 2) + kick_requests(mdsc, mds); mutex_unlock(&mdsc->mutex); } return; @@ -2695,18 +2770,8 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, session->s_state = CEPH_MDS_SESSION_RECONNECTING; session->s_seq = 0; - ceph_con_close(&session->s_con); - ceph_con_open(&session->s_con, - CEPH_ENTITY_TYPE_MDS, mds, - ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); - - /* replay unsafe requests */ - replay_unsafe_requests(mdsc, session); - - down_read(&mdsc->snap_rwsem); - dout("session %p state %s\n", session, - session_state_name(session->s_state)); + ceph_session_state_name(session->s_state)); spin_lock(&session->s_gen_ttl_lock); session->s_cap_gen++; @@ -2723,6 +2788,19 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc, discard_cap_releases(mdsc, session); spin_unlock(&session->s_cap_lock); + /* trim unused caps to reduce MDS's cache rejoin time */ + shrink_dcache_parent(mdsc->fsc->sb->s_root); + + ceph_con_close(&session->s_con); + ceph_con_open(&session->s_con, + CEPH_ENTITY_TYPE_MDS, mds, + ceph_mdsmap_get_addr(mdsc->mdsmap, mds)); + + /* replay unsafe requests */ + replay_unsafe_requests(mdsc, session); + + down_read(&mdsc->snap_rwsem); + /* traverse this session's caps */ s_nr_caps = session->s_nr_caps; err = ceph_pagelist_encode_32(pagelist, s_nr_caps); @@ -2791,7 +2869,6 @@ fail: mutex_unlock(&session->s_mutex); fail_nomsg: ceph_pagelist_release(pagelist); - kfree(pagelist); fail_nopagelist: pr_err("error %d preparing reconnect for mds%d\n", err, mds); return; @@ -2827,7 +2904,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, ceph_mdsmap_is_laggy(oldmap, i) ? " (laggy)" : "", ceph_mds_state_name(newstate), ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", - session_state_name(s->s_state)); + ceph_session_state_name(s->s_state)); if (i >= newmap->m_max_mds || memcmp(ceph_mdsmap_get_addr(oldmap, i), @@ -2939,14 +3016,15 @@ static void handle_lease(struct ceph_mds_client *mdsc, if (dname.len != get_unaligned_le32(h+1)) goto bad; - mutex_lock(&session->s_mutex); - session->s_seq++; - /* lookup inode */ inode = ceph_find_inode(sb, vino); dout("handle_lease %s, ino %llx %p %.*s\n", ceph_lease_op_name(h->action), vino.ino, inode, dname.len, dname.name); + + mutex_lock(&session->s_mutex); + session->s_seq++; + if (inode == NULL) { dout("handle_lease no inode %llx\n", vino.ino); goto release; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index e00737cf523c..3288359353e9 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -202,9 +202,7 @@ struct ceph_mds_request { bool r_direct_is_hash; /* true if r_direct_hash is valid */ /* data payload is used for xattr ops */ - struct page **r_pages; - int r_num_pages; - int r_data_len; + struct ceph_pagelist *r_pagelist; /* what caps shall we drop? */ int r_inode_drop, r_inode_unless; @@ -332,6 +330,8 @@ ceph_get_mds_session(struct ceph_mds_session *s) return s; } +extern const char *ceph_session_state_name(int s); + extern void ceph_put_mds_session(struct ceph_mds_session *s); extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 12b20744e386..b82f507979b8 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -95,6 +95,7 @@ struct ceph_fs_client { struct dentry *debugfs_congestion_kb; struct dentry *debugfs_bdi; struct dentry *debugfs_mdsc, *debugfs_mdsmap; + struct dentry *debugfs_mds_sessions; #endif #ifdef CONFIG_CEPH_FSCACHE @@ -714,7 +715,7 @@ extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_invalidate(struct inode *inode); extern void ceph_queue_writeback(struct inode *inode); -extern int ceph_do_getattr(struct inode *inode, int mask); +extern int ceph_do_getattr(struct inode *inode, int mask, bool force); extern int ceph_permission(struct inode *inode, int mask); extern int ceph_setattr(struct dentry *dentry, struct iattr *attr); extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -733,15 +734,23 @@ extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci); extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci); extern void __init ceph_xattr_init(void); extern void ceph_xattr_exit(void); +extern const struct xattr_handler *ceph_xattr_handlers[]; /* acl.c */ -extern const struct xattr_handler *ceph_xattr_handlers[]; +struct ceph_acls_info { + void *default_acl; + void *acl; + struct ceph_pagelist *pagelist; +}; #ifdef CONFIG_CEPH_FS_POSIX_ACL struct posix_acl *ceph_get_acl(struct inode *, int); int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type); -int ceph_init_acl(struct dentry *, struct inode *, struct inode *); +int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info); +void ceph_init_inode_acls(struct inode *inode, struct ceph_acls_info *info); +void ceph_release_acls_info(struct ceph_acls_info *info); static inline void ceph_forget_all_cached_acls(struct inode *inode) { @@ -753,12 +762,18 @@ static inline void ceph_forget_all_cached_acls(struct inode *inode) #define ceph_get_acl NULL #define ceph_set_acl NULL -static inline int ceph_init_acl(struct dentry *dentry, struct inode *inode, - struct inode *dir) +static inline int ceph_pre_init_acls(struct inode *dir, umode_t *mode, + struct ceph_acls_info *info) { return 0; } - +static inline void ceph_init_inode_acls(struct inode *inode, + struct ceph_acls_info *info) +{ +} +static inline void ceph_release_acls_info(struct ceph_acls_info *info) +{ +} static inline int ceph_acl_chmod(struct dentry *dentry, struct inode *inode) { return 0; diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 12f58d22e017..678b0d2bbbc4 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -1,4 +1,5 @@ #include <linux/ceph/ceph_debug.h> +#include <linux/ceph/pagelist.h> #include "super.h" #include "mds_client.h" @@ -284,8 +285,7 @@ static size_t ceph_vxattrs_name_size(struct ceph_vxattr *vxattrs) return ceph_dir_vxattrs_name_size; if (vxattrs == ceph_file_vxattrs) return ceph_file_vxattrs_name_size; - BUG(); - + BUG_ON(vxattrs); return 0; } @@ -736,24 +736,20 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, dout("getxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { - goto get_xattr; - } else { + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); /* get xattrs from mds (if we don't already have them) */ - err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) return err; + spin_lock(&ci->i_ceph_lock); } - spin_lock(&ci->i_ceph_lock); - err = __build_xattrs(inode); if (err < 0) goto out; -get_xattr: err = -ENODATA; /* == ENOATTR */ xattr = __get_xattr(ci, name); if (!xattr) @@ -798,23 +794,18 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) dout("listxattr %p ver=%lld index_ver=%lld\n", inode, ci->i_xattrs.version, ci->i_xattrs.index_version); - if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && - (ci->i_xattrs.index_version >= ci->i_xattrs.version)) { - goto list_xattr; - } else { + if (ci->i_xattrs.version == 0 || + !__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { spin_unlock(&ci->i_ceph_lock); - err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR); + err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); if (err) return err; + spin_lock(&ci->i_ceph_lock); } - spin_lock(&ci->i_ceph_lock); - err = __build_xattrs(inode); if (err < 0) goto out; - -list_xattr: /* * Start with virtual dir xattr names (if any) (including * terminating '\0' characters for each). @@ -860,35 +851,25 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_mds_request *req; struct ceph_mds_client *mdsc = fsc->mdsc; + struct ceph_pagelist *pagelist = NULL; int err; - int i, nr_pages; - struct page **pages = NULL; - void *kaddr; - - /* copy value into some pages */ - nr_pages = calc_pages_for(0, size); - if (nr_pages) { - pages = kmalloc(sizeof(pages[0])*nr_pages, GFP_NOFS); - if (!pages) + + if (value) { + /* copy value into pagelist */ + pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS); + if (!pagelist) return -ENOMEM; - err = -ENOMEM; - for (i = 0; i < nr_pages; i++) { - pages[i] = __page_cache_alloc(GFP_NOFS); - if (!pages[i]) { - nr_pages = i; - goto out; - } - kaddr = kmap(pages[i]); - memcpy(kaddr, value + i*PAGE_CACHE_SIZE, - min(PAGE_CACHE_SIZE, size-i*PAGE_CACHE_SIZE)); - } + + ceph_pagelist_init(pagelist); + err = ceph_pagelist_append(pagelist, value, size); + if (err) + goto out; + } else { + flags |= CEPH_XATTR_REMOVE; } dout("setxattr value=%.*s\n", (int)size, value); - if (!value) - flags |= CEPH_XATTR_REMOVE; - /* do request */ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETXATTR, USE_AUTH_MDS); @@ -903,9 +884,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, req->r_args.setxattr.flags = cpu_to_le32(flags); req->r_path2 = kstrdup(name, GFP_NOFS); - req->r_pages = pages; - req->r_num_pages = nr_pages; - req->r_data_len = size; + req->r_pagelist = pagelist; + pagelist = NULL; dout("xattr.ver (before): %lld\n", ci->i_xattrs.version); err = ceph_mdsc_do_request(mdsc, NULL, req); @@ -913,11 +893,8 @@ static int ceph_sync_setxattr(struct dentry *dentry, const char *name, dout("xattr.ver (after): %lld\n", ci->i_xattrs.version); out: - if (pages) { - for (i = 0; i < nr_pages; i++) - __free_page(pages[i]); - kfree(pages); - } + if (pagelist) + ceph_pagelist_release(pagelist); return err; } @@ -968,7 +945,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, retry: issued = __ceph_caps_issued(ci, NULL); dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (!(issued & CEPH_CAP_XATTR_EXCL)) + if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); @@ -1077,7 +1054,7 @@ retry: issued = __ceph_caps_issued(ci, NULL); dout("removexattr %p issued %s\n", inode, ceph_cap_string(issued)); - if (!(issued & CEPH_CAP_XATTR_EXCL)) + if (ci->i_xattrs.version == 0 || !(issued & CEPH_CAP_XATTR_EXCL)) goto do_sync; __build_xattrs(inode); diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 58df174deb10..b8602f199815 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -195,15 +195,15 @@ char *cifs_compose_mount_options(const char *sb_mountdata, else noff = tkn_e - (sb_mountdata + off) + 1; - if (strnicmp(sb_mountdata + off, "unc=", 4) == 0) { + if (strncasecmp(sb_mountdata + off, "unc=", 4) == 0) { off += noff; continue; } - if (strnicmp(sb_mountdata + off, "ip=", 3) == 0) { + if (strncasecmp(sb_mountdata + off, "ip=", 3) == 0) { off += noff; continue; } - if (strnicmp(sb_mountdata + off, "prefixpath=", 11) == 0) { + if (strncasecmp(sb_mountdata + off, "prefixpath=", 11) == 0) { off += noff; continue; } diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 9409fa10bd5c..3182273a3407 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -45,6 +45,7 @@ #define CIFS_MOUNT_POSIXACL 0x100000 /* mirror of MS_POSIXACL in mnt_cifs_flags */ #define CIFS_MOUNT_CIFS_BACKUPUID 0x200000 /* backup intent bit for a user */ #define CIFS_MOUNT_CIFS_BACKUPGID 0x400000 /* backup intent bit for a group */ +#define CIFS_MOUNT_MAP_SFM_CHR 0x800000 /* SFM/MAC mapping for illegal chars */ struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_spnego.c b/fs/cifs/cifs_spnego.c index a3e932547617..f4cf200b3c76 100644 --- a/fs/cifs/cifs_spnego.c +++ b/fs/cifs/cifs_spnego.c @@ -62,7 +62,6 @@ cifs_spnego_key_destroy(struct key *key) struct key_type cifs_spnego_key_type = { .name = "cifs.spnego", .instantiate = cifs_spnego_key_instantiate, - .match = user_match, .destroy = cifs_spnego_key_destroy, .describe = user_describe, }; diff --git a/fs/cifs/cifs_unicode.c b/fs/cifs/cifs_unicode.c index 15e9505aa35f..0303c6793d90 100644 --- a/fs/cifs/cifs_unicode.c +++ b/fs/cifs/cifs_unicode.c @@ -20,6 +20,7 @@ */ #include <linux/fs.h> #include <linux/slab.h> +#include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "cifs_uniupr.h" #include "cifspdu.h" @@ -61,26 +62,24 @@ cifs_utf16_bytes(const __le16 *from, int maxbytes, return outlen; } -/* - * cifs_mapchar - convert a host-endian char to proper char in codepage - * @target - where converted character should be copied - * @src_char - 2 byte host-endian source character - * @cp - codepage to which character should be converted - * @mapchar - should character be mapped according to mapchars mount option? - * - * This function handles the conversion of a single character. It is the - * responsibility of the caller to ensure that the target buffer is large - * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). - */ -static int -cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, - bool mapchar) +int cifs_remap(struct cifs_sb_info *cifs_sb) { - int len = 1; + int map_type; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + map_type = SFM_MAP_UNI_RSVD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + map_type = SFU_MAP_UNI_RSVD; + else + map_type = NO_MAP_UNI_RSVD; - if (!mapchar) - goto cp_convert; + return map_type; +} +/* Convert character using the SFU - "Services for Unix" remapping range */ +static bool +convert_sfu_char(const __u16 src_char, char *target) +{ /* * BB: Cannot handle remapping UNI_SLASH until all the calls to * build_path_from_dentry are modified, as they use slash as @@ -106,19 +105,74 @@ cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, *target = '<'; break; default: - goto cp_convert; + return false; } + return true; +} + +/* Convert character using the SFM - "Services for Mac" remapping range */ +static bool +convert_sfm_char(const __u16 src_char, char *target) +{ + switch (src_char) { + case SFM_COLON: + *target = ':'; + break; + case SFM_ASTERISK: + *target = '*'; + break; + case SFM_QUESTION: + *target = '?'; + break; + case SFM_PIPE: + *target = '|'; + break; + case SFM_GRTRTHAN: + *target = '>'; + break; + case SFM_LESSTHAN: + *target = '<'; + break; + case SFM_SLASH: + *target = '\\'; + break; + default: + return false; + } + return true; +} -out: - return len; -cp_convert: +/* + * cifs_mapchar - convert a host-endian char to proper char in codepage + * @target - where converted character should be copied + * @src_char - 2 byte host-endian source character + * @cp - codepage to which character should be converted + * @map_type - How should the 7 NTFS/SMB reserved characters be mapped to UCS2? + * + * This function handles the conversion of a single character. It is the + * responsibility of the caller to ensure that the target buffer is large + * enough to hold the result of the conversion (at least NLS_MAX_CHARSET_SIZE). + */ +static int +cifs_mapchar(char *target, const __u16 src_char, const struct nls_table *cp, + int maptype) +{ + int len = 1; + + if ((maptype == SFM_MAP_UNI_RSVD) && convert_sfm_char(src_char, target)) + return len; + else if ((maptype == SFU_MAP_UNI_RSVD) && + convert_sfu_char(src_char, target)) + return len; + + /* if character not one of seven in special remap set */ len = cp->uni2char(src_char, target, NLS_MAX_CHARSET_SIZE); if (len <= 0) { *target = '?'; len = 1; } - goto out; + return len; } /* @@ -145,7 +199,7 @@ cp_convert: */ int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, - const struct nls_table *codepage, bool mapchar) + const struct nls_table *codepage, int map_type) { int i, charlen, safelen; int outlen = 0; @@ -172,13 +226,13 @@ cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, * conversion bleed into the null terminator */ if (outlen >= safelen) { - charlen = cifs_mapchar(tmp, ftmp, codepage, mapchar); + charlen = cifs_mapchar(tmp, ftmp, codepage, map_type); if ((outlen + charlen) > (tolen - nullsize)) break; } /* put converted char into 'to' buffer */ - charlen = cifs_mapchar(&to[outlen], ftmp, codepage, mapchar); + charlen = cifs_mapchar(&to[outlen], ftmp, codepage, map_type); outlen += charlen; } @@ -267,7 +321,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, if (!dst) return NULL; cifs_from_utf16(dst, (__le16 *) src, len, maxlen, codepage, - false); + NO_MAP_UNI_RSVD); } else { len = strnlen(src, maxlen); len++; @@ -280,6 +334,66 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, return dst; } +static __le16 convert_to_sfu_char(char src_char) +{ + __le16 dest_char; + + switch (src_char) { + case ':': + dest_char = cpu_to_le16(UNI_COLON); + break; + case '*': + dest_char = cpu_to_le16(UNI_ASTERISK); + break; + case '?': + dest_char = cpu_to_le16(UNI_QUESTION); + break; + case '<': + dest_char = cpu_to_le16(UNI_LESSTHAN); + break; + case '>': + dest_char = cpu_to_le16(UNI_GRTRTHAN); + break; + case '|': + dest_char = cpu_to_le16(UNI_PIPE); + break; + default: + dest_char = 0; + } + + return dest_char; +} + +static __le16 convert_to_sfm_char(char src_char) +{ + __le16 dest_char; + + switch (src_char) { + case ':': + dest_char = cpu_to_le16(SFM_COLON); + break; + case '*': + dest_char = cpu_to_le16(SFM_ASTERISK); + break; + case '?': + dest_char = cpu_to_le16(SFM_QUESTION); + break; + case '<': + dest_char = cpu_to_le16(SFM_LESSTHAN); + break; + case '>': + dest_char = cpu_to_le16(SFM_GRTRTHAN); + break; + case '|': + dest_char = cpu_to_le16(SFM_PIPE); + break; + default: + dest_char = 0; + } + + return dest_char; +} + /* * Convert 16 bit Unicode pathname to wire format from string in current code * page. Conversion may involve remapping up the six characters that are @@ -288,7 +402,7 @@ cifs_strndup_from_utf16(const char *src, const int maxlen, */ int cifsConvertToUTF16(__le16 *target, const char *source, int srclen, - const struct nls_table *cp, int mapChars) + const struct nls_table *cp, int map_chars) { int i, charlen; int j = 0; @@ -296,39 +410,30 @@ cifsConvertToUTF16(__le16 *target, const char *source, int srclen, __le16 dst_char; wchar_t tmp; - if (!mapChars) + if (map_chars == NO_MAP_UNI_RSVD) return cifs_strtoUTF16(target, source, PATH_MAX, cp); for (i = 0; i < srclen; j++) { src_char = source[i]; charlen = 1; - switch (src_char) { - case 0: + + /* check if end of string */ + if (src_char == 0) goto ctoUTF16_out; - case ':': - dst_char = cpu_to_le16(UNI_COLON); - break; - case '*': - dst_char = cpu_to_le16(UNI_ASTERISK); - break; - case '?': - dst_char = cpu_to_le16(UNI_QUESTION); - break; - case '<': - dst_char = cpu_to_le16(UNI_LESSTHAN); - break; - case '>': - dst_char = cpu_to_le16(UNI_GRTRTHAN); - break; - case '|': - dst_char = cpu_to_le16(UNI_PIPE); - break; + + /* see if we must remap this char */ + if (map_chars == SFU_MAP_UNI_RSVD) + dst_char = convert_to_sfu_char(src_char); + else if (map_chars == SFM_MAP_UNI_RSVD) + dst_char = convert_to_sfm_char(src_char); + else + dst_char = 0; /* * FIXME: We can not handle remapping backslash (UNI_SLASH) * until all the calls to build_path_from_dentry are modified, * as they use backslash as separator. */ - default: + if (dst_char == 0) { charlen = cp->char2uni(source + i, srclen - i, &tmp); dst_char = cpu_to_le16(tmp); diff --git a/fs/cifs/cifs_unicode.h b/fs/cifs/cifs_unicode.h index d8eac3b6cefb..bdc52cb9a676 100644 --- a/fs/cifs/cifs_unicode.h +++ b/fs/cifs/cifs_unicode.h @@ -52,6 +52,34 @@ #define UNI_PIPE (__u16) ('|' + 0xF000) #define UNI_SLASH (__u16) ('\\' + 0xF000) +/* + * Macs use an older "SFM" mapping of the symbols above. Fortunately it does + * not conflict (although almost does) with the mapping above. + */ + +#define SFM_ASTERISK ((__u16) 0xF021) +#define SFM_QUESTION ((__u16) 0xF025) +#define SFM_COLON ((__u16) 0xF022) +#define SFM_GRTRTHAN ((__u16) 0xF024) +#define SFM_LESSTHAN ((__u16) 0xF023) +#define SFM_PIPE ((__u16) 0xF027) +#define SFM_SLASH ((__u16) 0xF026) + +/* + * Mapping mechanism to use when one of the seven reserved characters is + * encountered. We can only map using one of the mechanisms at a time + * since otherwise readdir could return directory entries which we would + * not be able to open + * + * NO_MAP_UNI_RSVD = do not perform any remapping of the character + * SFM_MAP_UNI_RSVD = map reserved characters using SFM scheme (MAC compatible) + * SFU_MAP_UNI_RSVD = map reserved characters ala SFU ("mapchars" option) + * + */ +#define NO_MAP_UNI_RSVD 0 +#define SFM_MAP_UNI_RSVD 1 +#define SFU_MAP_UNI_RSVD 2 + /* Just define what we want from uniupr.h. We don't want to define the tables * in each source file. */ @@ -75,7 +103,7 @@ extern const struct UniCaseRange CifsUniLowerRange[]; #ifdef __KERNEL__ int cifs_from_utf16(char *to, const __le16 *from, int tolen, int fromlen, - const struct nls_table *codepage, bool mapchar); + const struct nls_table *cp, int map_type); int cifs_utf16_bytes(const __le16 *from, int maxbytes, const struct nls_table *codepage); int cifs_strtoUTF16(__le16 *, const char *, int, const struct nls_table *); @@ -84,6 +112,7 @@ char *cifs_strndup_from_utf16(const char *src, const int maxlen, const struct nls_table *codepage); extern int cifsConvertToUTF16(__le16 *target, const char *source, int maxlen, const struct nls_table *cp, int mapChars); +extern int cifs_remap(struct cifs_sb_info *cifs_sb); #ifdef CONFIG_CIFS_SMB2 extern __le16 *cifs_strndup_to_utf16(const char *src, const int maxlen, int *utf16_len, const struct nls_table *cp, diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index 7ff866dbb89e..6d00c419cbae 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -84,7 +84,6 @@ static struct key_type cifs_idmap_key_type = { .instantiate = cifs_idmap_key_instantiate, .destroy = cifs_idmap_key_destroy, .describe = user_describe, - .match = user_match, }; static char * diff --git a/fs/cifs/cifsencrypt.c b/fs/cifs/cifsencrypt.c index 4934347321d3..4ac7445e6ec7 100644 --- a/fs/cifs/cifsencrypt.c +++ b/fs/cifs/cifsencrypt.c @@ -431,7 +431,7 @@ find_domain_name(struct cifs_ses *ses, const struct nls_table *nls_cp) return -ENOMEM; cifs_from_utf16(ses->domainName, (__le16 *)blobptr, attrsize, attrsize, - nls_cp, false); + nls_cp, NO_MAP_UNI_RSVD); break; } } diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 889b98455750..9d7996e8e793 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -813,7 +813,8 @@ static loff_t cifs_llseek(struct file *file, loff_t offset, int whence) return generic_file_llseek(file, offset, whence); } -static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) +static int +cifs_setlease(struct file *file, long arg, struct file_lock **lease, void **priv) { /* * Note that this is called by vfs setlease with i_lock held to @@ -829,7 +830,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) if (arg == F_UNLCK || ((arg == F_RDLCK) && CIFS_CACHE_READ(CIFS_I(inode))) || ((arg == F_WRLCK) && CIFS_CACHE_WRITE(CIFS_I(inode)))) - return generic_setlease(file, arg, lease); + return generic_setlease(file, arg, lease, priv); else if (tlink_tcon(cfile->tlink)->local_lease && !CIFS_CACHE_READ(CIFS_I(inode))) /* @@ -840,7 +841,7 @@ static int cifs_setlease(struct file *file, long arg, struct file_lock **lease) * knows that the file won't be changed on the server by anyone * else. */ - return generic_setlease(file, arg, lease); + return generic_setlease(file, arg, lease, priv); else return -EAGAIN; } diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 25b8392bfdd2..02a33e529904 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -323,11 +323,11 @@ struct smb_version_operations { int (*async_writev)(struct cifs_writedata *, void (*release)(struct kref *)); /* sync read from the server */ - int (*sync_read)(const unsigned int, struct cifsFileInfo *, + int (*sync_read)(const unsigned int, struct cifs_fid *, struct cifs_io_parms *, unsigned int *, char **, int *); /* sync write to the server */ - int (*sync_write)(const unsigned int, struct cifsFileInfo *, + int (*sync_write)(const unsigned int, struct cifs_fid *, struct cifs_io_parms *, unsigned int *, struct kvec *, unsigned long); /* open dir, start readdir */ @@ -466,6 +466,7 @@ struct smb_vol { bool direct_io:1; bool strict_io:1; /* strict cache behavior */ bool remap:1; /* set to remap seven reserved chars in filenames */ + bool sfu_remap:1; /* remap seven reserved chars ala SFU */ bool posix_paths:1; /* unset to not ask for posix pathnames. */ bool no_linux_ext:1; bool sfu_emul:1; @@ -499,6 +500,7 @@ struct smb_vol { #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ CIFS_MOUNT_SERVER_INUM | CIFS_MOUNT_DIRECT_IO | \ CIFS_MOUNT_NO_XATTR | CIFS_MOUNT_MAP_SPECIAL_CHR | \ + CIFS_MOUNT_MAP_SFM_CHR | \ CIFS_MOUNT_UNX_EMUL | CIFS_MOUNT_NO_BRL | \ CIFS_MOUNT_CIFS_ACL | CIFS_MOUNT_OVERR_UID | \ CIFS_MOUNT_OVERR_GID | CIFS_MOUNT_DYNPERM | \ diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index 66f65001a6d8..61d00a6e398f 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -867,7 +867,7 @@ CIFSSMBDelFile(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int rc = 0; int bytes_returned; int name_len; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); DelFileRetry: rc = smb_init(SMB_COM_DELETE, 1, tcon, (void **) &pSMB, @@ -913,7 +913,7 @@ CIFSSMBRmDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, int rc = 0; int bytes_returned; int name_len; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In CIFSSMBRmDir\n"); RmDirRetry: @@ -958,7 +958,7 @@ CIFSSMBMkDir(const unsigned int xid, struct cifs_tcon *tcon, const char *name, CREATE_DIRECTORY_RSP *pSMBr = NULL; int bytes_returned; int name_len; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In CIFSSMBMkDir\n"); MkDirRetry: @@ -1280,7 +1280,7 @@ CIFS_open(const unsigned int xid, struct cifs_open_parms *oparms, int *oplock, __u16 count; struct cifs_sb_info *cifs_sb = oparms->cifs_sb; struct cifs_tcon *tcon = oparms->tcon; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); const struct nls_table *nls = cifs_sb->local_nls; int create_options = oparms->create_options; int desired_access = oparms->desired_access; @@ -2572,7 +2572,7 @@ CIFSSMBRename(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len, name_len2; __u16 count; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In CIFSSMBRename\n"); renameRetry: @@ -2968,7 +2968,7 @@ CIFSCreateHardLink(const unsigned int xid, struct cifs_tcon *tcon, int bytes_returned; int name_len, name_len2; __u16 count; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); cifs_dbg(FYI, "In CIFSCreateHardLink\n"); winCreateHardLinkRetry: @@ -4367,7 +4367,7 @@ findFirstRetry: return rc; nls_codepage = cifs_sb->local_nls; - remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + remap = cifs_remap(cifs_sb); if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { name_len = @@ -5527,7 +5527,7 @@ CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, int name_len; int rc = 0; int bytes_returned = 0; - int remap = cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR; + int remap = cifs_remap(cifs_sb); __u16 params, byte_count, data_count, param_offset, offset; diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 36ca2045009b..24fa08d261fb 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -70,6 +70,7 @@ enum { Opt_forcegid, Opt_noforcegid, Opt_noblocksend, Opt_noautotune, Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_mapposix, Opt_nomapposix, Opt_mapchars, Opt_nomapchars, Opt_sfu, Opt_nosfu, Opt_nodfs, Opt_posixpaths, Opt_noposixpaths, Opt_nounix, @@ -124,8 +125,10 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_soft, "soft" }, { Opt_perm, "perm" }, { Opt_noperm, "noperm" }, - { Opt_mapchars, "mapchars" }, + { Opt_mapchars, "mapchars" }, /* SFU style */ { Opt_nomapchars, "nomapchars" }, + { Opt_mapposix, "mapposix" }, /* SFM style */ + { Opt_nomapposix, "nomapposix" }, { Opt_sfu, "sfu" }, { Opt_nosfu, "nosfu" }, { Opt_nodfs, "nodfs" }, @@ -1231,6 +1234,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->linux_uid = current_uid(); vol->linux_gid = current_gid(); + /* + * default to SFM style remapping of seven reserved characters + * unless user overrides it or we negotiate CIFS POSIX where + * it is unnecessary. Can not simultaneously use more than one mapping + * since then readdir could list files that open could not open + */ + vol->remap = true; + /* default to only allowing write access to owner of the mount */ vol->dir_mode = vol->file_mode = S_IRUGO | S_IXUGO | S_IWUSR; @@ -1338,10 +1349,18 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, vol->noperm = 1; break; case Opt_mapchars: - vol->remap = 1; + vol->sfu_remap = true; + vol->remap = false; /* disable SFM mapping */ break; case Opt_nomapchars: - vol->remap = 0; + vol->sfu_remap = false; + break; + case Opt_mapposix: + vol->remap = true; + vol->sfu_remap = false; /* disable SFU mapping */ + break; + case Opt_nomapposix: + vol->remap = false; break; case Opt_sfu: vol->sfu_emul = 1; @@ -1718,7 +1737,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } - if (strnicmp(string, "default", 7) != 0) { + if (strncasecmp(string, "default", 7) != 0) { vol->iocharset = kstrdup(string, GFP_KERNEL); if (!vol->iocharset) { @@ -1790,7 +1809,7 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, if (string == NULL) goto out_nomem; - if (strnicmp(string, "1", 1) == 0) { + if (strncasecmp(string, "1", 1) == 0) { /* This is the default */ break; } @@ -3197,6 +3216,8 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, if (pvolume_info->server_ino) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_SERVER_INUM; if (pvolume_info->remap) + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SFM_CHR; + if (pvolume_info->sfu_remap) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MAP_SPECIAL_CHR; if (pvolume_info->no_xattr) cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_NO_XATTR; @@ -3239,10 +3260,20 @@ void cifs_setup_cifs_sb(struct smb_vol *pvolume_info, } if (pvolume_info->mfsymlinks) { if (pvolume_info->sfu_emul) { - cifs_dbg(VFS, "mount option mfsymlinks ignored if sfu mount option is used\n"); - } else { - cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; + /* + * Our SFU ("Services for Unix" emulation does not allow + * creating symlinks but does allow reading existing SFU + * symlinks (it does allow both creating and reading SFU + * style mknod and FIFOs though). When "mfsymlinks" and + * "sfu" are both enabled at the same time, it allows + * reading both types of symlinks, but will only create + * them with mfsymlinks format. This allows better + * Apple compatibility (probably better for Samba too) + * while still recognizing old Windows style symlinks. + */ + cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } + cifs_sb->mnt_cifs_flags |= CIFS_MOUNT_MF_SYMLINKS; } if ((pvolume_info->cifs_acl) && (pvolume_info->dynperm)) @@ -3330,8 +3361,7 @@ expand_dfs_referral(const unsigned int xid, struct cifs_ses *ses, ref_path = check_prefix ? full_path + 1 : volume_info->UNC + 1; rc = get_dfs_path(xid, ses, ref_path, cifs_sb->local_nls, - &num_referrals, &referrals, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + &num_referrals, &referrals, cifs_remap(cifs_sb)); if (!rc && num_referrals > 0) { char *fake_devname = NULL; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 6cbd9c688cfe..b72bc29cba23 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -461,8 +461,8 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, xid = get_xid(); - cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + inode, direntry, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); if (IS_ERR(tlink)) { @@ -540,8 +540,8 @@ int cifs_create(struct inode *inode, struct dentry *direntry, umode_t mode, struct cifs_fid fid; __u32 oplock; - cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %s and dentry = 0x%p\n", - inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + inode, direntry, direntry); tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); @@ -577,12 +577,13 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, struct cifs_io_parms io_parms; char *full_path = NULL; struct inode *newinode = NULL; - int oplock = 0; + __u32 oplock = 0; struct cifs_fid fid; struct cifs_open_parms oparms; FILE_ALL_INFO *buf = NULL; unsigned int bytes_written; struct win_dev *pdev; + struct kvec iov[2]; if (!old_valid_dev(device_number)) return -EINVAL; @@ -658,7 +659,11 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, oparms.fid = &fid; oparms.reconnect = false; - rc = CIFS_open(xid, &oparms, &oplock, buf); + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, buf); if (rc) goto mknod_out; @@ -668,25 +673,26 @@ int cifs_mknod(struct inode *inode, struct dentry *direntry, umode_t mode, */ pdev = (struct win_dev *)buf; - io_parms.netfid = fid.netfid; io_parms.pid = current->tgid; io_parms.tcon = tcon; io_parms.offset = 0; io_parms.length = sizeof(struct win_dev); + iov[1].iov_base = buf; + iov[1].iov_len = sizeof(struct win_dev); if (S_ISCHR(mode)) { memcpy(pdev->type, "IntxCHR", 8); pdev->major = cpu_to_le64(MAJOR(device_number)); pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, - NULL, 0); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); } else if (S_ISBLK(mode)) { memcpy(pdev->type, "IntxBLK", 8); pdev->major = cpu_to_le64(MAJOR(device_number)); pdev->minor = cpu_to_le64(MINOR(device_number)); - rc = CIFSSMBWrite(xid, &io_parms, &bytes_written, (char *)pdev, - NULL, 0); + rc = tcon->ses->server->ops->sync_write(xid, &fid, &io_parms, + &bytes_written, iov, 1); } /* else if (S_ISFIFO) */ - CIFSSMBClose(xid, tcon, fid.netfid); + tcon->ses->server->ops->close(xid, tcon, &fid); d_drop(direntry); /* FIXME: add code here to set EAs */ @@ -713,8 +719,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, xid = get_xid(); - cifs_dbg(FYI, "parent inode = 0x%p name is: %s and dentry = 0x%p\n", - parent_dir_inode, direntry->d_name.name, direntry); + cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", + parent_dir_inode, direntry, direntry); /* check whether path exists */ @@ -833,7 +839,7 @@ cifs_d_revalidate(struct dentry *direntry, unsigned int flags) { int rc = 0; - cifs_dbg(FYI, "In cifs d_delete, name = %s\n", direntry->d_name.name); + cifs_dbg(FYI, "In cifs d_delete, name = %pd\n", direntry); return rc; } */ diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 5f29354b072a..3e4d00a06c44 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -1650,8 +1650,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, cifs_sb = CIFS_SB(dentry->d_sb); - cifs_dbg(FYI, "write %zd bytes to offset %lld of %s\n", - write_size, *offset, dentry->d_name.name); + cifs_dbg(FYI, "write %zd bytes to offset %lld of %pd\n", + write_size, *offset, dentry); tcon = tlink_tcon(open_file->tlink); server = tcon->ses->server; @@ -1687,8 +1687,8 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, io_parms.tcon = tcon; io_parms.offset = *offset; io_parms.length = len; - rc = server->ops->sync_write(xid, open_file, &io_parms, - &bytes_written, iov, 1); + rc = server->ops->sync_write(xid, &open_file->fid, + &io_parms, &bytes_written, iov, 1); } if (rc || (bytes_written == 0)) { if (total_written) @@ -2273,8 +2273,8 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, xid = get_xid(); - cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n", + file, datasync); if (!CIFS_CACHE_READ(CIFS_I(inode))) { rc = cifs_zap_mapping(inode); @@ -2315,8 +2315,8 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) xid = get_xid(); - cifs_dbg(FYI, "Sync file - name: %s datasync: 0x%x\n", - file->f_path.dentry->d_name.name, datasync); + cifs_dbg(FYI, "Sync file - name: %pD datasync: 0x%x\n", + file, datasync); tcon = tlink_tcon(smbfile->tlink); if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NOSSYNC)) { @@ -3206,7 +3206,7 @@ cifs_read(struct file *file, char *read_data, size_t read_size, loff_t *offset) io_parms.tcon = tcon; io_parms.offset = *offset; io_parms.length = current_read_size; - rc = server->ops->sync_read(xid, open_file, &io_parms, + rc = server->ops->sync_read(xid, &open_file->fid, &io_parms, &bytes_read, &cur_offset, &buf_type); } while (rc == -EAGAIN); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 7899a40465b3..197cb503d528 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -30,6 +30,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "cifs_unicode.h" #include "fscache.h" @@ -412,7 +413,7 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, struct cifs_sb_info *cifs_sb, unsigned int xid) { int rc; - int oplock = 0; + __u32 oplock; struct tcon_link *tlink; struct cifs_tcon *tcon; struct cifs_fid fid; @@ -451,8 +452,13 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, oparms.fid = &fid; oparms.reconnect = false; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (tcon->ses->server->oplocks) + oplock = REQ_OPLOCK; + else + oplock = 0; + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) { + cifs_dbg(FYI, "check sfu type of %s, open rc = %d\n", path, rc); cifs_put_tlink(tlink); return rc; } @@ -464,7 +470,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, io_parms.offset = 0; io_parms.length = 24; - rc = CIFSSMBRead(xid, &io_parms, &bytes_read, &pbuf, &buf_type); + rc = tcon->ses->server->ops->sync_read(xid, &fid, &io_parms, + &bytes_read, &pbuf, &buf_type); if ((rc == 0) && (bytes_read >= 8)) { if (memcmp("IntxBLK", pbuf, 8) == 0) { cifs_dbg(FYI, "Block device\n"); @@ -504,7 +511,8 @@ cifs_sfu_type(struct cifs_fattr *fattr, const char *path, fattr->cf_dtype = DT_REG; rc = -EOPNOTSUPP; /* or some unknown SFU type */ } - CIFSSMBClose(xid, tcon, fid.netfid); + + tcon->ses->server->ops->close(xid, tcon, &fid); cifs_put_tlink(tlink); return rc; } @@ -539,7 +547,7 @@ static int cifs_sfu_mode(struct cifs_fattr *fattr, const unsigned char *path, rc = tcon->ses->server->ops->query_all_EAs(xid, tcon, path, "SETFILEBITS", ea_value, 4 /* size of buf */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_put_tlink(tlink); if (rc < 0) return (int)rc; @@ -952,11 +960,18 @@ struct inode *cifs_root_iget(struct super_block *sb) struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); xid = get_xid(); - if (tcon->unix_ext) + if (tcon->unix_ext) { rc = cifs_get_inode_info_unix(&inode, "", sb, xid); - else - rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); + /* some servers mistakenly claim POSIX support */ + if (rc != -EOPNOTSUPP) + goto iget_no_retry; + cifs_dbg(VFS, "server does not support POSIX extensions"); + tcon->unix_ext = false; + } + + rc = cifs_get_inode_info(&inode, "", NULL, sb, xid, NULL); +iget_no_retry: if (!inode) { inode = ERR_PTR(rc); goto out; @@ -1117,8 +1132,7 @@ cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, /* rename the file */ rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc != 0) { rc = -EBUSY; goto undo_setattr; @@ -1159,8 +1173,7 @@ out: */ undo_rename: CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, dentry->d_name.name, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); undo_setattr: if (dosattr != origattr) { info_buf->Attributes = cpu_to_le32(origattr); @@ -1226,7 +1239,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, SMB_POSIX_UNLINK_FILE_TARGET, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_dbg(FYI, "posix del rc %d\n", rc); if ((rc == 0) || (rc == -ENOENT)) goto psx_del_no_retry; @@ -1349,8 +1362,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, } CIFSSMBUnixSetPathInfo(xid, tcon, full_path, &args, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); } else { struct TCP_Server_Info *server = tcon->ses->server; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_CIFS_ACL) && @@ -1392,8 +1404,7 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, mode &= ~current_umask(); rc = CIFSPOSIXCreate(xid, tcon, SMB_O_DIRECTORY | SMB_O_CREAT, mode, NULL /* netfid */, info, &oplock, full_path, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); if (rc == -EOPNOTSUPP) goto posix_mkdir_out; else if (rc) { @@ -1419,8 +1430,8 @@ cifs_posix_mkdir(struct inode *inode, struct dentry *dentry, umode_t mode, d_instantiate(dentry, newinode); #ifdef CONFIG_CIFS_DEBUG2 - cifs_dbg(FYI, "instantiated dentry %p %s to inode %p\n", - dentry, dentry->d_name.name, newinode); + cifs_dbg(FYI, "instantiated dentry %p %pd to inode %p\n", + dentry, dentry, newinode); if (newinode->i_nlink != 2) cifs_dbg(FYI, "unexpected number of links %d\n", @@ -1617,8 +1628,7 @@ cifs_do_rename(const unsigned int xid, struct dentry *from_dentry, if (rc == 0) { rc = CIFSSMBRenameOpenFile(xid, tcon, fid.netfid, (const char *) to_dentry->d_name.name, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); CIFSSMBClose(xid, tcon, fid.netfid); } do_rename_exit: @@ -1694,16 +1704,14 @@ cifs_rename2(struct inode *source_dir, struct dentry *source_dentry, tmprc = CIFSSMBUnixQPathInfo(xid, tcon, from_name, info_buf_source, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (tmprc != 0) goto unlink_target; tmprc = CIFSSMBUnixQPathInfo(xid, tcon, to_name, info_buf_target, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (tmprc == 0 && (info_buf_source->UniqueId == info_buf_target->UniqueId)) { @@ -2068,8 +2076,7 @@ cifs_set_file_size(struct inode *inode, struct iattr *attrs, rc = SMBLegacyOpen(xid, tcon, full_path, FILE_OPEN, GENERIC_WRITE, CREATE_NOT_DIR, &netfid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc == 0) { unsigned int bytes_written; @@ -2111,8 +2118,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) struct cifs_unix_set_info_args *args = NULL; struct cifsFileInfo *open_file; - cifs_dbg(FYI, "setattr_unix on file %s attrs->ia_valid=0x%x\n", - direntry->d_name.name, attrs->ia_valid); + cifs_dbg(FYI, "setattr_unix on file %pd attrs->ia_valid=0x%x\n", + direntry, attrs->ia_valid); xid = get_xid(); @@ -2254,8 +2261,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) xid = get_xid(); - cifs_dbg(FYI, "setattr on file %s attrs->iavalid 0x%x\n", - direntry->d_name.name, attrs->ia_valid); + cifs_dbg(FYI, "setattr on file %pd attrs->iavalid 0x%x\n", + direntry, attrs->ia_valid); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_PERM) attrs->ia_valid |= ATTR_FORCE; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 5657416d3483..2ec6037f61c7 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -28,6 +28,10 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifs_fs_sb.h" +#include "cifs_unicode.h" +#ifdef CONFIG_CIFS_SMB2 +#include "smb2proto.h" +#endif /* * M-F Symlink Functions - Begin @@ -401,6 +405,134 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, } /* + * SMB 2.1/SMB3 Protocol specific functions + */ +#ifdef CONFIG_CIFS_SMB2 +int +smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_read) +{ + int rc; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int buf_type = CIFS_NO_BUFFER; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_II; + struct smb2_file_all_info *pfile_info = NULL; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_READ; + oparms.create_options = CREATE_NOT_DIR; + if (backup_cred(cifs_sb)) + oparms.create_options |= CREATE_OPEN_BACKUP_INTENT; + oparms.disposition = FILE_OPEN; + oparms.fid = &fid; + oparms.reconnect = false; + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (utf16_path == NULL) + return -ENOMEM; + + pfile_info = kzalloc(sizeof(struct smb2_file_all_info) + PATH_MAX * 2, + GFP_KERNEL); + + if (pfile_info == NULL) { + kfree(utf16_path); + return -ENOMEM; + } + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, pfile_info, NULL); + if (rc) + goto qmf_out_open_fail; + + if (pfile_info->EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + /* it's not a symlink */ + rc = -ENOENT; /* Is there a better rc to return? */ + goto qmf_out; + } + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + io_parms.persistent_fid = fid.persistent_fid; + io_parms.volatile_fid = fid.volatile_fid; + rc = SMB2_read(xid, &io_parms, pbytes_read, &pbuf, &buf_type); +qmf_out: + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); +qmf_out_open_fail: + kfree(utf16_path); + kfree(pfile_info); + return rc; +} + +int +smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written) +{ + int rc; + struct cifs_fid fid; + struct cifs_open_parms oparms; + struct cifs_io_parms io_parms; + int create_options = CREATE_NOT_DIR; + __le16 *utf16_path; + __u8 oplock = SMB2_OPLOCK_LEVEL_EXCLUSIVE; + struct kvec iov[2]; + + if (backup_cred(cifs_sb)) + create_options |= CREATE_OPEN_BACKUP_INTENT; + + cifs_dbg(FYI, "%s: path: %s\n", __func__, path); + + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); + if (!utf16_path) + return -ENOMEM; + + oparms.tcon = tcon; + oparms.cifs_sb = cifs_sb; + oparms.desired_access = GENERIC_WRITE; + oparms.create_options = create_options; + oparms.disposition = FILE_CREATE; + oparms.fid = &fid; + oparms.reconnect = false; + + rc = SMB2_open(xid, &oparms, utf16_path, &oplock, NULL, NULL); + if (rc) { + kfree(utf16_path); + return rc; + } + + io_parms.netfid = fid.netfid; + io_parms.pid = current->tgid; + io_parms.tcon = tcon; + io_parms.offset = 0; + io_parms.length = CIFS_MF_SYMLINK_FILE_SIZE; + io_parms.persistent_fid = fid.persistent_fid; + io_parms.volatile_fid = fid.volatile_fid; + + /* iov[0] is reserved for smb header */ + iov[1].iov_base = pbuf; + iov[1].iov_len = CIFS_MF_SYMLINK_FILE_SIZE; + + rc = SMB2_write(xid, &io_parms, pbytes_written, iov, 1); + + /* Make sure we wrote all of the symlink data */ + if ((rc == 0) && (*pbytes_written != CIFS_MF_SYMLINK_FILE_SIZE)) + rc = -EIO; + + SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); + + kfree(utf16_path); + return rc; +} +#endif /* CONFIG_CIFS_SMB2 */ + +/* * M-F Symlink Functions - End */ @@ -435,8 +567,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, if (tcon->unix_ext) rc = CIFSUnixCreateHardLink(xid, tcon, from_name, to_name, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); else { server = tcon->ses->server; if (!server->ops->create_hardlink) { @@ -461,11 +592,7 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, spin_lock(&old_file->d_inode->i_lock); inc_nlink(old_file->d_inode); spin_unlock(&old_file->d_inode->i_lock); - /* - * BB should we make this contingent on superblock flag - * NOATIME? - */ - /* old_file->d_inode->i_ctime = CURRENT_TIME; */ + /* * parent dir timestamps will update from srv within a * second, would it really be worth it to set the parent @@ -475,7 +602,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, } /* * if not oplocked will force revalidate to get info on source - * file from srv + * file from srv. Note Samba server prior to 4.2 has bug - + * not updating src file ctime on hardlinks but Windows servers + * handle it properly */ cifsInode->time = 0; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index b334a89d6a66..8fd2a95860ba 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -87,8 +87,6 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, return; if (dentry) { - int err; - inode = dentry->d_inode; if (inode) { /* @@ -105,10 +103,8 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, goto out; } } - err = d_invalidate(dentry); + d_invalidate(dentry); dput(dentry); - if (err) - return; } /* @@ -243,7 +239,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, rc = CIFSSMBOpen(xid, ptcon, full_path, FILE_OPEN, GENERIC_READ, OPEN_REPARSE_POINT, &fid, &oplock, NULL, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb); if (!rc) { tmpbuffer = kmalloc(maxpath); rc = CIFSSMBQueryReparseLinkInfo(xid, ptcon, full_path, @@ -708,15 +704,15 @@ static int cifs_filldir(char *find_entry, struct file *file, if (file_info->srch_inf.unicode) { struct nls_table *nlt = cifs_sb->local_nls; + int map_type; + map_type = cifs_remap(cifs_sb); name.name = scratch_buf; name.len = cifs_from_utf16((char *)name.name, (__le16 *)de.name, UNICODE_NAME_MAX, min_t(size_t, de.namelen, - (size_t)max_len), nlt, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + (size_t)max_len), nlt, map_type); name.len -= nls_nullsize(nlt); } else { name.name = de.name; diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index 52131d8cb4d5..d2979036a4c7 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -23,6 +23,7 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifspdu.h" +#include "cifs_unicode.h" /* * An NT cancel request header looks just like the original request except: @@ -530,13 +531,11 @@ cifs_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, rc = CIFSSMBQPathInfo(xid, tcon, full_path, file_info, 0 /* not legacy */, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc == -EOPNOTSUPP || rc == -EINVAL) rc = SMBQueryInformation(xid, tcon, full_path, file_info, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); kfree(file_info); return rc; } @@ -552,8 +551,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, /* could do find first instead but this returns more info */ rc = CIFSSMBQPathInfo(xid, tcon, full_path, data, 0 /* not legacy */, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); /* * BB optimize code so we do not make the above call when server claims * no NT SMB support and the above call failed at least once - set flag @@ -562,8 +560,7 @@ cifs_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { rc = SMBQueryInformation(xid, tcon, full_path, data, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); *adjustTZ = true; } @@ -611,8 +608,7 @@ cifs_get_srv_inum(const unsigned int xid, struct cifs_tcon *tcon, */ return CIFSGetSrvInodeNumber(xid, tcon, full_path, uniqueid, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); } static int @@ -703,8 +699,7 @@ cifs_mkdir_setinfo(struct inode *inode, const char *full_path, dosattrs = cifsInode->cifsAttrs|ATTR_READONLY; info.Attributes = cpu_to_le32(dosattrs); rc = CIFSSMBSetPathInfo(xid, tcon, full_path, &info, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc == 0) cifsInode->cifsAttrs = dosattrs; } @@ -720,8 +715,7 @@ cifs_open_file(const unsigned int xid, struct cifs_open_parms *oparms, oparms->create_options, &oparms->fid->netfid, oplock, buf, oparms->cifs_sb->local_nls, - oparms->cifs_sb->mnt_cifs_flags - & CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(oparms->cifs_sb)); return CIFS_open(xid, oparms, oplock, buf); } @@ -749,21 +743,21 @@ cifs_flush_file(const unsigned int xid, struct cifs_tcon *tcon, } static int -cifs_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, +cifs_sync_read(const unsigned int xid, struct cifs_fid *pfid, struct cifs_io_parms *parms, unsigned int *bytes_read, char **buf, int *buf_type) { - parms->netfid = cfile->fid.netfid; + parms->netfid = pfid->netfid; return CIFSSMBRead(xid, parms, bytes_read, buf, buf_type); } static int -cifs_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, +cifs_sync_write(const unsigned int xid, struct cifs_fid *pfid, struct cifs_io_parms *parms, unsigned int *written, struct kvec *iov, unsigned long nr_segs) { - parms->netfid = cfile->fid.netfid; + parms->netfid = pfid->netfid; return CIFSSMBWrite2(xid, parms, written, iov, nr_segs); } @@ -800,8 +794,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, tcon = tlink_tcon(tlink); rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); if (rc == 0) { cinode->cifsAttrs = le32_to_cpu(buf->Attributes); goto out; diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 4aa7a0f07d6e..1a08a34838fc 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -379,6 +379,14 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) int len; const char *start_of_path; __le16 *to; + int map_type; + + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SFM_CHR) + map_type = SFM_MAP_UNI_RSVD; + else if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MAP_SPECIAL_CHR) + map_type = SFU_MAP_UNI_RSVD; + else + map_type = NO_MAP_UNI_RSVD; /* Windows doesn't allow paths beginning with \ */ if (from[0] == '\\') @@ -386,9 +394,7 @@ cifs_convert_path_to_utf16(const char *from, struct cifs_sb_info *cifs_sb) else start_of_path = from; to = cifs_strndup_to_utf16(start_of_path, PATH_MAX, &len, - cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, map_type); return to; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f522193b7184..c5f521bcdee2 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -265,15 +265,18 @@ SMB3_request_interfaces(const unsigned int xid, struct cifs_tcon *tcon) FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */, NULL /* no data input */, 0 /* no data input */, (char **)&out_buf, &ret_data_len); - - if ((rc == 0) && (ret_data_len > 0)) { + if (rc != 0) + cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + else if (ret_data_len < sizeof(struct network_interface_info_ioctl_rsp)) { + cifs_dbg(VFS, "server returned bad net interface info buf\n"); + rc = -EINVAL; + } else { /* Dump info on first interface */ cifs_dbg(FYI, "Adapter Capability 0x%x\t", le32_to_cpu(out_buf->Capability)); cifs_dbg(FYI, "Link Speed %lld\n", le64_to_cpu(out_buf->LinkSpeed)); - } else - cifs_dbg(VFS, "error %d on ioctl to get interface list\n", rc); + } return rc; } @@ -711,23 +714,23 @@ smb2_read_data_length(char *buf) static int -smb2_sync_read(const unsigned int xid, struct cifsFileInfo *cfile, +smb2_sync_read(const unsigned int xid, struct cifs_fid *pfid, struct cifs_io_parms *parms, unsigned int *bytes_read, char **buf, int *buf_type) { - parms->persistent_fid = cfile->fid.persistent_fid; - parms->volatile_fid = cfile->fid.volatile_fid; + parms->persistent_fid = pfid->persistent_fid; + parms->volatile_fid = pfid->volatile_fid; return SMB2_read(xid, parms, bytes_read, buf, buf_type); } static int -smb2_sync_write(const unsigned int xid, struct cifsFileInfo *cfile, +smb2_sync_write(const unsigned int xid, struct cifs_fid *pfid, struct cifs_io_parms *parms, unsigned int *written, struct kvec *iov, unsigned long nr_segs) { - parms->persistent_fid = cfile->fid.persistent_fid; - parms->volatile_fid = cfile->fid.volatile_fid; + parms->persistent_fid = pfid->persistent_fid; + parms->volatile_fid = pfid->volatile_fid; return SMB2_write(xid, parms, written, iov, nr_segs); } @@ -1452,6 +1455,8 @@ struct smb_version_operations smb21_operations = { .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, @@ -1531,6 +1536,8 @@ struct smb_version_operations smb30_operations = { .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, .query_symlink = smb2_query_symlink, + .query_mf_symlink = smb3_query_mf_symlink, + .create_mf_symlink = smb3_create_mf_symlink, .open = smb2_open_file, .set_fid = smb2_set_fid, .close = smb2_close_file, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 74b3a6684383..8f1672bb82d5 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -1098,6 +1098,8 @@ SMB2_open(const unsigned int xid, struct cifs_open_parms *oparms, __le16 *path, if (oparms->create_options & CREATE_OPTION_READONLY) file_attributes |= ATTR_READONLY; + if (oparms->create_options & CREATE_OPTION_SPECIAL) + file_attributes |= ATTR_SYSTEM; req->ImpersonationLevel = IL_IMPERSONATION; req->DesiredAccess = cpu_to_le32(oparms->desired_access); diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index fbe486c285a9..e3188abdafd0 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -352,6 +352,8 @@ struct smb2_tree_disconnect_rsp { #define FILE_ATTRIBUTE_OFFLINE 0x00001000 #define FILE_ATTRIBUTE_NOT_CONTENT_INDEXED 0x00002000 #define FILE_ATTRIBUTE_ENCRYPTED 0x00004000 +#define FILE_ATTRIBUTE_INTEGRITY_STREAM 0x00008000 +#define FILE_ATTRIBUTE_NO_SCRUB_DATA 0x00020000 /* Oplock levels */ #define SMB2_OPLOCK_LEVEL_NONE 0x00 diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 67e8ce8055de..79dc650c18b2 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -82,7 +82,13 @@ extern int smb2_rename_path(const unsigned int xid, struct cifs_tcon *tcon, extern int smb2_create_hardlink(const unsigned int xid, struct cifs_tcon *tcon, const char *from_name, const char *to_name, struct cifs_sb_info *cifs_sb); - +extern int smb3_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, const unsigned char *path, + char *pbuf, unsigned int *pbytes_written); +extern int smb3_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, + struct cifs_sb_info *cifs_sb, + const unsigned char *path, char *pbuf, + unsigned int *pbytes_read); extern int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 *oplock, FILE_ALL_INFO *buf); diff --git a/fs/cifs/smbencrypt.c b/fs/cifs/smbencrypt.c index 43eb1367b103..6c1566366a66 100644 --- a/fs/cifs/smbencrypt.c +++ b/fs/cifs/smbencrypt.c @@ -29,6 +29,7 @@ #include <linux/string.h> #include <linux/kernel.h> #include <linux/random.h> +#include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "cifspdu.h" #include "cifsglob.h" diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 5ac836a86b18..72a4d10653d6 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -28,6 +28,8 @@ #include "cifsglob.h" #include "cifsproto.h" #include "cifs_debug.h" +#include "cifs_fs_sb.h" +#include "cifs_unicode.h" #define MAX_EA_VALUE_SIZE 65535 #define CIFS_XATTR_DOS_ATTRIB "user.DosAttrib" @@ -85,8 +87,7 @@ int cifs_removexattr(struct dentry *direntry, const char *ea_name) if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, ea_name, NULL, (__u16)0, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); } remove_ea_exit: kfree(full_path); @@ -154,8 +155,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) @@ -165,8 +165,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, if (pTcon->ses->server->ops->set_EA) rc = pTcon->ses->server->ops->set_EA(xid, pTcon, full_path, ea_name, ea_value, (__u16)value_size, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); } else if (strncmp(ea_name, CIFS_XATTR_CIFS_ACL, strlen(CIFS_XATTR_CIFS_ACL)) == 0) { #ifdef CONFIG_CIFS_ACL @@ -199,8 +198,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, ea_value, (const int)value_size, ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_dbg(FYI, "set POSIX ACL rc %d\n", rc); #else cifs_dbg(FYI, "set POSIX ACL not supported\n"); @@ -212,8 +210,7 @@ int cifs_setxattr(struct dentry *direntry, const char *ea_name, rc = CIFSSMBSetPosixACL(xid, pTcon, full_path, ea_value, (const int)value_size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); cifs_dbg(FYI, "set POSIX default ACL rc %d\n", rc); #else cifs_dbg(FYI, "set default POSIX ACL not supported\n"); @@ -285,8 +282,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); } else if (strncmp(ea_name, XATTR_OS2_PREFIX, XATTR_OS2_PREFIX_LEN) == 0) { if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) goto get_ea_exit; @@ -295,8 +291,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, ea_name, ea_value, buf_size, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); } else if (strncmp(ea_name, POSIX_ACL_XATTR_ACCESS, strlen(POSIX_ACL_XATTR_ACCESS)) == 0) { #ifdef CONFIG_CIFS_POSIX @@ -304,8 +299,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, ea_value, buf_size, ACL_TYPE_ACCESS, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); #else cifs_dbg(FYI, "Query POSIX ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ @@ -316,8 +310,7 @@ ssize_t cifs_getxattr(struct dentry *direntry, const char *ea_name, rc = CIFSSMBGetPosixACL(xid, pTcon, full_path, ea_value, buf_size, ACL_TYPE_DEFAULT, cifs_sb->local_nls, - cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_remap(cifs_sb)); #else cifs_dbg(FYI, "Query POSIX default ACL not supported yet\n"); #endif /* CONFIG_CIFS_POSIX */ @@ -421,8 +414,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) if (pTcon->ses->server->ops->query_all_EAs) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, NULL, data, buf_size, - cifs_sb->local_nls, cifs_sb->mnt_cifs_flags & - CIFS_MOUNT_MAP_SPECIAL_CHR); + cifs_sb->local_nls, cifs_remap(cifs_sb)); list_ea_exit: kfree(full_path); free_xid(xid); diff --git a/fs/compat.c b/fs/compat.c index 66d3d3c6b4b2..b13df99f3534 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -794,25 +794,21 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, char *kernel_type; unsigned long data_page; char *kernel_dev; - struct filename *dir; int retval; - retval = copy_mount_string(type, &kernel_type); - if (retval < 0) + kernel_type = copy_mount_string(type); + retval = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) goto out; - dir = getname(dir_name); - retval = PTR_ERR(dir); - if (IS_ERR(dir)) + kernel_dev = copy_mount_string(dev_name); + retval = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) goto out1; - retval = copy_mount_string(dev_name, &kernel_dev); - if (retval < 0) - goto out2; - retval = copy_mount_options(data, &data_page); if (retval < 0) - goto out3; + goto out2; retval = -EINVAL; @@ -821,19 +817,17 @@ COMPAT_SYSCALL_DEFINE5(mount, const char __user *, dev_name, do_ncp_super_data_conv((void *)data_page); } else if (!strcmp(kernel_type, NFS4_NAME)) { if (do_nfs4_super_data_conv((void *) data_page)) - goto out4; + goto out3; } } - retval = do_mount(kernel_dev, dir->name, kernel_type, + retval = do_mount(kernel_dev, dir_name, kernel_type, flags, (void*)data_page); - out4: - free_page(data_page); out3: - kfree(kernel_dev); + free_page(data_page); out2: - putname(dir); + kfree(kernel_dev); out1: kfree(kernel_type); out: diff --git a/fs/coredump.c b/fs/coredump.c index a93f7e6ea4cf..b5c86ffd5033 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -199,6 +199,14 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm) err = cn_printf(cn, "%d", task_tgid_nr(current)); break; + case 'i': + err = cn_printf(cn, "%d", + task_pid_vnr(current)); + break; + case 'I': + err = cn_printf(cn, "%d", + task_pid_nr(current)); + break; /* uid */ case 'u': err = cn_printf(cn, "%d", cred->uid); diff --git a/fs/dcache.c b/fs/dcache.c index cb25a1a5e307..3ffef7f4e5cd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -235,18 +235,49 @@ static inline int dentry_cmp(const struct dentry *dentry, const unsigned char *c return dentry_string_cmp(cs, ct, tcount); } +struct external_name { + union { + atomic_t count; + struct rcu_head head; + } u; + unsigned char name[]; +}; + +static inline struct external_name *external_name(struct dentry *dentry) +{ + return container_of(dentry->d_name.name, struct external_name, name[0]); +} + static void __d_free(struct rcu_head *head) { struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); WARN_ON(!hlist_unhashed(&dentry->d_alias)); - if (dname_external(dentry)) - kfree(dentry->d_name.name); kmem_cache_free(dentry_cache, dentry); } +static void __d_free_external(struct rcu_head *head) +{ + struct dentry *dentry = container_of(head, struct dentry, d_u.d_rcu); + WARN_ON(!hlist_unhashed(&dentry->d_alias)); + kfree(external_name(dentry)); + kmem_cache_free(dentry_cache, dentry); +} + +static inline int dname_external(const struct dentry *dentry) +{ + return dentry->d_name.name != dentry->d_iname; +} + static void dentry_free(struct dentry *dentry) { + if (unlikely(dname_external(dentry))) { + struct external_name *p = external_name(dentry); + if (likely(atomic_dec_and_test(&p->u.count))) { + call_rcu(&dentry->d_u.d_rcu, __d_free_external); + return; + } + } /* if dentry was never visible to RCU, immediate free is OK */ if (!(dentry->d_flags & DCACHE_RCUACCESS)) __d_free(&dentry->d_u.d_rcu); @@ -456,7 +487,7 @@ static void __dentry_kill(struct dentry *dentry) * inform the fs via d_prune that this dentry is about to be * unhashed and destroyed. */ - if ((dentry->d_flags & DCACHE_OP_PRUNE) && !d_unhashed(dentry)) + if (dentry->d_flags & DCACHE_OP_PRUNE) dentry->d_op->d_prune(dentry); if (dentry->d_flags & DCACHE_LRU_LIST) { @@ -619,62 +650,6 @@ kill_it: } EXPORT_SYMBOL(dput); -/** - * d_invalidate - invalidate a dentry - * @dentry: dentry to invalidate - * - * Try to invalidate the dentry if it turns out to be - * possible. If there are other dentries that can be - * reached through this one we can't delete it and we - * return -EBUSY. On success we return 0. - * - * no dcache lock. - */ - -int d_invalidate(struct dentry * dentry) -{ - /* - * If it's already been dropped, return OK. - */ - spin_lock(&dentry->d_lock); - if (d_unhashed(dentry)) { - spin_unlock(&dentry->d_lock); - return 0; - } - /* - * Check whether to do a partial shrink_dcache - * to get rid of unused child entries. - */ - if (!list_empty(&dentry->d_subdirs)) { - spin_unlock(&dentry->d_lock); - shrink_dcache_parent(dentry); - spin_lock(&dentry->d_lock); - } - - /* - * Somebody else still using it? - * - * If it's a directory, we can't drop it - * for fear of somebody re-populating it - * with children (even though dropping it - * would make it unreachable from the root, - * we might still populate it if it was a - * working directory or similar). - * We also need to leave mountpoints alone, - * directory or not. - */ - if (dentry->d_lockref.count > 1 && dentry->d_inode) { - if (S_ISDIR(dentry->d_inode->i_mode) || d_mountpoint(dentry)) { - spin_unlock(&dentry->d_lock); - return -EBUSY; - } - } - - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - return 0; -} -EXPORT_SYMBOL(d_invalidate); /* This must be called with d_lock held */ static inline void __dget_dlock(struct dentry *dentry) @@ -735,7 +710,8 @@ EXPORT_SYMBOL(dget_parent); * acquire the reference to alias and return it. Otherwise return NULL. * Notice that if inode is a directory there can be only one alias and * it can be unhashed only if it has no children, or if it is the root - * of a filesystem. + * of a filesystem, or if the directory was renamed and d_revalidate + * was the first vfs operation to notice. * * If the inode has an IS_ROOT, DCACHE_DISCONNECTED alias, then prefer * any other hashed alias over that one. @@ -799,20 +775,13 @@ restart: hlist_for_each_entry(dentry, &inode->i_dentry, d_alias) { spin_lock(&dentry->d_lock); if (!dentry->d_lockref.count) { - /* - * inform the fs via d_prune that this dentry - * is about to be unhashed and destroyed. - */ - if ((dentry->d_flags & DCACHE_OP_PRUNE) && - !d_unhashed(dentry)) - dentry->d_op->d_prune(dentry); - - __dget_dlock(dentry); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&inode->i_lock); - dput(dentry); - goto restart; + struct dentry *parent = lock_parent(dentry); + if (likely(!dentry->d_lockref.count)) { + __dentry_kill(dentry); + goto restart; + } + if (parent) + spin_unlock(&parent->d_lock); } spin_unlock(&dentry->d_lock); } @@ -1193,7 +1162,7 @@ EXPORT_SYMBOL(have_submounts); * reachable (e.g. NFS can unhash a directory dentry and then the complete * subtree can become unreachable). * - * Only one of check_submounts_and_drop() and d_set_mounted() must succeed. For + * Only one of d_invalidate() and d_set_mounted() must succeed. For * this reason take rename_lock and d_lock on dentry and ancestors. */ int d_set_mounted(struct dentry *dentry) @@ -1202,7 +1171,7 @@ int d_set_mounted(struct dentry *dentry) int ret = -ENOENT; write_seqlock(&rename_lock); for (p = dentry->d_parent; !IS_ROOT(p); p = p->d_parent) { - /* Need exclusion wrt. check_submounts_and_drop() */ + /* Need exclusion wrt. d_invalidate() */ spin_lock(&p->d_lock); if (unlikely(d_unhashed(p))) { spin_unlock(&p->d_lock); @@ -1346,70 +1315,84 @@ void shrink_dcache_for_umount(struct super_block *sb) } } -static enum d_walk_ret check_and_collect(void *_data, struct dentry *dentry) +struct detach_data { + struct select_data select; + struct dentry *mountpoint; +}; +static enum d_walk_ret detach_and_collect(void *_data, struct dentry *dentry) { - struct select_data *data = _data; + struct detach_data *data = _data; if (d_mountpoint(dentry)) { - data->found = -EBUSY; + __dget_dlock(dentry); + data->mountpoint = dentry; return D_WALK_QUIT; } - return select_collect(_data, dentry); + return select_collect(&data->select, dentry); } static void check_and_drop(void *_data) { - struct select_data *data = _data; + struct detach_data *data = _data; - if (d_mountpoint(data->start)) - data->found = -EBUSY; - if (!data->found) - __d_drop(data->start); + if (!data->mountpoint && !data->select.found) + __d_drop(data->select.start); } /** - * check_submounts_and_drop - prune dcache, check for submounts and drop + * d_invalidate - detach submounts, prune dcache, and drop + * @dentry: dentry to invalidate (aka detach, prune and drop) * - * All done as a single atomic operation relative to has_unlinked_ancestor(). - * Returns 0 if successfully unhashed @parent. If there were submounts then - * return -EBUSY. + * no dcache lock. * - * @dentry: dentry to prune and drop + * The final d_drop is done as an atomic operation relative to + * rename_lock ensuring there are no races with d_set_mounted. This + * ensures there are no unhashed dentries on the path to a mountpoint. */ -int check_submounts_and_drop(struct dentry *dentry) +void d_invalidate(struct dentry *dentry) { - int ret = 0; + /* + * If it's already been dropped, return OK. + */ + spin_lock(&dentry->d_lock); + if (d_unhashed(dentry)) { + spin_unlock(&dentry->d_lock); + return; + } + spin_unlock(&dentry->d_lock); /* Negative dentries can be dropped without further checks */ if (!dentry->d_inode) { d_drop(dentry); - goto out; + return; } for (;;) { - struct select_data data; + struct detach_data data; - INIT_LIST_HEAD(&data.dispose); - data.start = dentry; - data.found = 0; + data.mountpoint = NULL; + INIT_LIST_HEAD(&data.select.dispose); + data.select.start = dentry; + data.select.found = 0; + + d_walk(dentry, &data, detach_and_collect, check_and_drop); - d_walk(dentry, &data, check_and_collect, check_and_drop); - ret = data.found; + if (data.select.found) + shrink_dentry_list(&data.select.dispose); - if (!list_empty(&data.dispose)) - shrink_dentry_list(&data.dispose); + if (data.mountpoint) { + detach_mounts(data.mountpoint); + dput(data.mountpoint); + } - if (ret <= 0) + if (!data.mountpoint && !data.select.found) break; cond_resched(); } - -out: - return ret; } -EXPORT_SYMBOL(check_submounts_and_drop); +EXPORT_SYMBOL(d_invalidate); /** * __d_alloc - allocate a dcache entry @@ -1438,11 +1421,14 @@ struct dentry *__d_alloc(struct super_block *sb, const struct qstr *name) */ dentry->d_iname[DNAME_INLINE_LEN-1] = 0; if (name->len > DNAME_INLINE_LEN-1) { - dname = kmalloc(name->len + 1, GFP_KERNEL); - if (!dname) { + size_t size = offsetof(struct external_name, name[1]); + struct external_name *p = kmalloc(size + name->len, GFP_KERNEL); + if (!p) { kmem_cache_free(dentry_cache, dentry); return NULL; } + atomic_set(&p->u.count, 1); + dname = p->name; } else { dname = dentry->d_iname; } @@ -2112,10 +2098,10 @@ struct dentry *d_lookup(const struct dentry *parent, const struct qstr *name) struct dentry *dentry; unsigned seq; - do { - seq = read_seqbegin(&rename_lock); - dentry = __d_lookup(parent, name); - if (dentry) + do { + seq = read_seqbegin(&rename_lock); + dentry = __d_lookup(parent, name); + if (dentry) break; } while (read_seqretry(&rename_lock, seq)); return dentry; @@ -2372,11 +2358,10 @@ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) } EXPORT_SYMBOL(dentry_update_name_case); -static void switch_names(struct dentry *dentry, struct dentry *target, - bool exchange) +static void swap_names(struct dentry *dentry, struct dentry *target) { - if (dname_external(target)) { - if (dname_external(dentry)) { + if (unlikely(dname_external(target))) { + if (unlikely(dname_external(dentry))) { /* * Both external: swap the pointers */ @@ -2392,7 +2377,7 @@ static void switch_names(struct dentry *dentry, struct dentry *target, target->d_name.name = target->d_iname; } } else { - if (dname_external(dentry)) { + if (unlikely(dname_external(dentry))) { /* * dentry:external, target:internal. Give dentry's * storage to target and make dentry internal @@ -2407,12 +2392,6 @@ static void switch_names(struct dentry *dentry, struct dentry *target, */ unsigned int i; BUILD_BUG_ON(!IS_ALIGNED(DNAME_INLINE_LEN, sizeof(long))); - if (!exchange) { - memcpy(dentry->d_iname, target->d_name.name, - target->d_name.len + 1); - dentry->d_name.hash_len = target->d_name.hash_len; - return; - } for (i = 0; i < DNAME_INLINE_LEN / sizeof(long); i++) { swap(((long *) &dentry->d_iname)[i], ((long *) &target->d_iname)[i]); @@ -2422,6 +2401,24 @@ static void switch_names(struct dentry *dentry, struct dentry *target, swap(dentry->d_name.hash_len, target->d_name.hash_len); } +static void copy_name(struct dentry *dentry, struct dentry *target) +{ + struct external_name *old_name = NULL; + if (unlikely(dname_external(dentry))) + old_name = external_name(dentry); + if (unlikely(dname_external(target))) { + atomic_inc(&external_name(target)->u.count); + dentry->d_name = target->d_name; + } else { + memcpy(dentry->d_iname, target->d_name.name, + target->d_name.len + 1); + dentry->d_name.name = dentry->d_iname; + dentry->d_name.hash_len = target->d_name.hash_len; + } + if (old_name && likely(atomic_dec_and_test(&old_name->u.count))) + kfree_rcu(old_name, u.head); +} + static void dentry_lock_for_move(struct dentry *dentry, struct dentry *target) { /* @@ -2518,7 +2515,10 @@ static void __d_move(struct dentry *dentry, struct dentry *target, } /* Switch the names.. */ - switch_names(dentry, target, exchange); + if (exchange) + swap_names(dentry, target); + else + copy_name(dentry, target); /* ... and switch them in the tree */ if (IS_ROOT(dentry)) { @@ -2625,10 +2625,8 @@ static struct dentry *__d_unalias(struct inode *inode, goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: - if (likely(!d_mountpoint(alias))) { - __d_move(alias, dentry, false); - ret = alias; - } + __d_move(alias, dentry, false); + ret = alias; out_err: spin_unlock(&inode->i_lock); if (m2) @@ -2675,11 +2673,13 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (!IS_ROOT(new)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } if (d_ancestor(new, dentry)) { spin_unlock(&inode->i_lock); dput(new); + iput(inode); return ERR_PTR(-EIO); } write_seqlock(&rename_lock); @@ -2810,6 +2810,9 @@ static int prepend(char **buffer, int *buflen, const char *str, int namelen) * the beginning of the name. The sequence number check at the caller will * retry it again when a d_move() does happen. So any garbage in the buffer * due to mismatched pointer and length will be discarded. + * + * Data dependency barrier is needed to make sure that we see that terminating + * NUL. Alpha strikes again, film at 11... */ static int prepend_name(char **buffer, int *buflen, struct qstr *name) { @@ -2817,6 +2820,8 @@ static int prepend_name(char **buffer, int *buflen, struct qstr *name) u32 dlen = ACCESS_ONCE(name->len); char *p; + smp_read_barrier_depends(); + *buflen -= dlen + 1; if (*buflen < 0) return -ENAMETOOLONG; diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c index f704458ea5f5..e0ab3a93eeff 100644 --- a/fs/dlm/plock.c +++ b/fs/dlm/plock.c @@ -30,7 +30,7 @@ struct plock_op { struct plock_xop { struct plock_op xop; - void *callback; + int (*callback)(struct file_lock *fl, int result); void *fl; void *file; struct file_lock flc; @@ -190,7 +190,7 @@ static int dlm_plock_callback(struct plock_op *op) struct file *file; struct file_lock *fl; struct file_lock *flc; - int (*notify)(void *, void *, int) = NULL; + int (*notify)(struct file_lock *fl, int result) = NULL; struct plock_xop *xop = (struct plock_xop *)op; int rv = 0; @@ -209,7 +209,7 @@ static int dlm_plock_callback(struct plock_op *op) notify = xop->callback; if (op->info.rv) { - notify(fl, NULL, op->info.rv); + notify(fl, op->info.rv); goto out; } @@ -228,7 +228,7 @@ static int dlm_plock_callback(struct plock_op *op) (unsigned long long)op->info.number, file, fl); } - rv = notify(fl, NULL, 0); + rv = notify(fl, 0); if (rv) { /* XXX: We need to cancel the fs lock here: */ log_print("dlm_plock_callback: lock granted after lock request " diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 9d61947d473a..f3f5e72a29ba 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -206,7 +206,7 @@ static void receive_rcom_status(struct dlm_ls *ls, struct dlm_rcom *rc_in) rs = (struct rcom_status *)rc_in->rc_buf; - if (!(rs->rs_flags & DLM_RSF_NEED_SLOTS)) { + if (!(le32_to_cpu(rs->rs_flags) & DLM_RSF_NEED_SLOTS)) { status = dlm_recover_status(ls); goto do_create; } diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index db0fad3269c0..f5bce9096555 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -229,8 +229,8 @@ static int ecryptfs_open(struct inode *inode, struct file *file) if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); goto out_free; } if ((ecryptfs_inode_to_private(inode)->lower_file->f_flags & O_ACCMODE) @@ -327,7 +327,7 @@ ecryptfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) struct file *lower_file = ecryptfs_file_to_lower(file); long rc = -ENOIOCTLCMD; - if (lower_file->f_op && lower_file->f_op->compat_ioctl) + if (lower_file->f_op->compat_ioctl) rc = lower_file->f_op->compat_ioctl(lower_file, cmd, arg); return rc; } diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index d4a9431ec73c..1686dc2da9fd 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -53,9 +53,7 @@ static void unlock_dir(struct dentry *dir) static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) { - if (ecryptfs_inode_to_lower(inode) == (struct inode *)lower_inode) - return 1; - return 0; + return ecryptfs_inode_to_lower(inode) == lower_inode; } static int ecryptfs_inode_set(struct inode *inode, void *opaque) @@ -192,12 +190,6 @@ ecryptfs_do_create(struct inode *directory_inode, lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); lower_dir_dentry = lock_parent(lower_dentry); - if (IS_ERR(lower_dir_dentry)) { - ecryptfs_printk(KERN_ERR, "Error locking directory of " - "dentry\n"); - inode = ERR_CAST(lower_dir_dentry); - goto out; - } rc = vfs_create(lower_dir_dentry->d_inode, lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " @@ -215,7 +207,6 @@ ecryptfs_do_create(struct inode *directory_inode, fsstack_copy_inode_size(directory_inode, lower_dir_dentry->d_inode); out_lock: unlock_dir(lower_dir_dentry); -out: return inode; } @@ -250,8 +241,8 @@ int ecryptfs_initialize_file(struct dentry *ecryptfs_dentry, if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - ecryptfs_dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + ecryptfs_dentry, rc); goto out; } rc = ecryptfs_write_metadata(ecryptfs_dentry, ecryptfs_inode); @@ -313,8 +304,8 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) if (rc) { printk(KERN_ERR "%s: Error attempting to initialize " "the lower file for the dentry with name " - "[%s]; rc = [%d]\n", __func__, - dentry->d_name.name, rc); + "[%pd]; rc = [%d]\n", __func__, + dentry, rc); return rc; } @@ -418,8 +409,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " - "[%d] on lower_dentry = [%s]\n", __func__, rc, - ecryptfs_dentry->d_name.name); + "[%d] on lower_dentry = [%pd]\n", __func__, rc, + ecryptfs_dentry); goto out; } if (lower_dentry->d_inode) @@ -1039,7 +1030,7 @@ ecryptfs_setxattr(struct dentry *dentry, const char *name, const void *value, } rc = vfs_setxattr(lower_dentry, name, value, size, flags); - if (!rc) + if (!rc && dentry->d_inode) fsstack_copy_attr_all(dentry->d_inode, lower_dentry->d_inode); out: return rc; diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index 4725a07f003c..635e8e16a5b7 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -26,7 +26,6 @@ */ #include <linux/string.h> -#include <linux/syscalls.h> #include <linux/pagemap.h> #include <linux/key.h> #include <linux/random.h> @@ -1846,7 +1845,6 @@ int ecryptfs_parse_packet_set(struct ecryptfs_crypt_stat *crypt_stat, "(Tag 11 not allowed by itself)\n"); rc = -EIO; goto out_wipe_list; - break; default: ecryptfs_printk(KERN_DEBUG, "No packet at offset [%zd] " "of the file header; hex value of " diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index 1b119d3bf924..c4cd1fd86cc2 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -566,6 +566,13 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags s->s_maxbytes = path.dentry->d_sb->s_maxbytes; s->s_blocksize = path.dentry->d_sb->s_blocksize; s->s_magic = ECRYPTFS_SUPER_MAGIC; + s->s_stack_depth = path.dentry->d_sb->s_stack_depth + 1; + + rc = -EINVAL; + if (s->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("eCryptfs: maximum fs stacking depth exceeded\n"); + goto out_free; + } inode = ecryptfs_get_inode(path.dentry->d_inode, s); rc = PTR_ERR(inode); diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index e57380e5f6bd..286f10b0363b 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -434,8 +434,7 @@ void ecryptfs_release_messaging(void) mutex_lock(&ecryptfs_msg_ctx_lists_mux); for (i = 0; i < ecryptfs_message_buf_len; i++) { mutex_lock(&ecryptfs_msg_ctx_arr[i].mux); - if (ecryptfs_msg_ctx_arr[i].msg) - kfree(ecryptfs_msg_ctx_arr[i].msg); + kfree(ecryptfs_msg_ctx_arr[i].msg); mutex_unlock(&ecryptfs_msg_ctx_arr[i].mux); } kfree(ecryptfs_msg_ctx_arr); diff --git a/fs/exec.c b/fs/exec.c index a2b42a98c743..7302b75a9820 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1372,18 +1372,23 @@ int search_binary_handler(struct linux_binprm *bprm) read_unlock(&binfmt_lock); bprm->recursion_depth++; retval = fmt->load_binary(bprm); + read_lock(&binfmt_lock); + put_binfmt(fmt); bprm->recursion_depth--; - if (retval >= 0 || retval != -ENOEXEC || - bprm->mm == NULL || bprm->file == NULL) { - put_binfmt(fmt); + if (retval < 0 && !bprm->mm) { + /* we got to flush_old_exec() and failed after it */ + read_unlock(&binfmt_lock); + force_sigsegv(SIGSEGV, current); + return retval; + } + if (retval != -ENOEXEC || !bprm->file) { + read_unlock(&binfmt_lock); return retval; } - read_lock(&binfmt_lock); - put_binfmt(fmt); } read_unlock(&binfmt_lock); - if (need_retry && retval == -ENOEXEC) { + if (need_retry) { if (printable(bprm->buf[0]) && printable(bprm->buf[1]) && printable(bprm->buf[2]) && printable(bprm->buf[3])) return retval; diff --git a/fs/exofs/Kbuild b/fs/exofs/Kbuild index 389ba8312d5d..b47c7b8dc275 100644 --- a/fs/exofs/Kbuild +++ b/fs/exofs/Kbuild @@ -4,7 +4,7 @@ # Copyright (C) 2008 Panasas Inc. All rights reserved. # # Authors: -# Boaz Harrosh <bharrosh@panasas.com> +# Boaz Harrosh <ooo@electrozaur.com> # # This program is free software; you can redistribute it and/or modify # it under the terms of the GNU General Public License version 2 diff --git a/fs/exofs/common.h b/fs/exofs/common.h index 3bbd46956d77..7d88ef566213 100644 --- a/fs/exofs/common.h +++ b/fs/exofs/common.h @@ -4,7 +4,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/dir.c b/fs/exofs/dir.c index 49f51ab4caac..d7defd557601 100644 --- a/fs/exofs/dir.c +++ b/fs/exofs/dir.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/exofs.h b/fs/exofs/exofs.h index fffe86fd7a42..ad9cac670a47 100644 --- a/fs/exofs/exofs.h +++ b/fs/exofs/exofs.h @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 71bf8e4fb5d4..1a376b42d305 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/inode.c b/fs/exofs/inode.c index 3f9cafd73931..f1d3d4eb8c4f 100644 --- a/fs/exofs/inode.c +++ b/fs/exofs/inode.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/namei.c b/fs/exofs/namei.c index 4731fd991efe..28907460e8fa 100644 --- a/fs/exofs/namei.c +++ b/fs/exofs/namei.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/ore.c b/fs/exofs/ore.c index cfc0205d62c4..7bd8ac8dfb28 100644 --- a/fs/exofs/ore.c +++ b/fs/exofs/ore.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of exofs. * @@ -29,7 +29,7 @@ #include "ore_raid.h" -MODULE_AUTHOR("Boaz Harrosh <bharrosh@panasas.com>"); +MODULE_AUTHOR("Boaz Harrosh <ooo@electrozaur.com>"); MODULE_DESCRIPTION("Objects Raid Engine ore.ko"); MODULE_LICENSE("GPL"); diff --git a/fs/exofs/ore_raid.c b/fs/exofs/ore_raid.c index 84529b8a331b..27cbdb697649 100644 --- a/fs/exofs/ore_raid.c +++ b/fs/exofs/ore_raid.c @@ -1,6 +1,6 @@ /* * Copyright (C) 2011 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/ore_raid.h b/fs/exofs/ore_raid.h index cf6375d82129..a6e746775570 100644 --- a/fs/exofs/ore_raid.h +++ b/fs/exofs/ore_raid.h @@ -1,6 +1,6 @@ /* * Copyright (C) from 2011 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of the objects raid engine (ore). * diff --git a/fs/exofs/super.c b/fs/exofs/super.c index ed73ed8ebbee..95965503afcb 100644 --- a/fs/exofs/super.c +++ b/fs/exofs/super.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/symlink.c b/fs/exofs/symlink.c index 4dd687c3e747..832e2624b80b 100644 --- a/fs/exofs/symlink.c +++ b/fs/exofs/symlink.c @@ -2,7 +2,7 @@ * Copyright (C) 2005, 2006 * Avishay Traeger (avishay@gmail.com) * Copyright (C) 2008, 2009 - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * Copyrights for code taken from ext2: * Copyright (C) 1992, 1993, 1994, 1995 diff --git a/fs/exofs/sys.c b/fs/exofs/sys.c index 1b4f2f95fc37..5e6a2c0a1f0b 100644 --- a/fs/exofs/sys.c +++ b/fs/exofs/sys.c @@ -1,7 +1,7 @@ /* * Copyright (C) 2012 * Sachin Bhamare <sbhamare@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This file is part of exofs. * diff --git a/fs/ext2/super.c b/fs/ext2/super.c index b88edc05c230..170dc41e8bf4 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -1067,14 +1067,14 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) ext2_rsv_window_add(sb, &sbi->s_rsv_window_head); err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext2_count_free_blocks(sb)); + ext2_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext2_count_free_inodes(sb)); + ext2_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext2_count_dirs(sb)); + ext2_count_dirs(sb), GFP_KERNEL); } if (err) { ext2_msg(sb, KERN_ERR, "error: insufficient memory"); diff --git a/fs/ext3/ext3.h b/fs/ext3/ext3.h index e85ff15a060e..fc3cdcf24aed 100644 --- a/fs/ext3/ext3.h +++ b/fs/ext3/ext3.h @@ -237,6 +237,8 @@ struct ext3_new_group_data { #define EXT3_IOC32_GETVERSION_OLD FS_IOC32_GETVERSION #define EXT3_IOC32_SETVERSION_OLD FS_IOC32_SETVERSION +/* Number of supported quota types */ +#define EXT3_MAXQUOTAS 2 /* * Mount options @@ -248,7 +250,7 @@ struct ext3_mount_options { unsigned long s_commit_interval; #ifdef CONFIG_QUOTA int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[EXT3_MAXQUOTAS]; #endif }; @@ -669,7 +671,7 @@ struct ext3_sb_info { unsigned long s_commit_interval; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + char *s_qf_names[EXT3_MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif }; @@ -1183,9 +1185,9 @@ extern const struct inode_operations ext3_fast_symlink_inode_operations; #define EXT3_QUOTA_INIT_BLOCKS(sb) 0 #define EXT3_QUOTA_DEL_BLOCKS(sb) 0 #endif -#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) -#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_TRANS_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_INIT_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_INIT_BLOCKS(sb)) +#define EXT3_MAXQUOTAS_DEL_BLOCKS(sb) (EXT3_MAXQUOTAS*EXT3_QUOTA_DEL_BLOCKS(sb)) int ext3_mark_iloc_dirty(handle_t *handle, diff --git a/fs/ext3/super.c b/fs/ext3/super.c index 622e88249024..7015db0bafd1 100644 --- a/fs/ext3/super.c +++ b/fs/ext3/super.c @@ -441,7 +441,7 @@ static void ext3_put_super (struct super_block * sb) percpu_counter_destroy(&sbi->s_dirs_counter); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif @@ -1555,7 +1555,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { if (EXT3_SB(sb)->s_qf_names[i]) { int ret = ext3_quota_on_mount(sb, i); if (ret < 0) @@ -1606,7 +1606,7 @@ static void ext3_orphan_cleanup (struct super_block * sb, PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } @@ -2039,14 +2039,14 @@ static int ext3_fill_super (struct super_block *sb, void *data, int silent) goto failed_mount2; } err = percpu_counter_init(&sbi->s_freeblocks_counter, - ext3_count_free_blocks(sb)); + ext3_count_free_blocks(sb), GFP_KERNEL); if (!err) { err = percpu_counter_init(&sbi->s_freeinodes_counter, - ext3_count_free_inodes(sb)); + ext3_count_free_inodes(sb), GFP_KERNEL); } if (!err) { err = percpu_counter_init(&sbi->s_dirs_counter, - ext3_count_dirs(sb)); + ext3_count_dirs(sb), GFP_KERNEL); } if (err) { ext3_msg(sb, KERN_ERR, "error: insufficient memory"); @@ -2139,7 +2139,7 @@ failed_mount2: kfree(sbi->s_group_desc); failed_mount: #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif ext3_blkdev_remove(sbi); @@ -2659,7 +2659,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) old_opts.s_commit_interval = sbi->s_commit_interval; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], GFP_KERNEL); @@ -2763,7 +2763,7 @@ static int ext3_remount (struct super_block * sb, int * flags, char * data) } #ifdef CONFIG_QUOTA /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT3_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); #endif if (enable_quota) @@ -2777,7 +2777,7 @@ restore_opts: sbi->s_commit_interval = old_opts.s_commit_interval; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT3_MAXQUOTAS; i++) { kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 581ef40fbe90..83a6f497c4e0 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -176,7 +176,7 @@ static unsigned int num_clusters_in_group(struct super_block *sb, } /* Initializes an uninitialized block bitmap */ -static void ext4_init_block_bitmap(struct super_block *sb, +static int ext4_init_block_bitmap(struct super_block *sb, struct buffer_head *bh, ext4_group_t block_group, struct ext4_group_desc *gdp) @@ -192,7 +192,6 @@ static void ext4_init_block_bitmap(struct super_block *sb, /* If checksum is bad mark all blocks used to prevent allocation * essentially implementing a per-group read-only flag. */ if (!ext4_group_desc_csum_verify(sb, block_group, gdp)) { - ext4_error(sb, "Checksum bad for group %u", block_group); grp = ext4_get_group_info(sb, block_group); if (!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) percpu_counter_sub(&sbi->s_freeclusters_counter, @@ -205,7 +204,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, count); } set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT, &grp->bb_state); - return; + return -EIO; } memset(bh->b_data, 0, sb->s_blocksize); @@ -243,6 +242,7 @@ static void ext4_init_block_bitmap(struct super_block *sb, sb->s_blocksize * 8, bh->b_data); ext4_block_bitmap_csum_set(sb, block_group, gdp, bh); ext4_group_desc_csum_set(sb, block_group, gdp); + return 0; } /* Return the number of free blocks in a block group. It is used when @@ -438,11 +438,15 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group) } ext4_lock_group(sb, block_group); if (desc->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT)) { - ext4_init_block_bitmap(sb, bh, block_group, desc); + int err; + + err = ext4_init_block_bitmap(sb, bh, block_group, desc); set_bitmap_uptodate(bh); set_buffer_uptodate(bh); ext4_unlock_group(sb, block_group); unlock_buffer(bh); + if (err) + ext4_error(sb, "Checksum bad for grp %u", block_group); return bh; } ext4_unlock_group(sb, block_group); @@ -636,8 +640,7 @@ ext4_fsblk_t ext4_new_meta_blocks(handle_t *handle, struct inode *inode, * Account for the allocated meta blocks. We will never * fail EDQUOT for metdata, but we do account for it. */ - if (!(*errp) && - ext4_test_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED)) { + if (!(*errp) && (flags & EXT4_MB_DELALLOC_RESERVED)) { spin_lock(&EXT4_I(inode)->i_block_reservation_lock); spin_unlock(&EXT4_I(inode)->i_block_reservation_lock); dquot_alloc_block_nofail(inode, diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 3285aa5a706a..b610779a958c 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -24,8 +24,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, __u32 provided, calculated; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); @@ -46,8 +45,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); @@ -65,8 +63,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, ext4_group_t group, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); @@ -91,8 +88,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, ext4_group_t group, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 0bb3f9ea0832..c24143ea9c08 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -151,13 +151,11 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) &file->f_ra, file, index, 1); file->f_ra.prev_pos = (loff_t)index << PAGE_CACHE_SHIFT; - bh = ext4_bread(NULL, inode, map.m_lblk, 0, &err); + bh = ext4_bread(NULL, inode, map.m_lblk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); } - /* - * We ignore I/O errors on directories so users have a chance - * of recovering data when there's a bad sector - */ if (!bh) { if (!dir_has_error) { EXT4_ERROR_FILE(file, 0, diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index b0c225cdb52c..c55a1faaed58 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -572,15 +572,15 @@ enum { /* * The bit position of these flags must not overlap with any of the - * EXT4_GET_BLOCKS_*. They are used by ext4_ext_find_extent(), + * EXT4_GET_BLOCKS_*. They are used by ext4_find_extent(), * read_extent_tree_block(), ext4_split_extent_at(), * ext4_ext_insert_extent(), and ext4_ext_create_new_leaf(). * EXT4_EX_NOCACHE is used to indicate that the we shouldn't be * caching the extents when reading from the extent tree while a * truncate or punch hole operation is in progress. */ -#define EXT4_EX_NOCACHE 0x0400 -#define EXT4_EX_FORCE_CACHE 0x0800 +#define EXT4_EX_NOCACHE 0x40000000 +#define EXT4_EX_FORCE_CACHE 0x20000000 /* * Flags used by ext4_free_blocks @@ -890,6 +890,7 @@ struct ext4_inode_info { struct ext4_es_tree i_es_tree; rwlock_t i_es_lock; struct list_head i_es_lru; + unsigned int i_es_all_nr; /* protected by i_es_lock */ unsigned int i_es_lru_nr; /* protected by i_es_lock */ unsigned long i_touch_when; /* jiffies of last accessing */ @@ -1174,6 +1175,9 @@ struct ext4_super_block { #define EXT4_MF_MNTDIR_SAMPLED 0x0001 #define EXT4_MF_FS_ABORTED 0x0002 /* Fatal error detected */ +/* Number of quota types we support */ +#define EXT4_MAXQUOTAS 2 + /* * fourth extended-fs super-block data in memory */ @@ -1237,7 +1241,7 @@ struct ext4_sb_info { u32 s_min_batch_time; struct block_device *journal_bdev; #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; /* Names of quota files with journalled quota */ + char *s_qf_names[EXT4_MAXQUOTAS]; /* Names of quota files with journalled quota */ int s_jquota_fmt; /* Format of quota to use */ #endif unsigned int s_want_extra_isize; /* New inodes should reserve # bytes */ @@ -1330,8 +1334,7 @@ struct ext4_sb_info { /* Reclaim extents from extent status tree */ struct shrinker s_es_shrinker; struct list_head s_es_lru; - unsigned long s_es_last_sorted; - struct percpu_counter s_extent_cache_cnt; + struct ext4_es_stats s_es_stats; struct mb_cache *s_mb_cache; spinlock_t s_es_lru_lock ____cacheline_aligned_in_smp; @@ -1399,7 +1402,6 @@ enum { EXT4_STATE_EXT_MIGRATE, /* Inode is migrating */ EXT4_STATE_DIO_UNWRITTEN, /* need convert on dio done*/ EXT4_STATE_NEWENTRY, /* File just added to dir */ - EXT4_STATE_DELALLOC_RESERVED, /* blks already reserved for delalloc */ EXT4_STATE_DIOREAD_LOCK, /* Disable support for dio read nolocking */ EXT4_STATE_MAY_INLINE_DATA, /* may have in-inode data */ @@ -2086,10 +2088,8 @@ extern int ext4_group_add_blocks(handle_t *handle, struct super_block *sb, extern int ext4_trim_fs(struct super_block *, struct fstrim_range *); /* inode.c */ -struct buffer_head *ext4_getblk(handle_t *, struct inode *, - ext4_lblk_t, int, int *); -struct buffer_head *ext4_bread(handle_t *, struct inode *, - ext4_lblk_t, int, int *); +struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); +struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, @@ -2109,6 +2109,7 @@ int do_journal_get_write_access(handle_t *handle, #define CONVERT_INLINE_DATA 2 extern struct inode *ext4_iget(struct super_block *, unsigned long); +extern struct inode *ext4_iget_normal(struct super_block *, unsigned long); extern int ext4_write_inode(struct inode *, struct writeback_control *); extern int ext4_setattr(struct dentry *, struct iattr *); extern int ext4_getattr(struct vfsmount *mnt, struct dentry *dentry, @@ -2332,10 +2333,18 @@ extern int ext4_register_li_request(struct super_block *sb, static inline int ext4_has_group_desc_csum(struct super_block *sb) { return EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_GDT_CSUM | - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM); + EXT4_FEATURE_RO_COMPAT_GDT_CSUM) || + (EXT4_SB(sb)->s_chksum_driver != NULL); } +static inline int ext4_has_metadata_csum(struct super_block *sb) +{ + WARN_ON_ONCE(EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + !EXT4_SB(sb)->s_chksum_driver); + + return (EXT4_SB(sb)->s_chksum_driver != NULL); +} static inline ext4_fsblk_t ext4_blocks_count(struct ext4_super_block *es) { return ((ext4_fsblk_t)le32_to_cpu(es->s_blocks_count_hi) << 32) | @@ -2731,21 +2740,26 @@ extern int ext4_can_extents_be_merged(struct inode *inode, struct ext4_extent *ex1, struct ext4_extent *ex2); extern int ext4_ext_insert_extent(handle_t *, struct inode *, - struct ext4_ext_path *, + struct ext4_ext_path **, struct ext4_extent *, int); -extern struct ext4_ext_path *ext4_ext_find_extent(struct inode *, ext4_lblk_t, - struct ext4_ext_path *, - int flags); +extern struct ext4_ext_path *ext4_find_extent(struct inode *, ext4_lblk_t, + struct ext4_ext_path **, + int flags); extern void ext4_ext_drop_refs(struct ext4_ext_path *); extern int ext4_ext_check_inode(struct inode *inode); extern int ext4_find_delalloc_range(struct inode *inode, ext4_lblk_t lblk_start, ext4_lblk_t lblk_end); extern int ext4_find_delalloc_cluster(struct inode *inode, ext4_lblk_t lblk); +extern ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path); extern int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); extern int ext4_ext_precache(struct inode *inode); extern int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len); +extern int ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, + ext4_lblk_t lblk2, ext4_lblk_t count, + int mark_unwritten,int *err); /* move_extent.c */ extern void ext4_double_down_write_data_sem(struct inode *first, @@ -2755,8 +2769,6 @@ extern void ext4_double_up_write_data_sem(struct inode *orig_inode, extern int ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 start_orig, __u64 start_donor, __u64 len, __u64 *moved_len); -extern int mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent); /* page-io.c */ extern int __init ext4_init_pageio(void); diff --git a/fs/ext4/ext4_extents.h b/fs/ext4/ext4_extents.h index a867f5ca9991..3c9381547094 100644 --- a/fs/ext4/ext4_extents.h +++ b/fs/ext4/ext4_extents.h @@ -123,6 +123,7 @@ find_ext4_extent_tail(struct ext4_extent_header *eh) struct ext4_ext_path { ext4_fsblk_t p_block; __u16 p_depth; + __u16 p_maxdepth; struct ext4_extent *p_ext; struct ext4_extent_idx *p_idx; struct ext4_extent_header *p_hdr; diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 0074e0d23d6e..3445035c7e01 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -256,8 +256,8 @@ int __ext4_handle_dirty_metadata(const char *where, unsigned int line, set_buffer_prio(bh); if (ext4_handle_valid(handle)) { err = jbd2_journal_dirty_metadata(handle, bh); - /* Errors can only happen if there is a bug */ - if (WARN_ON_ONCE(err)) { + /* Errors can only happen due to aborted journal or a nasty bug */ + if (!is_handle_aborted(handle) && WARN_ON_ONCE(err)) { ext4_journal_abort_handle(where, line, __func__, bh, handle, err); if (inode == NULL) { diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 17c00ff202f2..9c5b49fb281e 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -102,9 +102,9 @@ #define EXT4_QUOTA_INIT_BLOCKS(sb) 0 #define EXT4_QUOTA_DEL_BLOCKS(sb) 0 #endif -#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) -#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_TRANS_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_TRANS_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_INIT_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_INIT_BLOCKS(sb)) +#define EXT4_MAXQUOTAS_DEL_BLOCKS(sb) (EXT4_MAXQUOTAS*EXT4_QUOTA_DEL_BLOCKS(sb)) static inline int ext4_jbd2_credits_xattr(struct inode *inode) { diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 74292a71b384..37043d0b2be8 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -73,8 +73,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -88,8 +87,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -98,14 +96,14 @@ static void ext4_extent_block_csum_set(struct inode *inode, static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags); static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags); @@ -291,6 +289,20 @@ static inline int ext4_ext_space_root_idx(struct inode *inode, int check) return size; } +static inline int +ext4_force_split_extent_at(handle_t *handle, struct inode *inode, + struct ext4_ext_path **ppath, ext4_lblk_t lblk, + int nofail) +{ + struct ext4_ext_path *path = *ppath; + int unwritten = ext4_ext_is_unwritten(path[path->p_depth].p_ext); + + return ext4_split_extent_at(handle, inode, ppath, lblk, unwritten ? + EXT4_EXT_MARK_UNWRIT1|EXT4_EXT_MARK_UNWRIT2 : 0, + EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | + (nofail ? EXT4_GET_BLOCKS_METADATA_NOFAIL:0)); +} + /* * Calculate the number of metadata blocks needed * to allocate @blocks @@ -695,9 +707,11 @@ static void ext4_ext_show_move(struct inode *inode, struct ext4_ext_path *path, void ext4_ext_drop_refs(struct ext4_ext_path *path) { - int depth = path->p_depth; - int i; + int depth, i; + if (!path) + return; + depth = path->p_depth; for (i = 0; i <= depth; i++, path++) if (path->p_bh) { brelse(path->p_bh); @@ -841,24 +855,32 @@ int ext4_ext_tree_init(handle_t *handle, struct inode *inode) } struct ext4_ext_path * -ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, - struct ext4_ext_path *path, int flags) +ext4_find_extent(struct inode *inode, ext4_lblk_t block, + struct ext4_ext_path **orig_path, int flags) { struct ext4_extent_header *eh; struct buffer_head *bh; - short int depth, i, ppos = 0, alloc = 0; + struct ext4_ext_path *path = orig_path ? *orig_path : NULL; + short int depth, i, ppos = 0; int ret; eh = ext_inode_hdr(inode); depth = ext_depth(inode); - /* account possible depth increase */ + if (path) { + ext4_ext_drop_refs(path); + if (depth > path[0].p_maxdepth) { + kfree(path); + *orig_path = path = NULL; + } + } if (!path) { + /* account possible depth increase */ path = kzalloc(sizeof(struct ext4_ext_path) * (depth + 2), GFP_NOFS); - if (!path) + if (unlikely(!path)) return ERR_PTR(-ENOMEM); - alloc = 1; + path[0].p_maxdepth = depth + 1; } path[0].p_hdr = eh; path[0].p_bh = NULL; @@ -876,7 +898,7 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, bh = read_extent_tree_block(inode, path[ppos].p_block, --i, flags); - if (IS_ERR(bh)) { + if (unlikely(IS_ERR(bh))) { ret = PTR_ERR(bh); goto err; } @@ -910,8 +932,9 @@ ext4_ext_find_extent(struct inode *inode, ext4_lblk_t block, err: ext4_ext_drop_refs(path); - if (alloc) - kfree(path); + kfree(path); + if (orig_path) + *orig_path = NULL; return ERR_PTR(ret); } @@ -1238,16 +1261,24 @@ cleanup: * just created block */ static int ext4_ext_grow_indepth(handle_t *handle, struct inode *inode, - unsigned int flags, - struct ext4_extent *newext) + unsigned int flags) { struct ext4_extent_header *neh; struct buffer_head *bh; - ext4_fsblk_t newblock; + ext4_fsblk_t newblock, goal = 0; + struct ext4_super_block *es = EXT4_SB(inode->i_sb)->s_es; int err = 0; - newblock = ext4_ext_new_meta_block(handle, inode, NULL, - newext, &err, flags); + /* Try to prepend new index to old one */ + if (ext_depth(inode)) + goal = ext4_idx_pblock(EXT_FIRST_INDEX(ext_inode_hdr(inode))); + if (goal > le32_to_cpu(es->s_first_data_block)) { + flags |= EXT4_MB_HINT_TRY_GOAL; + goal--; + } else + goal = ext4_inode_to_goal_block(inode); + newblock = ext4_new_meta_blocks(handle, inode, goal, flags, + NULL, &err); if (newblock == 0) return err; @@ -1314,9 +1345,10 @@ out: static int ext4_ext_create_new_leaf(handle_t *handle, struct inode *inode, unsigned int mb_flags, unsigned int gb_flags, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext) { + struct ext4_ext_path *path = *ppath; struct ext4_ext_path *curp; int depth, i, err = 0; @@ -1340,23 +1372,21 @@ repeat: goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + ppath, gb_flags); if (IS_ERR(path)) err = PTR_ERR(path); } else { /* tree is full, time to grow in depth */ - err = ext4_ext_grow_indepth(handle, inode, mb_flags, newext); + err = ext4_ext_grow_indepth(handle, inode, mb_flags); if (err) goto out; /* refill path */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, + path = ext4_find_extent(inode, (ext4_lblk_t)le32_to_cpu(newext->ee_block), - path, gb_flags); + ppath, gb_flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -1559,7 +1589,7 @@ found_extent: * allocated block. Thus, index entries have to be consistent * with leaves. */ -static ext4_lblk_t +ext4_lblk_t ext4_ext_next_allocated_block(struct ext4_ext_path *path) { int depth; @@ -1802,6 +1832,7 @@ static void ext4_ext_try_to_merge_up(handle_t *handle, sizeof(struct ext4_extent_idx); s += sizeof(struct ext4_extent_header); + path[1].p_maxdepth = path[0].p_maxdepth; memcpy(path[0].p_hdr, path[1].p_hdr, s); path[0].p_depth = 0; path[0].p_ext = EXT_FIRST_EXTENT(path[0].p_hdr) + @@ -1896,9 +1927,10 @@ out: * creating new leaf in the no-space case. */ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_extent *newext, int gb_flags) { + struct ext4_ext_path *path = *ppath; struct ext4_extent_header *eh; struct ext4_extent *ex, *fex; struct ext4_extent *nearex; /* nearest extent */ @@ -1907,6 +1939,8 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, ext4_lblk_t next; int mb_flags = 0, unwritten; + if (gb_flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + mb_flags |= EXT4_MB_DELALLOC_RESERVED; if (unlikely(ext4_ext_get_actual_len(newext) == 0)) { EXT4_ERROR_INODE(inode, "ext4_ext_get_actual_len(newext) == 0"); return -EIO; @@ -1925,7 +1959,7 @@ int ext4_ext_insert_extent(handle_t *handle, struct inode *inode, /* * Try to see whether we should rather test the extent on * right from ex, or from the left of ex. This is because - * ext4_ext_find_extent() can return either extent on the + * ext4_find_extent() can return either extent on the * left, or on the right from the searched position. This * will make merging more effective. */ @@ -2008,7 +2042,7 @@ prepend: if (next != EXT_MAX_BLOCKS) { ext_debug("next leaf block - %u\n", next); BUG_ON(npath != NULL); - npath = ext4_ext_find_extent(inode, next, NULL, 0); + npath = ext4_find_extent(inode, next, NULL, 0); if (IS_ERR(npath)) return PTR_ERR(npath); BUG_ON(npath->p_depth != path->p_depth); @@ -2028,9 +2062,9 @@ prepend: * We're gonna add a new leaf in the tree. */ if (gb_flags & EXT4_GET_BLOCKS_METADATA_NOFAIL) - mb_flags = EXT4_MB_USE_RESERVED; + mb_flags |= EXT4_MB_USE_RESERVED; err = ext4_ext_create_new_leaf(handle, inode, mb_flags, gb_flags, - path, newext); + ppath, newext); if (err) goto cleanup; depth = ext_depth(inode); @@ -2108,10 +2142,8 @@ merge: err = ext4_ext_dirty(handle, inode, path + path->p_depth); cleanup: - if (npath) { - ext4_ext_drop_refs(npath); - kfree(npath); - } + ext4_ext_drop_refs(npath); + kfree(npath); return err; } @@ -2133,13 +2165,7 @@ static int ext4_fill_fiemap_extents(struct inode *inode, /* find extent for this block */ down_read(&EXT4_I(inode)->i_data_sem); - if (path && ext_depth(inode) != depth) { - /* depth was changed. we have to realloc path */ - kfree(path); - path = NULL; - } - - path = ext4_ext_find_extent(inode, block, path, 0); + path = ext4_find_extent(inode, block, &path, 0); if (IS_ERR(path)) { up_read(&EXT4_I(inode)->i_data_sem); err = PTR_ERR(path); @@ -2156,7 +2182,6 @@ static int ext4_fill_fiemap_extents(struct inode *inode, } ex = path[depth].p_ext; next = ext4_ext_next_allocated_block(path); - ext4_ext_drop_refs(path); flags = 0; exists = 0; @@ -2266,11 +2291,8 @@ static int ext4_fill_fiemap_extents(struct inode *inode, block = es.es_lblk + es.es_len; } - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } - + ext4_ext_drop_refs(path); + kfree(path); return err; } @@ -2826,7 +2848,7 @@ again: ext4_lblk_t ee_block; /* find extent for this block */ - path = ext4_ext_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, end, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2854,24 +2876,14 @@ again: */ if (end >= ee_block && end < ee_block + ext4_ext_get_actual_len(ex) - 1) { - int split_flag = 0; - - if (ext4_ext_is_unwritten(ex)) - split_flag = EXT4_EXT_MARK_UNWRIT1 | - EXT4_EXT_MARK_UNWRIT2; - /* * Split the extent in two so that 'end' is the last * block in the first new extent. Also we should not * fail removing space due to ENOSPC so try to use * reserved block if that happens. */ - err = ext4_split_extent_at(handle, inode, path, - end + 1, split_flag, - EXT4_EX_NOCACHE | - EXT4_GET_BLOCKS_PRE_IO | - EXT4_GET_BLOCKS_METADATA_NOFAIL); - + err = ext4_force_split_extent_at(handle, inode, &path, + end + 1, 1); if (err < 0) goto out; } @@ -2893,7 +2905,7 @@ again: ext4_journal_stop(handle); return -ENOMEM; } - path[0].p_depth = depth; + path[0].p_maxdepth = path[0].p_depth = depth; path[0].p_hdr = ext_inode_hdr(inode); i = 0; @@ -3013,10 +3025,9 @@ again: out: ext4_ext_drop_refs(path); kfree(path); - if (err == -EAGAIN) { - path = NULL; + path = NULL; + if (err == -EAGAIN) goto again; - } ext4_journal_stop(handle); return err; @@ -3130,11 +3141,12 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) */ static int ext4_split_extent_at(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, ext4_lblk_t split, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_fsblk_t newblock; ext4_lblk_t ee_block; struct ext4_extent *ex, newex, orig_ex, zero_ex; @@ -3205,7 +3217,7 @@ static int ext4_split_extent_at(handle_t *handle, if (split_flag & EXT4_EXT_MARK_UNWRIT2) ext4_ext_mark_unwritten(ex2); - err = ext4_ext_insert_extent(handle, inode, path, &newex, flags); + err = ext4_ext_insert_extent(handle, inode, ppath, &newex, flags); if (err == -ENOSPC && (EXT4_EXT_MAY_ZEROOUT & split_flag)) { if (split_flag & (EXT4_EXT_DATA_VALID1|EXT4_EXT_DATA_VALID2)) { if (split_flag & EXT4_EXT_DATA_VALID1) { @@ -3271,11 +3283,12 @@ fix_extent_len: */ static int ext4_split_extent(handle_t *handle, struct inode *inode, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, struct ext4_map_blocks *map, int split_flag, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t ee_block; struct ext4_extent *ex; unsigned int ee_len, depth; @@ -3298,7 +3311,7 @@ static int ext4_split_extent(handle_t *handle, EXT4_EXT_MARK_UNWRIT2; if (split_flag & EXT4_EXT_DATA_VALID2) split_flag1 |= EXT4_EXT_DATA_VALID1; - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk + map->m_len, split_flag1, flags1); if (err) goto out; @@ -3309,8 +3322,7 @@ static int ext4_split_extent(handle_t *handle, * Update path is required because previous ext4_split_extent_at() may * result in split of original leaf or extent zeroout. */ - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = ext_depth(inode); @@ -3330,7 +3342,7 @@ static int ext4_split_extent(handle_t *handle, split_flag1 |= split_flag & (EXT4_EXT_MAY_ZEROOUT | EXT4_EXT_MARK_UNWRIT2); } - err = ext4_split_extent_at(handle, inode, path, + err = ext4_split_extent_at(handle, inode, ppath, map->m_lblk, split_flag1, flags); if (err) goto out; @@ -3364,9 +3376,10 @@ out: static int ext4_ext_convert_to_initialized(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; struct ext4_sb_info *sbi; struct ext4_extent_header *eh; struct ext4_map_blocks split_map; @@ -3590,7 +3603,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle, } } - allocated = ext4_split_extent(handle, inode, path, + allocated = ext4_split_extent(handle, inode, ppath, &split_map, split_flag, flags); if (allocated < 0) err = allocated; @@ -3629,9 +3642,10 @@ out: static int ext4_split_convert_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, + struct ext4_ext_path **ppath, int flags) { + struct ext4_ext_path *path = *ppath; ext4_lblk_t eof_block; ext4_lblk_t ee_block; struct ext4_extent *ex; @@ -3665,74 +3679,15 @@ static int ext4_split_convert_extents(handle_t *handle, split_flag |= (EXT4_EXT_MARK_UNWRIT2 | EXT4_EXT_DATA_VALID2); } flags |= EXT4_GET_BLOCKS_PRE_IO; - return ext4_split_extent(handle, inode, path, map, split_flag, flags); -} - -static int ext4_convert_initialized_extents(handle_t *handle, - struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path) -{ - struct ext4_extent *ex; - ext4_lblk_t ee_block; - unsigned int ee_len; - int depth; - int err = 0; - - depth = ext_depth(inode); - ex = path[depth].p_ext; - ee_block = le32_to_cpu(ex->ee_block); - ee_len = ext4_ext_get_actual_len(ex); - - ext_debug("%s: inode %lu, logical" - "block %llu, max_blocks %u\n", __func__, inode->i_ino, - (unsigned long long)ee_block, ee_len); - - if (ee_block != map->m_lblk || ee_len > map->m_len) { - err = ext4_split_convert_extents(handle, inode, map, path, - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); - if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } - depth = ext_depth(inode); - ex = path[depth].p_ext; - if (!ex) { - EXT4_ERROR_INODE(inode, "unexpected hole at %lu", - (unsigned long) map->m_lblk); - err = -EIO; - goto out; - } - } - - err = ext4_ext_get_access(handle, inode, path + depth); - if (err) - goto out; - /* first mark the extent as unwritten */ - ext4_ext_mark_unwritten(ex); - - /* note: ext4_ext_correct_indexes() isn't needed here because - * borders are not changed - */ - ext4_ext_try_to_merge(handle, inode, path, ex); - - /* Mark modified extent as dirty */ - err = ext4_ext_dirty(handle, inode, path + path->p_depth); -out: - ext4_ext_show_leaf(inode, path); - return err; + return ext4_split_extent(handle, inode, ppath, map, split_flag, flags); } - static int ext4_convert_unwritten_extents_endio(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path) + struct ext4_ext_path **ppath) { + struct ext4_ext_path *path = *ppath; struct ext4_extent *ex; ext4_lblk_t ee_block; unsigned int ee_len; @@ -3761,16 +3716,13 @@ static int ext4_convert_unwritten_extents_endio(handle_t *handle, inode->i_ino, (unsigned long long)ee_block, ee_len, (unsigned long long)map->m_lblk, map->m_len); #endif - err = ext4_split_convert_extents(handle, inode, map, path, + err = ext4_split_convert_extents(handle, inode, map, ppath, EXT4_GET_BLOCKS_CONVERT); if (err < 0) - goto out; - ext4_ext_drop_refs(path); - path = ext4_ext_find_extent(inode, map->m_lblk, path, 0); - if (IS_ERR(path)) { - err = PTR_ERR(path); - goto out; - } + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); depth = ext_depth(inode); ex = path[depth].p_ext; } @@ -3963,12 +3915,16 @@ get_reserved_cluster_alloc(struct inode *inode, ext4_lblk_t lblk_start, } static int -ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, - unsigned int allocated, ext4_fsblk_t newblock) +convert_initialized_extent(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, + struct ext4_ext_path **ppath, int flags, + unsigned int allocated, ext4_fsblk_t newblock) { - int ret = 0; + struct ext4_ext_path *path = *ppath; + struct ext4_extent *ex; + ext4_lblk_t ee_block; + unsigned int ee_len; + int depth; int err = 0; /* @@ -3978,28 +3934,67 @@ ext4_ext_convert_initialized_extent(handle_t *handle, struct inode *inode, if (map->m_len > EXT_UNWRITTEN_MAX_LEN) map->m_len = EXT_UNWRITTEN_MAX_LEN / 2; - ret = ext4_convert_initialized_extents(handle, inode, map, - path); - if (ret >= 0) { - ext4_update_inode_fsync_trans(handle, inode, 1); - err = check_eofblocks_fl(handle, inode, map->m_lblk, - path, map->m_len); - } else - err = ret; + depth = ext_depth(inode); + ex = path[depth].p_ext; + ee_block = le32_to_cpu(ex->ee_block); + ee_len = ext4_ext_get_actual_len(ex); + + ext_debug("%s: inode %lu, logical" + "block %llu, max_blocks %u\n", __func__, inode->i_ino, + (unsigned long long)ee_block, ee_len); + + if (ee_block != map->m_lblk || ee_len > map->m_len) { + err = ext4_split_convert_extents(handle, inode, map, ppath, + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN); + if (err < 0) + return err; + path = ext4_find_extent(inode, map->m_lblk, ppath, 0); + if (IS_ERR(path)) + return PTR_ERR(path); + depth = ext_depth(inode); + ex = path[depth].p_ext; + if (!ex) { + EXT4_ERROR_INODE(inode, "unexpected hole at %lu", + (unsigned long) map->m_lblk); + return -EIO; + } + } + + err = ext4_ext_get_access(handle, inode, path + depth); + if (err) + return err; + /* first mark the extent as unwritten */ + ext4_ext_mark_unwritten(ex); + + /* note: ext4_ext_correct_indexes() isn't needed here because + * borders are not changed + */ + ext4_ext_try_to_merge(handle, inode, path, ex); + + /* Mark modified extent as dirty */ + err = ext4_ext_dirty(handle, inode, path + path->p_depth); + if (err) + return err; + ext4_ext_show_leaf(inode, path); + + ext4_update_inode_fsync_trans(handle, inode, 1); + err = check_eofblocks_fl(handle, inode, map->m_lblk, path, map->m_len); + if (err) + return err; map->m_flags |= EXT4_MAP_UNWRITTEN; if (allocated > map->m_len) allocated = map->m_len; map->m_len = allocated; - - return err ? err : allocated; + return allocated; } static int ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, - struct ext4_ext_path *path, int flags, + struct ext4_ext_path **ppath, int flags, unsigned int allocated, ext4_fsblk_t newblock) { + struct ext4_ext_path *path = *ppath; int ret = 0; int err = 0; ext4_io_end_t *io = ext4_inode_aio(inode); @@ -4021,8 +4016,8 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* get_block() before submit the IO, split the extent */ if (flags & EXT4_GET_BLOCKS_PRE_IO) { - ret = ext4_split_convert_extents(handle, inode, map, - path, flags | EXT4_GET_BLOCKS_CONVERT); + ret = ext4_split_convert_extents(handle, inode, map, ppath, + flags | EXT4_GET_BLOCKS_CONVERT); if (ret <= 0) goto out; /* @@ -4040,7 +4035,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { ret = ext4_convert_unwritten_extents_endio(handle, inode, map, - path); + ppath); if (ret >= 0) { ext4_update_inode_fsync_trans(handle, inode, 1); err = check_eofblocks_fl(handle, inode, map->m_lblk, @@ -4078,7 +4073,7 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* buffered write, writepage time, convert*/ - ret = ext4_ext_convert_to_initialized(handle, inode, map, path, flags); + ret = ext4_ext_convert_to_initialized(handle, inode, map, ppath, flags); if (ret >= 0) ext4_update_inode_fsync_trans(handle, inode, 1); out: @@ -4279,7 +4274,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_ext_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, 0); if (IS_ERR(path)) { err = PTR_ERR(path); path = NULL; @@ -4291,7 +4286,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * consistent leaf must not be empty; * this situation is possible, though, _during_ tree modification; - * this is why assert can't be put in ext4_ext_find_extent() + * this is why assert can't be put in ext4_find_extent() */ if (unlikely(path[depth].p_ext == NULL && depth != 0)) { EXT4_ERROR_INODE(inode, "bad extent address " @@ -4331,15 +4326,15 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, */ if ((!ext4_ext_is_unwritten(ex)) && (flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) { - allocated = ext4_ext_convert_initialized_extent( - handle, inode, map, path, flags, - allocated, newblock); + allocated = convert_initialized_extent( + handle, inode, map, &path, + flags, allocated, newblock); goto out2; } else if (!ext4_ext_is_unwritten(ex)) goto out; ret = ext4_ext_handle_unwritten_extents( - handle, inode, map, path, flags, + handle, inode, map, &path, flags, allocated, newblock); if (ret < 0) err = ret; @@ -4376,7 +4371,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, /* * If we are doing bigalloc, check to see if the extent returned - * by ext4_ext_find_extent() implies a cluster we can use. + * by ext4_find_extent() implies a cluster we can use. */ if (cluster_offset && ex && get_implied_cluster_alloc(inode->i_sb, map, ex, path)) { @@ -4451,6 +4446,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, ar.flags = 0; if (flags & EXT4_GET_BLOCKS_NO_NORMALIZE) ar.flags |= EXT4_MB_HINT_NOPREALLOC; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; newblock = ext4_mb_new_blocks(handle, &ar, &err); if (!newblock) goto out2; @@ -4486,7 +4483,7 @@ got_allocated_blocks: err = check_eofblocks_fl(handle, inode, map->m_lblk, path, ar.len); if (!err) - err = ext4_ext_insert_extent(handle, inode, path, + err = ext4_ext_insert_extent(handle, inode, &path, &newex, flags); if (!err && set_unwritten) { @@ -4619,10 +4616,8 @@ out: map->m_pblk = newblock; map->m_len = allocated; out2: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, err ? err : allocated); @@ -4799,7 +4794,8 @@ static long ext4_zero_range(struct file *file, loff_t offset, max_blocks -= lblk; flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT | - EXT4_GET_BLOCKS_CONVERT_UNWRITTEN; + EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | + EXT4_EX_NOCACHE; if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; @@ -4837,15 +4833,21 @@ static long ext4_zero_range(struct file *file, loff_t offset, ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, + flags, mode); + if (ret) + goto out_dio; /* * Remove entire range from the extent status tree. + * + * ext4_es_remove_extent(inode, lblk, max_blocks) is + * NOT sufficient. I'm not sure why this is the case, + * but let's be conservative and remove the extent + * status tree for the entire inode. There should be + * no outstanding delalloc extents thanks to the + * filemap_write_and_wait_range() call above. */ - ret = ext4_es_remove_extent(inode, lblk, max_blocks); - if (ret) - goto out_dio; - - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags, mode); + ret = ext4_es_remove_extent(inode, 0, EXT_MAX_BLOCKS); if (ret) goto out_dio; } @@ -5304,36 +5306,31 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, struct ext4_ext_path *path; int ret = 0, depth; struct ext4_extent *extent; - ext4_lblk_t stop_block, current_block; + ext4_lblk_t stop_block; ext4_lblk_t ex_start, ex_end; /* Let path point to the last extent */ - path = ext4_ext_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); + path = ext4_find_extent(inode, EXT_MAX_BLOCKS - 1, NULL, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; extent = path[depth].p_ext; - if (!extent) { - ext4_ext_drop_refs(path); - kfree(path); - return ret; - } + if (!extent) + goto out; stop_block = le32_to_cpu(extent->ee_block) + ext4_ext_get_actual_len(extent); - ext4_ext_drop_refs(path); - kfree(path); /* Nothing to shift, if hole is at the end of file */ if (start >= stop_block) - return ret; + goto out; /* * Don't start shifting extents until we make sure the hole is big * enough to accomodate the shift. */ - path = ext4_ext_find_extent(inode, start - 1, NULL, 0); + path = ext4_find_extent(inode, start - 1, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5346,8 +5343,6 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, ex_start = 0; ex_end = 0; } - ext4_ext_drop_refs(path); - kfree(path); if ((start == ex_start && shift > ex_start) || (shift > start - ex_end)) @@ -5355,7 +5350,7 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, /* Its safe to start updating extents */ while (start < stop_block) { - path = ext4_ext_find_extent(inode, start, NULL, 0); + path = ext4_find_extent(inode, start, &path, 0); if (IS_ERR(path)) return PTR_ERR(path); depth = path->p_depth; @@ -5365,27 +5360,23 @@ ext4_ext_shift_extents(struct inode *inode, handle_t *handle, (unsigned long) start); return -EIO; } - - current_block = le32_to_cpu(extent->ee_block); - if (start > current_block) { + if (start > le32_to_cpu(extent->ee_block)) { /* Hole, move to the next extent */ - ret = mext_next_extent(inode, path, &extent); - if (ret != 0) { - ext4_ext_drop_refs(path); - kfree(path); - if (ret == 1) - ret = 0; - break; + if (extent < EXT_LAST_EXTENT(path[depth].p_hdr)) { + path[depth].p_ext++; + } else { + start = ext4_ext_next_allocated_block(path); + continue; } } ret = ext4_ext_shift_path_extents(path, shift, inode, handle, &start); - ext4_ext_drop_refs(path); - kfree(path); if (ret) break; } - +out: + ext4_ext_drop_refs(path); + kfree(path); return ret; } @@ -5508,3 +5499,199 @@ out_mutex: mutex_unlock(&inode->i_mutex); return ret; } + +/** + * ext4_swap_extents - Swap extents between two inodes + * + * @inode1: First inode + * @inode2: Second inode + * @lblk1: Start block for first inode + * @lblk2: Start block for second inode + * @count: Number of blocks to swap + * @mark_unwritten: Mark second inode's extents as unwritten after swap + * @erp: Pointer to save error value + * + * This helper routine does exactly what is promise "swap extents". All other + * stuff such as page-cache locking consistency, bh mapping consistency or + * extent's data copying must be performed by caller. + * Locking: + * i_mutex is held for both inodes + * i_data_sem is locked for write for both inodes + * Assumptions: + * All pages from requested range are locked for both inodes + */ +int +ext4_swap_extents(handle_t *handle, struct inode *inode1, + struct inode *inode2, ext4_lblk_t lblk1, ext4_lblk_t lblk2, + ext4_lblk_t count, int unwritten, int *erp) +{ + struct ext4_ext_path *path1 = NULL; + struct ext4_ext_path *path2 = NULL; + int replaced_count = 0; + + BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); + BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + BUG_ON(!mutex_is_locked(&inode1->i_mutex)); + + *erp = ext4_es_remove_extent(inode1, lblk1, count); + if (unlikely(*erp)) + return 0; + *erp = ext4_es_remove_extent(inode2, lblk2, count); + if (unlikely(*erp)) + return 0; + + while (count) { + struct ext4_extent *ex1, *ex2, tmp_ex; + ext4_lblk_t e1_blk, e2_blk; + int e1_len, e2_len, len; + int split = 0; + + path1 = ext4_find_extent(inode1, lblk1, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path1))) { + *erp = PTR_ERR(path1); + path1 = NULL; + finish: + count = 0; + goto repeat; + } + path2 = ext4_find_extent(inode2, lblk2, NULL, EXT4_EX_NOCACHE); + if (unlikely(IS_ERR(path2))) { + *erp = PTR_ERR(path2); + path2 = NULL; + goto finish; + } + ex1 = path1[path1->p_depth].p_ext; + ex2 = path2[path2->p_depth].p_ext; + /* Do we have somthing to swap ? */ + if (unlikely(!ex2 || !ex1)) + goto finish; + + e1_blk = le32_to_cpu(ex1->ee_block); + e2_blk = le32_to_cpu(ex2->ee_block); + e1_len = ext4_ext_get_actual_len(ex1); + e2_len = ext4_ext_get_actual_len(ex2); + + /* Hole handling */ + if (!in_range(lblk1, e1_blk, e1_len) || + !in_range(lblk2, e2_blk, e2_len)) { + ext4_lblk_t next1, next2; + + /* if hole after extent, then go to next extent */ + next1 = ext4_ext_next_allocated_block(path1); + next2 = ext4_ext_next_allocated_block(path2); + /* If hole before extent, then shift to that extent */ + if (e1_blk > lblk1) + next1 = e1_blk; + if (e2_blk > lblk2) + next2 = e1_blk; + /* Do we have something to swap */ + if (next1 == EXT_MAX_BLOCKS || next2 == EXT_MAX_BLOCKS) + goto finish; + /* Move to the rightest boundary */ + len = next1 - lblk1; + if (len < next2 - lblk2) + len = next2 - lblk2; + if (len > count) + len = count; + lblk1 += len; + lblk2 += len; + count -= len; + goto repeat; + } + + /* Prepare left boundary */ + if (e1_blk < lblk1) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1, 0); + if (unlikely(*erp)) + goto finish; + } + if (e2_blk < lblk2) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2, 0); + if (unlikely(*erp)) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + /* Prepare right boundary */ + len = count; + if (len > e1_blk + e1_len - lblk1) + len = e1_blk + e1_len - lblk1; + if (len > e2_blk + e2_len - lblk2) + len = e2_blk + e2_len - lblk2; + + if (len != e1_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode1, + &path1, lblk1 + len, 0); + if (unlikely(*erp)) + goto finish; + } + if (len != e2_len) { + split = 1; + *erp = ext4_force_split_extent_at(handle, inode2, + &path2, lblk2 + len, 0); + if (*erp) + goto finish; + } + /* ext4_split_extent_at() may result in leaf extent split, + * path must to be revalidated. */ + if (split) + goto repeat; + + BUG_ON(e2_len != e1_len); + *erp = ext4_ext_get_access(handle, inode1, path1 + path1->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_get_access(handle, inode2, path2 + path2->p_depth); + if (unlikely(*erp)) + goto finish; + + /* Both extents are fully inside boundaries. Swap it now */ + tmp_ex = *ex1; + ext4_ext_store_pblock(ex1, ext4_ext_pblock(ex2)); + ext4_ext_store_pblock(ex2, ext4_ext_pblock(&tmp_ex)); + ex1->ee_len = cpu_to_le16(e2_len); + ex2->ee_len = cpu_to_le16(e1_len); + if (unwritten) + ext4_ext_mark_unwritten(ex2); + if (ext4_ext_is_unwritten(&tmp_ex)) + ext4_ext_mark_unwritten(ex1); + + ext4_ext_try_to_merge(handle, inode2, path2, ex2); + ext4_ext_try_to_merge(handle, inode1, path1, ex1); + *erp = ext4_ext_dirty(handle, inode2, path2 + + path2->p_depth); + if (unlikely(*erp)) + goto finish; + *erp = ext4_ext_dirty(handle, inode1, path1 + + path1->p_depth); + /* + * Looks scarry ah..? second inode already points to new blocks, + * and it was successfully dirtied. But luckily error may happen + * only due to journal error, so full transaction will be + * aborted anyway. + */ + if (unlikely(*erp)) + goto finish; + lblk1 += len; + lblk2 += len; + replaced_count += len; + count -= len; + + repeat: + ext4_ext_drop_refs(path1); + kfree(path1); + ext4_ext_drop_refs(path2); + kfree(path2); + path1 = path2 = NULL; + } + return replaced_count; +} diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index 0b7e28e7eaa4..94e7855ae71b 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -11,6 +11,8 @@ */ #include <linux/rbtree.h> #include <linux/list_sort.h> +#include <linux/proc_fs.h> +#include <linux/seq_file.h> #include "ext4.h" #include "extents_status.h" @@ -313,19 +315,27 @@ ext4_es_alloc_extent(struct inode *inode, ext4_lblk_t lblk, ext4_lblk_t len, */ if (!ext4_es_is_delayed(es)) { EXT4_I(inode)->i_es_lru_nr++; - percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_inc(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } + EXT4_I(inode)->i_es_all_nr++; + percpu_counter_inc(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + return es; } static void ext4_es_free_extent(struct inode *inode, struct extent_status *es) { + EXT4_I(inode)->i_es_all_nr--; + percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_es_stats.es_stats_all_cnt); + /* Decrease the lru counter when this es is not delayed */ if (!ext4_es_is_delayed(es)) { BUG_ON(EXT4_I(inode)->i_es_lru_nr == 0); EXT4_I(inode)->i_es_lru_nr--; - percpu_counter_dec(&EXT4_SB(inode->i_sb)->s_extent_cache_cnt); + percpu_counter_dec(&EXT4_SB(inode->i_sb)-> + s_es_stats.es_stats_lru_cnt); } kmem_cache_free(ext4_es_cachep, es); @@ -426,7 +436,7 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, unsigned short ee_len; int depth, ee_status, es_status; - path = ext4_ext_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, es->es_lblk, NULL, EXT4_EX_NOCACHE); if (IS_ERR(path)) return; @@ -499,10 +509,8 @@ static void ext4_es_insert_extent_ext_check(struct inode *inode, } } out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); } static void ext4_es_insert_extent_ind_check(struct inode *inode, @@ -731,6 +739,7 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, struct extent_status *es) { struct ext4_es_tree *tree; + struct ext4_es_stats *stats; struct extent_status *es1 = NULL; struct rb_node *node; int found = 0; @@ -767,11 +776,15 @@ int ext4_es_lookup_extent(struct inode *inode, ext4_lblk_t lblk, } out: + stats = &EXT4_SB(inode->i_sb)->s_es_stats; if (found) { BUG_ON(!es1); es->es_lblk = es1->es_lblk; es->es_len = es1->es_len; es->es_pblk = es1->es_pblk; + stats->es_stats_cache_hits++; + } else { + stats->es_stats_cache_misses++; } read_unlock(&EXT4_I(inode)->i_es_lock); @@ -933,11 +946,16 @@ static int __ext4_es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, struct ext4_inode_info *locked_ei) { struct ext4_inode_info *ei; + struct ext4_es_stats *es_stats; struct list_head *cur, *tmp; LIST_HEAD(skipped); + ktime_t start_time; + u64 scan_time; int nr_shrunk = 0; int retried = 0, skip_precached = 1, nr_skipped = 0; + es_stats = &sbi->s_es_stats; + start_time = ktime_get(); spin_lock(&sbi->s_es_lru_lock); retry: @@ -948,7 +966,8 @@ retry: * If we have already reclaimed all extents from extent * status tree, just stop the loop immediately. */ - if (percpu_counter_read_positive(&sbi->s_extent_cache_cnt) == 0) + if (percpu_counter_read_positive( + &es_stats->es_stats_lru_cnt) == 0) break; ei = list_entry(cur, struct ext4_inode_info, i_es_lru); @@ -958,7 +977,7 @@ retry: * time. Normally we try hard to avoid shrinking * precached inodes, but we will as a last resort. */ - if ((sbi->s_es_last_sorted < ei->i_touch_when) || + if ((es_stats->es_stats_last_sorted < ei->i_touch_when) || (skip_precached && ext4_test_inode_state(&ei->vfs_inode, EXT4_STATE_EXT_PRECACHED))) { nr_skipped++; @@ -992,7 +1011,7 @@ retry: if ((nr_shrunk == 0) && nr_skipped && !retried) { retried++; list_sort(NULL, &sbi->s_es_lru, ext4_inode_touch_time_cmp); - sbi->s_es_last_sorted = jiffies; + es_stats->es_stats_last_sorted = jiffies; ei = list_first_entry(&sbi->s_es_lru, struct ext4_inode_info, i_es_lru); /* @@ -1010,6 +1029,22 @@ retry: if (locked_ei && nr_shrunk == 0) nr_shrunk = __es_try_to_reclaim_extents(locked_ei, nr_to_scan); + scan_time = ktime_to_ns(ktime_sub(ktime_get(), start_time)); + if (likely(es_stats->es_stats_scan_time)) + es_stats->es_stats_scan_time = (scan_time + + es_stats->es_stats_scan_time*3) / 4; + else + es_stats->es_stats_scan_time = scan_time; + if (scan_time > es_stats->es_stats_max_scan_time) + es_stats->es_stats_max_scan_time = scan_time; + if (likely(es_stats->es_stats_shrunk)) + es_stats->es_stats_shrunk = (nr_shrunk + + es_stats->es_stats_shrunk*3) / 4; + else + es_stats->es_stats_shrunk = nr_shrunk; + + trace_ext4_es_shrink(sbi->s_sb, nr_shrunk, scan_time, skip_precached, + nr_skipped, retried); return nr_shrunk; } @@ -1020,8 +1055,8 @@ static unsigned long ext4_es_count(struct shrinker *shrink, struct ext4_sb_info *sbi; sbi = container_of(shrink, struct ext4_sb_info, s_es_shrinker); - nr = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, sc->nr_to_scan, nr); + nr = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_count(sbi->s_sb, sc->nr_to_scan, nr); return nr; } @@ -1033,31 +1068,160 @@ static unsigned long ext4_es_scan(struct shrinker *shrink, int nr_to_scan = sc->nr_to_scan; int ret, nr_shrunk; - ret = percpu_counter_read_positive(&sbi->s_extent_cache_cnt); - trace_ext4_es_shrink_enter(sbi->s_sb, nr_to_scan, ret); + ret = percpu_counter_read_positive(&sbi->s_es_stats.es_stats_lru_cnt); + trace_ext4_es_shrink_scan_enter(sbi->s_sb, nr_to_scan, ret); if (!nr_to_scan) return ret; nr_shrunk = __ext4_es_shrink(sbi, nr_to_scan, NULL); - trace_ext4_es_shrink_exit(sbi->s_sb, nr_shrunk, ret); + trace_ext4_es_shrink_scan_exit(sbi->s_sb, nr_shrunk, ret); return nr_shrunk; } -void ext4_es_register_shrinker(struct ext4_sb_info *sbi) +static void *ext4_es_seq_shrinker_info_start(struct seq_file *seq, loff_t *pos) { + return *pos ? NULL : SEQ_START_TOKEN; +} + +static void * +ext4_es_seq_shrinker_info_next(struct seq_file *seq, void *v, loff_t *pos) +{ + return NULL; +} + +static int ext4_es_seq_shrinker_info_show(struct seq_file *seq, void *v) +{ + struct ext4_sb_info *sbi = seq->private; + struct ext4_es_stats *es_stats = &sbi->s_es_stats; + struct ext4_inode_info *ei, *max = NULL; + unsigned int inode_cnt = 0; + + if (v != SEQ_START_TOKEN) + return 0; + + /* here we just find an inode that has the max nr. of objects */ + spin_lock(&sbi->s_es_lru_lock); + list_for_each_entry(ei, &sbi->s_es_lru, i_es_lru) { + inode_cnt++; + if (max && max->i_es_all_nr < ei->i_es_all_nr) + max = ei; + else if (!max) + max = ei; + } + spin_unlock(&sbi->s_es_lru_lock); + + seq_printf(seq, "stats:\n %lld objects\n %lld reclaimable objects\n", + percpu_counter_sum_positive(&es_stats->es_stats_all_cnt), + percpu_counter_sum_positive(&es_stats->es_stats_lru_cnt)); + seq_printf(seq, " %lu/%lu cache hits/misses\n", + es_stats->es_stats_cache_hits, + es_stats->es_stats_cache_misses); + if (es_stats->es_stats_last_sorted != 0) + seq_printf(seq, " %u ms last sorted interval\n", + jiffies_to_msecs(jiffies - + es_stats->es_stats_last_sorted)); + if (inode_cnt) + seq_printf(seq, " %d inodes on lru list\n", inode_cnt); + + seq_printf(seq, "average:\n %llu us scan time\n", + div_u64(es_stats->es_stats_scan_time, 1000)); + seq_printf(seq, " %lu shrunk objects\n", es_stats->es_stats_shrunk); + if (inode_cnt) + seq_printf(seq, + "maximum:\n %lu inode (%u objects, %u reclaimable)\n" + " %llu us max scan time\n", + max->vfs_inode.i_ino, max->i_es_all_nr, max->i_es_lru_nr, + div_u64(es_stats->es_stats_max_scan_time, 1000)); + + return 0; +} + +static void ext4_es_seq_shrinker_info_stop(struct seq_file *seq, void *v) +{ +} + +static const struct seq_operations ext4_es_seq_shrinker_info_ops = { + .start = ext4_es_seq_shrinker_info_start, + .next = ext4_es_seq_shrinker_info_next, + .stop = ext4_es_seq_shrinker_info_stop, + .show = ext4_es_seq_shrinker_info_show, +}; + +static int +ext4_es_seq_shrinker_info_open(struct inode *inode, struct file *file) +{ + int ret; + + ret = seq_open(file, &ext4_es_seq_shrinker_info_ops); + if (!ret) { + struct seq_file *m = file->private_data; + m->private = PDE_DATA(inode); + } + + return ret; +} + +static int +ext4_es_seq_shrinker_info_release(struct inode *inode, struct file *file) +{ + return seq_release(inode, file); +} + +static const struct file_operations ext4_es_seq_shrinker_info_fops = { + .owner = THIS_MODULE, + .open = ext4_es_seq_shrinker_info_open, + .read = seq_read, + .llseek = seq_lseek, + .release = ext4_es_seq_shrinker_info_release, +}; + +int ext4_es_register_shrinker(struct ext4_sb_info *sbi) +{ + int err; + INIT_LIST_HEAD(&sbi->s_es_lru); spin_lock_init(&sbi->s_es_lru_lock); - sbi->s_es_last_sorted = 0; + sbi->s_es_stats.es_stats_last_sorted = 0; + sbi->s_es_stats.es_stats_shrunk = 0; + sbi->s_es_stats.es_stats_cache_hits = 0; + sbi->s_es_stats.es_stats_cache_misses = 0; + sbi->s_es_stats.es_stats_scan_time = 0; + sbi->s_es_stats.es_stats_max_scan_time = 0; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_all_cnt, 0, GFP_KERNEL); + if (err) + return err; + err = percpu_counter_init(&sbi->s_es_stats.es_stats_lru_cnt, 0, GFP_KERNEL); + if (err) + goto err1; + sbi->s_es_shrinker.scan_objects = ext4_es_scan; sbi->s_es_shrinker.count_objects = ext4_es_count; sbi->s_es_shrinker.seeks = DEFAULT_SEEKS; - register_shrinker(&sbi->s_es_shrinker); + err = register_shrinker(&sbi->s_es_shrinker); + if (err) + goto err2; + + if (sbi->s_proc) + proc_create_data("es_shrinker_info", S_IRUGO, sbi->s_proc, + &ext4_es_seq_shrinker_info_fops, sbi); + + return 0; + +err2: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); +err1: + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + return err; } void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi) { + if (sbi->s_proc) + remove_proc_entry("es_shrinker_info", sbi->s_proc); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_all_cnt); + percpu_counter_destroy(&sbi->s_es_stats.es_stats_lru_cnt); unregister_shrinker(&sbi->s_es_shrinker); } diff --git a/fs/ext4/extents_status.h b/fs/ext4/extents_status.h index f1b62a419920..efd5f970b501 100644 --- a/fs/ext4/extents_status.h +++ b/fs/ext4/extents_status.h @@ -64,6 +64,17 @@ struct ext4_es_tree { struct extent_status *cache_es; /* recently accessed extent */ }; +struct ext4_es_stats { + unsigned long es_stats_last_sorted; + unsigned long es_stats_shrunk; + unsigned long es_stats_cache_hits; + unsigned long es_stats_cache_misses; + u64 es_stats_scan_time; + u64 es_stats_max_scan_time; + struct percpu_counter es_stats_all_cnt; + struct percpu_counter es_stats_lru_cnt; +}; + extern int __init ext4_init_es(void); extern void ext4_exit_es(void); extern void ext4_es_init_tree(struct ext4_es_tree *tree); @@ -138,7 +149,7 @@ static inline void ext4_es_store_pblock_status(struct extent_status *es, (pb & ~ES_MASK)); } -extern void ext4_es_register_shrinker(struct ext4_sb_info *sbi); +extern int ext4_es_register_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_unregister_shrinker(struct ext4_sb_info *sbi); extern void ext4_es_lru_add(struct inode *inode); extern void ext4_es_lru_del(struct inode *inode); diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 5b87fc36aab8..8012a5daf401 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1011,8 +1011,7 @@ got: spin_unlock(&sbi->s_next_gen_lock); /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index e75f840000a0..36b369697a13 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -318,34 +318,24 @@ static int ext4_blks_to_allocate(Indirect *branch, int k, unsigned int blks, * ext4_alloc_block() (normally -ENOSPC). Otherwise we set the chain * as described above and return 0. */ -static int ext4_alloc_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t iblock, int indirect_blks, - int *blks, ext4_fsblk_t goal, - ext4_lblk_t *offsets, Indirect *branch) +static int ext4_alloc_branch(handle_t *handle, + struct ext4_allocation_request *ar, + int indirect_blks, ext4_lblk_t *offsets, + Indirect *branch) { - struct ext4_allocation_request ar; struct buffer_head * bh; ext4_fsblk_t b, new_blocks[4]; __le32 *p; int i, j, err, len = 1; - /* - * Set up for the direct block allocation - */ - memset(&ar, 0, sizeof(ar)); - ar.inode = inode; - ar.len = *blks; - ar.logical = iblock; - if (S_ISREG(inode->i_mode)) - ar.flags = EXT4_MB_HINT_DATA; - for (i = 0; i <= indirect_blks; i++) { if (i == indirect_blks) { - ar.goal = goal; - new_blocks[i] = ext4_mb_new_blocks(handle, &ar, &err); + new_blocks[i] = ext4_mb_new_blocks(handle, ar, &err); } else - goal = new_blocks[i] = ext4_new_meta_blocks(handle, inode, - goal, 0, NULL, &err); + ar->goal = new_blocks[i] = ext4_new_meta_blocks(handle, + ar->inode, ar->goal, + ar->flags & EXT4_MB_DELALLOC_RESERVED, + NULL, &err); if (err) { i--; goto failed; @@ -354,7 +344,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, if (i == 0) continue; - bh = branch[i].bh = sb_getblk(inode->i_sb, new_blocks[i-1]); + bh = branch[i].bh = sb_getblk(ar->inode->i_sb, new_blocks[i-1]); if (unlikely(!bh)) { err = -ENOMEM; goto failed; @@ -372,7 +362,7 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, b = new_blocks[i]; if (i == indirect_blks) - len = ar.len; + len = ar->len; for (j = 0; j < len; j++) *p++ = cpu_to_le32(b++); @@ -381,11 +371,10 @@ static int ext4_alloc_branch(handle_t *handle, struct inode *inode, unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, bh); if (err) goto failed; } - *blks = ar.len; return 0; failed: for (; i >= 0; i--) { @@ -396,10 +385,10 @@ failed: * existing before ext4_alloc_branch() was called. */ if (i > 0 && i != indirect_blks && branch[i].bh) - ext4_forget(handle, 1, inode, branch[i].bh, + ext4_forget(handle, 1, ar->inode, branch[i].bh, branch[i].bh->b_blocknr); - ext4_free_blocks(handle, inode, NULL, new_blocks[i], - (i == indirect_blks) ? ar.len : 1, 0); + ext4_free_blocks(handle, ar->inode, NULL, new_blocks[i], + (i == indirect_blks) ? ar->len : 1, 0); } return err; } @@ -419,9 +408,9 @@ failed: * inode (->i_blocks, etc.). In case of success we end up with the full * chain to new block and return 0. */ -static int ext4_splice_branch(handle_t *handle, struct inode *inode, - ext4_lblk_t block, Indirect *where, int num, - int blks) +static int ext4_splice_branch(handle_t *handle, + struct ext4_allocation_request *ar, + Indirect *where, int num) { int i; int err = 0; @@ -446,9 +435,9 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, * Update the host buffer_head or inode to point to more just allocated * direct blocks blocks */ - if (num == 0 && blks > 1) { + if (num == 0 && ar->len > 1) { current_block = le32_to_cpu(where->key) + 1; - for (i = 1; i < blks; i++) + for (i = 1; i < ar->len; i++) *(where->p + i) = cpu_to_le32(current_block++); } @@ -465,14 +454,14 @@ static int ext4_splice_branch(handle_t *handle, struct inode *inode, */ jbd_debug(5, "splicing indirect only\n"); BUFFER_TRACE(where->bh, "call ext4_handle_dirty_metadata"); - err = ext4_handle_dirty_metadata(handle, inode, where->bh); + err = ext4_handle_dirty_metadata(handle, ar->inode, where->bh); if (err) goto err_out; } else { /* * OK, we spliced it into the inode itself on a direct block. */ - ext4_mark_inode_dirty(handle, inode); + ext4_mark_inode_dirty(handle, ar->inode); jbd_debug(5, "splicing direct\n"); } return err; @@ -484,11 +473,11 @@ err_out: * need to revoke the block, which is why we don't * need to set EXT4_FREE_BLOCKS_METADATA. */ - ext4_free_blocks(handle, inode, where[i].bh, 0, 1, + ext4_free_blocks(handle, ar->inode, where[i].bh, 0, 1, EXT4_FREE_BLOCKS_FORGET); } - ext4_free_blocks(handle, inode, NULL, le32_to_cpu(where[num].key), - blks, 0); + ext4_free_blocks(handle, ar->inode, NULL, le32_to_cpu(where[num].key), + ar->len, 0); return err; } @@ -525,11 +514,11 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags) { + struct ext4_allocation_request ar; int err = -EIO; ext4_lblk_t offsets[4]; Indirect chain[4]; Indirect *partial; - ext4_fsblk_t goal; int indirect_blks; int blocks_to_boundary = 0; int depth; @@ -579,7 +568,16 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, return -ENOSPC; } - goal = ext4_find_goal(inode, map->m_lblk, partial); + /* Set up for the direct block allocation */ + memset(&ar, 0, sizeof(ar)); + ar.inode = inode; + ar.logical = map->m_lblk; + if (S_ISREG(inode->i_mode)) + ar.flags = EXT4_MB_HINT_DATA; + if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) + ar.flags |= EXT4_MB_DELALLOC_RESERVED; + + ar.goal = ext4_find_goal(inode, map->m_lblk, partial); /* the number of blocks need to allocate for [d,t]indirect blocks */ indirect_blks = (chain + depth) - partial - 1; @@ -588,13 +586,13 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * Next look up the indirect map to count the totoal number of * direct blocks to allocate for this branch. */ - count = ext4_blks_to_allocate(partial, indirect_blks, - map->m_len, blocks_to_boundary); + ar.len = ext4_blks_to_allocate(partial, indirect_blks, + map->m_len, blocks_to_boundary); + /* * Block out ext4_truncate while we alter the tree */ - err = ext4_alloc_branch(handle, inode, map->m_lblk, indirect_blks, - &count, goal, + err = ext4_alloc_branch(handle, &ar, indirect_blks, offsets + (partial - chain), partial); /* @@ -605,14 +603,14 @@ int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, * may need to return -EAGAIN upwards in the worst case. --sct */ if (!err) - err = ext4_splice_branch(handle, inode, map->m_lblk, - partial, indirect_blks, count); + err = ext4_splice_branch(handle, &ar, partial, indirect_blks); if (err) goto cleanup; map->m_flags |= EXT4_MAP_NEW; ext4_update_inode_fsync_trans(handle, inode, 1); + count = ar.len; got_it: map->m_flags |= EXT4_MAP_MAPPED; map->m_pblk = le32_to_cpu(chain[depth-1].key); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index bea662bd0ca6..3ea62695abce 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -594,6 +594,7 @@ retry: if (ret) { unlock_page(page); page_cache_release(page); + page = NULL; ext4_orphan_add(handle, inode); up_write(&EXT4_I(inode)->xattr_sem); sem_held = 0; @@ -613,7 +614,8 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; - block_commit_write(page, from, to); + if (page) + block_commit_write(page, from, to); out: if (page) { unlock_page(page); @@ -1126,8 +1128,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 3aa26e9117c4..e9777f93cf05 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -83,8 +83,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -105,8 +104,7 @@ static void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + !ext4_has_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -224,16 +222,15 @@ void ext4_evict_inode(struct inode *inode) goto no_delete; } - if (!is_bad_inode(inode)) - dquot_initialize(inode); + if (is_bad_inode(inode)) + goto no_delete; + dquot_initialize(inode); if (ext4_should_order_data(inode)) ext4_begin_ordered_truncate(inode, 0); truncate_inode_pages_final(&inode->i_data); WARN_ON(atomic_read(&EXT4_I(inode)->i_ioend_count)); - if (is_bad_inode(inode)) - goto no_delete; /* * Protect us against freezing - iput() caller didn't have to have any @@ -590,20 +587,12 @@ found: /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take - * the write lock of i_data_sem, and call get_blocks() + * the write lock of i_data_sem, and call get_block() * with create == 1 flag. */ down_write(&EXT4_I(inode)->i_data_sem); /* - * if the caller is from delayed allocation writeout path - * we have already reserved fs blocks for allocation - * let the underlying get_block() function know to - * avoid double accounting - */ - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_set_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); - /* * We need to check for EXT4 here because migrate * could have changed the inode type in between */ @@ -631,8 +620,6 @@ found: (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE)) ext4_da_update_reserve_space(inode, retval, 1); } - if (flags & EXT4_GET_BLOCKS_DELALLOC_RESERVE) - ext4_clear_inode_state(inode, EXT4_STATE_DELALLOC_RESERVED); if (retval > 0) { unsigned int status; @@ -734,11 +721,11 @@ int ext4_get_block(struct inode *inode, sector_t iblock, * `handle' can be NULL if create is zero */ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *errp) + ext4_lblk_t block, int create) { struct ext4_map_blocks map; struct buffer_head *bh; - int fatal = 0, err; + int err; J_ASSERT(handle != NULL || create == 0); @@ -747,21 +734,14 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, err = ext4_map_blocks(handle, inode, &map, create ? EXT4_GET_BLOCKS_CREATE : 0); - /* ensure we send some value back into *errp */ - *errp = 0; - - if (create && err == 0) - err = -ENOSPC; /* should never happen */ + if (err == 0) + return create ? ERR_PTR(-ENOSPC) : NULL; if (err < 0) - *errp = err; - if (err <= 0) - return NULL; + return ERR_PTR(err); bh = sb_getblk(inode->i_sb, map.m_pblk); - if (unlikely(!bh)) { - *errp = -ENOMEM; - return NULL; - } + if (unlikely(!bh)) + return ERR_PTR(-ENOMEM); if (map.m_flags & EXT4_MAP_NEW) { J_ASSERT(create != 0); J_ASSERT(handle != NULL); @@ -775,44 +755,44 @@ struct buffer_head *ext4_getblk(handle_t *handle, struct inode *inode, */ lock_buffer(bh); BUFFER_TRACE(bh, "call get_create_access"); - fatal = ext4_journal_get_create_access(handle, bh); - if (!fatal && !buffer_uptodate(bh)) { + err = ext4_journal_get_create_access(handle, bh); + if (unlikely(err)) { + unlock_buffer(bh); + goto errout; + } + if (!buffer_uptodate(bh)) { memset(bh->b_data, 0, inode->i_sb->s_blocksize); set_buffer_uptodate(bh); } unlock_buffer(bh); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); err = ext4_handle_dirty_metadata(handle, inode, bh); - if (!fatal) - fatal = err; - } else { + if (unlikely(err)) + goto errout; + } else BUFFER_TRACE(bh, "not a new buffer"); - } - if (fatal) { - *errp = fatal; - brelse(bh); - bh = NULL; - } return bh; +errout: + brelse(bh); + return ERR_PTR(err); } struct buffer_head *ext4_bread(handle_t *handle, struct inode *inode, - ext4_lblk_t block, int create, int *err) + ext4_lblk_t block, int create) { struct buffer_head *bh; - bh = ext4_getblk(handle, inode, block, create, err); - if (!bh) + bh = ext4_getblk(handle, inode, block, create); + if (IS_ERR(bh)) return bh; - if (buffer_uptodate(bh)) + if (!bh || buffer_uptodate(bh)) return bh; ll_rw_block(READ | REQ_META | REQ_PRIO, 1, &bh); wait_on_buffer(bh); if (buffer_uptodate(bh)) return bh; put_bh(bh); - *err = -EIO; - return NULL; + return ERR_PTR(-EIO); } int ext4_walk_page_buffers(handle_t *handle, @@ -1536,7 +1516,7 @@ out_unlock: } /* - * This is a special get_blocks_t callback which is used by + * This is a special get_block_t callback which is used by * ext4_da_write_begin(). It will either return mapped block or * reserve space for a single block. * @@ -2011,12 +1991,10 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * in data loss. So use reserved blocks to allocate metadata if * possible. * - * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if the blocks - * in question are delalloc blocks. This affects functions in many - * different parts of the allocation call path. This flag exists - * primarily because we don't want to change *many* call functions, so - * ext4_map_blocks() will set the EXT4_STATE_DELALLOC_RESERVED flag - * once the inode's allocation semaphore is taken. + * We pass in the magic EXT4_GET_BLOCKS_DELALLOC_RESERVE if + * the blocks in question are delalloc blocks. This indicates + * that the blocks and quotas has already been checked when + * the data was copied into the page cache. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL; @@ -2515,6 +2493,20 @@ static int ext4_nonda_switch(struct super_block *sb) return 0; } +/* We always reserve for an inode update; the superblock could be there too */ +static int ext4_da_write_credits(struct inode *inode, loff_t pos, unsigned len) +{ + if (likely(EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_LARGE_FILE))) + return 1; + + if (pos + len <= 0x7fffffffULL) + return 1; + + /* We might need to update the superblock to set LARGE_FILE */ + return 2; +} + static int ext4_da_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -2565,7 +2557,8 @@ retry_grab: * of file which has an already mapped buffer. */ retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, 1); + handle = ext4_journal_start(inode, EXT4_HT_WRITE_PAGE, + ext4_da_write_credits(inode, pos, len)); if (IS_ERR(handle)) { page_cache_release(page); return PTR_ERR(handle); @@ -2658,10 +2651,7 @@ static int ext4_da_write_end(struct file *file, if (copied && new_i_size > EXT4_I(inode)->i_disksize) { if (ext4_has_inline_data(inode) || ext4_da_should_update_i_disksize(page, end)) { - down_write(&EXT4_I(inode)->i_data_sem); - if (new_i_size > EXT4_I(inode)->i_disksize) - EXT4_I(inode)->i_disksize = new_i_size; - up_write(&EXT4_I(inode)->i_data_sem); + ext4_update_i_disksize(inode, new_i_size); /* We need to mark inode dirty even if * new_i_size is less that inode->i_size * bu greater than i_disksize.(hint delalloc) @@ -3936,8 +3926,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); @@ -4127,6 +4116,13 @@ bad_inode: return ERR_PTR(ret); } +struct inode *ext4_iget_normal(struct super_block *sb, unsigned long ino) +{ + if (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) + return ERR_PTR(-EIO); + return ext4_iget(sb, ino); +} + static int ext4_inode_blocks_set(handle_t *handle, struct ext4_inode *raw_inode, struct ext4_inode_info *ei) @@ -4226,7 +4222,8 @@ static int ext4_do_update_inode(handle_t *handle, EXT4_INODE_SET_XTIME(i_atime, inode, raw_inode); EXT4_EINODE_SET_XTIME(i_crtime, ei, raw_inode); - if (ext4_inode_blocks_set(handle, raw_inode, ei)) { + err = ext4_inode_blocks_set(handle, raw_inode, ei); + if (err) { spin_unlock(&ei->i_raw_lock); goto out_brelse; } @@ -4536,8 +4533,12 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) ext4_orphan_del(NULL, inode); goto err_out; } - } else + } else { + loff_t oldsize = inode->i_size; + i_size_write(inode, attr->ia_size); + pagecache_isize_extended(inode, oldsize, inode->i_size); + } /* * Blocks are going to be removed from the inode. Wait diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 0f2252ec274d..bfda18a15592 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -331,8 +331,7 @@ flags_out: if (!inode_owner_or_capable(inode)) return -EPERM; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -532,9 +531,17 @@ group_add_out: } case EXT4_IOC_SWAP_BOOT: + { + int err; if (!(filp->f_mode & FMODE_WRITE)) return -EBADF; - return swap_inode_boot_loader(sb, inode); + err = mnt_want_write_file(filp); + if (err) + return err; + err = swap_inode_boot_loader(sb, inode); + mnt_drop_write_file(filp); + return err; + } case EXT4_IOC_RESIZE_FS: { ext4_fsblk_t n_blocks_count; diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index 8b0f9ef517d6..dbfe15c2533c 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -3155,9 +3155,8 @@ ext4_mb_normalize_request(struct ext4_allocation_context *ac, "start %lu, size %lu, fe_logical %lu", (unsigned long) start, (unsigned long) size, (unsigned long) ac->ac_o_ex.fe_logical); + BUG(); } - BUG_ON(start + size <= ac->ac_o_ex.fe_logical && - start > ac->ac_o_ex.fe_logical); BUG_ON(size <= 0 || size > EXT4_BLOCKS_PER_GROUP(ac->ac_sb)); /* now prepare goal request */ @@ -4134,7 +4133,7 @@ static void ext4_mb_group_or_file(struct ext4_allocation_context *ac) * per cpu locality group is to reduce the contention between block * request from multiple CPUs. */ - ac->ac_lg = __this_cpu_ptr(sbi->s_locality_groups); + ac->ac_lg = raw_cpu_ptr(sbi->s_locality_groups); /* we're going to use group allocation */ ac->ac_flags |= EXT4_MB_HINT_GROUP_ALLOC; @@ -4410,14 +4409,7 @@ ext4_fsblk_t ext4_mb_new_blocks(handle_t *handle, if (IS_NOQUOTA(ar->inode)) ar->flags |= EXT4_MB_USE_ROOT_BLOCKS; - /* - * For delayed allocation, we could skip the ENOSPC and - * EDQUOT check, as blocks and quotas have been already - * reserved when data being copied into pagecache. - */ - if (ext4_test_inode_state(ar->inode, EXT4_STATE_DELALLOC_RESERVED)) - ar->flags |= EXT4_MB_DELALLOC_RESERVED; - else { + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) { /* Without delayed allocation we need to verify * there is enough free blocks to do block allocation * and verify allocation doesn't exceed the quota limits. @@ -4528,8 +4520,7 @@ out: if (inquota && ar->len < inquota) dquot_free_block(ar->inode, EXT4_C2B(sbi, inquota - ar->len)); if (!ar->len) { - if (!ext4_test_inode_state(ar->inode, - EXT4_STATE_DELALLOC_RESERVED)) + if ((ar->flags & EXT4_MB_DELALLOC_RESERVED) == 0) /* release all the reserved blocks if non delalloc */ percpu_counter_sub(&sbi->s_dirtyclusters_counter, reserv_clstrs); diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index d3567f27bae7..a432634f2e6a 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -41,8 +41,7 @@ static int finish_range(handle_t *handle, struct inode *inode, ext4_ext_store_pblock(&newext, lb->first_pblock); /* Locking only for convinience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); - path = ext4_ext_find_extent(inode, lb->first_block, NULL, 0); - + path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { retval = PTR_ERR(path); path = NULL; @@ -81,13 +80,11 @@ static int finish_range(handle_t *handle, struct inode *inode, goto err_out; } } - retval = ext4_ext_insert_extent(handle, inode, path, &newext, 0); + retval = ext4_ext_insert_extent(handle, inode, &path, &newext, 0); err_out: up_write((&EXT4_I(inode)->i_data_sem)); - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); lb->first_pblock = 0; return retval; } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 32bce844c2e1..8313ca3324ec 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -20,8 +20,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,8 +28,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 671a74b14fd7..9f2311bc9c4f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -27,120 +27,26 @@ * @lblock: logical block number to find an extent path * @path: pointer to an extent path pointer (for output) * - * ext4_ext_find_extent wrapper. Return 0 on success, or a negative error value + * ext4_find_extent wrapper. Return 0 on success, or a negative error value * on failure. */ static inline int get_ext_path(struct inode *inode, ext4_lblk_t lblock, - struct ext4_ext_path **orig_path) + struct ext4_ext_path **ppath) { - int ret = 0; struct ext4_ext_path *path; - path = ext4_ext_find_extent(inode, lblock, *orig_path, EXT4_EX_NOCACHE); + path = ext4_find_extent(inode, lblock, ppath, EXT4_EX_NOCACHE); if (IS_ERR(path)) - ret = PTR_ERR(path); - else if (path[ext_depth(inode)].p_ext == NULL) - ret = -ENODATA; - else - *orig_path = path; - - return ret; -} - -/** - * copy_extent_status - Copy the extent's initialization status - * - * @src: an extent for getting initialize status - * @dest: an extent to be set the status - */ -static void -copy_extent_status(struct ext4_extent *src, struct ext4_extent *dest) -{ - if (ext4_ext_is_unwritten(src)) - ext4_ext_mark_unwritten(dest); - else - dest->ee_len = cpu_to_le16(ext4_ext_get_actual_len(dest)); -} - -/** - * mext_next_extent - Search for the next extent and set it to "extent" - * - * @inode: inode which is searched - * @path: this will obtain data for the next extent - * @extent: pointer to the next extent we have just gotten - * - * Search the next extent in the array of ext4_ext_path structure (@path) - * and set it to ext4_extent structure (@extent). In addition, the member of - * @path (->p_ext) also points the next extent. Return 0 on success, 1 if - * ext4_ext_path structure refers to the last extent, or a negative error - * value on failure. - */ -int -mext_next_extent(struct inode *inode, struct ext4_ext_path *path, - struct ext4_extent **extent) -{ - struct ext4_extent_header *eh; - int ppos, leaf_ppos = path->p_depth; - - ppos = leaf_ppos; - if (EXT_LAST_EXTENT(path[ppos].p_hdr) > path[ppos].p_ext) { - /* leaf block */ - *extent = ++path[ppos].p_ext; - path[ppos].p_block = ext4_ext_pblock(path[ppos].p_ext); - return 0; - } - - while (--ppos >= 0) { - if (EXT_LAST_INDEX(path[ppos].p_hdr) > - path[ppos].p_idx) { - int cur_ppos = ppos; - - /* index block */ - path[ppos].p_idx++; - path[ppos].p_block = ext4_idx_pblock(path[ppos].p_idx); - if (path[ppos+1].p_bh) - brelse(path[ppos+1].p_bh); - path[ppos+1].p_bh = - sb_bread(inode->i_sb, path[ppos].p_block); - if (!path[ppos+1].p_bh) - return -EIO; - path[ppos+1].p_hdr = - ext_block_hdr(path[ppos+1].p_bh); - - /* Halfway index block */ - while (++cur_ppos < leaf_ppos) { - path[cur_ppos].p_idx = - EXT_FIRST_INDEX(path[cur_ppos].p_hdr); - path[cur_ppos].p_block = - ext4_idx_pblock(path[cur_ppos].p_idx); - if (path[cur_ppos+1].p_bh) - brelse(path[cur_ppos+1].p_bh); - path[cur_ppos+1].p_bh = sb_bread(inode->i_sb, - path[cur_ppos].p_block); - if (!path[cur_ppos+1].p_bh) - return -EIO; - path[cur_ppos+1].p_hdr = - ext_block_hdr(path[cur_ppos+1].p_bh); - } - - path[leaf_ppos].p_ext = *extent = NULL; - - eh = path[leaf_ppos].p_hdr; - if (le16_to_cpu(eh->eh_entries) == 0) - /* empty leaf is found */ - return -ENODATA; - - /* leaf block */ - path[leaf_ppos].p_ext = *extent = - EXT_FIRST_EXTENT(path[leaf_ppos].p_hdr); - path[leaf_ppos].p_block = - ext4_ext_pblock(path[leaf_ppos].p_ext); - return 0; - } + return PTR_ERR(path); + if (path[ext_depth(inode)].p_ext == NULL) { + ext4_ext_drop_refs(path); + kfree(path); + *ppath = NULL; + return -ENODATA; } - /* We found the last extent */ - return 1; + *ppath = path; + return 0; } /** @@ -178,417 +84,6 @@ ext4_double_up_write_data_sem(struct inode *orig_inode, } /** - * mext_insert_across_blocks - Insert extents across leaf block - * - * @handle: journal handle - * @orig_inode: original inode - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Allocate a new leaf block and insert extents into it. Return 0 on success, - * or a negative error value on failure. - */ -static int -mext_insert_across_blocks(handle_t *handle, struct inode *orig_inode, - struct ext4_extent *o_start, struct ext4_extent *o_end, - struct ext4_extent *start_ext, struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_ext_path *orig_path = NULL; - ext4_lblk_t eblock = 0; - int new_flag = 0; - int end_flag = 0; - int err = 0; - - if (start_ext->ee_len && new_ext->ee_len && end_ext->ee_len) { - if (o_start == o_end) { - - /* start_ext new_ext end_ext - * donor |---------|-----------|--------| - * orig |------------------------------| - */ - end_flag = 1; - } else { - - /* start_ext new_ext end_ext - * donor |---------|----------|---------| - * orig |---------------|--------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - } - - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (start_ext->ee_len && new_ext->ee_len && - !end_ext->ee_len && o_start == o_end) { - - /* start_ext new_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_start->ee_len = start_ext->ee_len; - eblock = le32_to_cpu(start_ext->ee_block); - new_flag = 1; - - } else if (!start_ext->ee_len && new_ext->ee_len && - end_ext->ee_len && o_start == o_end) { - - /* new_ext end_ext - * donor |--------------|---------------| - * orig |------------------------------| - */ - o_end->ee_block = end_ext->ee_block; - o_end->ee_len = end_ext->ee_len; - ext4_ext_store_pblock(o_end, ext4_ext_pblock(end_ext)); - - /* - * Set 0 to the extent block if new_ext was - * the first block. - */ - if (new_ext->ee_block) - eblock = le32_to_cpu(new_ext->ee_block); - - new_flag = 1; - } else { - ext4_debug("ext4 move extent: Unexpected insert case\n"); - return -EIO; - } - - if (new_flag) { - err = get_ext_path(orig_inode, eblock, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, new_ext, 0)) - goto out; - } - - if (end_flag) { - err = get_ext_path(orig_inode, - le32_to_cpu(end_ext->ee_block) - 1, &orig_path); - if (err) - goto out; - - if (ext4_ext_insert_extent(handle, orig_inode, - orig_path, end_ext, 0)) - goto out; - } -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - - return err; - -} - -/** - * mext_insert_inside_block - Insert new extent to the extent block - * - * @o_start: first original extent to be moved - * @o_end: last original extent to be moved - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * @eh: extent header of target leaf block - * @range_to_move: used to decide how to insert extent - * - * Insert extents into the leaf block. The extent (@o_start) is overwritten - * by inserted extents. - */ -static void -mext_insert_inside_block(struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext, - struct ext4_extent_header *eh, - int range_to_move) -{ - int i = 0; - unsigned long len; - - /* Move the existing extents */ - if (range_to_move && o_end < EXT_LAST_EXTENT(eh)) { - len = (unsigned long)(EXT_LAST_EXTENT(eh) + 1) - - (unsigned long)(o_end + 1); - memmove(o_end + 1 + range_to_move, o_end + 1, len); - } - - /* Insert start entry */ - if (start_ext->ee_len) - o_start[i++].ee_len = start_ext->ee_len; - - /* Insert new entry */ - if (new_ext->ee_len) { - o_start[i] = *new_ext; - ext4_ext_store_pblock(&o_start[i++], ext4_ext_pblock(new_ext)); - } - - /* Insert end entry */ - if (end_ext->ee_len) - o_start[i] = *end_ext; - - /* Increment the total entries counter on the extent block */ - le16_add_cpu(&eh->eh_entries, range_to_move); -} - -/** - * mext_insert_extents - Insert new extent - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @o_start: first original extent to be changed - * @o_end: last original extent to be changed - * @start_ext: first new extent to be inserted - * @new_ext: middle of new extent to be inserted - * @end_ext: last new extent to be inserted - * - * Call the function to insert extents. If we cannot add more extents into - * the leaf block, we call mext_insert_across_blocks() to create a - * new leaf block. Otherwise call mext_insert_inside_block(). Return 0 - * on success, or a negative error value on failure. - */ -static int -mext_insert_extents(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, - struct ext4_extent *o_start, - struct ext4_extent *o_end, - struct ext4_extent *start_ext, - struct ext4_extent *new_ext, - struct ext4_extent *end_ext) -{ - struct ext4_extent_header *eh; - unsigned long need_slots, slots_range; - int range_to_move, depth, ret; - - /* - * The extents need to be inserted - * start_extent + new_extent + end_extent. - */ - need_slots = (start_ext->ee_len ? 1 : 0) + (end_ext->ee_len ? 1 : 0) + - (new_ext->ee_len ? 1 : 0); - - /* The number of slots between start and end */ - slots_range = ((unsigned long)(o_end + 1) - (unsigned long)o_start + 1) - / sizeof(struct ext4_extent); - - /* Range to move the end of extent */ - range_to_move = need_slots - slots_range; - depth = orig_path->p_depth; - orig_path += depth; - eh = orig_path->p_hdr; - - if (depth) { - /* Register to journal */ - BUFFER_TRACE(orig_path->p_bh, "get_write_access"); - ret = ext4_journal_get_write_access(handle, orig_path->p_bh); - if (ret) - return ret; - } - - /* Expansion */ - if (range_to_move > 0 && - (range_to_move > le16_to_cpu(eh->eh_max) - - le16_to_cpu(eh->eh_entries))) { - - ret = mext_insert_across_blocks(handle, orig_inode, o_start, - o_end, start_ext, new_ext, end_ext); - if (ret < 0) - return ret; - } else - mext_insert_inside_block(o_start, o_end, start_ext, new_ext, - end_ext, eh, range_to_move); - - return ext4_ext_dirty(handle, orig_inode, orig_path); -} - -/** - * mext_leaf_block - Move one leaf extent block into the inode. - * - * @handle: journal handle - * @orig_inode: original inode - * @orig_path: path indicates first extent to be changed - * @dext: donor extent - * @from: start offset on the target file - * - * In order to insert extents into the leaf block, we must divide the extent - * in the leaf block into three extents. The one is located to be inserted - * extents, and the others are located around it. - * - * Therefore, this function creates structures to save extents of the leaf - * block, and inserts extents by calling mext_insert_extents() with - * created extents. Return 0 on success, or a negative error value on failure. - */ -static int -mext_leaf_block(handle_t *handle, struct inode *orig_inode, - struct ext4_ext_path *orig_path, struct ext4_extent *dext, - ext4_lblk_t *from) -{ - struct ext4_extent *oext, *o_start, *o_end, *prev_ext; - struct ext4_extent new_ext, start_ext, end_ext; - ext4_lblk_t new_ext_end; - int oext_alen, new_ext_alen, end_ext_alen; - int depth = ext_depth(orig_inode); - int ret; - - start_ext.ee_block = end_ext.ee_block = 0; - o_start = o_end = oext = orig_path[depth].p_ext; - oext_alen = ext4_ext_get_actual_len(oext); - start_ext.ee_len = end_ext.ee_len = 0; - - new_ext.ee_block = cpu_to_le32(*from); - ext4_ext_store_pblock(&new_ext, ext4_ext_pblock(dext)); - new_ext.ee_len = dext->ee_len; - new_ext_alen = ext4_ext_get_actual_len(&new_ext); - new_ext_end = le32_to_cpu(new_ext.ee_block) + new_ext_alen - 1; - - /* - * Case: original extent is first - * oext |--------| - * new_ext |--| - * start_ext |--| - */ - if (le32_to_cpu(oext->ee_block) < le32_to_cpu(new_ext.ee_block) && - le32_to_cpu(new_ext.ee_block) < - le32_to_cpu(oext->ee_block) + oext_alen) { - start_ext.ee_len = cpu_to_le16(le32_to_cpu(new_ext.ee_block) - - le32_to_cpu(oext->ee_block)); - start_ext.ee_block = oext->ee_block; - copy_extent_status(oext, &start_ext); - } else if (oext > EXT_FIRST_EXTENT(orig_path[depth].p_hdr)) { - prev_ext = oext - 1; - /* - * We can merge new_ext into previous extent, - * if these are contiguous and same extent type. - */ - if (ext4_can_extents_be_merged(orig_inode, prev_ext, - &new_ext)) { - o_start = prev_ext; - start_ext.ee_len = cpu_to_le16( - ext4_ext_get_actual_len(prev_ext) + - new_ext_alen); - start_ext.ee_block = oext->ee_block; - copy_extent_status(prev_ext, &start_ext); - new_ext.ee_len = 0; - } - } - - /* - * Case: new_ext_end must be less than oext - * oext |-----------| - * new_ext |-------| - */ - if (le32_to_cpu(oext->ee_block) + oext_alen - 1 < new_ext_end) { - EXT4_ERROR_INODE(orig_inode, - "new_ext_end(%u) should be less than or equal to " - "oext->ee_block(%u) + oext_alen(%d) - 1", - new_ext_end, le32_to_cpu(oext->ee_block), - oext_alen); - ret = -EIO; - goto out; - } - - /* - * Case: new_ext is smaller than original extent - * oext |---------------| - * new_ext |-----------| - * end_ext |---| - */ - if (le32_to_cpu(oext->ee_block) <= new_ext_end && - new_ext_end < le32_to_cpu(oext->ee_block) + oext_alen - 1) { - end_ext.ee_len = - cpu_to_le16(le32_to_cpu(oext->ee_block) + - oext_alen - 1 - new_ext_end); - copy_extent_status(oext, &end_ext); - end_ext_alen = ext4_ext_get_actual_len(&end_ext); - ext4_ext_store_pblock(&end_ext, - (ext4_ext_pblock(o_end) + oext_alen - end_ext_alen)); - end_ext.ee_block = - cpu_to_le32(le32_to_cpu(o_end->ee_block) + - oext_alen - end_ext_alen); - } - - ret = mext_insert_extents(handle, orig_inode, orig_path, o_start, - o_end, &start_ext, &new_ext, &end_ext); -out: - return ret; -} - -/** - * mext_calc_swap_extents - Calculate extents for extent swapping. - * - * @tmp_dext: the extent that will belong to the original inode - * @tmp_oext: the extent that will belong to the donor inode - * @orig_off: block offset of original inode - * @donor_off: block offset of donor inode - * @max_count: the maximum length of extents - * - * Return 0 on success, or a negative error value on failure. - */ -static int -mext_calc_swap_extents(struct ext4_extent *tmp_dext, - struct ext4_extent *tmp_oext, - ext4_lblk_t orig_off, ext4_lblk_t donor_off, - ext4_lblk_t max_count) -{ - ext4_lblk_t diff, orig_diff; - struct ext4_extent dext_old, oext_old; - - BUG_ON(orig_off != donor_off); - - /* original and donor extents have to cover the same block offset */ - if (orig_off < le32_to_cpu(tmp_oext->ee_block) || - le32_to_cpu(tmp_oext->ee_block) + - ext4_ext_get_actual_len(tmp_oext) - 1 < orig_off) - return -ENODATA; - - if (orig_off < le32_to_cpu(tmp_dext->ee_block) || - le32_to_cpu(tmp_dext->ee_block) + - ext4_ext_get_actual_len(tmp_dext) - 1 < orig_off) - return -ENODATA; - - dext_old = *tmp_dext; - oext_old = *tmp_oext; - - /* When tmp_dext is too large, pick up the target range. */ - diff = donor_off - le32_to_cpu(tmp_dext->ee_block); - - ext4_ext_store_pblock(tmp_dext, ext4_ext_pblock(tmp_dext) + diff); - le32_add_cpu(&tmp_dext->ee_block, diff); - le16_add_cpu(&tmp_dext->ee_len, -diff); - - if (max_count < ext4_ext_get_actual_len(tmp_dext)) - tmp_dext->ee_len = cpu_to_le16(max_count); - - orig_diff = orig_off - le32_to_cpu(tmp_oext->ee_block); - ext4_ext_store_pblock(tmp_oext, ext4_ext_pblock(tmp_oext) + orig_diff); - - /* Adjust extent length if donor extent is larger than orig */ - if (ext4_ext_get_actual_len(tmp_dext) > - ext4_ext_get_actual_len(tmp_oext) - orig_diff) - tmp_dext->ee_len = cpu_to_le16(le16_to_cpu(tmp_oext->ee_len) - - orig_diff); - - tmp_oext->ee_len = cpu_to_le16(ext4_ext_get_actual_len(tmp_dext)); - - copy_extent_status(&oext_old, tmp_dext); - copy_extent_status(&dext_old, tmp_oext); - - return 0; -} - -/** * mext_check_coverage - Check that all extents in range has the same type * * @inode: inode in question @@ -619,171 +114,25 @@ mext_check_coverage(struct inode *inode, ext4_lblk_t from, ext4_lblk_t count, } ret = 1; out: - if (path) { - ext4_ext_drop_refs(path); - kfree(path); - } + ext4_ext_drop_refs(path); + kfree(path); return ret; } /** - * mext_replace_branches - Replace original extents with new extents - * - * @handle: journal handle - * @orig_inode: original inode - * @donor_inode: donor inode - * @from: block offset of orig_inode - * @count: block count to be replaced - * @err: pointer to save return value - * - * Replace original inode extents and donor inode extents page by page. - * We implement this replacement in the following three steps: - * 1. Save the block information of original and donor inodes into - * dummy extents. - * 2. Change the block information of original inode to point at the - * donor inode blocks. - * 3. Change the block information of donor inode to point at the saved - * original inode blocks in the dummy extents. - * - * Return replaced block count. - */ -static int -mext_replace_branches(handle_t *handle, struct inode *orig_inode, - struct inode *donor_inode, ext4_lblk_t from, - ext4_lblk_t count, int *err) -{ - struct ext4_ext_path *orig_path = NULL; - struct ext4_ext_path *donor_path = NULL; - struct ext4_extent *oext, *dext; - struct ext4_extent tmp_dext, tmp_oext; - ext4_lblk_t orig_off = from, donor_off = from; - int depth; - int replaced_count = 0; - int dext_alen; - - *err = ext4_es_remove_extent(orig_inode, from, count); - if (*err) - goto out; - - *err = ext4_es_remove_extent(donor_inode, from, count); - if (*err) - goto out; - - /* Get the original extent for the block "orig_off" */ - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - - /* Get the donor extent for the head */ - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - if (unlikely(!dext)) - goto missing_donor_extent; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count); - if (*err) - goto out; - - /* Loop for the donor extents */ - while (1) { - /* The extent for donor must be found. */ - if (unlikely(!dext)) { - missing_donor_extent: - EXT4_ERROR_INODE(donor_inode, - "The extent for donor must be found"); - *err = -EIO; - goto out; - } else if (donor_off != le32_to_cpu(tmp_dext.ee_block)) { - EXT4_ERROR_INODE(donor_inode, - "Donor offset(%u) and the first block of donor " - "extent(%u) should be equal", - donor_off, - le32_to_cpu(tmp_dext.ee_block)); - *err = -EIO; - goto out; - } - - /* Set donor extent to orig extent */ - *err = mext_leaf_block(handle, orig_inode, - orig_path, &tmp_dext, &orig_off); - if (*err) - goto out; - - /* Set orig extent to donor extent */ - *err = mext_leaf_block(handle, donor_inode, - donor_path, &tmp_oext, &donor_off); - if (*err) - goto out; - - dext_alen = ext4_ext_get_actual_len(&tmp_dext); - replaced_count += dext_alen; - donor_off += dext_alen; - orig_off += dext_alen; - - BUG_ON(replaced_count > count); - /* Already moved the expected blocks */ - if (replaced_count >= count) - break; - - if (orig_path) - ext4_ext_drop_refs(orig_path); - *err = get_ext_path(orig_inode, orig_off, &orig_path); - if (*err) - goto out; - depth = ext_depth(orig_inode); - oext = orig_path[depth].p_ext; - tmp_oext = *oext; - - if (donor_path) - ext4_ext_drop_refs(donor_path); - *err = get_ext_path(donor_inode, donor_off, &donor_path); - if (*err) - goto out; - depth = ext_depth(donor_inode); - dext = donor_path[depth].p_ext; - tmp_dext = *dext; - - *err = mext_calc_swap_extents(&tmp_dext, &tmp_oext, orig_off, - donor_off, count - replaced_count); - if (*err) - goto out; - } - -out: - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (donor_path) { - ext4_ext_drop_refs(donor_path); - kfree(donor_path); - } - - return replaced_count; -} - -/** * mext_page_double_lock - Grab and lock pages on both @inode1 and @inode2 * * @inode1: the inode structure * @inode2: the inode structure - * @index: page index + * @index1: page index + * @index2: page index * @page: result page vector * * Grab two locked pages for inode's by inode order */ static int mext_page_double_lock(struct inode *inode1, struct inode *inode2, - pgoff_t index, struct page *page[2]) + pgoff_t index1, pgoff_t index2, struct page *page[2]) { struct address_space *mapping[2]; unsigned fl = AOP_FLAG_NOFS; @@ -793,15 +142,18 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2, mapping[0] = inode1->i_mapping; mapping[1] = inode2->i_mapping; } else { + pgoff_t tmp = index1; + index1 = index2; + index2 = tmp; mapping[0] = inode2->i_mapping; mapping[1] = inode1->i_mapping; } - page[0] = grab_cache_page_write_begin(mapping[0], index, fl); + page[0] = grab_cache_page_write_begin(mapping[0], index1, fl); if (!page[0]) return -ENOMEM; - page[1] = grab_cache_page_write_begin(mapping[1], index, fl); + page[1] = grab_cache_page_write_begin(mapping[1], index2, fl); if (!page[1]) { unlock_page(page[0]); page_cache_release(page[0]); @@ -893,25 +245,27 @@ out: * @o_filp: file structure of original file * @donor_inode: donor inode * @orig_page_offset: page index on original file + * @donor_page_offset: page index on donor file * @data_offset_in_page: block index where data swapping starts * @block_len_in_page: the number of blocks to be swapped * @unwritten: orig extent is unwritten or not * @err: pointer to save return value * * Save the data in original inode blocks and replace original inode extents - * with donor inode extents by calling mext_replace_branches(). + * with donor inode extents by calling ext4_swap_extents(). * Finally, write out the saved data in new original inode blocks. Return * replaced block count. */ static int move_extent_per_page(struct file *o_filp, struct inode *donor_inode, - pgoff_t orig_page_offset, int data_offset_in_page, - int block_len_in_page, int unwritten, int *err) + pgoff_t orig_page_offset, pgoff_t donor_page_offset, + int data_offset_in_page, + int block_len_in_page, int unwritten, int *err) { struct inode *orig_inode = file_inode(o_filp); struct page *pagep[2] = {NULL, NULL}; handle_t *handle; - ext4_lblk_t orig_blk_offset; + ext4_lblk_t orig_blk_offset, donor_blk_offset; unsigned long blocksize = orig_inode->i_sb->s_blocksize; unsigned int w_flags = 0; unsigned int tmp_data_size, data_size, replaced_size; @@ -939,6 +293,9 @@ again: orig_blk_offset = orig_page_offset * blocks_per_page + data_offset_in_page; + donor_blk_offset = donor_page_offset * blocks_per_page + + data_offset_in_page; + /* Calculate data_size */ if ((orig_blk_offset + block_len_in_page - 1) == ((orig_inode->i_size - 1) >> orig_inode->i_blkbits)) { @@ -959,7 +316,7 @@ again: replaced_size = data_size; *err = mext_page_double_lock(orig_inode, donor_inode, orig_page_offset, - pagep); + donor_page_offset, pagep); if (unlikely(*err < 0)) goto stop_journal; /* @@ -978,7 +335,7 @@ again: if (*err) goto drop_data_sem; - unwritten &= mext_check_coverage(donor_inode, orig_blk_offset, + unwritten &= mext_check_coverage(donor_inode, donor_blk_offset, block_len_in_page, 1, err); if (*err) goto drop_data_sem; @@ -994,9 +351,10 @@ again: *err = -EBUSY; goto drop_data_sem; } - replaced_count = mext_replace_branches(handle, orig_inode, - donor_inode, orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, + donor_inode, orig_blk_offset, + donor_blk_offset, + block_len_in_page, 1, err); drop_data_sem: ext4_double_up_write_data_sem(orig_inode, donor_inode); goto unlock_pages; @@ -1014,9 +372,9 @@ data_copy: goto unlock_pages; } ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, orig_inode, donor_inode, - orig_blk_offset, - block_len_in_page, err); + replaced_count = ext4_swap_extents(handle, orig_inode, donor_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 1, err); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (*err) { if (replaced_count) { @@ -1061,9 +419,9 @@ repair_branches: * Try to swap extents to it's original places */ ext4_double_down_write_data_sem(orig_inode, donor_inode); - replaced_count = mext_replace_branches(handle, donor_inode, orig_inode, - orig_blk_offset, - block_len_in_page, &err2); + replaced_count = ext4_swap_extents(handle, donor_inode, orig_inode, + orig_blk_offset, donor_blk_offset, + block_len_in_page, 0, &err2); ext4_double_up_write_data_sem(orig_inode, donor_inode); if (replaced_count != block_len_in_page) { EXT4_ERROR_INODE_BLOCK(orig_inode, (sector_t)(orig_blk_offset), @@ -1093,10 +451,14 @@ mext_check_arguments(struct inode *orig_inode, struct inode *donor_inode, __u64 orig_start, __u64 donor_start, __u64 *len) { - ext4_lblk_t orig_blocks, donor_blocks; + __u64 orig_eof, donor_eof; unsigned int blkbits = orig_inode->i_blkbits; unsigned int blocksize = 1 << blkbits; + orig_eof = (i_size_read(orig_inode) + blocksize - 1) >> blkbits; + donor_eof = (i_size_read(donor_inode) + blocksize - 1) >> blkbits; + + if (donor_inode->i_mode & (S_ISUID|S_ISGID)) { ext4_debug("ext4 move extent: suid or sgid is set" " to donor file [ino:orig %lu, donor %lu]\n", @@ -1112,7 +474,7 @@ mext_check_arguments(struct inode *orig_inode, ext4_debug("ext4 move extent: The argument files should " "not be swapfile [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; + return -EBUSY; } /* Ext4 move extent supports only extent based file */ @@ -1132,67 +494,28 @@ mext_check_arguments(struct inode *orig_inode, } /* Start offset should be same */ - if (orig_start != donor_start) { + if ((orig_start & ~(PAGE_MASK >> orig_inode->i_blkbits)) != + (donor_start & ~(PAGE_MASK >> orig_inode->i_blkbits))) { ext4_debug("ext4 move extent: orig and donor's start " - "offset are not same [ino:orig %lu, donor %lu]\n", + "offset are not alligned [ino:orig %lu, donor %lu]\n", orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } if ((orig_start >= EXT_MAX_BLOCKS) || + (donor_start >= EXT_MAX_BLOCKS) || (*len > EXT_MAX_BLOCKS) || + (donor_start + *len >= EXT_MAX_BLOCKS) || (orig_start + *len >= EXT_MAX_BLOCKS)) { ext4_debug("ext4 move extent: Can't handle over [%u] blocks " "[ino:orig %lu, donor %lu]\n", EXT_MAX_BLOCKS, orig_inode->i_ino, donor_inode->i_ino); return -EINVAL; } - - if (orig_inode->i_size > donor_inode->i_size) { - donor_blocks = (donor_inode->i_size + blocksize - 1) >> blkbits; - /* TODO: eliminate this artificial restriction */ - if (orig_start >= donor_blocks) { - ext4_debug("ext4 move extent: orig start offset " - "[%llu] should be less than donor file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, donor_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - /* TODO: eliminate this artificial restriction */ - if (orig_start + *len > donor_blocks) { - ext4_debug("ext4 move extent: End offset [%llu] should " - "be less than donor file blocks [%u]." - "So adjust length from %llu to %llu " - "[ino:orig %lu, donor %lu]\n", - orig_start + *len, donor_blocks, - *len, donor_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = donor_blocks - orig_start; - } - } else { - orig_blocks = (orig_inode->i_size + blocksize - 1) >> blkbits; - if (orig_start >= orig_blocks) { - ext4_debug("ext4 move extent: start offset [%llu] " - "should be less than original file blocks " - "[%u] [ino:orig %lu, donor %lu]\n", - orig_start, orig_blocks, - orig_inode->i_ino, donor_inode->i_ino); - return -EINVAL; - } - - if (orig_start + *len > orig_blocks) { - ext4_debug("ext4 move extent: Adjust length " - "from %llu to %llu. Because it should be " - "less than original file blocks " - "[ino:orig %lu, donor %lu]\n", - *len, orig_blocks - orig_start, - orig_inode->i_ino, donor_inode->i_ino); - *len = orig_blocks - orig_start; - } - } - + if (orig_eof < orig_start + *len - 1) + *len = orig_eof - orig_start; + if (donor_eof < donor_start + *len - 1) + *len = donor_eof - donor_start; if (!*len) { ext4_debug("ext4 move extent: len should not be 0 " "[ino:orig %lu, donor %lu]\n", orig_inode->i_ino, @@ -1208,60 +531,26 @@ mext_check_arguments(struct inode *orig_inode, * * @o_filp: file structure of the original file * @d_filp: file structure of the donor file - * @orig_start: start offset in block for orig - * @donor_start: start offset in block for donor + * @orig_blk: start offset in block for orig + * @donor_blk: start offset in block for donor * @len: the number of blocks to be moved * @moved_len: moved block length * * This function returns 0 and moved block length is set in moved_len * if succeed, otherwise returns error value. * - * Note: ext4_move_extents() proceeds the following order. - * 1:ext4_move_extents() calculates the last block number of moving extent - * function by the start block number (orig_start) and the number of blocks - * to be moved (len) specified as arguments. - * If the {orig, donor}_start points a hole, the extent's start offset - * pointed by ext_cur (current extent), holecheck_path, orig_path are set - * after hole behind. - * 2:Continue step 3 to step 5, until the holecheck_path points to last_extent - * or the ext_cur exceeds the block_end which is last logical block number. - * 3:To get the length of continues area, call mext_next_extent() - * specified with the ext_cur (initial value is holecheck_path) re-cursive, - * until find un-continuous extent, the start logical block number exceeds - * the block_end or the extent points to the last extent. - * 4:Exchange the original inode data with donor inode data - * from orig_page_offset to seq_end_page. - * The start indexes of data are specified as arguments. - * That of the original inode is orig_page_offset, - * and the donor inode is also orig_page_offset - * (To easily handle blocksize != pagesize case, the offset for the - * donor inode is block unit). - * 5:Update holecheck_path and orig_path to points a next proceeding extent, - * then returns to step 2. - * 6:Release holecheck_path, orig_path and set the len to moved_len - * which shows the number of moved blocks. - * The moved_len is useful for the command to calculate the file offset - * for starting next move extent ioctl. - * 7:Return 0 on success, or a negative error value on failure. */ int -ext4_move_extents(struct file *o_filp, struct file *d_filp, - __u64 orig_start, __u64 donor_start, __u64 len, - __u64 *moved_len) +ext4_move_extents(struct file *o_filp, struct file *d_filp, __u64 orig_blk, + __u64 donor_blk, __u64 len, __u64 *moved_len) { struct inode *orig_inode = file_inode(o_filp); struct inode *donor_inode = file_inode(d_filp); - struct ext4_ext_path *orig_path = NULL, *holecheck_path = NULL; - struct ext4_extent *ext_prev, *ext_cur, *ext_dummy; - ext4_lblk_t block_start = orig_start; - ext4_lblk_t block_end, seq_start, add_blocks, file_end, seq_blocks = 0; - ext4_lblk_t rest_blocks; - pgoff_t orig_page_offset = 0, seq_end_page; - int ret, depth, last_extent = 0; + struct ext4_ext_path *path = NULL; int blocks_per_page = PAGE_CACHE_SIZE >> orig_inode->i_blkbits; - int data_offset_in_page; - int block_len_in_page; - int unwritten; + ext4_lblk_t o_end, o_start = orig_blk; + ext4_lblk_t d_start = donor_blk; + int ret; if (orig_inode->i_sb != donor_inode->i_sb) { ext4_debug("ext4 move extent: The argument files " @@ -1303,121 +592,58 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, /* Protect extent tree against block allocations via delalloc */ ext4_double_down_write_data_sem(orig_inode, donor_inode); /* Check the filesystem environment whether move_extent can be done */ - ret = mext_check_arguments(orig_inode, donor_inode, orig_start, - donor_start, &len); + ret = mext_check_arguments(orig_inode, donor_inode, orig_blk, + donor_blk, &len); if (ret) goto out; + o_end = o_start + len; - file_end = (i_size_read(orig_inode) - 1) >> orig_inode->i_blkbits; - block_end = block_start + len - 1; - if (file_end < block_end) - len -= block_end - file_end; + while (o_start < o_end) { + struct ext4_extent *ex; + ext4_lblk_t cur_blk, next_blk; + pgoff_t orig_page_index, donor_page_index; + int offset_in_page; + int unwritten, cur_len; - ret = get_ext_path(orig_inode, block_start, &orig_path); - if (ret) - goto out; - - /* Get path structure to check the hole */ - ret = get_ext_path(orig_inode, block_start, &holecheck_path); - if (ret) - goto out; - - depth = ext_depth(orig_inode); - ext_cur = holecheck_path[depth].p_ext; - - /* - * Get proper starting location of block replacement if block_start was - * within the hole. - */ - if (le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur) - 1 < block_start) { - /* - * The hole exists between extents or the tail of - * original file. - */ - last_extent = mext_next_extent(orig_inode, - holecheck_path, &ext_cur); - if (last_extent < 0) { - ret = last_extent; - goto out; - } - last_extent = mext_next_extent(orig_inode, orig_path, - &ext_dummy); - if (last_extent < 0) { - ret = last_extent; + ret = get_ext_path(orig_inode, o_start, &path); + if (ret) goto out; - } - seq_start = le32_to_cpu(ext_cur->ee_block); - } else if (le32_to_cpu(ext_cur->ee_block) > block_start) - /* The hole exists at the beginning of original file. */ - seq_start = le32_to_cpu(ext_cur->ee_block); - else - seq_start = block_start; - - /* No blocks within the specified range. */ - if (le32_to_cpu(ext_cur->ee_block) > block_end) { - ext4_debug("ext4 move extent: The specified range of file " - "may be the hole\n"); - ret = -EINVAL; - goto out; - } - - /* Adjust start blocks */ - add_blocks = min(le32_to_cpu(ext_cur->ee_block) + - ext4_ext_get_actual_len(ext_cur), block_end + 1) - - max(le32_to_cpu(ext_cur->ee_block), block_start); - - while (!last_extent && le32_to_cpu(ext_cur->ee_block) <= block_end) { - seq_blocks += add_blocks; - - /* Adjust tail blocks */ - if (seq_start + seq_blocks - 1 > block_end) - seq_blocks = block_end - seq_start + 1; - - ext_prev = ext_cur; - last_extent = mext_next_extent(orig_inode, holecheck_path, - &ext_cur); - if (last_extent < 0) { - ret = last_extent; - break; - } - add_blocks = ext4_ext_get_actual_len(ext_cur); - - /* - * Extend the length of contiguous block (seq_blocks) - * if extents are contiguous. - */ - if (ext4_can_extents_be_merged(orig_inode, - ext_prev, ext_cur) && - block_end >= le32_to_cpu(ext_cur->ee_block) && - !last_extent) + ex = path[path->p_depth].p_ext; + next_blk = ext4_ext_next_allocated_block(path); + cur_blk = le32_to_cpu(ex->ee_block); + cur_len = ext4_ext_get_actual_len(ex); + /* Check hole before the start pos */ + if (cur_blk + cur_len - 1 < o_start) { + if (next_blk == EXT_MAX_BLOCKS) { + o_start = o_end; + ret = -ENODATA; + goto out; + } + d_start += next_blk - o_start; + o_start = next_blk; continue; - - /* Is original extent is unwritten */ - unwritten = ext4_ext_is_unwritten(ext_prev); - - data_offset_in_page = seq_start % blocks_per_page; - - /* - * Calculate data blocks count that should be swapped - * at the first page. - */ - if (data_offset_in_page + seq_blocks > blocks_per_page) { - /* Swapped blocks are across pages */ - block_len_in_page = - blocks_per_page - data_offset_in_page; - } else { - /* Swapped blocks are in a page */ - block_len_in_page = seq_blocks; + /* Check hole after the start pos */ + } else if (cur_blk > o_start) { + /* Skip hole */ + d_start += cur_blk - o_start; + o_start = cur_blk; + /* Extent inside requested range ?*/ + if (cur_blk >= o_end) + goto out; + } else { /* in_range(o_start, o_blk, o_len) */ + cur_len += cur_blk - o_start; } - - orig_page_offset = seq_start >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_end_page = (seq_start + seq_blocks - 1) >> - (PAGE_CACHE_SHIFT - orig_inode->i_blkbits); - seq_start = le32_to_cpu(ext_cur->ee_block); - rest_blocks = seq_blocks; - + unwritten = ext4_ext_is_unwritten(ex); + if (o_end - o_start < cur_len) + cur_len = o_end - o_start; + + orig_page_index = o_start >> (PAGE_CACHE_SHIFT - + orig_inode->i_blkbits); + donor_page_index = d_start >> (PAGE_CACHE_SHIFT - + donor_inode->i_blkbits); + offset_in_page = o_start % blocks_per_page; + if (cur_len > blocks_per_page- offset_in_page) + cur_len = blocks_per_page - offset_in_page; /* * Up semaphore to avoid following problems: * a. transaction deadlock among ext4_journal_start, @@ -1426,77 +652,29 @@ ext4_move_extents(struct file *o_filp, struct file *d_filp, * in move_extent_per_page */ ext4_double_up_write_data_sem(orig_inode, donor_inode); - - while (orig_page_offset <= seq_end_page) { - - /* Swap original branches with new branches */ - block_len_in_page = move_extent_per_page( - o_filp, donor_inode, - orig_page_offset, - data_offset_in_page, - block_len_in_page, - unwritten, &ret); - - /* Count how many blocks we have exchanged */ - *moved_len += block_len_in_page; - if (ret < 0) - break; - if (*moved_len > len) { - EXT4_ERROR_INODE(orig_inode, - "We replaced blocks too much! " - "sum of replaced: %llu requested: %llu", - *moved_len, len); - ret = -EIO; - break; - } - - orig_page_offset++; - data_offset_in_page = 0; - rest_blocks -= block_len_in_page; - if (rest_blocks > blocks_per_page) - block_len_in_page = blocks_per_page; - else - block_len_in_page = rest_blocks; - } - + /* Swap original branches with new branches */ + move_extent_per_page(o_filp, donor_inode, + orig_page_index, donor_page_index, + offset_in_page, cur_len, + unwritten, &ret); ext4_double_down_write_data_sem(orig_inode, donor_inode); if (ret < 0) break; - - /* Decrease buffer counter */ - if (holecheck_path) - ext4_ext_drop_refs(holecheck_path); - ret = get_ext_path(orig_inode, seq_start, &holecheck_path); - if (ret) - break; - depth = holecheck_path->p_depth; - - /* Decrease buffer counter */ - if (orig_path) - ext4_ext_drop_refs(orig_path); - ret = get_ext_path(orig_inode, seq_start, &orig_path); - if (ret) - break; - - ext_cur = holecheck_path[depth].p_ext; - add_blocks = ext4_ext_get_actual_len(ext_cur); - seq_blocks = 0; - + o_start += cur_len; + d_start += cur_len; } + *moved_len = o_start - orig_blk; + if (*moved_len > len) + *moved_len = len; + out: if (*moved_len) { ext4_discard_preallocations(orig_inode); ext4_discard_preallocations(donor_inode); } - if (orig_path) { - ext4_ext_drop_refs(orig_path); - kfree(orig_path); - } - if (holecheck_path) { - ext4_ext_drop_refs(holecheck_path); - kfree(holecheck_path); - } + ext4_ext_drop_refs(path); + kfree(path); ext4_double_up_write_data_sem(orig_inode, donor_inode); ext4_inode_resume_unlocked_dio(orig_inode); ext4_inode_resume_unlocked_dio(donor_inode); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 603e4ebbd0ac..123798c5ac31 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -53,7 +53,7 @@ static struct buffer_head *ext4_append(handle_t *handle, ext4_lblk_t *block) { struct buffer_head *bh; - int err = 0; + int err; if (unlikely(EXT4_SB(inode->i_sb)->s_max_dir_size_kb && ((inode->i_size >> 10) >= @@ -62,9 +62,9 @@ static struct buffer_head *ext4_append(handle_t *handle, *block = inode->i_size >> inode->i_sb->s_blocksize_bits; - bh = ext4_bread(handle, inode, *block, 1, &err); - if (!bh) - return ERR_PTR(err); + bh = ext4_bread(handle, inode, *block, 1); + if (IS_ERR(bh)) + return bh; inode->i_size += inode->i_sb->s_blocksize; EXT4_I(inode)->i_disksize = inode->i_size; BUFFER_TRACE(bh, "get_write_access"); @@ -94,20 +94,20 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, { struct buffer_head *bh; struct ext4_dir_entry *dirent; - int err = 0, is_dx_block = 0; + int is_dx_block = 0; - bh = ext4_bread(NULL, inode, block, 0, &err); - if (!bh) { - if (err == 0) { - ext4_error_inode(inode, __func__, line, block, - "Directory hole found"); - return ERR_PTR(-EIO); - } + bh = ext4_bread(NULL, inode, block, 0); + if (IS_ERR(bh)) { __ext4_warning(inode->i_sb, __func__, line, - "error reading directory block " - "(ino %lu, block %lu)", inode->i_ino, + "error %ld reading directory block " + "(ino %lu, block %lu)", PTR_ERR(bh), inode->i_ino, (unsigned long) block); - return ERR_PTR(err); + + return bh; + } + if (!bh) { + ext4_error_inode(inode, __func__, line, block, "Directory hole found"); + return ERR_PTR(-EIO); } dirent = (struct ext4_dir_entry *) bh->b_data; /* Determine whether or not we have an index block */ @@ -124,8 +124,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, "directory leaf block found instead of index block"); return ERR_PTR(-EIO); } - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) || + if (!ext4_has_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -253,8 +252,7 @@ static unsigned dx_node_limit(struct inode *dir); static struct dx_frame *dx_probe(const struct qstr *d_name, struct inode *dir, struct dx_hash_info *hinfo, - struct dx_frame *frame, - int *err); + struct dx_frame *frame); static void dx_release(struct dx_frame *frames); static int dx_make_map(struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); @@ -270,8 +268,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, - int *err); + struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct inode *inode); @@ -340,8 +337,7 @@ int ext4_dirent_csum_verify(struct inode *inode, struct ext4_dir_entry *dirent) { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, dirent); @@ -362,8 +358,7 @@ static void ext4_dirent_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, dirent); @@ -438,8 +433,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -468,8 +462,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -557,8 +550,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - EXT4_DIR_REC_LEN(2) - infosize; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -567,8 +559,7 @@ static inline unsigned dx_node_limit(struct inode *dir) { unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -641,7 +632,9 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, u32 range = i < count - 1? (dx_get_hash(entries + 1) - hash): ~hash; struct stats stats; printk("%s%3u:%03u hash %8x/%8x ",levels?"":" ", i, block, hash, range); - if (!(bh = ext4_bread (NULL,dir, block, 0,&err))) continue; + bh = ext4_bread(NULL,dir, block, 0); + if (!bh || IS_ERR(bh)) + continue; stats = levels? dx_show_entries(hinfo, dir, ((struct dx_node *) bh->b_data)->entries, levels - 1): dx_show_leaf(hinfo, (struct ext4_dir_entry_2 *) bh->b_data, blocksize, 0); @@ -669,29 +662,25 @@ struct stats dx_show_entries(struct dx_hash_info *hinfo, struct inode *dir, */ static struct dx_frame * dx_probe(const struct qstr *d_name, struct inode *dir, - struct dx_hash_info *hinfo, struct dx_frame *frame_in, int *err) + struct dx_hash_info *hinfo, struct dx_frame *frame_in) { unsigned count, indirect; struct dx_entry *at, *entries, *p, *q, *m; struct dx_root *root; - struct buffer_head *bh; struct dx_frame *frame = frame_in; + struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); u32 hash; - frame->bh = NULL; - bh = ext4_read_dirblock(dir, 0, INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail; - } - root = (struct dx_root *) bh->b_data; + frame->bh = ext4_read_dirblock(dir, 0, INDEX); + if (IS_ERR(frame->bh)) + return (struct dx_frame *) frame->bh; + + root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && root->info.hash_version != DX_HASH_LEGACY) { ext4_warning(dir->i_sb, "Unrecognised inode hash code %d", root->info.hash_version); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } hinfo->hash_version = root->info.hash_version; @@ -705,16 +694,12 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (root->info.unused_flags & 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash flags: %#06x", root->info.unused_flags); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } if ((indirect = root->info.indirect_levels) > 1) { ext4_warning(dir->i_sb, "Unimplemented inode hash depth: %#06x", root->info.indirect_levels); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } @@ -724,27 +709,21 @@ dx_probe(const struct qstr *d_name, struct inode *dir, if (dx_get_limit(entries) != dx_root_limit(dir, root->info.info_length)) { ext4_warning(dir->i_sb, "dx entry: limit != root limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; goto fail; } dxtrace(printk("Look up %x", hash)); - while (1) - { + while (1) { count = dx_get_count(entries); if (!count || count > dx_get_limit(entries)) { ext4_warning(dir->i_sb, "dx entry: no count or count > limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } p = entries + 1; q = entries + count - 1; - while (p <= q) - { + while (p <= q) { m = p + (q - p)/2; dxtrace(printk(".")); if (dx_get_hash(m) > hash) @@ -753,8 +732,7 @@ dx_probe(const struct qstr *d_name, struct inode *dir, p = m + 1; } - if (0) // linear search cross check - { + if (0) { // linear search cross check unsigned n = count - 1; at = entries; while (n--) @@ -771,38 +749,35 @@ dx_probe(const struct qstr *d_name, struct inode *dir, at = p - 1; dxtrace(printk(" %x->%u\n", at == entries? 0: dx_get_hash(at), dx_get_block(at))); - frame->bh = bh; frame->entries = entries; frame->at = at; - if (!indirect--) return frame; - bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); - goto fail2; + if (!indirect--) + return frame; + frame++; + frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX); + if (IS_ERR(frame->bh)) { + ret_err = (struct dx_frame *) frame->bh; + frame->bh = NULL; + goto fail; } - entries = ((struct dx_node *) bh->b_data)->entries; + entries = ((struct dx_node *) frame->bh->b_data)->entries; if (dx_get_limit(entries) != dx_node_limit (dir)) { ext4_warning(dir->i_sb, "dx entry: limit != node limit"); - brelse(bh); - *err = ERR_BAD_DX_DIR; - goto fail2; + goto fail; } - frame++; - frame->bh = NULL; } -fail2: +fail: while (frame >= frame_in) { brelse(frame->bh); frame--; } -fail: - if (*err == ERR_BAD_DX_DIR) + if (ret_err == ERR_PTR(ERR_BAD_DX_DIR)) ext4_warning(dir->i_sb, "Corrupt dir inode %lu, running e2fsck is " "recommended.", dir->i_ino); - return NULL; + return ret_err; } static void dx_release (struct dx_frame *frames) @@ -988,9 +963,9 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, } hinfo.hash = start_hash; hinfo.minor_hash = 0; - frame = dx_probe(NULL, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(NULL, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); /* Add '.' and '..' from the htree header */ if (!start_hash && !start_minor_hash) { @@ -1227,8 +1202,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, buffer */ int num = 0; ext4_lblk_t nblocks; - int i, err = 0; - int namelen; + int i, namelen; *res_dir = NULL; sb = dir->i_sb; @@ -1258,17 +1232,13 @@ static struct buffer_head * ext4_find_entry (struct inode *dir, goto restart; } if (is_dx(dir)) { - bh = ext4_dx_find_entry(dir, d_name, res_dir, &err); + bh = ext4_dx_find_entry(dir, d_name, res_dir); /* * On success, or if the error was file not found, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (err == -ENOENT) - return NULL; - if (err && err != ERR_BAD_DX_DIR) - return ERR_PTR(err); - if (bh) + if (!IS_ERR(bh) || PTR_ERR(bh) != ERR_BAD_DX_DIR) return bh; dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " "falling back\n")); @@ -1298,10 +1268,10 @@ restart: break; } num++; - bh = ext4_getblk(NULL, dir, b++, 0, &err); - if (unlikely(err)) { + bh = ext4_getblk(NULL, dir, b++, 0); + if (unlikely(IS_ERR(bh))) { if (ra_max == 0) - return ERR_PTR(err); + return bh; break; } bh_use[ra_max] = bh; @@ -1366,7 +1336,7 @@ cleanup_and_exit: } static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct qstr *d_name, - struct ext4_dir_entry_2 **res_dir, int *err) + struct ext4_dir_entry_2 **res_dir) { struct super_block * sb = dir->i_sb; struct dx_hash_info hinfo; @@ -1375,25 +1345,23 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q ext4_lblk_t block; int retval; - if (!(frame = dx_probe(d_name, dir, &hinfo, frames, err))) - return NULL; + frame = dx_probe(d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return (struct buffer_head *) frame; do { block = dx_get_block(frame->at); bh = ext4_read_dirblock(dir, block, DIRENT); - if (IS_ERR(bh)) { - *err = PTR_ERR(bh); + if (IS_ERR(bh)) goto errout; - } + retval = search_dirblock(bh, dir, d_name, block << EXT4_BLOCK_SIZE_BITS(sb), res_dir); - if (retval == 1) { /* Success! */ - dx_release(frames); - return bh; - } + if (retval == 1) + goto success; brelse(bh); if (retval == -1) { - *err = ERR_BAD_DX_DIR; + bh = ERR_PTR(ERR_BAD_DX_DIR); goto errout; } @@ -1402,18 +1370,19 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, const struct q frames, NULL); if (retval < 0) { ext4_warning(sb, - "error reading index page in directory #%lu", - dir->i_ino); - *err = retval; + "error %d reading index page in directory #%lu", + retval, dir->i_ino); + bh = ERR_PTR(retval); goto errout; } } while (retval == 1); - *err = -ENOENT; + bh = NULL; errout: dxtrace(printk(KERN_DEBUG "%s not found\n", d_name->name)); - dx_release (frames); - return NULL; +success: + dx_release(frames); + return bh; } static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) @@ -1441,7 +1410,7 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi dentry); return ERR_PTR(-EIO); } - inode = ext4_iget(dir->i_sb, ino); + inode = ext4_iget_normal(dir->i_sb, ino); if (inode == ERR_PTR(-ESTALE)) { EXT4_ERROR_INODE(dir, "deleted inode referenced: %u", @@ -1474,7 +1443,7 @@ struct dentry *ext4_get_parent(struct dentry *child) return ERR_PTR(-EIO); } - return d_obtain_alias(ext4_iget(child->d_inode->i_sb, ino)); + return d_obtain_alias(ext4_iget_normal(child->d_inode->i_sb, ino)); } /* @@ -1533,7 +1502,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) */ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, struct buffer_head **bh,struct dx_frame *frame, - struct dx_hash_info *hinfo, int *error) + struct dx_hash_info *hinfo) { unsigned blocksize = dir->i_sb->s_blocksize; unsigned count, continued; @@ -1548,16 +1517,14 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); if (IS_ERR(bh2)) { brelse(*bh); *bh = NULL; - *error = PTR_ERR(bh2); - return NULL; + return (struct ext4_dir_entry_2 *) bh2; } BUFFER_TRACE(*bh, "get_write_access"); @@ -1617,8 +1584,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, dxtrace(dx_show_leaf (hinfo, (struct ext4_dir_entry_2 *) data2, blocksize, 1)); /* Which block gets the new entry? */ - if (hinfo->hash >= hash2) - { + if (hinfo->hash >= hash2) { swap(*bh, bh2); de = de2; } @@ -1638,8 +1604,7 @@ journal_error: brelse(bh2); *bh = NULL; ext4_std_error(dir->i_sb, err); - *error = err; - return NULL; + return ERR_PTR(err); } int ext4_find_dest_de(struct inode *dir, struct inode *inode, @@ -1718,8 +1683,7 @@ static int add_dirent_to_buf(handle_t *handle, struct dentry *dentry, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { @@ -1786,8 +1750,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, struct fake_dirent *fde; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -1862,8 +1825,8 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, ext4_handle_dirty_dx_node(handle, dir, frame->bh); ext4_handle_dirty_dirent_node(handle, dir, bh); - de = do_split(handle,dir, &bh, frame, &hinfo, &retval); - if (!de) { + de = do_split(handle,dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { /* * Even if the block split failed, we have to properly write * out all the changes we did so far. Otherwise we can end up @@ -1871,7 +1834,7 @@ static int make_indexed_dir(handle_t *handle, struct dentry *dentry, */ ext4_mark_inode_dirty(handle, dir); dx_release(frames); - return retval; + return PTR_ERR(de); } dx_release(frames); @@ -1904,8 +1867,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -1982,9 +1944,9 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, struct ext4_dir_entry_2 *de; int err; - frame = dx_probe(&dentry->d_name, dir, &hinfo, frames, &err); - if (!frame) - return err; + frame = dx_probe(&dentry->d_name, dir, &hinfo, frames); + if (IS_ERR(frame)) + return PTR_ERR(frame); entries = frame->entries; at = frame->at; bh = ext4_read_dirblock(dir, dx_get_block(frame->at), DIRENT); @@ -2095,9 +2057,11 @@ static int ext4_dx_add_entry(handle_t *handle, struct dentry *dentry, goto cleanup; } } - de = do_split(handle, dir, &bh, frame, &hinfo, &err); - if (!de) + de = do_split(handle, dir, &bh, frame, &hinfo); + if (IS_ERR(de)) { + err = PTR_ERR(de); goto cleanup; + } err = add_dirent_to_buf(handle, dentry, inode, de, bh); goto cleanup; @@ -2167,8 +2131,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2387,8 +2350,7 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (EXT4_HAS_RO_COMPAT_FEATURE(dir->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -2403,10 +2365,6 @@ static int ext4_init_new_dir(handle_t *handle, struct inode *dir, dir_block = ext4_append(handle, inode, &block); if (IS_ERR(dir_block)) return PTR_ERR(dir_block); - BUFFER_TRACE(dir_block, "get_write_access"); - err = ext4_journal_get_write_access(handle, dir_block); - if (err) - goto out; de = (struct ext4_dir_entry_2 *)dir_block->b_data; ext4_init_dot_dotdot(inode, de, blocksize, csum_size, dir->i_ino, 0); set_nlink(inode, 2); @@ -2573,7 +2531,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) int err = 0, rc; bool dirty = false; - if (!sbi->s_journal) + if (!sbi->s_journal || is_bad_inode(inode)) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && @@ -3190,6 +3148,39 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) } } +static struct inode *ext4_whiteout_for_rename(struct ext4_renament *ent, + int credits, handle_t **h) +{ + struct inode *wh; + handle_t *handle; + int retries = 0; + + /* + * for inode block, sb block, group summaries, + * and inode bitmap + */ + credits += (EXT4_MAXQUOTAS_TRANS_BLOCKS(ent->dir->i_sb) + + EXT4_XATTR_TRANS_BLOCKS + 4); +retry: + wh = ext4_new_inode_start_handle(ent->dir, S_IFCHR | WHITEOUT_MODE, + &ent->dentry->d_name, 0, NULL, + EXT4_HT_DIR, credits); + + handle = ext4_journal_current_handle(); + if (IS_ERR(wh)) { + if (handle) + ext4_journal_stop(handle); + if (PTR_ERR(wh) == -ENOSPC && + ext4_should_retry_alloc(ent->dir->i_sb, &retries)) + goto retry; + } else { + *h = handle; + init_special_inode(wh, wh->i_mode, WHITEOUT_DEV); + wh->i_op = &ext4_special_inode_operations; + } + return wh; +} + /* * Anybody can rename anything with this: the permission checks are left to the * higher-level routines. @@ -3199,7 +3190,8 @@ static void ext4_update_dir_count(handle_t *handle, struct ext4_renament *ent) * This comes from rename(const char *oldpath, const char *newpath) */ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, - struct inode *new_dir, struct dentry *new_dentry) + struct inode *new_dir, struct dentry *new_dentry, + unsigned int flags) { handle_t *handle = NULL; struct ext4_renament old = { @@ -3214,6 +3206,9 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, }; int force_reread; int retval; + struct inode *whiteout = NULL; + int credits; + u8 old_file_type; dquot_initialize(old.dir); dquot_initialize(new.dir); @@ -3252,11 +3247,17 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, if (new.inode && !test_opt(new.dir->i_sb, NO_AUTO_DA_ALLOC)) ext4_alloc_da_blocks(old.inode); - handle = ext4_journal_start(old.dir, EXT4_HT_DIR, - (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + - EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2)); - if (IS_ERR(handle)) - return PTR_ERR(handle); + credits = (2 * EXT4_DATA_TRANS_BLOCKS(old.dir->i_sb) + + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 2); + if (!(flags & RENAME_WHITEOUT)) { + handle = ext4_journal_start(old.dir, EXT4_HT_DIR, credits); + if (IS_ERR(handle)) + return PTR_ERR(handle); + } else { + whiteout = ext4_whiteout_for_rename(&old, credits, &handle); + if (IS_ERR(whiteout)) + return PTR_ERR(whiteout); + } if (IS_DIRSYNC(old.dir) || IS_DIRSYNC(new.dir)) ext4_handle_sync(handle); @@ -3284,13 +3285,26 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, */ force_reread = (new.dir->i_ino == old.dir->i_ino && ext4_test_inode_flag(new.dir, EXT4_INODE_INLINE_DATA)); + + old_file_type = old.de->file_type; + if (whiteout) { + /* + * Do this before adding a new entry, so the old entry is sure + * to be still pointing to the valid old entry. + */ + retval = ext4_setent(handle, &old, whiteout->i_ino, + EXT4_FT_CHRDEV); + if (retval) + goto end_rename; + ext4_mark_inode_dirty(handle, whiteout); + } if (!new.bh) { retval = ext4_add_entry(handle, new.dentry, old.inode); if (retval) goto end_rename; } else { retval = ext4_setent(handle, &new, - old.inode->i_ino, old.de->file_type); + old.inode->i_ino, old_file_type); if (retval) goto end_rename; } @@ -3305,10 +3319,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, old.inode->i_ctime = ext4_current_time(old.inode); ext4_mark_inode_dirty(handle, old.inode); - /* - * ok, that's it - */ - ext4_rename_delete(handle, &old, force_reread); + if (!whiteout) { + /* + * ok, that's it + */ + ext4_rename_delete(handle, &old, force_reread); + } if (new.inode) { ext4_dec_count(handle, new.inode); @@ -3344,6 +3360,12 @@ end_rename: brelse(old.dir_bh); brelse(old.bh); brelse(new.bh); + if (whiteout) { + if (retval) + drop_nlink(whiteout); + unlock_new_inode(whiteout); + iput(whiteout); + } if (handle) ext4_journal_stop(handle); return retval; @@ -3476,18 +3498,15 @@ static int ext4_rename2(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry, unsigned int flags) { - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; if (flags & RENAME_EXCHANGE) { return ext4_cross_rename(old_dir, old_dentry, new_dir, new_dentry); } - /* - * Existence checking was done by the VFS, otherwise "RENAME_NOREPLACE" - * is equivalent to regular rename. - */ - return ext4_rename(old_dir, old_dentry, new_dir, new_dentry); + + return ext4_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } /* diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 1e43b905ff98..f298c60f907d 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1212,8 +1212,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 0b28b36e7915..1eda6ab0ef9d 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -70,7 +70,6 @@ static void ext4_mark_recovery_complete(struct super_block *sb, static void ext4_clear_journal_err(struct super_block *sb, struct ext4_super_block *es); static int ext4_sync_fs(struct super_block *sb, int wait); -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait); static int ext4_remount(struct super_block *sb, int *flags, char *data); static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf); static int ext4_unfreeze(struct super_block *sb); @@ -141,8 +140,7 @@ static __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return 1; return es->s_checksum == ext4_superblock_csum(sb, es); @@ -152,8 +150,7 @@ void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(sb)) return; es->s_checksum = ext4_superblock_csum(sb, es); @@ -820,10 +817,9 @@ static void ext4_put_super(struct super_block *sb) percpu_counter_destroy(&sbi->s_freeinodes_counter); percpu_counter_destroy(&sbi->s_dirs_counter); percpu_counter_destroy(&sbi->s_dirtyclusters_counter); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); brelse(sbi->s_sbh); #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif @@ -885,6 +881,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ext4_es_init_tree(&ei->i_es_tree); rwlock_init(&ei->i_es_lock); INIT_LIST_HEAD(&ei->i_es_lru); + ei->i_es_all_nr = 0; ei->i_es_lru_nr = 0; ei->i_touch_when = 0; ei->i_reserved_data_blocks = 0; @@ -1002,7 +999,7 @@ static struct inode *ext4_nfs_get_inode(struct super_block *sb, * Currently we don't know the generation for parent directory, so * a generation of 0 means "accept any" */ - inode = ext4_iget(sb, ino); + inode = ext4_iget_normal(sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); if (generation && inode->i_generation != generation) { @@ -1124,25 +1121,6 @@ static const struct super_operations ext4_sops = { .bdev_try_to_free_page = bdev_try_to_free_page, }; -static const struct super_operations ext4_nojournal_sops = { - .alloc_inode = ext4_alloc_inode, - .destroy_inode = ext4_destroy_inode, - .write_inode = ext4_write_inode, - .dirty_inode = ext4_dirty_inode, - .drop_inode = ext4_drop_inode, - .evict_inode = ext4_evict_inode, - .sync_fs = ext4_sync_fs_nojournal, - .put_super = ext4_put_super, - .statfs = ext4_statfs, - .remount_fs = ext4_remount, - .show_options = ext4_show_options, -#ifdef CONFIG_QUOTA - .quota_read = ext4_quota_read, - .quota_write = ext4_quota_write, -#endif - .bdev_try_to_free_page = bdev_try_to_free_page, -}; - static const struct export_operations ext4_export_ops = { .fh_to_dentry = ext4_fh_to_dentry, .fh_to_parent = ext4_fh_to_parent, @@ -1712,13 +1690,6 @@ static int parse_options(char *options, struct super_block *sb, "not specified"); return 0; } - } else { - if (sbi->s_jquota_fmt) { - ext4_msg(sb, KERN_ERR, "journaled quota format " - "specified with no journaling " - "enabled"); - return 0; - } } #endif if (test_opt(sb, DIOREAD_NOLOCK)) { @@ -2016,8 +1987,7 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, __u16 crc = 0; __le32 le_group = cpu_to_le32(block_group); - if ((sbi->s_es->s_feature_ro_compat & - cpu_to_le32(EXT4_FEATURE_RO_COMPAT_METADATA_CSUM))) { + if (ext4_has_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __le16 save_csum; __u32 csum32; @@ -2035,6 +2005,10 @@ static __le16 ext4_group_desc_csum(struct ext4_sb_info *sbi, __u32 block_group, } /* old crc16 code */ + if (!(sbi->s_es->s_feature_ro_compat & + cpu_to_le32(EXT4_FEATURE_RO_COMPAT_GDT_CSUM))) + return 0; + offset = offsetof(struct ext4_group_desc, bg_checksum); crc = crc16(~0, sbi->s_es->s_uuid, sizeof(sbi->s_es->s_uuid)); @@ -2191,7 +2165,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, if (EXT4_SB(sb)->s_mount_state & EXT4_ERROR_FS) { /* don't clear list on RO mount w/ errors */ if (es->s_last_orphan && !(s_flags & MS_RDONLY)) { - jbd_debug(1, "Errors on filesystem, " + ext4_msg(sb, KERN_INFO, "Errors on filesystem, " "clearing orphan list.\n"); es->s_last_orphan = 0; } @@ -2207,7 +2181,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); if (ret < 0) @@ -2263,7 +2237,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA /* Turn quotas off */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } @@ -2548,6 +2522,16 @@ static ssize_t sbi_ui_store(struct ext4_attr *a, return count; } +static ssize_t es_ui_show(struct ext4_attr *a, + struct ext4_sb_info *sbi, char *buf) +{ + + unsigned int *ui = (unsigned int *) (((char *) sbi->s_es) + + a->u.offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + static ssize_t reserved_clusters_show(struct ext4_attr *a, struct ext4_sb_info *sbi, char *buf) { @@ -2601,14 +2585,29 @@ static struct ext4_attr ext4_attr_##_name = { \ .offset = offsetof(struct ext4_sb_info, _elname),\ }, \ } + +#define EXT4_ATTR_OFFSET_ES(_name,_mode,_show,_store,_elname) \ +static struct ext4_attr ext4_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .u = { \ + .offset = offsetof(struct ext4_super_block, _elname), \ + }, \ +} + #define EXT4_ATTR(name, mode, show, store) \ static struct ext4_attr ext4_attr_##name = __ATTR(name, mode, show, store) #define EXT4_INFO_ATTR(name) EXT4_ATTR(name, 0444, NULL, NULL) #define EXT4_RO_ATTR(name) EXT4_ATTR(name, 0444, name##_show, NULL) #define EXT4_RW_ATTR(name) EXT4_ATTR(name, 0644, name##_show, name##_store) + +#define EXT4_RO_ATTR_ES_UI(name, elname) \ + EXT4_ATTR_OFFSET_ES(name, 0444, es_ui_show, NULL, elname) #define EXT4_RW_ATTR_SBI_UI(name, elname) \ EXT4_ATTR_OFFSET(name, 0644, sbi_ui_show, sbi_ui_store, elname) + #define ATTR_LIST(name) &ext4_attr_##name.attr #define EXT4_DEPRECATED_ATTR(_name, _val) \ static struct ext4_attr ext4_attr_##_name = { \ @@ -2641,6 +2640,9 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_interval_ms, s_warning_ratelimit_state.int EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval); EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst); +EXT4_RO_ATTR_ES_UI(errors_count, s_error_count); +EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time); +EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time); static struct attribute *ext4_attrs[] = { ATTR_LIST(delayed_allocation_blocks), @@ -2664,6 +2666,9 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(warning_ratelimit_burst), ATTR_LIST(msg_ratelimit_interval_ms), ATTR_LIST(msg_ratelimit_burst), + ATTR_LIST(errors_count), + ATTR_LIST(first_error_time), + ATTR_LIST(last_error_time), NULL, }; @@ -2723,9 +2728,25 @@ static void ext4_feat_release(struct kobject *kobj) complete(&ext4_feat->f_kobj_unregister); } +static ssize_t ext4_feat_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "supported\n"); +} + +/* + * We can not use ext4_attr_show/store because it relies on the kobject + * being embedded in the ext4_sb_info structure which is definitely not + * true in this case. + */ +static const struct sysfs_ops ext4_feat_ops = { + .show = ext4_feat_show, + .store = NULL, +}; + static struct kobj_type ext4_feat_ktype = { .default_attrs = ext4_feat_attrs, - .sysfs_ops = &ext4_attr_ops, + .sysfs_ops = &ext4_feat_ops, .release = ext4_feat_release, }; @@ -3179,8 +3200,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) { + if (ext4_has_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -3190,6 +3210,10 @@ static int set_journal_csum_feature_set(struct super_block *sb) incompat = 0; } + jbd2_journal_clear_features(sbi->s_journal, + JBD2_FEATURE_COMPAT_CHECKSUM, 0, + JBD2_FEATURE_INCOMPAT_CSUM_V3 | + JBD2_FEATURE_INCOMPAT_CSUM_V2); if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { ret = jbd2_journal_set_features(sbi->s_journal, compat, 0, @@ -3202,11 +3226,8 @@ static int set_journal_csum_feature_set(struct super_block *sb) jbd2_journal_clear_features(sbi->s_journal, 0, 0, JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } else { - jbd2_journal_clear_features(sbi->s_journal, - JBD2_FEATURE_COMPAT_CHECKSUM, 0, - JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT | - JBD2_FEATURE_INCOMPAT_CSUM_V3 | - JBD2_FEATURE_INCOMPAT_CSUM_V2); + jbd2_journal_clear_features(sbi->s_journal, 0, 0, + JBD2_FEATURE_INCOMPAT_ASYNC_COMMIT); } return ret; @@ -3436,7 +3457,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) logical_sb_block = sb_block; } - if (!(bh = sb_bread(sb, logical_sb_block))) { + if (!(bh = sb_bread_unmovable(sb, logical_sb_block))) { ext4_msg(sb, KERN_ERR, "unable to read superblock"); goto out_fail; } @@ -3487,8 +3508,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) } /* Precompute checksum seed for all metadata */ - if (EXT4_HAS_RO_COMPAT_FEATURE(sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (ext4_has_metadata_csum(sb)) sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, sizeof(es->s_uuid)); @@ -3519,8 +3539,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) set_opt(sb, ERRORS_CONT); else set_opt(sb, ERRORS_RO); - if (def_mount_opts & EXT4_DEFM_BLOCK_VALIDITY) - set_opt(sb, BLOCK_VALIDITY); + /* block_validity enabled by default; disable with noblock_validity */ + set_opt(sb, BLOCK_VALIDITY); if (def_mount_opts & EXT4_DEFM_DISCARD) set_opt(sb, DISCARD); @@ -3646,7 +3666,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) brelse(bh); logical_sb_block = sb_block * EXT4_MIN_BLOCK_SIZE; offset = do_div(logical_sb_block, blocksize); - bh = sb_bread(sb, logical_sb_block); + bh = sb_bread_unmovable(sb, logical_sb_block); if (!bh) { ext4_msg(sb, KERN_ERR, "Can't read superblock on 2nd try"); @@ -3868,7 +3888,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) for (i = 0; i < db_count; i++) { block = descriptor_loc(sb, logical_sb_block, i); - sbi->s_group_desc[i] = sb_bread(sb, block); + sbi->s_group_desc[i] = sb_bread_unmovable(sb, block); if (!sbi->s_group_desc[i]) { ext4_msg(sb, KERN_ERR, "can't read group descriptor %d", i); @@ -3890,12 +3910,8 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sbi->s_err_report.data = (unsigned long) sb; /* Register extent status tree shrinker */ - ext4_es_register_shrinker(sbi); - - if ((err = percpu_counter_init(&sbi->s_extent_cache_cnt, 0)) != 0) { - ext4_msg(sb, KERN_ERR, "insufficient memory"); + if (ext4_es_register_shrinker(sbi)) goto failed_mount3; - } sbi->s_stripe = ext4_get_stripe_size(sbi); sbi->s_extent_max_zeroout_kb = 32; @@ -3903,11 +3919,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) /* * set up enough so that it can read an inode */ - if (!test_opt(sb, NOLOAD) && - EXT4_HAS_COMPAT_FEATURE(sb, EXT4_FEATURE_COMPAT_HAS_JOURNAL)) - sb->s_op = &ext4_sops; - else - sb->s_op = &ext4_nojournal_sops; + sb->s_op = &ext4_sops; sb->s_export_op = &ext4_export_ops; sb->s_xattr = ext4_xattr_handlers; #ifdef CONFIG_QUOTA @@ -4106,17 +4118,20 @@ no_journal: block = ext4_count_free_clusters(sb); ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); - err = percpu_counter_init(&sbi->s_freeclusters_counter, block); + err = percpu_counter_init(&sbi->s_freeclusters_counter, block, + GFP_KERNEL); if (!err) { unsigned long freei = ext4_count_free_inodes(sb); sbi->s_es->s_free_inodes_count = cpu_to_le32(freei); - err = percpu_counter_init(&sbi->s_freeinodes_counter, freei); + err = percpu_counter_init(&sbi->s_freeinodes_counter, freei, + GFP_KERNEL); } if (!err) err = percpu_counter_init(&sbi->s_dirs_counter, - ext4_count_dirs(sb)); + ext4_count_dirs(sb), GFP_KERNEL); if (!err) - err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0); + err = percpu_counter_init(&sbi->s_dirtyclusters_counter, 0, + GFP_KERNEL); if (err) { ext4_msg(sb, KERN_ERR, "insufficient memory"); goto failed_mount6; @@ -4225,10 +4240,9 @@ failed_mount_wq: jbd2_journal_destroy(sbi->s_journal); sbi->s_journal = NULL; } -failed_mount3: ext4_es_unregister_shrinker(sbi); +failed_mount3: del_timer_sync(&sbi->s_err_report); - percpu_counter_destroy(&sbi->s_extent_cache_cnt); if (sbi->s_mmp_tsk) kthread_stop(sbi->s_mmp_tsk); failed_mount2: @@ -4243,7 +4257,7 @@ failed_mount: remove_proc_entry(sb->s_id, ext4_proc_root); } #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(sbi->s_qf_names[i]); #endif ext4_blkdev_remove(sbi); @@ -4371,6 +4385,15 @@ static journal_t *ext4_get_dev_journal(struct super_block *sb, goto out_bdev; } + if ((le32_to_cpu(es->s_feature_ro_compat) & + EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + es->s_checksum != ext4_superblock_csum(sb, es)) { + ext4_msg(sb, KERN_ERR, "external journal has " + "corrupt superblock"); + brelse(bh); + goto out_bdev; + } + if (memcmp(EXT4_SB(sb)->s_es->s_journal_uuid, es->s_uuid, 16)) { ext4_msg(sb, KERN_ERR, "journal UUID does not match"); brelse(bh); @@ -4673,15 +4696,19 @@ static int ext4_sync_fs(struct super_block *sb, int wait) * being sent at the end of the function. But we can skip it if * transaction_commit will do it for us. */ - target = jbd2_get_latest_transaction(sbi->s_journal); - if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && - !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + if (sbi->s_journal) { + target = jbd2_get_latest_transaction(sbi->s_journal); + if (wait && sbi->s_journal->j_flags & JBD2_BARRIER && + !jbd2_trans_will_send_data_barrier(sbi->s_journal, target)) + needs_barrier = true; + + if (jbd2_journal_start_commit(sbi->s_journal, &target)) { + if (wait) + ret = jbd2_log_wait_commit(sbi->s_journal, + target); + } + } else if (wait && test_opt(sb, BARRIER)) needs_barrier = true; - - if (jbd2_journal_start_commit(sbi->s_journal, &target)) { - if (wait) - ret = jbd2_log_wait_commit(sbi->s_journal, target); - } if (needs_barrier) { int err; err = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); @@ -4692,19 +4719,6 @@ static int ext4_sync_fs(struct super_block *sb, int wait) return ret; } -static int ext4_sync_fs_nojournal(struct super_block *sb, int wait) -{ - int ret = 0; - - trace_ext4_sync_fs(sb, wait); - flush_workqueue(EXT4_SB(sb)->rsv_conversion_wq); - dquot_writeback_dquots(sb, -1); - if (wait && test_opt(sb, BARRIER)) - ret = blkdev_issue_flush(sb->s_bdev, GFP_KERNEL, NULL); - - return ret; -} - /* * LVM calls this function before a (read-only) snapshot is created. This * gives us a chance to flush the journal completely and mark the fs clean. @@ -4723,23 +4737,26 @@ static int ext4_freeze(struct super_block *sb) journal = EXT4_SB(sb)->s_journal; - /* Now we set up the journal barrier. */ - jbd2_journal_lock_updates(journal); + if (journal) { + /* Now we set up the journal barrier. */ + jbd2_journal_lock_updates(journal); - /* - * Don't clear the needs_recovery flag if we failed to flush - * the journal. - */ - error = jbd2_journal_flush(journal); - if (error < 0) - goto out; + /* + * Don't clear the needs_recovery flag if we failed to + * flush the journal. + */ + error = jbd2_journal_flush(journal); + if (error < 0) + goto out; + } /* Journal blocked and flushed, clear needs_recovery flag. */ EXT4_CLEAR_INCOMPAT_FEATURE(sb, EXT4_FEATURE_INCOMPAT_RECOVER); error = ext4_commit_super(sb, 1); out: - /* we rely on upper layer to stop further updates */ - jbd2_journal_unlock_updates(EXT4_SB(sb)->s_journal); + if (journal) + /* we rely on upper layer to stop further updates */ + jbd2_journal_unlock_updates(journal); return error; } @@ -4770,7 +4787,7 @@ struct ext4_mount_options { u32 s_min_batch_time, s_max_batch_time; #ifdef CONFIG_QUOTA int s_jquota_fmt; - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[EXT4_MAXQUOTAS]; #endif }; @@ -4800,7 +4817,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_max_batch_time = sbi->s_max_batch_time; #ifdef CONFIG_QUOTA old_opts.s_jquota_fmt = sbi->s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) if (sbi->s_qf_names[i]) { old_opts.s_qf_names[i] = kstrdup(sbi->s_qf_names[i], GFP_KERNEL); @@ -4961,7 +4978,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_QUOTA /* Release old quota file names */ - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < EXT4_MAXQUOTAS; i++) kfree(old_opts.s_qf_names[i]); if (enable_quota) { if (sb_any_quota_suspended(sb)) @@ -4990,7 +5007,7 @@ restore_opts: sbi->s_max_batch_time = old_opts.s_max_batch_time; #ifdef CONFIG_QUOTA sbi->s_jquota_fmt = old_opts.s_jquota_fmt; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { kfree(sbi->s_qf_names[i]); sbi->s_qf_names[i] = old_opts.s_qf_names[i]; } @@ -5193,7 +5210,7 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, { int err; struct inode *qf_inode; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; @@ -5221,13 +5238,13 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, static int ext4_enable_quotas(struct super_block *sb) { int type, err = 0; - unsigned long qf_inums[MAXQUOTAS] = { + unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < EXT4_MAXQUOTAS; type++) { if (qf_inums[type]) { err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED); @@ -5305,7 +5322,6 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; @@ -5320,9 +5336,9 @@ static ssize_t ext4_quota_read(struct super_block *sb, int type, char *data, while (toread > 0) { tocopy = sb->s_blocksize - offset < toread ? sb->s_blocksize - offset : toread; - bh = ext4_bread(NULL, inode, blk, 0, &err); - if (err) - return err; + bh = ext4_bread(NULL, inode, blk, 0); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) /* A hole? */ memset(data, 0, tocopy); else @@ -5343,8 +5359,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, { struct inode *inode = sb_dqopt(sb)->files[type]; ext4_lblk_t blk = off >> EXT4_BLOCK_SIZE_BITS(sb); - int err = 0; - int offset = off & (sb->s_blocksize - 1); + int err, offset = off & (sb->s_blocksize - 1); struct buffer_head *bh; handle_t *handle = journal_current_handle(); @@ -5365,14 +5380,16 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, return -EIO; } - bh = ext4_bread(handle, inode, blk, 1, &err); + bh = ext4_bread(handle, inode, blk, 1); + if (IS_ERR(bh)) + return PTR_ERR(bh); if (!bh) goto out; BUFFER_TRACE(bh, "get write access"); err = ext4_journal_get_write_access(handle, bh); if (err) { brelse(bh); - goto out; + return err; } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); @@ -5381,8 +5398,6 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); out: - if (err) - return err; if (inode->i_size < off + len) { i_size_write(inode, off + len); EXT4_I(inode)->i_disksize = inode->i_size; diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index e7387337060c..1e09fc77395c 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -142,8 +142,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && + if (ext4_has_metadata_csum(inode->i_sb) && (hdr->h_checksum != ext4_xattr_block_csum(inode, block_nr, hdr))) return 0; return 1; @@ -153,8 +152,7 @@ static void ext4_xattr_block_csum_set(struct inode *inode, sector_t block_nr, struct ext4_xattr_header *hdr) { - if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, - EXT4_FEATURE_RO_COMPAT_METADATA_CSUM)) + if (!ext4_has_metadata_csum(inode->i_sb)) return; hdr->h_checksum = ext4_xattr_block_csum(inode, block_nr, hdr); @@ -190,14 +188,28 @@ ext4_listxattr(struct dentry *dentry, char *buffer, size_t size) } static int -ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end) +ext4_xattr_check_names(struct ext4_xattr_entry *entry, void *end, + void *value_start) { - while (!IS_LAST_ENTRY(entry)) { - struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(entry); + struct ext4_xattr_entry *e = entry; + + while (!IS_LAST_ENTRY(e)) { + struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e); if ((void *)next >= end) return -EIO; - entry = next; + e = next; } + + while (!IS_LAST_ENTRY(entry)) { + if (entry->e_value_size != 0 && + (value_start + le16_to_cpu(entry->e_value_offs) < + (void *)e + sizeof(__u32) || + value_start + le16_to_cpu(entry->e_value_offs) + + le32_to_cpu(entry->e_value_size) > end)) + return -EIO; + entry = EXT4_XATTR_NEXT(entry); + } + return 0; } @@ -214,7 +226,8 @@ ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh) return -EIO; if (!ext4_xattr_block_csum_verify(inode, bh->b_blocknr, BHDR(bh))) return -EIO; - error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size); + error = ext4_xattr_check_names(BFIRST(bh), bh->b_data + bh->b_size, + bh->b_data); if (!error) set_buffer_verified(bh); return error; @@ -331,7 +344,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, header = IHDR(inode, raw_inode); entry = IFIRST(header); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(entry, end); + error = ext4_xattr_check_names(entry, end, entry); if (error) goto cleanup; error = ext4_xattr_find_entry(&entry, name_index, name, @@ -463,7 +476,7 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = ext4_xattr_check_names(IFIRST(header), end); + error = ext4_xattr_check_names(IFIRST(header), end, IFIRST(header)); if (error) goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), @@ -899,14 +912,8 @@ inserted: if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) goal = goal & EXT4_MAX_BLOCK_FILE_PHYS; - /* - * take i_data_sem because we will test - * i_delalloc_reserved_flag in ext4_mb_new_blocks - */ - down_read(&EXT4_I(inode)->i_data_sem); block = ext4_new_meta_blocks(handle, inode, goal, 0, NULL, &error); - up_read((&EXT4_I(inode)->i_data_sem)); if (error) goto cleanup; @@ -986,7 +993,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, is->s.here = is->s.first; is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = ext4_xattr_check_names(IFIRST(header), is->s.end); + error = ext4_xattr_check_names(IFIRST(header), is->s.end, + IFIRST(header)); if (error) return error; /* Find the named attribute. */ diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ec3b7a5381fa..dd10a031c052 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -72,7 +72,22 @@ out: return page; } -static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) +struct page *get_meta_page_ra(struct f2fs_sb_info *sbi, pgoff_t index) +{ + bool readahead = false; + struct page *page; + + page = find_get_page(META_MAPPING(sbi), index); + if (!page || (page && !PageUptodate(page))) + readahead = true; + f2fs_put_page(page, 0); + + if (readahead) + ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR); + return get_meta_page(sbi, index); +} + +static inline block_t get_max_meta_blks(struct f2fs_sb_info *sbi, int type) { switch (type) { case META_NAT: @@ -82,6 +97,8 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) case META_SSA: case META_CP: return 0; + case META_POR: + return MAX_BLKADDR(sbi); default: BUG(); } @@ -90,12 +107,12 @@ static inline int get_max_meta_blks(struct f2fs_sb_info *sbi, int type) /* * Readahead CP/NAT/SIT/SSA pages */ -int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type) { block_t prev_blk_addr = 0; struct page *page; - int blkno = start; - int max_blks = get_max_meta_blks(sbi, type); + block_t blkno = start; + block_t max_blks = get_max_meta_blks(sbi, type); struct f2fs_io_info fio = { .type = META, @@ -125,7 +142,11 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, int start, int nrpages, int type) break; case META_SSA: case META_CP: - /* get ssa/cp block addr */ + case META_POR: + if (unlikely(blkno >= max_blks)) + goto out; + if (unlikely(blkno < SEG0_BLKADDR(sbi))) + goto out; blk_addr = blkno; break; default: @@ -151,8 +172,7 @@ out: static int f2fs_write_meta_page(struct page *page, struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); trace_f2fs_writepage(page, META); @@ -177,7 +197,7 @@ redirty_out: static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; trace_f2fs_writepages(mapping->host, wbc, META); @@ -259,15 +279,12 @@ continue_unlock: static int f2fs_set_meta_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - trace_f2fs_set_page_dirty(page, META); SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_META); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); return 1; } return 0; @@ -378,7 +395,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) void release_orphan_inode(struct f2fs_sb_info *sbi) { spin_lock(&sbi->ino_lock[ORPHAN_INO]); - f2fs_bug_on(sbi->n_orphans == 0); + f2fs_bug_on(sbi, sbi->n_orphans == 0); sbi->n_orphans--; spin_unlock(&sbi->ino_lock[ORPHAN_INO]); } @@ -398,7 +415,7 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static void recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode = f2fs_iget(sbi->sb, ino); - f2fs_bug_on(IS_ERR(inode)); + f2fs_bug_on(sbi, IS_ERR(inode)); clear_nlink(inode); /* truncate all the data during iput */ @@ -459,7 +476,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) list_for_each_entry(orphan, head, list) { if (!page) { page = find_get_page(META_MAPPING(sbi), start_blk++); - f2fs_bug_on(!page); + f2fs_bug_on(sbi, !page); orphan_blk = (struct f2fs_orphan_block *)page_address(page); memset(orphan_blk, 0, sizeof(*orphan_blk)); @@ -619,7 +636,7 @@ fail_no_cp: static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) return -EEXIST; @@ -631,32 +648,38 @@ static int __add_dirty_inode(struct inode *inode, struct dir_inode_entry *new) return 0; } -void set_dirty_dir_page(struct inode *inode, struct page *page) +void update_dirty_page(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *new; int ret = 0; - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) return; + if (!S_ISDIR(inode->i_mode)) { + inode_inc_dirty_pages(inode); + goto out; + } + new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); new->inode = inode; INIT_LIST_HEAD(&new->list); spin_lock(&sbi->dir_inode_lock); ret = __add_dirty_inode(inode, new); - inode_inc_dirty_dents(inode); - SetPagePrivate(page); + inode_inc_dirty_pages(inode); spin_unlock(&sbi->dir_inode_lock); if (ret) kmem_cache_free(inode_entry_slab, new); +out: + SetPagePrivate(page); } void add_dirty_dir_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); int ret = 0; @@ -674,14 +697,14 @@ void add_dirty_dir_inode(struct inode *inode) void remove_dirty_dir_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dir_inode_entry *entry; if (!S_ISDIR(inode->i_mode)) return; spin_lock(&sbi->dir_inode_lock); - if (get_dirty_dents(inode) || + if (get_dirty_pages(inode) || !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { spin_unlock(&sbi->dir_inode_lock); return; @@ -802,11 +825,12 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); - nid_t last_nid = 0; + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; block_t start_blk; struct page *cp_page; unsigned int data_sum_blocks, orphan_blocks; @@ -869,7 +893,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + orphan_blocks); - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ cp_payload_blks + data_sum_blocks + @@ -886,6 +910,9 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) else clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + if (sbi->need_fsck) + set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); @@ -920,7 +947,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; - if (is_umount) { + if (cpc->reason == CP_UMOUNT) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; } @@ -960,23 +987,23 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) +void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "start block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); mutex_lock(&sbi->cp_mutex); - if (!sbi->s_dirty) + if (!sbi->s_dirty && cpc->reason != CP_DISCARD) goto out; if (unlikely(f2fs_cp_error(sbi))) goto out; if (block_operations(sbi)) goto out; - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish block_ops"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); f2fs_submit_merged_bio(sbi, DATA, WRITE); f2fs_submit_merged_bio(sbi, NODE, WRITE); @@ -992,16 +1019,16 @@ void write_checkpoint(struct f2fs_sb_info *sbi, bool is_umount) /* write cached NAT/SIT entries to NAT/SIT area */ flush_nat_entries(sbi); - flush_sit_entries(sbi); + flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, is_umount); + do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, is_umount, "finish checkpoint"); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 76de83e25a89..8e58c4cc2cb9 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -85,7 +85,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = bio_alloc(GFP_NOIO, npages); bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(sbi, blk_addr); + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = sbi; @@ -193,7 +193,7 @@ void f2fs_submit_page_mbio(struct f2fs_sb_info *sbi, struct page *page, __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int bio_blocks = MAX_BIO_BLOCKS(sbi); io->bio = __bio_alloc(sbi, blk_addr, bio_blocks, is_read); io->fio = *fio; @@ -236,7 +236,7 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t new_addr) int reserve_new_block(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) return -EPERM; @@ -258,7 +258,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int err; /* if inode_page exists, index should be zero */ - f2fs_bug_on(!need_put && index); + f2fs_bug_on(F2FS_I_SB(dn->inode), !need_put && index); err = get_dnode_of_data(dn, index, ALLOC_NODE); if (err) @@ -321,7 +321,7 @@ void update_extent_cache(block_t blk_addr, struct dnode_of_data *dn) block_t start_blkaddr, end_blkaddr; int need_update = true; - f2fs_bug_on(blk_addr == NEW_ADDR); + f2fs_bug_on(F2FS_I_SB(dn->inode), blk_addr == NEW_ADDR); fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + dn->ofs_in_node; @@ -396,7 +396,6 @@ end_update: struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; @@ -429,7 +428,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) return page; } - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, dn.data_blkaddr, sync ? READ_SYNC : READA); if (err) return ERR_PTR(err); @@ -451,7 +450,6 @@ struct page *find_data_page(struct inode *inode, pgoff_t index, bool sync) */ struct page *get_lock_data_page(struct inode *inode, pgoff_t index) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; @@ -490,7 +488,8 @@ repeat: return page; } - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, READ_SYNC); + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, + dn.data_blkaddr, READ_SYNC); if (err) return ERR_PTR(err); @@ -517,7 +516,6 @@ repeat: struct page *get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct address_space *mapping = inode->i_mapping; struct page *page; struct dnode_of_data dn; @@ -541,8 +539,8 @@ repeat: zero_user_segment(page, 0, PAGE_CACHE_SIZE); SetPageUptodate(page); } else { - err = f2fs_submit_page_bio(sbi, page, dn.data_blkaddr, - READ_SYNC); + err = f2fs_submit_page_bio(F2FS_I_SB(inode), page, + dn.data_blkaddr, READ_SYNC); if (err) goto put_err; @@ -573,10 +571,12 @@ put_err: static int __allocate_data_block(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; block_t new_blkaddr; struct node_info ni; + pgoff_t fofs; int type; if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) @@ -599,6 +599,12 @@ static int __allocate_data_block(struct dnode_of_data *dn) update_extent_cache(new_blkaddr, dn); clear_inode_flag(F2FS_I(dn->inode), FI_NO_EXTENT); + /* update i_size */ + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + dn->ofs_in_node; + if (i_size_read(dn->inode) < ((fofs + 1) << PAGE_CACHE_SHIFT)) + i_size_write(dn->inode, ((fofs + 1) << PAGE_CACHE_SHIFT)); + dn->data_blkaddr = new_blkaddr; return 0; } @@ -614,7 +620,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) static int __get_data_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create, bool fiemap) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); unsigned int blkbits = inode->i_sb->s_blocksize_bits; unsigned maxblocks = bh_result->b_size >> blkbits; struct dnode_of_data dn; @@ -630,8 +635,8 @@ static int __get_data_block(struct inode *inode, sector_t iblock, goto out; if (create) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_lock_op(F2FS_I_SB(inode)); } /* When reading holes, we need its node page */ @@ -707,7 +712,7 @@ put_out: f2fs_put_dnode(&dn); unlock_out: if (create) - f2fs_unlock_op(sbi); + f2fs_unlock_op(F2FS_I_SB(inode)); out: trace_f2fs_get_data_block(inode, iblock, bh_result, err); return err; @@ -804,7 +809,7 @@ static int f2fs_write_data_page(struct page *page, struct writeback_control *wbc) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) >> PAGE_CACHE_SHIFT; @@ -846,7 +851,7 @@ write: if (unlikely(f2fs_cp_error(sbi))) { SetPageError(page); unlock_page(page); - return 0; + goto out; } if (!wbc->for_reclaim) @@ -866,7 +871,7 @@ done: clear_cold_data(page); out: - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); unlock_page(page); if (need_balance_fs) f2fs_balance_fs(sbi); @@ -892,7 +897,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, struct writeback_control *wbc) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); bool locked = false; int ret; long diff; @@ -904,7 +909,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, return 0; if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && - get_dirty_dents(inode) < nr_pages_to_skip(sbi, DATA) && + get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; @@ -926,7 +931,7 @@ static int f2fs_write_data_pages(struct address_space *mapping, return ret; skip_write: - wbc->pages_skipped += get_dirty_dents(inode); + wbc->pages_skipped += get_dirty_pages(inode); return 0; } @@ -945,7 +950,7 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct page **pagep, void **fsdata) { struct inode *inode = mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; struct dnode_of_data dn; @@ -1047,7 +1052,10 @@ static int f2fs_write_end(struct file *file, trace_f2fs_write_end(inode, pos, len, copied); - set_page_dirty(page); + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + register_inmem_page(inode, page); + else + set_page_dirty(page); if (pos + copied > i_size_read(inode)) { i_size_write(inode, pos + copied); @@ -1092,9 +1100,6 @@ static ssize_t f2fs_direct_IO(int rw, struct kiocb *iocb, if (check_direct_IO(inode, rw, iter, offset)) return 0; - /* clear fsync mark to recover these blocks */ - fsync_mark_clear(F2FS_SB(inode->i_sb), inode->i_ino); - trace_f2fs_direct_IO_enter(inode, offset, count, rw); err = blockdev_direct_IO(rw, iocb, inode, iter, offset, get_data_block); @@ -1110,8 +1115,12 @@ static void f2fs_invalidate_data_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; + + if (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE) + return; + if (PageDirty(page)) - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); ClearPagePrivate(page); } @@ -1133,7 +1142,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - set_dirty_dir_page(inode, page); + update_dirty_page(inode, page); return 1; } return 0; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fecebdbfd781..0a91ab813a9e 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -93,7 +93,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) total_vblocks = 0; blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); hblks_per_sec = blks_per_sec / 2; - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -103,7 +103,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) ndirty++; } } - dist = TOTAL_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; + dist = MAIN_SECS(sbi) * hblks_per_sec * hblks_per_sec / 100; si->bimodal = bimodal / dist; if (si->dirty_count) si->avg_vblocks = total_vblocks / ndirty; @@ -131,17 +131,17 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build sit */ si->base_mem += sizeof(struct sit_info); - si->base_mem += TOTAL_SEGS(sbi) * sizeof(struct seg_entry); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * TOTAL_SEGS(sbi); + si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); if (sbi->segs_per_sec > 1) - si->base_mem += TOTAL_SECS(sbi) * sizeof(struct sec_entry); + si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); si->base_mem += __bitmap_size(sbi, SIT_BITMAP); /* build free segmap */ si->base_mem += sizeof(struct free_segmap_info); - si->base_mem += f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; @@ -149,8 +149,8 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); - si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(TOTAL_SEGS(sbi)); - si->base_mem += f2fs_bitmap_size(TOTAL_SECS(sbi)); + si->base_mem += NR_DIRTY_TYPE * f2fs_bitmap_size(MAIN_SEGS(sbi)); + si->base_mem += f2fs_bitmap_size(MAIN_SECS(sbi)); /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 155fb056b7f1..b54f87149c09 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -126,7 +126,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, * For the most part, it should be a bug when name_len is zero. * We stop here for figuring out where the bugs has occurred. */ - f2fs_bug_on(!de->name_len); + f2fs_bug_on(F2FS_P_SB(dentry_page), !de->name_len); bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -151,7 +151,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, bool room = false; int max_slots = 0; - f2fs_bug_on(level > MAX_DIR_HASH_DEPTH); + f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -284,10 +284,9 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) int update_dent_inode(struct inode *inode, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *page; - page = get_node_page(sbi, inode->i_ino); + page = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -337,7 +336,6 @@ static int make_empty_dir(struct inode *inode, static struct page *init_inode_metadata(struct inode *inode, struct inode *dir, const struct qstr *name) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct page *page; int err; @@ -360,7 +358,7 @@ static struct page *init_inode_metadata(struct inode *inode, if (err) goto put_error; } else { - page = get_node_page(F2FS_SB(dir->i_sb), inode->i_ino); + page = get_node_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; @@ -381,7 +379,7 @@ static struct page *init_inode_metadata(struct inode *inode, * we should remove this inode from orphan list. */ if (inode->i_nlink == 0) - remove_orphan_inode(sbi, inode->i_ino); + remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); inc_nlink(inode); } return page; @@ -571,8 +569,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, { struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; - struct address_space *mapping = page->mapping; - struct inode *dir = mapping->host; + struct inode *dir = page->mapping->host; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; @@ -594,7 +591,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dir->i_ctime = dir->i_mtime = CURRENT_TIME; if (inode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); @@ -621,7 +618,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, truncate_hole(dir, page->index, page->index + 1); clear_page_dirty_for_io(page); ClearPageUptodate(page); - inode_dec_dirty_dents(dir); + inode_dec_dirty_pages(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index e921242186f6..8171e80b2ee9 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,16 @@ #include <linux/sched.h> #ifdef CONFIG_F2FS_CHECK_FS -#define f2fs_bug_on(condition) BUG_ON(condition) +#define f2fs_bug_on(sbi, condition) BUG_ON(condition) #define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else -#define f2fs_bug_on(condition) WARN_ON(condition) +#define f2fs_bug_on(sbi, condition) \ + do { \ + if (unlikely(condition)) { \ + WARN_ON(1); \ + sbi->need_fsck = true; \ + } \ + } while (0) #define f2fs_down_write(x, y) down_write(x) #endif @@ -90,6 +96,20 @@ enum { SIT_BITMAP }; +enum { + CP_UMOUNT, + CP_SYNC, + CP_DISCARD, +}; + +struct cp_control { + int reason; + __u64 trim_start; + __u64 trim_end; + __u64 trim_minlen; + __u64 trimmed; +}; + /* * For CP/NAT/SIT/SSA readahead */ @@ -97,7 +117,8 @@ enum { META_CP, META_NAT, META_SIT, - META_SSA + META_SSA, + META_POR, }; /* for the list of ino */ @@ -130,7 +151,9 @@ struct discard_entry { struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ - block_t blkaddr; /* block address locating the last inode */ + block_t blkaddr; /* block address locating the last fsync */ + block_t last_dentry; /* block address locating the last dentry */ + block_t last_inode; /* block address locating the last inode */ }; #define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) @@ -141,6 +164,9 @@ struct fsync_inode_entry { #define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) #define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) +#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) + static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) { int before = nats_in_cursum(rs); @@ -155,11 +181,24 @@ static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) return before; } +static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, + int type) +{ + if (type == NAT_JOURNAL) + return size <= MAX_NAT_JENTRIES(sum); + return size <= MAX_SIT_JENTRIES(sum); +} + /* * ioctl commands */ -#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS -#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS +#define F2FS_IOC_GETFLAGS FS_IOC_GETFLAGS +#define F2FS_IOC_SETFLAGS FS_IOC_SETFLAGS + +#define F2FS_IOCTL_MAGIC 0xf5 +#define F2FS_IOC_START_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 1) +#define F2FS_IOC_COMMIT_ATOMIC_WRITE _IO(F2FS_IOCTL_MAGIC, 2) +#define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* @@ -222,13 +261,16 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_dents; /* # of dirty dentry pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ struct extent_info ext; /* in-memory extent cache entry */ struct dir_inode_entry *dirty_dir; /* the pointer of dirty dir */ + + struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct mutex inmem_lock; /* lock for inmemory pages */ }; static inline void get_extent_info(struct extent_info *ext, @@ -260,11 +302,10 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ + struct radix_tree_root nat_set_root;/* root of the nat set cache */ rwlock_t nat_tree_lock; /* protect nat_tree_lock */ - unsigned int nat_cnt; /* the # of cached nat entries */ struct list_head nat_entries; /* cached nat entry list (clean) */ - struct list_head dirty_nat_entries; /* cached nat entry list (dirty) */ - struct list_head nat_entry_set; /* nat entry set list */ + unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ /* free node ids management */ @@ -332,18 +373,16 @@ enum { }; struct flush_cmd { - struct flush_cmd *next; struct completion wait; + struct llist_node llnode; int ret; }; struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - struct flush_cmd *issue_list; /* list for command issue */ - struct flush_cmd *dispatch_list; /* list for command dispatch */ - spinlock_t issue_lock; /* for issue list lock */ - struct flush_cmd *issue_tail; /* list tail of issue list */ + struct llist_head issue_list; /* list for command issue */ + struct llist_node *dispatch_list; /* list for command dispatch */ }; struct f2fs_sm_info { @@ -369,8 +408,11 @@ struct f2fs_sm_info { int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ + struct list_head sit_entry_set; /* sit entry set list */ + unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ + unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ struct flush_cmd_control *cmd_control_info; @@ -434,6 +476,7 @@ struct f2fs_sb_info { struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ int s_dirty; /* dirty flag for checkpoint */ + bool need_fsck; /* need fsck.f2fs to fix */ /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ @@ -539,6 +582,21 @@ static inline struct f2fs_sb_info *F2FS_SB(struct super_block *sb) return sb->s_fs_info; } +static inline struct f2fs_sb_info *F2FS_I_SB(struct inode *inode) +{ + return F2FS_SB(inode->i_sb); +} + +static inline struct f2fs_sb_info *F2FS_M_SB(struct address_space *mapping) +{ + return F2FS_I_SB(mapping->host); +} + +static inline struct f2fs_sb_info *F2FS_P_SB(struct page *page) +{ + return F2FS_M_SB(page->mapping); +} + static inline struct f2fs_super_block *F2FS_RAW_SUPER(struct f2fs_sb_info *sbi) { return (struct f2fs_super_block *)(sbi->raw_super); @@ -703,8 +761,8 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t count) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(inode->i_blocks < count); + f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); + f2fs_bug_on(sbi, inode->i_blocks < count); inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); @@ -716,10 +774,11 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) F2FS_SET_SB_DIRT(sbi); } -static inline void inode_inc_dirty_dents(struct inode *inode) +static inline void inode_inc_dirty_pages(struct inode *inode) { - inc_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); - atomic_inc(&F2FS_I(inode)->dirty_dents); + atomic_inc(&F2FS_I(inode)->dirty_pages); + if (S_ISDIR(inode->i_mode)) + inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -727,13 +786,15 @@ static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) atomic_dec(&sbi->nr_pages[count_type]); } -static inline void inode_dec_dirty_dents(struct inode *inode) +static inline void inode_dec_dirty_pages(struct inode *inode) { - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode)) return; - dec_page_count(F2FS_SB(inode->i_sb), F2FS_DIRTY_DENTS); - atomic_dec(&F2FS_I(inode)->dirty_dents); + atomic_dec(&F2FS_I(inode)->dirty_pages); + + if (S_ISDIR(inode->i_mode)) + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); } static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) @@ -741,9 +802,9 @@ static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline int get_dirty_dents(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_dents); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) @@ -848,9 +909,9 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, { spin_lock(&sbi->stat_lock); - f2fs_bug_on(!sbi->total_valid_block_count); - f2fs_bug_on(!sbi->total_valid_node_count); - f2fs_bug_on(!inode->i_blocks); + f2fs_bug_on(sbi, !sbi->total_valid_block_count); + f2fs_bug_on(sbi, !sbi->total_valid_node_count); + f2fs_bug_on(sbi, !inode->i_blocks); inode->i_blocks--; sbi->total_valid_node_count--; @@ -867,7 +928,7 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi->total_valid_inode_count == sbi->total_node_count); + f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); sbi->total_valid_inode_count++; spin_unlock(&sbi->stat_lock); } @@ -875,7 +936,7 @@ static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { spin_lock(&sbi->stat_lock); - f2fs_bug_on(!sbi->total_valid_inode_count); + f2fs_bug_on(sbi, !sbi->total_valid_inode_count); sbi->total_valid_inode_count--; spin_unlock(&sbi->stat_lock); } @@ -891,7 +952,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) return; if (unlock) { - f2fs_bug_on(!PageLocked(page)); + f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } page_cache_release(page); @@ -998,7 +1059,9 @@ enum { FI_INLINE_DATA, /* used for inline data*/ FI_APPEND_WRITE, /* inode has appended data */ FI_UPDATE_WRITE, /* inode has in-place-update data */ - FI_NEED_IPU, /* used fo ipu for fdatasync */ + FI_NEED_IPU, /* used for ipu per file */ + FI_ATOMIC_FILE, /* indicate atomic file */ + FI_VOLATILE_FILE, /* indicate volatile file */ }; static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) @@ -1085,6 +1148,16 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); } +static inline bool f2fs_is_atomic_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); +} + +static inline bool f2fs_is_volatile_file(struct inode *inode) +{ + return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); +} + static inline void *inline_data_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); @@ -1141,6 +1214,7 @@ void update_inode(struct inode *, struct page *); void update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); +void handle_failed_inode(struct inode *); /* * namei.c @@ -1188,9 +1262,9 @@ struct dnode_of_data; struct node_info; bool available_free_memory(struct f2fs_sb_info *, int); -int is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool fsync_mark_done(struct f2fs_sb_info *, nid_t); -void fsync_mark_clear(struct f2fs_sb_info *, nid_t); +bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); +bool has_fsynced_inode(struct f2fs_sb_info *, nid_t); +bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); @@ -1221,6 +1295,8 @@ void destroy_node_manager_caches(void); /* * segment.c */ +void register_inmem_page(struct inode *, struct page *); +void commit_inmem_pages(struct inode *, bool); void f2fs_balance_fs(struct f2fs_sb_info *); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); @@ -1229,9 +1305,11 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *); void invalidate_blocks(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *); +void release_discard_addrs(struct f2fs_sb_info *); void discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *); void allocate_new_segments(struct f2fs_sb_info *); +int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(struct f2fs_sb_info *, struct page *, @@ -1248,7 +1326,7 @@ void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); int lookup_journal_in_cursum(struct f2fs_summary_block *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *); +void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); int __init create_segment_manager_caches(void); @@ -1259,7 +1337,8 @@ void destroy_segment_manager_caches(void); */ struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -int ra_meta_pages(struct f2fs_sb_info *, int, int, int); +struct page *get_meta_page_ra(struct f2fs_sb_info *, pgoff_t); +int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); @@ -1271,11 +1350,11 @@ void add_orphan_inode(struct f2fs_sb_info *, nid_t); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); void recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); -void set_dirty_dir_page(struct inode *, struct page *); +void update_dirty_page(struct inode *, struct page *); void add_dirty_dir_inode(struct inode *); void remove_dirty_dir_inode(struct inode *); void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, bool); +void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1359,12 +1438,12 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_SB(inode->i_sb))->inline_inode++); \ + ((F2FS_I_SB(inode))->inline_inode++); \ } while (0) #define stat_dec_inline_inode(inode) \ do { \ if (f2fs_has_inline_data(inode)) \ - ((F2FS_SB(inode->i_sb))->inline_inode--); \ + ((F2FS_I_SB(inode))->inline_inode--); \ } while (0) #define stat_inc_seg_type(sbi, curseg) \ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 060aee65aee8..8e68bb64f835 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -33,7 +33,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, { struct page *page = vmf->page; struct inode *inode = file_inode(vma->vm_file); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; int err; @@ -117,7 +117,7 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) static inline bool need_do_checkpoint(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); bool need_cp = false; if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) @@ -138,7 +138,8 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) { struct inode *inode = file->f_mapping->host; struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + nid_t ino = inode->i_ino; int ret = 0; bool need_cp = false; struct writeback_control wbc = { @@ -153,12 +154,11 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (datasync) + if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) set_inode_flag(fi, FI_NEED_IPU); - ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - if (datasync) - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(fi, FI_NEED_IPU); + if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); return ret; @@ -168,13 +168,22 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * if there is no written data, don't waste time to write recovery info. */ if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && - !exist_written_data(sbi, inode->i_ino, APPEND_INO)) { + !exist_written_data(sbi, ino, APPEND_INO)) { + struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + + /* But we need to avoid that there are some inode updates */ + if ((i && PageDirty(i)) || need_inode_block_update(sbi, ino)) { + f2fs_put_page(i, 0); + goto go_write; + } + f2fs_put_page(i, 0); + if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || - exist_written_data(sbi, inode->i_ino, UPDATE_INO)) + exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } - +go_write: /* guarantee free sections for fsync */ f2fs_balance_fs(sbi); @@ -207,26 +216,28 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) up_write(&fi->i_sem); } } else { - /* if there is no written node page, write its inode page */ - while (!sync_node_pages(sbi, inode->i_ino, &wbc)) { - if (fsync_mark_done(sbi, inode->i_ino)) - goto out; +sync_nodes: + sync_node_pages(sbi, ino, &wbc); + + if (need_inode_block_update(sbi, ino)) { mark_inode_dirty_sync(inode); ret = f2fs_write_inode(inode, NULL); if (ret) goto out; + goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, inode->i_ino); + + ret = wait_on_node_pages_writeback(sbi, ino); if (ret) goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, inode->i_ino, APPEND_INO); + remove_dirty_inode(sbi, ino, APPEND_INO); clear_inode_flag(fi, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, inode->i_ino, UPDATE_INO); + remove_dirty_inode(sbi, ino, UPDATE_INO); clear_inode_flag(fi, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(F2FS_SB(inode->i_sb)); + ret = f2fs_issue_flush(F2FS_I_SB(inode)); } out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -353,6 +364,8 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) maxbytes, i_size_read(inode)); case SEEK_DATA: case SEEK_HOLE: + if (offset < 0) + return -ENXIO; return f2fs_seek_block(file, offset, whence); } @@ -369,7 +382,7 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) int truncate_data_blocks_range(struct dnode_of_data *dn, int count) { int nr_free = 0, ofs = dn->ofs_in_node; - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_node *raw_node; __le32 *addr; @@ -432,7 +445,7 @@ out: int truncate_blocks(struct inode *inode, u64 from, bool lock) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int blocksize = inode->i_sb->s_blocksize; struct dnode_of_data dn; pgoff_t free_from; @@ -463,7 +476,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); count -= dn.ofs_in_node; - f2fs_bug_on(count < 0); + f2fs_bug_on(sbi, count < 0); if (dn.ofs_in_node || IS_INODE(dn.node_page)) { truncate_data_blocks_range(&dn, count); @@ -547,15 +560,22 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { + if (attr->ia_valid & ATTR_SIZE) { err = f2fs_convert_inline_data(inode, attr->ia_size, NULL); if (err) return err; - truncate_setsize(inode, attr->ia_size); - f2fs_truncate(inode); - f2fs_balance_fs(F2FS_SB(inode->i_sb)); + if (attr->ia_size != i_size_read(inode)) { + truncate_setsize(inode, attr->ia_size); + f2fs_truncate(inode); + f2fs_balance_fs(F2FS_I_SB(inode)); + } else { + /* + * giving a chance to truncate blocks past EOF which + * are fallocated with FALLOC_FL_KEEP_SIZE. + */ + f2fs_truncate(inode); + } } __setattr_copy(inode, attr); @@ -589,7 +609,7 @@ const struct inode_operations f2fs_file_inode_operations = { static void fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page; if (!len) @@ -638,6 +658,13 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t off_start, off_end; int ret = 0; + if (!S_ISREG(inode->i_mode)) + return -EOPNOTSUPP; + + /* skip punching hole beyond i_size */ + if (offset >= inode->i_size) + return ret; + ret = f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); if (ret) return ret; @@ -661,7 +688,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) if (pg_start < pg_end) { struct address_space *mapping = inode->i_mapping; loff_t blk_start, blk_end; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); f2fs_balance_fs(sbi); @@ -682,7 +709,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t index, pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_start, off_end; @@ -778,61 +805,157 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) return flags & F2FS_OTHER_FLMASK; } -long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_inode_info *fi = F2FS_I(inode); + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + return put_user(flags, (int __user *)arg); +} + +static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags; + unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int oldflags; int ret; - switch (cmd) { - case F2FS_IOC_GETFLAGS: - flags = fi->i_flags & FS_FL_USER_VISIBLE; - return put_user(flags, (int __user *) arg); - case F2FS_IOC_SETFLAGS: - { - unsigned int oldflags; + ret = mnt_want_write_file(filp); + if (ret) + return ret; - ret = mnt_want_write_file(filp); - if (ret) - return ret; + if (!inode_owner_or_capable(inode)) { + ret = -EACCES; + goto out; + } - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } + if (get_user(flags, (int __user *)arg)) { + ret = -EFAULT; + goto out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + + mutex_lock(&inode->i_mutex); - if (get_user(flags, (int __user *) arg)) { - ret = -EFAULT; + oldflags = fi->i_flags; + + if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) { + mutex_unlock(&inode->i_mutex); + ret = -EPERM; goto out; } + } - flags = f2fs_mask_flags(inode->i_mode, flags); + flags = flags & FS_FL_USER_MODIFIABLE; + flags |= oldflags & ~FS_FL_USER_MODIFIABLE; + fi->i_flags = flags; + mutex_unlock(&inode->i_mutex); - mutex_lock(&inode->i_mutex); + f2fs_set_inode_flags(inode); + inode->i_ctime = CURRENT_TIME; + mark_inode_dirty(inode); +out: + mnt_drop_write_file(filp); + return ret; +} - oldflags = fi->i_flags; +static int f2fs_ioc_start_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); - ret = -EPERM; - goto out; - } - } + if (!inode_owner_or_capable(inode)) + return -EACCES; - flags = flags & FS_FL_USER_MODIFIABLE; - flags |= oldflags & ~FS_FL_USER_MODIFIABLE; - fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + f2fs_balance_fs(sbi); - f2fs_set_inode_flags(inode); - inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); -out: - mnt_drop_write_file(filp); + set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); + + return f2fs_convert_inline_data(inode, MAX_INLINE_DATA + 1, NULL); +} + +static int f2fs_ioc_commit_atomic_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (f2fs_is_volatile_file(inode)) + return 0; + + ret = mnt_want_write_file(filp); + if (ret) return ret; - } + + if (f2fs_is_atomic_file(inode)) + commit_inmem_pages(inode, false); + + ret = f2fs_sync_file(filp, 0, LONG_MAX, 0); + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_start_volatile_write(struct file *filp) +{ + struct inode *inode = file_inode(filp); + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); + return 0; +} + +static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct request_queue *q = bdev_get_queue(sb->s_bdev); + struct fstrim_range range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (copy_from_user(&range, (struct fstrim_range __user *)arg, + sizeof(range))) + return -EFAULT; + + range.minlen = max((unsigned int)range.minlen, + q->limits.discard_granularity); + ret = f2fs_trim_fs(F2FS_SB(sb), &range); + if (ret < 0) + return ret; + + if (copy_to_user((struct fstrim_range __user *)arg, &range, + sizeof(range))) + return -EFAULT; + return 0; +} + +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case F2FS_IOC_GETFLAGS: + return f2fs_ioc_getflags(filp, arg); + case F2FS_IOC_SETFLAGS: + return f2fs_ioc_setflags(filp, arg); + case F2FS_IOC_START_ATOMIC_WRITE: + return f2fs_ioc_start_atomic_write(filp); + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + return f2fs_ioc_commit_atomic_write(filp); + case F2FS_IOC_START_VOLATILE_WRITE: + return f2fs_ioc_start_volatile_write(filp); + case FITRIM: + return f2fs_ioc_fitrim(filp, arg); default: return -ENOTTY; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 943a31db7cc3..2a8f4acdb86b 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -193,7 +193,7 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) * selected by background GC before. * Those segments guarantee they have small valid blocks. */ - for_each_set_bit(secno, dirty_i->victim_secmap, TOTAL_SECS(sbi)) { + for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; clear_bit(secno, dirty_i->victim_secmap); @@ -263,14 +263,14 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int secno, max_cost; int nsearched = 0; + mutex_lock(&dirty_i->seglist_lock); + p.alloc_mode = alloc_mode; select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; p.min_cost = max_cost = get_max_cost(sbi, &p); - mutex_lock(&dirty_i->seglist_lock); - if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -281,9 +281,8 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned long cost; unsigned int segno; - segno = find_next_bit(p.dirty_segmap, - TOTAL_SEGS(sbi), p.offset); - if (segno >= TOTAL_SEGS(sbi)) { + segno = find_next_bit(p.dirty_segmap, MAIN_SEGS(sbi), p.offset); + if (segno >= MAIN_SEGS(sbi)) { if (sbi->last_victim[p.gc_mode]) { sbi->last_victim[p.gc_mode] = 0; p.offset = 0; @@ -423,6 +422,12 @@ next_step: if (IS_ERR(node_page)) continue; + /* block may become invalid during get_node_page */ + if (check_valid_map(sbi, segno, off) == 0) { + f2fs_put_page(node_page, 1); + continue; + } + /* set page dirty and write it */ if (gc_type == FG_GC) { f2fs_wait_on_page_writeback(node_page, NODE); @@ -531,7 +536,7 @@ static void move_data_page(struct inode *inode, struct page *page, int gc_type) f2fs_wait_on_page_writeback(page, DATA); if (clear_page_dirty_for_io(page)) - inode_dec_dirty_dents(inode); + inode_dec_dirty_pages(inode); set_cold_data(page); do_write_data_page(page, &fio); clear_cold_data(page); @@ -688,6 +693,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi) int gc_type = BG_GC; int nfree = 0; int ret = -1; + struct cp_control cpc = { + .reason = CP_SYNC, + }; INIT_LIST_HEAD(&ilist); gc_more: @@ -698,7 +706,7 @@ gc_more: if (gc_type == BG_GC && has_not_enough_free_secs(sbi, nfree)) { gc_type = FG_GC; - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); } if (!__get_victim(sbi, &segno, gc_type, NO_CHECK_TYPE)) @@ -723,7 +731,7 @@ gc_more: goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3e8ecdf3742b..88036fd75797 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -15,11 +15,13 @@ bool f2fs_may_inline(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); block_t nr_blocks; loff_t i_size; - if (!test_opt(sbi, INLINE_DATA)) + if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) + return false; + + if (f2fs_is_atomic_file(inode)) return false; nr_blocks = F2FS_I(inode)->i_xattr_nid ? 3 : 2; @@ -35,7 +37,6 @@ bool f2fs_may_inline(struct inode *inode) int f2fs_read_inline_data(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *ipage; void *src_addr, *dst_addr; @@ -44,7 +45,7 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) goto out; } - ipage = get_node_page(sbi, inode->i_ino); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { unlock_page(page); return PTR_ERR(ipage); @@ -73,7 +74,7 @@ static int __f2fs_convert_inline_data(struct inode *inode, struct page *page) struct dnode_of_data dn; void *src_addr, *dst_addr; block_t new_blk_addr; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_io_info fio = { .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, @@ -189,13 +190,12 @@ int f2fs_write_inline_data(struct inode *inode, void truncate_inline_data(struct inode *inode, u64 from) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); struct page *ipage; if (from >= MAX_INLINE_DATA) return; - ipage = get_node_page(sbi, inode->i_ino); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return; @@ -209,7 +209,7 @@ void truncate_inline_data(struct inode *inode, u64 from) bool recover_inline_data(struct inode *inode, struct page *npage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; void *src_addr, *dst_addr; struct page *ipage; @@ -229,7 +229,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE); @@ -243,7 +243,7 @@ process_inline: if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + f2fs_bug_on(sbi, IS_ERR(ipage)); f2fs_wait_on_page_writeback(ipage, NODE); zero_user_segment(ipage, INLINE_DATA_OFFSET, INLINE_DATA_OFFSET + MAX_INLINE_DATA); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 2c39999f3868..0deead4505e7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -69,7 +69,7 @@ static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) static int do_read_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; @@ -218,7 +218,7 @@ void update_inode(struct inode *inode, struct page *node_page) void update_inode_page(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; retry: node_page = get_node_page(sbi, inode->i_ino); @@ -238,7 +238,7 @@ retry: int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino == F2FS_NODE_INO(sbi) || inode->i_ino == F2FS_META_INO(sbi)) @@ -266,9 +266,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) */ void f2fs_evict_inode(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t xnid = F2FS_I(inode)->i_xattr_nid; + /* some remained atomic pages should discarded */ + if (f2fs_is_atomic_file(inode) || f2fs_is_volatile_file(inode)) + commit_inmem_pages(inode, true); + trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -276,7 +280,7 @@ void f2fs_evict_inode(struct inode *inode) inode->i_ino == F2FS_META_INO(sbi)) goto out_clear; - f2fs_bug_on(get_dirty_dents(inode)); + f2fs_bug_on(sbi, get_dirty_pages(inode)); remove_dirty_dir_inode(inode); if (inode->i_nlink || is_bad_inode(inode)) @@ -306,3 +310,26 @@ no_delete: out_clear: clear_inode(inode); } + +/* caller should call f2fs_lock_op() */ +void handle_failed_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + clear_nlink(inode); + make_bad_inode(inode); + unlock_new_inode(inode); + + i_size_write(inode, 0); + if (F2FS_HAS_BLOCKS(inode)) + f2fs_truncate(inode); + + remove_inode_page(inode); + stat_dec_inline_inode(inode); + + alloc_nid_failed(sbi, inode->i_ino); + f2fs_unlock_op(sbi); + + /* iput will drop the inode object */ + iput(inode); +} diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ee103fd7283c..0d2526e5aa11 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -23,7 +23,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); nid_t ino; struct inode *inode; bool nid_free = false; @@ -102,7 +102,7 @@ static inline void set_cold_files(struct f2fs_sb_info *sbi, struct inode *inode, static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, bool excl) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; nid_t ino = 0; int err; @@ -123,9 +123,9 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, ino); @@ -133,9 +133,7 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, unlock_new_inode(inode); return 0; out: - clear_nlink(inode); - iget_failed(inode); - alloc_nid_failed(sbi, ino); + handle_failed_inode(inode); return err; } @@ -143,7 +141,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *dentry) { struct inode *inode = old_dentry->d_inode; - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); int err; f2fs_balance_fs(sbi); @@ -154,15 +152,16 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); d_instantiate(dentry, inode); return 0; out: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); iput(inode); + f2fs_unlock_op(sbi); return err; } @@ -203,7 +202,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, static int f2fs_unlink(struct inode *dir, struct dentry *dentry) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode = dentry->d_inode; struct f2fs_dir_entry *de; struct page *page; @@ -237,7 +236,7 @@ fail: static int f2fs_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t symlen = strlen(symname) + 1; int err; @@ -253,9 +252,9 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); err = page_symlink(inode, symname, symlen); alloc_nid_done(sbi, inode->i_ino); @@ -264,15 +263,13 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, unlock_new_inode(inode); return err; out: - clear_nlink(inode); - iget_failed(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; @@ -290,9 +287,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) set_inode_flag(F2FS_I(inode), FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out_fail; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); @@ -303,9 +300,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) out_fail: clear_inode_flag(F2FS_I(inode), FI_INC_LINK); - clear_nlink(inode); - iget_failed(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } @@ -320,7 +315,7 @@ static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) static int f2fs_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, dev_t rdev) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err = 0; @@ -338,25 +333,23 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); - f2fs_unlock_op(sbi); if (err) goto out; + f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); d_instantiate(dentry, inode); unlock_new_inode(inode); return 0; out: - clear_nlink(inode); - iget_failed(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct f2fs_sb_info *sbi = F2FS_SB(old_dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page; @@ -480,8 +473,7 @@ out: static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *new_dir, struct dentry *new_dentry) { - struct super_block *sb = old_dir->i_sb; - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = old_dentry->d_inode; struct inode *new_inode = new_dentry->d_inode; struct page *old_dir_page, *new_dir_page; @@ -642,7 +634,7 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; @@ -678,10 +670,7 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) release_out: release_orphan_inode(sbi); out: - f2fs_unlock_op(sbi); - clear_nlink(inode); - iget_failed(inode); - alloc_nid_failed(sbi, inode->i_ino); + handle_failed_inode(inode); return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 45378196e19a..44b8afef43d9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -54,7 +54,6 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) static void clear_node_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); unsigned int long flags; if (PageDirty(page)) { @@ -65,7 +64,7 @@ static void clear_node_page_dirty(struct page *page) spin_unlock_irqrestore(&mapping->tree_lock, flags); clear_page_dirty_for_io(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); + dec_page_count(F2FS_M_SB(mapping), F2FS_DIRTY_NODES); } ClearPageUptodate(page); } @@ -92,7 +91,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) /* get current nat block page with lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(PageDirty(src_page)); + f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -124,44 +123,99 @@ static void __del_from_nat_cache(struct f2fs_nm_info *nm_i, struct nat_entry *e) kmem_cache_free(nat_entry_slab, e); } -int is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) +static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); + struct nat_entry_set *head; + + if (get_nat_flag(ne, IS_DIRTY)) + return; +retry: + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (!head) { + head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); + + INIT_LIST_HEAD(&head->entry_list); + INIT_LIST_HEAD(&head->set_list); + head->set = set; + head->entry_cnt = 0; + + if (radix_tree_insert(&nm_i->nat_set_root, set, head)) { + cond_resched(); + goto retry; + } + } + list_move_tail(&ne->list, &head->entry_list); + nm_i->dirty_nat_cnt++; + head->entry_cnt++; + set_nat_flag(ne, IS_DIRTY, true); +} + +static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, + struct nat_entry *ne) +{ + nid_t set = ne->ni.nid / NAT_ENTRY_PER_BLOCK; + struct nat_entry_set *head; + + head = radix_tree_lookup(&nm_i->nat_set_root, set); + if (head) { + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + head->entry_cnt--; + nm_i->dirty_nat_cnt--; + } +} + +static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, + nid_t start, unsigned int nr, struct nat_entry_set **ep) +{ + return radix_tree_gang_lookup(&nm_i->nat_set_root, (void **)ep, + start, nr); +} + +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - int is_cp = 1; + bool is_cp = true; read_lock(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); - if (e && !e->checkpointed) - is_cp = 0; + if (e && !get_nat_flag(e, IS_CHECKPOINTED)) + is_cp = false; read_unlock(&nm_i->nat_tree_lock); return is_cp; } -bool fsync_mark_done(struct f2fs_sb_info *sbi, nid_t nid) +bool has_fsynced_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - bool fsync_done = false; + bool fsynced = false; read_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (e) - fsync_done = e->fsync_done; + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_FSYNCED_INODE)) + fsynced = true; read_unlock(&nm_i->nat_tree_lock); - return fsync_done; + return fsynced; } -void fsync_mark_clear(struct f2fs_sb_info *sbi, nid_t nid) +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; + bool need_update = true; - write_lock(&nm_i->nat_tree_lock); - e = __lookup_nat_cache(nm_i, nid); - if (e) - e->fsync_done = false; - write_unlock(&nm_i->nat_tree_lock); + read_lock(&nm_i->nat_tree_lock); + e = __lookup_nat_cache(nm_i, ino); + if (e && get_nat_flag(e, HAS_LAST_FSYNC) && + (get_nat_flag(e, IS_CHECKPOINTED) || + get_nat_flag(e, HAS_FSYNCED_INODE))) + need_update = false; + read_unlock(&nm_i->nat_tree_lock); + return need_update; } static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) @@ -177,7 +231,7 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) } memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); - new->checkpointed = true; + nat_reset_flag(new); list_add_tail(&new->list, &nm_i->nat_entries); nm_i->nat_cnt++; return new; @@ -216,7 +270,7 @@ retry: goto retry; } e->ni = *ni; - f2fs_bug_on(ni->blk_addr == NEW_ADDR); + f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { /* * when nid is reallocated, @@ -224,16 +278,16 @@ retry: * So, reinitialize it with new information. */ e->ni = *ni; - f2fs_bug_on(ni->blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, ni->blk_addr != NULL_ADDR); } /* sanity check */ - f2fs_bug_on(nat_get_blkaddr(e) != ni->blk_addr); - f2fs_bug_on(nat_get_blkaddr(e) == NULL_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) != ni->blk_addr); + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NULL_ADDR && new_blkaddr == NULL_ADDR); - f2fs_bug_on(nat_get_blkaddr(e) == NEW_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) == NEW_ADDR && new_blkaddr == NEW_ADDR); - f2fs_bug_on(nat_get_blkaddr(e) != NEW_ADDR && + f2fs_bug_on(sbi, nat_get_blkaddr(e) != NEW_ADDR && nat_get_blkaddr(e) != NULL_ADDR && new_blkaddr == NEW_ADDR); @@ -245,12 +299,17 @@ retry: /* change address */ nat_set_blkaddr(e, new_blkaddr); + if (new_blkaddr == NEW_ADDR || new_blkaddr == NULL_ADDR) + set_nat_flag(e, IS_CHECKPOINTED, false); __set_nat_cache_dirty(nm_i, e); /* update fsync_mark if its inode nat entry is still alive */ e = __lookup_nat_cache(nm_i, ni->ino); - if (e) - e->fsync_done = fsync_done; + if (e) { + if (fsync_done && ni->nid == ni->ino) + set_nat_flag(e, HAS_FSYNCED_INODE, true); + set_nat_flag(e, HAS_LAST_FSYNC, fsync_done); + } write_unlock(&nm_i->nat_tree_lock); } @@ -411,7 +470,7 @@ got: */ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct page *npage[4]; struct page *parent; int offset[4]; @@ -504,15 +563,15 @@ release_out: static void truncate_node(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info ni; get_node_info(sbi, dn->nid, &ni); if (dn->inode->i_blocks == 0) { - f2fs_bug_on(ni.blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); goto invalidate; } - f2fs_bug_on(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); @@ -540,14 +599,13 @@ invalidate: static int truncate_dnode(struct dnode_of_data *dn) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *page; if (dn->nid == 0) return 1; /* get direct node */ - page = get_node_page(sbi, dn->nid); + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page) && PTR_ERR(page) == -ENOENT) return 1; else if (IS_ERR(page)) @@ -564,7 +622,6 @@ static int truncate_dnode(struct dnode_of_data *dn) static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, int ofs, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct dnode_of_data rdn = *dn; struct page *page; struct f2fs_node *rn; @@ -578,7 +635,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = get_node_page(sbi, dn->nid); + page = get_node_page(F2FS_I_SB(dn->inode), dn->nid); if (IS_ERR(page)) { trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); return PTR_ERR(page); @@ -636,7 +693,6 @@ out_err: static int truncate_partial_nodes(struct dnode_of_data *dn, struct f2fs_inode *ri, int *offset, int depth) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); struct page *pages[2]; nid_t nid[3]; nid_t child_nid; @@ -651,7 +707,7 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - pages[i] = get_node_page(sbi, nid[i]); + pages[i] = get_node_page(F2FS_I_SB(dn->inode), nid[i]); if (IS_ERR(pages[i])) { err = PTR_ERR(pages[i]); idx = i - 1; @@ -696,7 +752,7 @@ fail: */ int truncate_inode_blocks(struct inode *inode, pgoff_t from) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err = 0, cont = 1; int level, offset[4], noffset[4]; unsigned int nofs = 0; @@ -792,7 +848,7 @@ fail: int truncate_xattr_node(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; struct page *npage; @@ -840,7 +896,8 @@ void remove_inode_page(struct inode *inode) truncate_data_blocks_range(&dn, 1); /* 0 is possible, after f2fs_new_inode() has failed */ - f2fs_bug_on(inode->i_blocks != 0 && inode->i_blocks != 1); + f2fs_bug_on(F2FS_I_SB(inode), + inode->i_blocks != 0 && inode->i_blocks != 1); /* will put inode & node pages */ truncate_node(&dn); @@ -860,7 +917,7 @@ struct page *new_inode_page(struct inode *inode) struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info old_ni, new_ni; struct page *page; int err; @@ -880,7 +937,7 @@ struct page *new_node_page(struct dnode_of_data *dn, get_node_info(sbi, dn->nid, &old_ni); /* Reinitialize old_ni with new node page */ - f2fs_bug_on(old_ni.blk_addr != NULL_ADDR); + f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); new_ni = old_ni; new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); @@ -918,7 +975,7 @@ fail: */ static int read_node_page(struct page *page, int rw) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; get_node_info(sbi, page->index, &ni); @@ -994,7 +1051,7 @@ got_it: */ struct page *get_node_page_ra(struct page *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_SB(parent->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); struct blk_plug plug; struct page *page; int err, i, end; @@ -1124,10 +1181,14 @@ continue_unlock: /* called by fsync() */ if (ino && IS_DNODE(page)) { - int mark = !is_checkpointed_node(sbi, ino); set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, mark); + if (IS_INODE(page)) { + if (!is_checkpointed_node(sbi, ino) && + !has_fsynced_inode(sbi, ino)) + set_dentry_mark(page, 1); + else + set_dentry_mark(page, 0); + } nwritten++; } else { set_fsync_mark(page, 0); @@ -1206,7 +1267,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) static int f2fs_write_node_page(struct page *page, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_P_SB(page); nid_t nid; block_t new_addr; struct node_info ni; @@ -1226,7 +1287,7 @@ static int f2fs_write_node_page(struct page *page, /* get old block addr of this node page */ nid = nid_of_node(page); - f2fs_bug_on(page->index != nid); + f2fs_bug_on(sbi, page->index != nid); get_node_info(sbi, nid, &ni); @@ -1257,7 +1318,7 @@ redirty_out: static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); + struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff; trace_f2fs_writepages(mapping->host, wbc, NODE); @@ -1282,15 +1343,12 @@ skip_write: static int f2fs_set_node_page_dirty(struct page *page) { - struct address_space *mapping = page->mapping; - struct f2fs_sb_info *sbi = F2FS_SB(mapping->host->i_sb); - trace_f2fs_set_page_dirty(page, NODE); SetPageUptodate(page); if (!PageDirty(page)) { __set_page_dirty_nobuffers(page); - inc_page_count(sbi, F2FS_DIRTY_NODES); + inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); return 1; } @@ -1301,9 +1359,8 @@ static void f2fs_invalidate_node_page(struct page *page, unsigned int offset, unsigned int length) { struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); if (PageDirty(page)) - dec_page_count(sbi, F2FS_DIRTY_NODES); + dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_NODES); ClearPagePrivate(page); } @@ -1356,7 +1413,8 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) read_lock(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (ne && - (!ne->checkpointed || nat_get_blkaddr(ne) != NULL_ADDR)) + (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) allocated = true; read_unlock(&nm_i->nat_tree_lock); if (allocated) @@ -1413,7 +1471,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, break; blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); - f2fs_bug_on(blk_addr == NEW_ADDR); + f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) { if (add_free_nid(sbi, start_nid, true) < 0) break; @@ -1483,12 +1541,12 @@ retry: /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(list_empty(&nm_i->free_nid_list)); + f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) break; - f2fs_bug_on(i->state != NID_NEW); + f2fs_bug_on(sbi, i->state != NID_NEW); *nid = i->nid; i->state = NID_ALLOC; nm_i->fcnt--; @@ -1514,7 +1572,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(!i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); __del_from_free_nid_list(nm_i, i); spin_unlock(&nm_i->free_nid_list_lock); @@ -1535,7 +1593,7 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->free_nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(!i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; @@ -1551,14 +1609,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) void recover_inline_xattr(struct inode *inode, struct page *page) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); void *src_addr, *dst_addr; size_t inline_size; struct page *ipage; struct f2fs_inode *ri; - ipage = get_node_page(sbi, inode->i_ino); - f2fs_bug_on(IS_ERR(ipage)); + ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); + f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(ipage)); ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { @@ -1579,7 +1636,7 @@ update_inode: void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; nid_t new_xnid = nid_of_node(page); struct node_info ni; @@ -1590,7 +1647,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) /* Deallocate node address */ get_node_info(sbi, prev_xnid, &ni); - f2fs_bug_on(ni.blk_addr == NULL_ADDR); + f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); dec_valid_node_count(sbi, inode); set_node_addr(sbi, &ni, NULL_ADDR, false); @@ -1598,7 +1655,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) recover_xnid: /* 2: allocate new xattr nid */ if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(1); + f2fs_bug_on(sbi, 1); remove_free_nid(NM_I(sbi), new_xnid); get_node_info(sbi, new_xnid, &ni); @@ -1691,7 +1748,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_summary *sum_entry; struct inode *inode = sbi->sb->s_bdev->bd_inode; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int bio_blocks = MAX_BIO_BLOCKS(sbi); struct page *pages[bio_blocks]; int i, idx, last_offset, nrpages, err = 0; @@ -1733,89 +1790,6 @@ skip: return err; } -static struct nat_entry_set *grab_nat_entry_set(void) -{ - struct nat_entry_set *nes = - f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_ATOMIC); - - nes->entry_cnt = 0; - INIT_LIST_HEAD(&nes->set_list); - INIT_LIST_HEAD(&nes->entry_list); - return nes; -} - -static void release_nat_entry_set(struct nat_entry_set *nes, - struct f2fs_nm_info *nm_i) -{ - f2fs_bug_on(!list_empty(&nes->entry_list)); - - nm_i->dirty_nat_cnt -= nes->entry_cnt; - list_del(&nes->set_list); - kmem_cache_free(nat_entry_set_slab, nes); -} - -static void adjust_nat_entry_set(struct nat_entry_set *nes, - struct list_head *head) -{ - struct nat_entry_set *next = nes; - - if (list_is_last(&nes->set_list, head)) - return; - - list_for_each_entry_continue(next, head, set_list) - if (nes->entry_cnt <= next->entry_cnt) - break; - - list_move_tail(&nes->set_list, &next->set_list); -} - -static void add_nat_entry(struct nat_entry *ne, struct list_head *head) -{ - struct nat_entry_set *nes; - nid_t start_nid = START_NID(ne->ni.nid); - - list_for_each_entry(nes, head, set_list) { - if (nes->start_nid == start_nid) { - list_move_tail(&ne->list, &nes->entry_list); - nes->entry_cnt++; - adjust_nat_entry_set(nes, head); - return; - } - } - - nes = grab_nat_entry_set(); - - nes->start_nid = start_nid; - list_move_tail(&ne->list, &nes->entry_list); - nes->entry_cnt++; - list_add(&nes->set_list, head); -} - -static void merge_nats_in_set(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct list_head *dirty_list = &nm_i->dirty_nat_entries; - struct list_head *set_list = &nm_i->nat_entry_set; - struct nat_entry *ne, *tmp; - - write_lock(&nm_i->nat_tree_lock); - list_for_each_entry_safe(ne, tmp, dirty_list, list) { - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; - add_nat_entry(ne, set_list); - nm_i->dirty_nat_cnt++; - } - write_unlock(&nm_i->nat_tree_lock); -} - -static bool __has_cursum_space(struct f2fs_summary_block *sum, int size) -{ - if (nats_in_cursum(sum) + size <= NAT_JOURNAL_ENTRIES) - return true; - else - return false; -} - static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1850,99 +1824,130 @@ found: mutex_unlock(&curseg->curseg_mutex); } -/* - * This function is called during the checkpointing process. - */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +static void __adjust_nat_entry_set(struct nat_entry_set *nes, + struct list_head *head, int max) { - struct f2fs_nm_info *nm_i = NM_I(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; - struct nat_entry_set *nes, *tmp; - struct list_head *head = &nm_i->nat_entry_set; - bool to_journal = true; + struct nat_entry_set *cur; - /* merge nat entries of dirty list to nat entry set temporarily */ - merge_nats_in_set(sbi); + if (nes->entry_cnt >= max) + goto add_out; - /* - * if there are no enough space in journal to store dirty nat - * entries, remove all entries from journal and merge them - * into nat entry set. - */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt)) { - remove_nats_in_journal(sbi); - - /* - * merge nat entries of dirty list to nat entry set temporarily - */ - merge_nats_in_set(sbi); + list_for_each_entry(cur, head, set_list) { + if (cur->entry_cnt >= nes->entry_cnt) { + list_add(&nes->set_list, cur->set_list.prev); + return; + } } +add_out: + list_add_tail(&nes->set_list, head); +} - if (!nm_i->dirty_nat_cnt) - return; +static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, + struct nat_entry_set *set) +{ + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; + bool to_journal = true; + struct f2fs_nat_block *nat_blk; + struct nat_entry *ne, *cur; + struct page *page = NULL; /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - list_for_each_entry_safe(nes, tmp, head, set_list) { - struct f2fs_nat_block *nat_blk; - struct nat_entry *ne, *cur; - struct page *page; - nid_t start_nid = nes->start_nid; + if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + to_journal = false; - if (to_journal && !__has_cursum_space(sum, nes->entry_cnt)) - to_journal = false; + if (to_journal) { + mutex_lock(&curseg->curseg_mutex); + } else { + page = get_next_nat_page(sbi, start_nid); + nat_blk = page_address(page); + f2fs_bug_on(sbi, !nat_blk); + } + + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(ne, cur, &set->entry_list, list) { + struct f2fs_nat_entry *raw_ne; + nid_t nid = nat_get_nid(ne); + int offset; + + if (nat_get_blkaddr(ne) == NEW_ADDR) + continue; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + offset = lookup_journal_in_cursum(sum, + NAT_JOURNAL, nid, 1); + f2fs_bug_on(sbi, offset < 0); + raw_ne = &nat_in_journal(sum, offset); + nid_in_journal(sum, offset) = cpu_to_le32(nid); } else { - page = get_next_nat_page(sbi, start_nid); - nat_blk = page_address(page); - f2fs_bug_on(!nat_blk); + raw_ne = &nat_blk->entries[nid - start_nid]; } + raw_nat_from_node_info(raw_ne, &ne->ni); - /* flush dirty nats in nat entry set */ - list_for_each_entry_safe(ne, cur, &nes->entry_list, list) { - struct f2fs_nat_entry *raw_ne; - nid_t nid = nat_get_nid(ne); - int offset; + write_lock(&NM_I(sbi)->nat_tree_lock); + nat_reset_flag(ne); + __clear_nat_cache_dirty(NM_I(sbi), ne); + write_unlock(&NM_I(sbi)->nat_tree_lock); - if (to_journal) { - offset = lookup_journal_in_cursum(sum, - NAT_JOURNAL, nid, 1); - f2fs_bug_on(offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); - } else { - raw_ne = &nat_blk->entries[nid - start_nid]; - } - raw_nat_from_node_info(raw_ne, &ne->ni); + if (nat_get_blkaddr(ne) == NULL_ADDR) + add_free_nid(sbi, nid, false); + } - if (nat_get_blkaddr(ne) == NULL_ADDR && - add_free_nid(sbi, nid, false) <= 0) { - write_lock(&nm_i->nat_tree_lock); - __del_from_nat_cache(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } else { - write_lock(&nm_i->nat_tree_lock); - __clear_nat_cache_dirty(nm_i, ne); - write_unlock(&nm_i->nat_tree_lock); - } - } + if (to_journal) + mutex_unlock(&curseg->curseg_mutex); + else + f2fs_put_page(page, 1); - if (to_journal) - mutex_unlock(&curseg->curseg_mutex); - else - f2fs_put_page(page, 1); + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } +} + +/* + * This function is called during the checkpointing process. + */ +void flush_nat_entries(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_summary_block *sum = curseg->sum_blk; + struct nat_entry_set *setvec[NATVEC_SIZE]; + struct nat_entry_set *set, *tmp; + unsigned int found; + nid_t set_idx = 0; + LIST_HEAD(sets); + + /* + * if there are no enough space in journal to store dirty nat + * entries, remove all entries from journal and merge them + * into nat entry set. + */ + if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + remove_nats_in_journal(sbi); - release_nat_entry_set(nes, nm_i); + if (!nm_i->dirty_nat_cnt) + return; + + while ((found = __gang_lookup_nat_set(nm_i, + set_idx, NATVEC_SIZE, setvec))) { + unsigned idx; + set_idx = setvec[found - 1]->set + 1; + for (idx = 0; idx < found; idx++) + __adjust_nat_entry_set(setvec[idx], &sets, + MAX_NAT_JENTRIES(sum)); } - f2fs_bug_on(!list_empty(head)); - f2fs_bug_on(nm_i->dirty_nat_cnt); + /* flush dirty nats in nat entry set */ + list_for_each_entry_safe(set, tmp, &sets, set_list) + __flush_nat_entry_set(sbi, set); + + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -1969,9 +1974,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); INIT_RADIX_TREE(&nm_i->nat_root, GFP_ATOMIC); + INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->nat_entries); - INIT_LIST_HEAD(&nm_i->dirty_nat_entries); - INIT_LIST_HEAD(&nm_i->nat_entry_set); mutex_init(&nm_i->build_lock); spin_lock_init(&nm_i->free_nid_list_lock); @@ -2020,14 +2024,14 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) /* destroy free nid list */ spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(i->state == NID_ALLOC); + f2fs_bug_on(sbi, i->state == NID_ALLOC); __del_from_free_nid_list(nm_i, i); nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->free_nid_list_lock); } - f2fs_bug_on(nm_i->fcnt); + f2fs_bug_on(sbi, nm_i->fcnt); spin_unlock(&nm_i->free_nid_list_lock); /* destroy nat cache */ @@ -2039,7 +2043,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) for (idx = 0; idx < found; idx++) __del_from_nat_cache(nm_i, natvec[idx]); } - f2fs_bug_on(nm_i->nat_cnt); + f2fs_bug_on(sbi, nm_i->nat_cnt); write_unlock(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 8a116a407599..8d5e6e0dd840 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -39,10 +39,16 @@ struct node_info { unsigned char version; /* version of the node */ }; +enum { + IS_CHECKPOINTED, /* is it checkpointed before? */ + HAS_FSYNCED_INODE, /* is the inode fsynced before? */ + HAS_LAST_FSYNC, /* has the latest node fsync mark? */ + IS_DIRTY, /* this nat entry is dirty? */ +}; + struct nat_entry { struct list_head list; /* for clean or dirty nat list */ - bool checkpointed; /* whether it is checkpointed or not */ - bool fsync_done; /* whether the latest node has fsync mark */ + unsigned char flag; /* for node information bits */ struct node_info ni; /* in-memory node information */ }; @@ -55,18 +61,32 @@ struct nat_entry { #define nat_get_version(nat) (nat->ni.version) #define nat_set_version(nat, v) (nat->ni.version = v) -#define __set_nat_cache_dirty(nm_i, ne) \ - do { \ - ne->checkpointed = false; \ - list_move_tail(&ne->list, &nm_i->dirty_nat_entries); \ - } while (0) -#define __clear_nat_cache_dirty(nm_i, ne) \ - do { \ - ne->checkpointed = true; \ - list_move_tail(&ne->list, &nm_i->nat_entries); \ - } while (0) #define inc_node_version(version) (++version) +static inline void set_nat_flag(struct nat_entry *ne, + unsigned int type, bool set) +{ + unsigned char mask = 0x01 << type; + if (set) + ne->flag |= mask; + else + ne->flag &= ~mask; +} + +static inline bool get_nat_flag(struct nat_entry *ne, unsigned int type) +{ + unsigned char mask = 0x01 << type; + return ne->flag & mask; +} + +static inline void nat_reset_flag(struct nat_entry *ne) +{ + /* these states can be set only after checkpoint was done */ + set_nat_flag(ne, IS_CHECKPOINTED, true); + set_nat_flag(ne, HAS_FSYNCED_INODE, false); + set_nat_flag(ne, HAS_LAST_FSYNC, true); +} + static inline void node_info_from_raw_nat(struct node_info *ni, struct f2fs_nat_entry *raw_ne) { @@ -90,9 +110,9 @@ enum mem_type { }; struct nat_entry_set { - struct list_head set_list; /* link with all nat sets */ + struct list_head set_list; /* link with other nat sets */ struct list_head entry_list; /* link with dirty nat entries */ - nid_t start_nid; /* start nid of nats in set */ + nid_t set; /* set number*/ unsigned int entry_cnt; /* the # of nat entries in set */ }; @@ -110,18 +130,19 @@ struct free_nid { int state; /* in use or not: NID_NEW or NID_ALLOC */ }; -static inline int next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) +static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - if (nm_i->fcnt <= 0) - return -1; spin_lock(&nm_i->free_nid_list_lock); + if (nm_i->fcnt <= 0) { + spin_unlock(&nm_i->free_nid_list_lock); + return; + } fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->free_nid_list_lock); - return 0; } /* @@ -197,8 +218,7 @@ static inline void copy_node_footer(struct page *dst, struct page *src) static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); rn->footer.cp_ver = ckpt->checkpoint_ver; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 756c41cd2582..ebd013225788 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -14,6 +14,37 @@ #include "node.h" #include "segment.h" +/* + * Roll forward recovery scenarios. + * + * [Term] F: fsync_mark, D: dentry_mark + * + * 1. inode(x) | CP | inode(x) | dnode(F) + * -> Update the latest inode(x). + * + * 2. inode(x) | CP | inode(F) | dnode(F) + * -> No problem. + * + * 3. inode(x) | CP | dnode(F) | inode(x) + * -> Recover to the latest dnode(F), and drop the last inode(x) + * + * 4. inode(x) | CP | dnode(F) | inode(F) + * -> No problem. + * + * 5. CP | inode(x) | dnode(F) + * -> The inode(DF) was missing. Should drop this dnode(F). + * + * 6. CP | inode(DF) | dnode(F) + * -> No problem. + * + * 7. CP | dnode(F) | inode(DF) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * + * 8. CP | dnode(F) | inode(x) + * -> If f2fs_iget fails, then goto next to find inode(DF). + * But it will fail due to no inode(DF). + */ + static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) @@ -36,7 +67,7 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct page *ipage, struct inode *inode) +static int recover_dentry(struct inode *inode, struct page *ipage) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); @@ -75,7 +106,7 @@ retry: err = -EEXIST; goto out_unmap_put; } - err = acquire_orphan_inode(F2FS_SB(inode->i_sb)); + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); goto out_unmap_put; @@ -110,35 +141,28 @@ out: return err; } -static int recover_inode(struct inode *inode, struct page *node_page) +static void recover_inode(struct inode *inode, struct page *page) { - struct f2fs_inode *raw_inode = F2FS_INODE(node_page); - - if (!IS_INODE(node_page)) - return 0; + struct f2fs_inode *raw = F2FS_INODE(page); - inode->i_mode = le16_to_cpu(raw_inode->i_mode); - i_size_write(inode, le64_to_cpu(raw_inode->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime); - inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec); - inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec); - - if (is_dent_dnode(node_page)) - return recover_dentry(node_page, inode); + inode->i_mode = le16_to_cpu(raw->i_mode); + i_size_write(inode, le64_to_cpu(raw->i_size)); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); + inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); + inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s", - ino_of_node(node_page), raw_inode->i_name); - return 0; + ino_of_node(page), F2FS_INODE(page)->i_name); } static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct page *page; + struct page *page = NULL; block_t blkaddr; int err = 0; @@ -146,20 +170,13 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* read node page */ - page = alloc_page(GFP_F2FS_ZERO); - if (!page) - return -ENOMEM; - lock_page(page); - while (1) { struct fsync_inode_entry *entry; - err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); - if (err) - return err; + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + return 0; - lock_page(page); + page = get_meta_page_ra(sbi, blkaddr); if (cp_ver != cpver_of_node(page)) break; @@ -180,33 +197,38 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) } /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_NOFS); + entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); if (!entry) { err = -ENOMEM; break; } - + /* + * CP | dnode(F) | inode(DF) + * For this case, we should not give up now. + */ entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); if (IS_ERR(entry->inode)) { err = PTR_ERR(entry->inode); kmem_cache_free(fsync_entry_slab, entry); + if (err == -ENOENT) + goto next; break; } list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - err = recover_inode(entry->inode, page); - if (err && err != -ENOENT) - break; + if (IS_INODE(page)) { + entry->last_inode = blkaddr; + if (is_dent_dnode(page)) + entry->last_dentry = blkaddr; + } next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); } - - unlock_page(page); - __free_pages(page, 0); - + f2fs_put_page(page, 1); return err; } @@ -279,16 +301,30 @@ got_it: ino = ino_of_node(node_page); f2fs_put_page(node_page, 1); - /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); - if (IS_ERR(inode)) - return PTR_ERR(inode); + if (ino != dn->inode->i_ino) { + /* Deallocate previous index in the node page */ + inode = f2fs_iget(sbi->sb, ino); + if (IS_ERR(inode)) + return PTR_ERR(inode); + } else { + inode = dn->inode; + } bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + le16_to_cpu(sum.ofs_in_node); - truncate_hole(inode, bidx, bidx + 1); - iput(inode); + if (ino != dn->inode->i_ino) { + truncate_hole(inode, bidx, bidx + 1); + iput(inode); + } else { + struct dnode_of_data tdn; + set_new_dnode(&tdn, inode, dn->inode_page, NULL, 0); + if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) + return 0; + if (tdn.data_blkaddr != NULL_ADDR) + truncate_data_blocks_range(&tdn, 1); + f2fs_put_page(tdn.node_page, 1); + } return 0; } @@ -331,8 +367,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(dn.node_page, NODE); get_node_info(sbi, dn.nid, &ni); - f2fs_bug_on(ni.ino != ino_of_node(page)); - f2fs_bug_on(ofs_of_node(dn.node_page) != ofs_of_node(page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page)); for (; start < end; start++) { block_t src, dest; @@ -344,7 +380,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); /* We should not get -ENOSPC */ - f2fs_bug_on(err); + f2fs_bug_on(sbi, err); } /* Check the previous node page having this index */ @@ -386,7 +422,7 @@ static int recover_data(struct f2fs_sb_info *sbi, { unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; - struct page *page; + struct page *page = NULL; int err = 0; block_t blkaddr; @@ -394,32 +430,41 @@ static int recover_data(struct f2fs_sb_info *sbi, curseg = CURSEG_I(sbi, type); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* read node page */ - page = alloc_page(GFP_F2FS_ZERO); - if (!page) - return -ENOMEM; - - lock_page(page); - while (1) { struct fsync_inode_entry *entry; - err = f2fs_submit_page_bio(sbi, page, blkaddr, READ_SYNC); - if (err) - return err; + if (blkaddr < MAIN_BLKADDR(sbi) || blkaddr >= MAX_BLKADDR(sbi)) + break; - lock_page(page); + page = get_meta_page_ra(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (cp_ver != cpver_of_node(page)) { + f2fs_put_page(page, 1); break; + } entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) goto next; - + /* + * inode(x) | CP | inode(x) | dnode(F) + * In this case, we can lose the latest inode(x). + * So, call recover_inode for the inode update. + */ + if (entry->last_inode == blkaddr) + recover_inode(entry->inode, page); + if (entry->last_dentry == blkaddr) { + err = recover_dentry(entry->inode, page); + if (err) { + f2fs_put_page(page, 1); + break; + } + } err = do_recover_data(sbi, entry->inode, page, blkaddr); - if (err) + if (err) { + f2fs_put_page(page, 1); break; + } if (entry->blkaddr == blkaddr) { iput(entry->inode); @@ -429,11 +474,8 @@ static int recover_data(struct f2fs_sb_info *sbi, next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); + f2fs_put_page(page, 1); } - - unlock_page(page); - __free_pages(page, 0); - if (!err) allocate_new_segments(sbi); return err; @@ -474,11 +516,15 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #2: recover data */ err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); if (!err) - f2fs_bug_on(!list_empty(&inode_list)); + f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); kmem_cache_destroy(fsync_entry_slab); + /* truncate meta pages to be used by the recovery */ + truncate_inode_pages_range(META_MAPPING(sbi), + MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); truncate_inode_pages_final(META_MAPPING(sbi)); @@ -494,8 +540,11 @@ out: set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); } else if (need_writecp) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); } else { mutex_unlock(&sbi->cp_mutex); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 0aa337cd5bba..923cb76fdc46 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -25,6 +25,8 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *sit_entry_set_slab; +static struct kmem_cache *inmem_entry_slab; /* * __reverse_ffs is copied from include/asm-generic/bitops/__ffs.h since @@ -172,6 +174,60 @@ found_middle: return result + __reverse_ffz(tmp); } +void register_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *new; + + new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); + + /* add atomic page indices to the list */ + new->page = page; + INIT_LIST_HEAD(&new->list); + + /* increase reference count with clean state */ + mutex_lock(&fi->inmem_lock); + get_page(page); + list_add_tail(&new->list, &fi->inmem_pages); + mutex_unlock(&fi->inmem_lock); +} + +void commit_inmem_pages(struct inode *inode, bool abort) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct inmem_pages *cur, *tmp; + bool submit_bio = false; + struct f2fs_io_info fio = { + .type = DATA, + .rw = WRITE_SYNC, + }; + + f2fs_balance_fs(sbi); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + lock_page(cur->page); + if (!abort && cur->page->mapping == inode->i_mapping) { + f2fs_wait_on_page_writeback(cur->page, DATA); + if (clear_page_dirty_for_io(cur->page)) + inode_dec_dirty_pages(inode); + do_write_data_page(cur->page, &fio); + submit_bio = true; + } + f2fs_put_page(cur->page, 1); + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + } + if (submit_bio) + f2fs_submit_merged_bio(sbi, DATA, WRITE); + mutex_unlock(&fi->inmem_lock); + + filemap_fdatawait_range(inode->i_mapping, 0, LLONG_MAX); + f2fs_unlock_op(sbi); +} + /* * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. @@ -205,24 +261,20 @@ repeat: if (kthread_should_stop()) return 0; - spin_lock(&fcc->issue_lock); - if (fcc->issue_list) { - fcc->dispatch_list = fcc->issue_list; - fcc->issue_list = fcc->issue_tail = NULL; - } - spin_unlock(&fcc->issue_lock); - - if (fcc->dispatch_list) { + if (!llist_empty(&fcc->issue_list)) { struct bio *bio = bio_alloc(GFP_NOIO, 0); struct flush_cmd *cmd, *next; int ret; + fcc->dispatch_list = llist_del_all(&fcc->issue_list); + fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); + bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); - for (cmd = fcc->dispatch_list; cmd; cmd = next) { + llist_for_each_entry_safe(cmd, next, + fcc->dispatch_list, llnode) { cmd->ret = ret; - next = cmd->next; complete(&cmd->wait); } bio_put(bio); @@ -230,7 +282,7 @@ repeat: } wait_event_interruptible(*q, - kthread_should_stop() || fcc->issue_list); + kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; } @@ -249,15 +301,8 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return blkdev_issue_flush(sbi->sb->s_bdev, GFP_KERNEL, NULL); init_completion(&cmd.wait); - cmd.next = NULL; - spin_lock(&fcc->issue_lock); - if (fcc->issue_list) - fcc->issue_tail->next = &cmd; - else - fcc->issue_list = &cmd; - fcc->issue_tail = &cmd; - spin_unlock(&fcc->issue_lock); + llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); @@ -276,8 +321,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - spin_lock_init(&fcc->issue_lock); init_waitqueue_head(&fcc->flush_wait_queue); + init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -317,6 +362,10 @@ static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, struct seg_entry *sentry = get_seg_entry(sbi, segno); enum dirty_type t = sentry->type; + if (unlikely(t >= DIRTY)) { + f2fs_bug_on(sbi, 1); + return; + } if (!test_and_set_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]++; } @@ -376,8 +425,8 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(sbi, blkstart); - sector_t len = SECTOR_FROM_BLOCK(sbi, blklen); + sector_t start = SECTOR_FROM_BLOCK(blkstart); + sector_t len = SECTOR_FROM_BLOCK(blklen); trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } @@ -392,22 +441,48 @@ void discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) } } -static void add_discard_addrs(struct f2fs_sb_info *sbi, - unsigned int segno, struct seg_entry *se) +static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct list_head *head = &SM_I(sbi)->discard_list; struct discard_entry *new; int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; + struct seg_entry *se = get_seg_entry(sbi, cpc->trim_start); unsigned long *cur_map = (unsigned long *)se->cur_valid_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long dmap[entries]; unsigned int start = 0, end = -1; + bool force = (cpc->reason == CP_DISCARD); int i; - if (!test_opt(sbi, DISCARD)) + if (!force && !test_opt(sbi, DISCARD)) return; + if (force && !se->valid_blocks) { + struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + /* + * if this segment is registered in the prefree list, then + * we should skip adding a discard candidate, and let the + * checkpoint do that later. + */ + mutex_lock(&dirty_i->seglist_lock); + if (test_bit(cpc->trim_start, dirty_i->dirty_segmap[PRE])) { + mutex_unlock(&dirty_i->seglist_lock); + cpc->trimmed += sbi->blocks_per_seg; + return; + } + mutex_unlock(&dirty_i->seglist_lock); + + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); + INIT_LIST_HEAD(&new->list); + new->blkaddr = START_BLOCK(sbi, cpc->trim_start); + new->len = sbi->blocks_per_seg; + list_add_tail(&new->list, head); + SM_I(sbi)->nr_discards += sbi->blocks_per_seg; + cpc->trimmed += sbi->blocks_per_seg; + return; + } + /* zero block will be discarded through the prefree list */ if (!se->valid_blocks || se->valid_blocks == max_blocks) return; @@ -416,23 +491,39 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, for (i = 0; i < entries; i++) dmap[i] = (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (end - start < cpc->trim_minlen) + continue; + new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, segno) + start; + new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; new->len = end - start; + cpc->trimmed += end - start; list_add_tail(&new->list, head); SM_I(sbi)->nr_discards += end - start; } } +void release_discard_addrs(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &(SM_I(sbi)->discard_list); + struct discard_entry *entry, *this; + + /* drop caches */ + list_for_each_entry_safe(entry, this, head, list) { + list_del(&entry->list); + kmem_cache_free(discard_entry_slab, entry); + } +} + /* * Should call clear_prefree_segments after checkpoint is done. */ @@ -440,10 +531,9 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int segno; - unsigned int total_segs = TOTAL_SEGS(sbi); mutex_lock(&dirty_i->seglist_lock); - for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], total_segs) + for_each_set_bit(segno, dirty_i->dirty_segmap[PRE], MAIN_SEGS(sbi)) __set_test_and_free(sbi, segno); mutex_unlock(&dirty_i->seglist_lock); } @@ -454,17 +544,17 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; - unsigned int total_segs = TOTAL_SEGS(sbi); unsigned int start = 0, end = -1; mutex_lock(&dirty_i->seglist_lock); while (1) { int i; - start = find_next_bit(prefree_map, total_segs, end + 1); - if (start >= total_segs) + start = find_next_bit(prefree_map, MAIN_SEGS(sbi), end + 1); + if (start >= MAIN_SEGS(sbi)) break; - end = find_next_zero_bit(prefree_map, total_segs, start + 1); + end = find_next_zero_bit(prefree_map, MAIN_SEGS(sbi), + start + 1); for (i = start; i < end; i++) clear_bit(i, prefree_map); @@ -488,11 +578,16 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi) } } -static void __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) +static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) + + if (!__test_and_set_bit(segno, sit_i->dirty_sentries_bitmap)) { sit_i->dirty_sentries++; + return false; + } + + return true; } static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, @@ -516,7 +611,7 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) new_vblocks = se->valid_blocks + del; offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - f2fs_bug_on((new_vblocks >> (sizeof(unsigned short) << 3) || + f2fs_bug_on(sbi, (new_vblocks >> (sizeof(unsigned short) << 3) || (new_vblocks > sbi->blocks_per_seg))); se->valid_blocks = new_vblocks; @@ -526,10 +621,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { if (f2fs_set_bit(offset, se->cur_valid_map)) - BUG(); + f2fs_bug_on(sbi, 1); } else { if (!f2fs_clear_bit(offset, se->cur_valid_map)) - BUG(); + f2fs_bug_on(sbi, 1); } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) se->ckpt_valid_blocks += del; @@ -558,7 +653,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr) unsigned int segno = GET_SEGNO(sbi, addr); struct sit_info *sit_i = SIT_I(sbi); - f2fs_bug_on(addr == NULL_ADDR); + f2fs_bug_on(sbi, addr == NULL_ADDR); if (addr == NEW_ADDR) return; @@ -634,7 +729,7 @@ static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) unsigned int segno = curseg->segno + 1; struct free_segmap_info *free_i = FREE_I(sbi); - if (segno < TOTAL_SEGS(sbi) && segno % sbi->segs_per_sec) + if (segno < MAIN_SEGS(sbi) && segno % sbi->segs_per_sec) return !test_bit(segno, free_i->free_segmap); return 0; } @@ -648,7 +743,7 @@ static void get_new_segment(struct f2fs_sb_info *sbi, { struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; - unsigned int total_zones = TOTAL_SECS(sbi) / sbi->secs_per_zone; + unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; unsigned int hint = *newseg / sbi->segs_per_sec; unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); unsigned int left_start = hint; @@ -660,18 +755,18 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - TOTAL_SEGS(sbi), *newseg + 1); + MAIN_SEGS(sbi), *newseg + 1); if (segno - *newseg < sbi->segs_per_sec - (*newseg % sbi->segs_per_sec)) goto got_it; } find_other_zone: - secno = find_next_zero_bit(free_i->free_secmap, TOTAL_SECS(sbi), hint); - if (secno >= TOTAL_SECS(sbi)) { + secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); + if (secno >= MAIN_SECS(sbi)) { if (dir == ALLOC_RIGHT) { secno = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - f2fs_bug_on(secno >= TOTAL_SECS(sbi)); + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, secno >= MAIN_SECS(sbi)); } else { go_left = 1; left_start = hint - 1; @@ -686,8 +781,8 @@ find_other_zone: continue; } left_start = find_next_zero_bit(free_i->free_secmap, - TOTAL_SECS(sbi), 0); - f2fs_bug_on(left_start >= TOTAL_SECS(sbi)); + MAIN_SECS(sbi), 0); + f2fs_bug_on(sbi, left_start >= MAIN_SECS(sbi)); break; } secno = left_start; @@ -726,7 +821,7 @@ skip_left: } got_it: /* set it as dirty segment in free segmap */ - f2fs_bug_on(test_bit(segno, free_i->free_segmap)); + f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); __set_inuse(sbi, segno); *newseg = segno; write_unlock(&free_i->segmap_lock); @@ -898,6 +993,37 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) +{ + __u64 start = range->start >> sbi->log_blocksize; + __u64 end = start + (range->len >> sbi->log_blocksize) - 1; + unsigned int start_segno, end_segno; + struct cp_control cpc; + + if (range->minlen > SEGMENT_SIZE(sbi) || start >= MAX_BLKADDR(sbi) || + range->len < sbi->blocksize) + return -EINVAL; + + if (end <= MAIN_BLKADDR(sbi)) + goto out; + + /* start/end segment number in main_area */ + start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); + end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : + GET_SEGNO(sbi, end); + cpc.reason = CP_DISCARD; + cpc.trim_start = start_segno; + cpc.trim_end = end_segno; + cpc.trim_minlen = range->minlen >> sbi->log_blocksize; + cpc.trimmed = 0; + + /* do checkpoint to issue discard commands safely */ + write_checkpoint(sbi, &cpc); +out: + range->len = cpc.trimmed << sbi->log_blocksize; + return 0; +} + static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -953,15 +1079,15 @@ static int __get_segment_type_6(struct page *page, enum page_type p_type) static int __get_segment_type(struct page *page, enum page_type p_type) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); - switch (sbi->active_logs) { + switch (F2FS_P_SB(page)->active_logs) { case 2: return __get_segment_type_2(page, p_type); case 4: return __get_segment_type_4(page, p_type); } /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(sbi->active_logs != NR_CURSEG_TYPE); + f2fs_bug_on(F2FS_P_SB(page), + F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); return __get_segment_type_6(page, p_type); } @@ -1041,11 +1167,11 @@ void write_node_page(struct f2fs_sb_info *sbi, struct page *page, void write_data_page(struct page *page, struct dnode_of_data *dn, block_t *new_blkaddr, struct f2fs_io_info *fio) { - struct f2fs_sb_info *sbi = F2FS_SB(dn->inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - f2fs_bug_on(dn->data_blkaddr == NULL_ADDR); + f2fs_bug_on(sbi, dn->data_blkaddr == NULL_ADDR); get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); @@ -1055,9 +1181,7 @@ void write_data_page(struct page *page, struct dnode_of_data *dn, void rewrite_data_page(struct page *page, block_t old_blkaddr, struct f2fs_io_info *fio) { - struct inode *inode = page->mapping->host; - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); - f2fs_submit_page_mbio(sbi, page, old_blkaddr, fio); + f2fs_submit_page_mbio(F2FS_P_SB(page), page, old_blkaddr, fio); } void recover_data_page(struct f2fs_sb_info *sbi, @@ -1130,8 +1254,9 @@ out: void f2fs_wait_on_page_writeback(struct page *page, enum page_type type) { - struct f2fs_sb_info *sbi = F2FS_SB(page->mapping->host->i_sb); if (PageWriteback(page)) { + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (is_merged_page(sbi, page, type)) f2fs_submit_merged_bio(sbi, type, WRITE); wait_on_page_writeback(page); @@ -1400,7 +1525,7 @@ static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, segno); + unsigned int offset = SIT_BLOCK_OFFSET(segno); block_t blk_addr = sit_i->sit_base_addr + offset; check_seg_range(sbi, segno); @@ -1426,7 +1551,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, /* get current sit block page without lock */ src_page = get_meta_page(sbi, src_off); dst_page = grab_meta_page(sbi, dst_off); - f2fs_bug_on(PageDirty(src_page)); + f2fs_bug_on(sbi, PageDirty(src_page)); src_addr = page_address(src_page); dst_addr = page_address(dst_page); @@ -1440,101 +1565,192 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, return dst_page; } -static bool flush_sits_in_journal(struct f2fs_sb_info *sbi) +static struct sit_entry_set *grab_sit_entry_set(void) +{ + struct sit_entry_set *ses = + f2fs_kmem_cache_alloc(sit_entry_set_slab, GFP_ATOMIC); + + ses->entry_cnt = 0; + INIT_LIST_HEAD(&ses->set_list); + return ses; +} + +static void release_sit_entry_set(struct sit_entry_set *ses) +{ + list_del(&ses->set_list); + kmem_cache_free(sit_entry_set_slab, ses); +} + +static void adjust_sit_entry_set(struct sit_entry_set *ses, + struct list_head *head) +{ + struct sit_entry_set *next = ses; + + if (list_is_last(&ses->set_list, head)) + return; + + list_for_each_entry_continue(next, head, set_list) + if (ses->entry_cnt <= next->entry_cnt) + break; + + list_move_tail(&ses->set_list, &next->set_list); +} + +static void add_sit_entry(unsigned int segno, struct list_head *head) +{ + struct sit_entry_set *ses; + unsigned int start_segno = START_SEGNO(segno); + + list_for_each_entry(ses, head, set_list) { + if (ses->start_segno == start_segno) { + ses->entry_cnt++; + adjust_sit_entry_set(ses, head); + return; + } + } + + ses = grab_sit_entry_set(); + + ses->start_segno = start_segno; + ses->entry_cnt++; + list_add(&ses->set_list, head); +} + +static void add_sits_in_set(struct f2fs_sb_info *sbi) +{ + struct f2fs_sm_info *sm_info = SM_I(sbi); + struct list_head *set_list = &sm_info->sit_entry_set; + unsigned long *bitmap = SIT_I(sbi)->dirty_sentries_bitmap; + unsigned int segno; + + for_each_set_bit(segno, bitmap, MAIN_SEGS(sbi)) + add_sit_entry(segno, set_list); +} + +static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; int i; - /* - * If the journal area in the current summary is full of sit entries, - * all the sit entries will be flushed. Otherwise the sit entries - * are not able to replace with newly hot sit entries. - */ - if (sits_in_cursum(sum) >= SIT_JOURNAL_ENTRIES) { - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { - unsigned int segno; - segno = le32_to_cpu(segno_in_journal(sum, i)); - __mark_sit_entry_dirty(sbi, segno); - } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); - return true; + for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + unsigned int segno; + bool dirtied; + + segno = le32_to_cpu(segno_in_journal(sum, i)); + dirtied = __mark_sit_entry_dirty(sbi, segno); + + if (!dirtied) + add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - return false; + update_sits_in_cursum(sum, -sits_in_cursum(sum)); } /* * CP calls this function, which flushes SIT entries including sit_journal, * and moves prefree segs to free segs. */ -void flush_sit_entries(struct f2fs_sb_info *sbi) +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); struct f2fs_summary_block *sum = curseg->sum_blk; - unsigned long nsegs = TOTAL_SEGS(sbi); - struct page *page = NULL; - struct f2fs_sit_block *raw_sit = NULL; - unsigned int start = 0, end = 0; - unsigned int segno; - bool flushed; + struct sit_entry_set *ses, *tmp; + struct list_head *head = &SM_I(sbi)->sit_entry_set; + bool to_journal = true; + struct seg_entry *se; mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); /* - * "flushed" indicates whether sit entries in journal are flushed - * to the SIT area or not. + * add and account sit entries of dirty bitmap in sit entry + * set temporarily */ - flushed = flush_sits_in_journal(sbi); + add_sits_in_set(sbi); - for_each_set_bit(segno, bitmap, nsegs) { - struct seg_entry *se = get_seg_entry(sbi, segno); - int sit_offset, offset; + /* + * if there are no enough space in journal to store dirty sit + * entries, remove all entries from journal and add and account + * them in sit entry set. + */ + if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + remove_sits_in_journal(sbi); - sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + if (!sit_i->dirty_sentries) + goto out; - /* add discard candidates */ - if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) - add_discard_addrs(sbi, segno, se); + /* + * there are two steps to flush sit entries: + * #1, flush sit entries to journal in current cold data summary block. + * #2, flush sit entries to sit page. + */ + list_for_each_entry_safe(ses, tmp, head, set_list) { + struct page *page; + struct f2fs_sit_block *raw_sit = NULL; + unsigned int start_segno = ses->start_segno; + unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, + (unsigned long)MAIN_SEGS(sbi)); + unsigned int segno = start_segno; + + if (to_journal && + !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + to_journal = false; + + if (!to_journal) { + page = get_next_sit_page(sbi, start_segno); + raw_sit = page_address(page); + } - if (flushed) - goto to_sit_page; + /* flush dirty sit entries in region of current sit set */ + for_each_set_bit_from(segno, bitmap, end) { + int offset, sit_offset; - offset = lookup_journal_in_cursum(sum, SIT_JOURNAL, segno, 1); - if (offset >= 0) { - segno_in_journal(sum, offset) = cpu_to_le32(segno); - seg_info_to_raw_sit(se, &sit_in_journal(sum, offset)); - goto flush_done; - } -to_sit_page: - if (!page || (start > segno) || (segno > end)) { - if (page) { - f2fs_put_page(page, 1); - page = NULL; + se = get_seg_entry(sbi, segno); + + /* add discard candidates */ + if (SM_I(sbi)->nr_discards < SM_I(sbi)->max_discards) { + cpc->trim_start = segno; + add_discard_addrs(sbi, cpc); } - start = START_SEGNO(sit_i, segno); - end = start + SIT_ENTRY_PER_BLOCK - 1; + if (to_journal) { + offset = lookup_journal_in_cursum(sum, + SIT_JOURNAL, segno, 1); + f2fs_bug_on(sbi, offset < 0); + segno_in_journal(sum, offset) = + cpu_to_le32(segno); + seg_info_to_raw_sit(se, + &sit_in_journal(sum, offset)); + } else { + sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); + seg_info_to_raw_sit(se, + &raw_sit->entries[sit_offset]); + } - /* read sit block that will be updated */ - page = get_next_sit_page(sbi, start); - raw_sit = page_address(page); + __clear_bit(segno, bitmap); + sit_i->dirty_sentries--; + ses->entry_cnt--; } - /* udpate entry in SIT block */ - seg_info_to_raw_sit(se, &raw_sit->entries[sit_offset]); -flush_done: - __clear_bit(segno, bitmap); - sit_i->dirty_sentries--; + if (!to_journal) + f2fs_put_page(page, 1); + + f2fs_bug_on(sbi, ses->entry_cnt); + release_sit_entry_set(ses); + } + + f2fs_bug_on(sbi, !list_empty(head)); + f2fs_bug_on(sbi, sit_i->dirty_sentries); +out: + if (cpc->reason == CP_DISCARD) { + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) + add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); mutex_unlock(&curseg->curseg_mutex); - /* writeout last modified SIT block */ - f2fs_put_page(page, 1); - set_prefree_as_free_segments(sbi); } @@ -1554,16 +1770,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = vzalloc(TOTAL_SEGS(sbi) * sizeof(struct seg_entry)); + sit_i->sentries = vzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry)); if (!sit_i->sentries) return -ENOMEM; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); sit_i->dirty_sentries_bitmap = kzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { sit_i->sentries[start].cur_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map @@ -1574,7 +1790,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) } if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = vzalloc(TOTAL_SECS(sbi) * + sit_i->sec_entries = vzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry)); if (!sit_i->sec_entries) return -ENOMEM; @@ -1609,7 +1825,6 @@ static int build_sit_info(struct f2fs_sb_info *sbi) static int build_free_segmap(struct f2fs_sb_info *sbi) { - struct f2fs_sm_info *sm_info = SM_I(sbi); struct free_segmap_info *free_i; unsigned int bitmap_size, sec_bitmap_size; @@ -1620,12 +1835,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); free_i->free_segmap = kmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; - sec_bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); + sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); free_i->free_secmap = kmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -1635,8 +1850,7 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) memset(free_i->free_secmap, 0xff, sec_bitmap_size); /* init free segmap information */ - free_i->start_segno = - (unsigned int) GET_SEGNO_FROM_SEG0(sbi, sm_info->main_blkaddr); + free_i->start_segno = GET_SEGNO_FROM_SEG0(sbi, MAIN_BLKADDR(sbi)); free_i->free_segments = 0; free_i->free_sections = 0; rwlock_init(&free_i->segmap_lock); @@ -1673,7 +1887,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + int nrpages = MAX_BIO_BLOCKS(sbi); do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT); @@ -1681,7 +1895,7 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; - for (; start < end && start < TOTAL_SEGS(sbi); start++) { + for (; start < end && start < MAIN_SEGS(sbi); start++) { struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; struct f2fs_sit_entry sit; @@ -1719,7 +1933,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) unsigned int start; int type; - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); @@ -1736,18 +1950,22 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int segno = 0, offset = 0, total_segs = TOTAL_SEGS(sbi); + unsigned int segno = 0, offset = 0; unsigned short valid_blocks; while (1) { /* find dirty segment based on free segmap */ - segno = find_next_inuse(free_i, total_segs, offset); - if (segno >= total_segs) + segno = find_next_inuse(free_i, MAIN_SEGS(sbi), offset); + if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; valid_blocks = get_valid_blocks(sbi, segno, 0); - if (valid_blocks >= sbi->blocks_per_seg || !valid_blocks) + if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) + continue; + if (valid_blocks > sbi->blocks_per_seg) { + f2fs_bug_on(sbi, 1); continue; + } mutex_lock(&dirty_i->seglist_lock); __locate_dirty_segment(sbi, segno, DIRTY); mutex_unlock(&dirty_i->seglist_lock); @@ -1757,7 +1975,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) static int init_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - unsigned int bitmap_size = f2fs_bitmap_size(TOTAL_SECS(sbi)); + unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); dirty_i->victim_secmap = kzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) @@ -1778,7 +1996,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->dirty_info = dirty_i; mutex_init(&dirty_i->seglist_lock); - bitmap_size = f2fs_bitmap_size(TOTAL_SEGS(sbi)); + bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { dirty_i->dirty_segmap[i] = kzalloc(bitmap_size, GFP_KERNEL); @@ -1802,7 +2020,7 @@ static void init_min_max_mtime(struct f2fs_sb_info *sbi) sit_i->min_mtime = LLONG_MAX; - for (segno = 0; segno < TOTAL_SEGS(sbi); segno += sbi->segs_per_sec) { + for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { unsigned int i; unsigned long long mtime = 0; @@ -1840,13 +2058,16 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = F2FS_IPU_DISABLE; + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; + sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; + INIT_LIST_HEAD(&sm_info->sit_entry_set); + if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) @@ -1942,7 +2163,7 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) return; if (sit_i->sentries) { - for (start = 0; start < TOTAL_SEGS(sbi); start++) { + for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); kfree(sit_i->sentries[start].ckpt_valid_map); } @@ -1976,11 +2197,30 @@ int __init create_segment_manager_caches(void) discard_entry_slab = f2fs_kmem_cache_create("discard_entry", sizeof(struct discard_entry)); if (!discard_entry_slab) - return -ENOMEM; + goto fail; + + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", + sizeof(struct nat_entry_set)); + if (!sit_entry_set_slab) + goto destory_discard_entry; + + inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", + sizeof(struct inmem_pages)); + if (!inmem_entry_slab) + goto destroy_sit_entry_set; return 0; + +destroy_sit_entry_set: + kmem_cache_destroy(sit_entry_set_slab); +destory_discard_entry: + kmem_cache_destroy(discard_entry_slab); +fail: + return -ENOMEM; } void destroy_segment_manager_caches(void) { + kmem_cache_destroy(sit_entry_set_slab); kmem_cache_destroy(discard_entry_slab); + kmem_cache_destroy(inmem_entry_slab); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ff483257283b..2495bec1c621 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -45,16 +45,26 @@ (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ sbi->segs_per_sec)) \ -#define START_BLOCK(sbi, segno) \ - (SM_I(sbi)->seg0_blkaddr + \ +#define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) +#define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) + +#define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) +#define MAIN_SECS(sbi) (sbi->total_sections) + +#define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) + +#define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) +#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ + sbi->log_blocks_per_seg)) + +#define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + #define NEXT_FREE_BLKADDR(sbi, curseg) \ (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) -#define MAIN_BASE_BLOCK(sbi) (SM_I(sbi)->main_blkaddr) - -#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) \ - ((blk_addr) - SM_I(sbi)->seg0_blkaddr) +#define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ @@ -77,23 +87,21 @@ #define SIT_ENTRY_OFFSET(sit_i, segno) \ (segno % sit_i->sents_per_block) -#define SIT_BLOCK_OFFSET(sit_i, segno) \ +#define SIT_BLOCK_OFFSET(segno) \ (segno / SIT_ENTRY_PER_BLOCK) -#define START_SEGNO(sit_i, segno) \ - (SIT_BLOCK_OFFSET(sit_i, segno) * SIT_ENTRY_PER_BLOCK) +#define START_SEGNO(segno) \ + (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ - ((TOTAL_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) + ((MAIN_SEGS(sbi) + SIT_ENTRY_PER_BLOCK - 1) / SIT_ENTRY_PER_BLOCK) #define f2fs_bitmap_size(nr) \ (BITS_TO_LONGS(nr) * sizeof(unsigned long)) -#define TOTAL_SEGS(sbi) (SM_I(sbi)->main_segments) -#define TOTAL_SECS(sbi) (sbi->total_sections) -#define SECTOR_FROM_BLOCK(sbi, blk_addr) \ - (((sector_t)blk_addr) << (sbi)->log_sectors_per_block) -#define SECTOR_TO_BLOCK(sbi, sectors) \ - (sectors >> (sbi)->log_sectors_per_block) -#define MAX_BIO_BLOCKS(max_hw_blocks) \ - (min((int)max_hw_blocks, BIO_MAX_PAGES)) +#define SECTOR_FROM_BLOCK(blk_addr) \ + (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) +#define SECTOR_TO_BLOCK(sectors) \ + (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) +#define MAX_BIO_BLOCKS(sbi) \ + ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -167,6 +175,11 @@ struct segment_allocation { void (*allocate_segment)(struct f2fs_sb_info *, int, bool); }; +struct inmem_pages { + struct list_head list; + struct page *page; +}; + struct sit_info { const struct segment_allocation *s_ops; @@ -237,6 +250,12 @@ struct curseg_info { unsigned int next_segno; /* preallocated segment */ }; +struct sit_entry_set { + struct list_head set_list; /* link with all sit sets */ + unsigned int start_segno; /* start segno of sits in set */ + unsigned int entry_cnt; /* the # of sit entries in set */ +}; + /* * inline functions */ @@ -316,7 +335,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) clear_bit(segno, free_i->free_segmap); free_i->free_segments++; - next = find_next_bit(free_i->free_segmap, TOTAL_SEGS(sbi), start_segno); + next = find_next_bit(free_i->free_segmap, MAIN_SEGS(sbi), start_segno); if (next >= start_segno + sbi->segs_per_sec) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; @@ -430,8 +449,10 @@ static inline int reserved_sections(struct f2fs_sb_info *sbi) static inline bool need_SSR(struct f2fs_sb_info *sbi) { - return (prefree_segments(sbi) / sbi->segs_per_sec) - + free_sections(sbi) < overprovision_sections(sbi); + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + return free_sections(sbi) <= (node_secs + 2 * dent_secs + + reserved_sections(sbi) + 1); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) @@ -466,48 +487,47 @@ static inline int utilization(struct f2fs_sb_info *sbi) * F2FS_IPU_UTIL - if FS utilization is over threashold, * F2FS_IPU_SSR_UTIL - if SSR mode is activated and FS utilization is over * threashold, + * F2FS_IPU_FSYNC - activated in fsync path only for high performance flash + * storages. IPU will be triggered only if the # of dirty + * pages over min_fsync_blocks. * F2FS_IPUT_DISABLE - disable IPU. (=default option) */ #define DEF_MIN_IPU_UTIL 70 +#define DEF_MIN_FSYNC_BLOCKS 8 enum { F2FS_IPU_FORCE, F2FS_IPU_SSR, F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, - F2FS_IPU_DISABLE, + F2FS_IPU_FSYNC, }; static inline bool need_inplace_update(struct inode *inode) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + unsigned int policy = SM_I(sbi)->ipu_policy; /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode)) + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; - /* this is only set during fdatasync */ - if (is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + if (policy & (0x1 << F2FS_IPU_FORCE)) + return true; + if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) + return true; + if (policy & (0x1 << F2FS_IPU_UTIL) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) + return true; + if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && need_SSR(sbi) && + utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; - switch (SM_I(sbi)->ipu_policy) { - case F2FS_IPU_FORCE: + /* this is only set during fdatasync */ + if (policy & (0x1 << F2FS_IPU_FSYNC) && + is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) return true; - case F2FS_IPU_SSR: - if (need_SSR(sbi)) - return true; - break; - case F2FS_IPU_UTIL: - if (utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - break; - case F2FS_IPU_SSR_UTIL: - if (need_SSR(sbi) && utilization(sbi) > SM_I(sbi)->min_ipu_util) - return true; - break; - case F2FS_IPU_DISABLE: - break; - } + return false; } @@ -534,18 +554,13 @@ static inline unsigned short curseg_blkoff(struct f2fs_sb_info *sbi, int type) #ifdef CONFIG_F2FS_CHECK_FS static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) { - unsigned int end_segno = SM_I(sbi)->segment_count - 1; - BUG_ON(segno > end_segno); + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); } static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - struct f2fs_sm_info *sm_info = SM_I(sbi); - block_t total_blks = sm_info->segment_count << sbi->log_blocks_per_seg; - block_t start_addr = sm_info->seg0_blkaddr; - block_t end_addr = start_addr + total_blks - 1; - BUG_ON(blk_addr < start_addr); - BUG_ON(blk_addr > end_addr); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi)); + BUG_ON(blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -554,8 +569,6 @@ static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) static inline void check_block_count(struct f2fs_sb_info *sbi, int segno, struct f2fs_sit_entry *raw_sit) { - struct f2fs_sm_info *sm_info = SM_I(sbi); - unsigned int end_segno = sm_info->segment_count - 1; bool is_valid = test_bit_le(0, raw_sit->valid_map) ? true : false; int valid_blocks = 0; int cur_pos = 0, next_pos; @@ -564,7 +577,7 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg); /* check boundary of a given segment number */ - BUG_ON(segno > end_segno); + BUG_ON(segno > TOTAL_SEGS(sbi) - 1); /* check bitmap with valid block count */ do { @@ -583,16 +596,39 @@ static inline void check_block_count(struct f2fs_sb_info *sbi, BUG_ON(GET_SIT_VBLOCKS(raw_sit) != valid_blocks); } #else -#define check_seg_range(sbi, segno) -#define verify_block_addr(sbi, blk_addr) -#define check_block_count(sbi, segno, raw_sit) +static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) +{ + if (segno > TOTAL_SEGS(sbi) - 1) + sbi->need_fsck = true; +} + +static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) +{ + if (blk_addr < SEG0_BLKADDR(sbi) || blk_addr >= MAX_BLKADDR(sbi)) + sbi->need_fsck = true; +} + +/* + * Summary block is always treated as an invalid block + */ +static inline void check_block_count(struct f2fs_sb_info *sbi, + int segno, struct f2fs_sit_entry *raw_sit) +{ + /* check segment usage */ + if (GET_SIT_VBLOCKS(raw_sit) > sbi->blocks_per_seg) + sbi->need_fsck = true; + + /* check boundary of a given segment number */ + if (segno > TOTAL_SEGS(sbi) - 1) + sbi->need_fsck = true; +} #endif static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int offset = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int offset = SIT_BLOCK_OFFSET(start); block_t blk_addr = sit_i->sit_base_addr + offset; check_seg_range(sbi, start); @@ -619,7 +655,7 @@ static inline pgoff_t next_sit_addr(struct f2fs_sb_info *sbi, static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) { - unsigned int block_off = SIT_BLOCK_OFFSET(sit_i, start); + unsigned int block_off = SIT_BLOCK_OFFSET(start); if (f2fs_test_bit(block_off, sit_i->sit_bitmap)) f2fs_clear_bit(block_off, sit_i->sit_bitmap); @@ -666,7 +702,7 @@ static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) { struct block_device *bdev = sbi->sb->s_bdev; struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(sbi, queue_max_sectors(q)); + return SECTOR_TO_BLOCK(queue_max_sectors(q)); } /* @@ -683,7 +719,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 3 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + return MAX_BIO_BLOCKS(sbi); else return 0; } @@ -706,7 +742,7 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, else if (type == NODE) desired = 3 * max_hw_blocks(sbi); else - desired = MAX_BIO_BLOCKS(max_hw_blocks(sbi)); + desired = MAX_BIO_BLOCKS(sbi); wbc->nr_to_write = desired; return desired - nr_to_write; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 41bdf511003d..41d6f700f4ee 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -190,6 +190,7 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); @@ -204,6 +205,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(max_small_discards), ATTR_LIST(ipu_policy), ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), ATTR_LIST(max_victim_search), ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), @@ -366,11 +368,13 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_dents, 0); + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; rwlock_init(&fi->ext.ext_lock); init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->inmem_pages); + mutex_init(&fi->inmem_lock); set_inode_flag(fi, FI_NEW_INODE); @@ -432,14 +436,19 @@ static void f2fs_put_super(struct super_block *sb) stop_gc_thread(sbi); /* We don't need to do checkpoint when it's clean */ - if (sbi->s_dirty) - write_checkpoint(sbi, true); + if (sbi->s_dirty) { + struct cp_control cpc = { + .reason = CP_UMOUNT, + }; + write_checkpoint(sbi, &cpc); + } /* * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ release_dirty_inode(sbi); + release_discard_addrs(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -464,8 +473,11 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); if (sync) { + struct cp_control cpc = { + .reason = CP_SYNC, + }; mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, false); + write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); } else { f2fs_balance_fs(sbi); @@ -616,6 +628,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + sbi->mount_opt.opt = 0; + sbi->active_logs = NR_CURSEG_TYPE; + /* parse mount options */ err = parse_options(sb, data); if (err) @@ -786,14 +801,22 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } - if (le32_to_cpu(raw_super->log_sectorsize) != - F2FS_LOG_SECTOR_SIZE) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize"); + /* Currently, support 512/1024/2048/4096 bytes sector size */ + if (le32_to_cpu(raw_super->log_sectorsize) > + F2FS_MAX_LOG_SECTOR_SIZE || + le32_to_cpu(raw_super->log_sectorsize) < + F2FS_MIN_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, "Invalid log sectorsize (%u)", + le32_to_cpu(raw_super->log_sectorsize)); return 1; } - if (le32_to_cpu(raw_super->log_sectors_per_block) != - F2FS_LOG_SECTORS_PER_BLOCK) { - f2fs_msg(sb, KERN_INFO, "Invalid log sectors per block"); + if (le32_to_cpu(raw_super->log_sectors_per_block) + + le32_to_cpu(raw_super->log_sectorsize) != + F2FS_MAX_LOG_SECTOR_SIZE) { + f2fs_msg(sb, KERN_INFO, + "Invalid log sectors per block(%u) log sectorsize(%u)", + le32_to_cpu(raw_super->log_sectors_per_block), + le32_to_cpu(raw_super->log_sectorsize)); return 1; } return 0; @@ -849,6 +872,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) atomic_set(&sbi->nr_pages[i], 0); sbi->dir_level = DEF_DIR_LEVEL; + sbi->need_fsck = false; } /* @@ -1082,6 +1106,9 @@ try_onemore: if (err) goto free_proc; + if (!retry) + sbi->need_fsck = true; + /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { err = recover_fsync_data(sbi); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 728a5dc3dc16..deca8728117b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -266,7 +266,7 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, static void *read_all_xattrs(struct inode *inode, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; @@ -325,7 +325,7 @@ fail: static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = 0; void *xattr_addr; struct page *xpage; @@ -373,7 +373,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); } - f2fs_bug_on(new_nid); + f2fs_bug_on(sbi, new_nid); f2fs_wait_on_page_writeback(xpage, NODE); } else { struct dnode_of_data dn; @@ -596,7 +596,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_sb_info *sbi = F2FS_SB(inode->i_sb); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; /* this case is only from init_inode_metadata */ diff --git a/fs/fat/misc.c b/fs/fat/misc.c index 628e22a5a543..d8da2d2e30ae 100644 --- a/fs/fat/misc.c +++ b/fs/fat/misc.c @@ -164,8 +164,6 @@ int fat_chain_add(struct inode *inode, int new_dclus, int nr_cluster) return 0; } -extern struct timezone sys_tz; - /* * The epoch of FAT timestamp is 1980. * : bits : value diff --git a/fs/fcntl.c b/fs/fcntl.c index 22d1c3df61ac..99d440a4a6ba 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -98,26 +98,19 @@ static void f_modown(struct file *filp, struct pid *pid, enum pid_type type, write_unlock_irq(&filp->f_owner.lock); } -int __f_setown(struct file *filp, struct pid *pid, enum pid_type type, +void __f_setown(struct file *filp, struct pid *pid, enum pid_type type, int force) { - int err; - - err = security_file_set_fowner(filp); - if (err) - return err; - + security_file_set_fowner(filp); f_modown(filp, pid, type, force); - return 0; } EXPORT_SYMBOL(__f_setown); -int f_setown(struct file *filp, unsigned long arg, int force) +void f_setown(struct file *filp, unsigned long arg, int force) { enum pid_type type; struct pid *pid; int who = arg; - int result; type = PIDTYPE_PID; if (who < 0) { type = PIDTYPE_PGID; @@ -125,9 +118,8 @@ int f_setown(struct file *filp, unsigned long arg, int force) } rcu_read_lock(); pid = find_vpid(who); - result = __f_setown(filp, pid, type, force); + __f_setown(filp, pid, type, force); rcu_read_unlock(); - return result; } EXPORT_SYMBOL(f_setown); @@ -181,7 +173,7 @@ static int f_setown_ex(struct file *filp, unsigned long arg) if (owner.pid && !pid) ret = -ESRCH; else - ret = __f_setown(filp, pid, type, 1); + __f_setown(filp, pid, type, 1); rcu_read_unlock(); return ret; @@ -302,7 +294,8 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, force_successful_syscall_return(); break; case F_SETOWN: - err = f_setown(filp, arg, 1); + f_setown(filp, arg, 1); + err = 0; break; case F_GETOWN_EX: err = f_getown_ex(filp, arg); diff --git a/fs/file.c b/fs/file.c index 66923fe3176e..ab3eb6a88239 100644 --- a/fs/file.c +++ b/fs/file.c @@ -367,7 +367,7 @@ static struct fdtable *close_files(struct files_struct * files) struct file * file = xchg(&fdt->fd[i], NULL); if (file) { filp_close(file, files); - cond_resched(); + cond_resched_rcu_qs(); } } i++; @@ -750,6 +750,7 @@ bool get_close_on_exec(unsigned int fd) static int do_dup2(struct files_struct *files, struct file *file, unsigned fd, unsigned flags) +__releases(&files->file_lock) { struct file *tofree; struct fdtable *fdt; diff --git a/fs/file_table.c b/fs/file_table.c index 385bfd31512a..3f85411b03ce 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -150,18 +150,10 @@ over: /** * alloc_file - allocate and initialize a 'struct file' - * @mnt: the vfsmount on which the file will reside - * @dentry: the dentry representing the new file + * + * @path: the (dentry, vfsmount) pair for the new file * @mode: the mode with which the new file will be opened * @fop: the 'struct file_operations' for the new file - * - * Use this instead of get_empty_filp() to get a new - * 'struct file'. Do so because of the same initialization - * pitfalls reasons listed for init_file(). This is a - * preferred interface to using init_file(). - * - * If all the callers of init_file() are eliminated, its - * code should be moved into this function. */ struct file *alloc_file(struct path *path, fmode_t mode, const struct file_operations *fop) @@ -331,5 +323,5 @@ void __init files_init(unsigned long mempages) n = (mempages * (PAGE_SIZE / 1024)) / 10; files_stat.max_files = max_t(unsigned long, n, NR_FILE); - percpu_counter_init(&nr_files, 0); + percpu_counter_init(&nr_files, 0, GFP_KERNEL); } diff --git a/fs/fscache/object-list.c b/fs/fscache/object-list.c index b8179ca6bf9d..51dde817e1f2 100644 --- a/fs/fscache/object-list.c +++ b/fs/fscache/object-list.c @@ -380,26 +380,14 @@ no_config: static int fscache_objlist_open(struct inode *inode, struct file *file) { struct fscache_objlist_data *data; - struct seq_file *m; - int ret; - ret = seq_open(file, &fscache_objlist_ops); - if (ret < 0) - return ret; - - m = file->private_data; - - /* buffer for key extraction */ - data = kmalloc(sizeof(struct fscache_objlist_data), GFP_KERNEL); - if (!data) { - seq_release(inode, file); + data = __seq_open_private(file, &fscache_objlist_ops, sizeof(*data)); + if (!data) return -ENOMEM; - } /* get the configuration key */ fscache_objlist_config(data); - m->private = data; return 0; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index de1d84af9f7c..dbab798f5caf 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -274,9 +274,6 @@ out: invalid: ret = 0; - - if (!(flags & LOOKUP_RCU) && check_submounts_and_drop(entry) != 0) - ret = 1; goto out; } @@ -1289,9 +1286,7 @@ static int fuse_direntplus_link(struct file *file, d_drop(dentry); } else if (get_node_id(inode) != o->nodeid || ((o->attr.mode ^ inode->i_mode) & S_IFMT)) { - err = d_invalidate(dentry); - if (err) - goto out; + d_invalidate(dentry); } else if (is_bad_inode(inode)) { err = -EIO; goto out; diff --git a/fs/gfs2/dentry.c b/fs/gfs2/dentry.c index d3a5d4e29ba5..589f4ea9381c 100644 --- a/fs/gfs2/dentry.c +++ b/fs/gfs2/dentry.c @@ -93,9 +93,6 @@ invalid_gunlock: if (!had_lock) gfs2_glock_dq_uninit(&d_gh); invalid: - if (check_submounts_and_drop(dentry) != 0) - goto valid; - dput(parent); return 0; diff --git a/fs/gfs2/dir.c b/fs/gfs2/dir.c index 1a349f9a9685..5d4261ff5d23 100644 --- a/fs/gfs2/dir.c +++ b/fs/gfs2/dir.c @@ -2100,8 +2100,13 @@ int gfs2_diradd_alloc_required(struct inode *inode, const struct qstr *name, } if (IS_ERR(dent)) return PTR_ERR(dent); - da->bh = bh; - da->dent = dent; + + if (da->save_loc) { + da->bh = bh; + da->dent = dent; + } else { + brelse(bh); + } return 0; } diff --git a/fs/gfs2/dir.h b/fs/gfs2/dir.h index 126c65dda028..e1b309c24dab 100644 --- a/fs/gfs2/dir.h +++ b/fs/gfs2/dir.h @@ -23,6 +23,7 @@ struct gfs2_diradd { unsigned nr_blocks; struct gfs2_dirent *dent; struct buffer_head *bh; + int save_loc; }; extern struct inode *gfs2_dir_search(struct inode *dir, diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7f4ed3daa38c..80dd44dca028 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -914,26 +914,6 @@ out_uninit: #ifdef CONFIG_GFS2_FS_LOCKING_DLM /** - * gfs2_setlease - acquire/release a file lease - * @file: the file pointer - * @arg: lease type - * @fl: file lock - * - * We don't currently have a way to enforce a lease across the whole - * cluster; until we do, disable leases (by just returning -EINVAL), - * unless the administrator has requested purely local locking. - * - * Locking: called under i_lock - * - * Returns: errno - */ - -static int gfs2_setlease(struct file *file, long arg, struct file_lock **fl) -{ - return -EINVAL; -} - -/** * gfs2_lock - acquire/release a posix lock on a file * @file: the file pointer * @cmd: either modify or retrieve lock state, possibly wait @@ -1078,7 +1058,7 @@ const struct file_operations gfs2_file_fops = { .flock = gfs2_flock, .splice_read = generic_file_splice_read, .splice_write = iter_file_splice_write, - .setlease = gfs2_setlease, + .setlease = simple_nosetlease, .fallocate = gfs2_fallocate, }; diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 7f513b1ceb2c..8f0c19d1d943 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -811,7 +811,7 @@ void gfs2_holder_init(struct gfs2_glock *gl, unsigned int state, unsigned flags, { INIT_LIST_HEAD(&gh->gh_list); gh->gh_gl = gl; - gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_ip = _RET_IP_; gh->gh_owner_pid = get_pid(task_pid(current)); gh->gh_state = state; gh->gh_flags = flags; @@ -835,7 +835,7 @@ void gfs2_holder_reinit(unsigned int state, unsigned flags, struct gfs2_holder * gh->gh_state = state; gh->gh_flags = flags; gh->gh_iflags = 0; - gh->gh_ip = (unsigned long)__builtin_return_address(0); + gh->gh_ip = _RET_IP_; if (gh->gh_owner_pid) put_pid(gh->gh_owner_pid); gh->gh_owner_pid = get_pid(task_pid(current)); diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 2ffc67dce87f..1cc0bba6313f 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -93,7 +93,7 @@ static void gfs2_ail_empty_gl(struct gfs2_glock *gl) * tr->alloced is not set since the transaction structure is * on the stack */ tr.tr_reserved = 1 + gfs2_struct2blk(sdp, tr.tr_revokes, sizeof(u64)); - tr.tr_ip = (unsigned long)__builtin_return_address(0); + tr.tr_ip = _RET_IP_; sb_start_intwrite(sdp->sd_vfs); if (gfs2_log_reserve(sdp, tr.tr_reserved) < 0) { sb_end_intwrite(sdp->sd_vfs); diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index fc8ac2ee0667..c4ed823d150e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -600,7 +600,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, int error, free_vfs_inode = 0; u32 aflags = 0; unsigned blocks = 1; - struct gfs2_diradd da = { .bh = NULL, }; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -672,6 +672,7 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; gfs2_set_inode_blocks(inode, 1); munge_mode_uid_gid(dip, inode); + check_and_update_goal(dip); ip->i_goal = dip->i_goal; ip->i_diskflags = 0; ip->i_eattr = 0; @@ -899,7 +900,7 @@ static int gfs2_link(struct dentry *old_dentry, struct inode *dir, struct gfs2_inode *ip = GFS2_I(inode); struct gfs2_holder ghs[2]; struct buffer_head *dibh; - struct gfs2_diradd da = { .bh = NULL, }; + struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; int error; if (S_ISDIR(inode->i_mode)) @@ -1244,6 +1245,9 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, struct dentry *d; bool excl = !!(flags & O_EXCL); + if (!d_unhashed(dentry)) + goto skip_lookup; + d = __gfs2_lookup(dir, dentry, file, opened); if (IS_ERR(d)) return PTR_ERR(d); @@ -1260,6 +1264,8 @@ static int gfs2_atomic_open(struct inode *dir, struct dentry *dentry, } BUG_ON(d != NULL); + +skip_lookup: if (!(flags & O_CREAT)) return -ENOENT; @@ -1337,7 +1343,7 @@ static int gfs2_rename(struct inode *odir, struct dentry *odentry, struct gfs2_rgrpd *nrgd; unsigned int num_gh; int dir_rename = 0; - struct gfs2_diradd da = { .nr_blocks = 0, }; + struct gfs2_diradd da = { .nr_blocks = 0, .save_loc = 0, }; unsigned int x; int error; diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c index f4cb9c0d6bbd..7474c413ffd1 100644 --- a/fs/gfs2/rgrp.c +++ b/fs/gfs2/rgrp.c @@ -577,6 +577,13 @@ struct gfs2_rgrpd *gfs2_rgrpd_get_next(struct gfs2_rgrpd *rgd) return rgd; } +void check_and_update_goal(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + if (!ip->i_goal || gfs2_blk2rgrpd(sdp, ip->i_goal, 1) == NULL) + ip->i_goal = ip->i_no_addr; +} + void gfs2_free_clones(struct gfs2_rgrpd *rgd) { int x; @@ -1910,6 +1917,7 @@ int gfs2_inplace_reserve(struct gfs2_inode *ip, const struct gfs2_alloc_parms *a } else if (ip->i_rgd && rgrp_contains_block(ip->i_rgd, ip->i_goal)) { rs->rs_rbm.rgd = begin = ip->i_rgd; } else { + check_and_update_goal(ip); rs->rs_rbm.rgd = begin = gfs2_blk2rgrpd(sdp, ip->i_goal, 1); } if (S_ISDIR(ip->i_inode.i_mode) && (ap->aflags & GFS2_AF_ORLOV)) @@ -2089,7 +2097,7 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, u32 blen, unsigned char new_state) { struct gfs2_rbm rbm; - struct gfs2_bitmap *bi; + struct gfs2_bitmap *bi, *bi_prev = NULL; rbm.rgd = gfs2_blk2rgrpd(sdp, bstart, 1); if (!rbm.rgd) { @@ -2098,18 +2106,22 @@ static struct gfs2_rgrpd *rgblk_free(struct gfs2_sbd *sdp, u64 bstart, return NULL; } + gfs2_rbm_from_block(&rbm, bstart); while (blen--) { - gfs2_rbm_from_block(&rbm, bstart); bi = rbm_bi(&rbm); - bstart++; - if (!bi->bi_clone) { - bi->bi_clone = kmalloc(bi->bi_bh->b_size, - GFP_NOFS | __GFP_NOFAIL); - memcpy(bi->bi_clone + bi->bi_offset, - bi->bi_bh->b_data + bi->bi_offset, bi->bi_len); + if (bi != bi_prev) { + if (!bi->bi_clone) { + bi->bi_clone = kmalloc(bi->bi_bh->b_size, + GFP_NOFS | __GFP_NOFAIL); + memcpy(bi->bi_clone + bi->bi_offset, + bi->bi_bh->b_data + bi->bi_offset, + bi->bi_len); + } + gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); + bi_prev = bi; } - gfs2_trans_add_meta(rbm.rgd->rd_gl, bi->bi_bh); gfs2_setbit(&rbm, false, new_state); + gfs2_rbm_incr(&rbm); } return rbm.rgd; diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h index 463ab2e95d1c..5d8f085f7ade 100644 --- a/fs/gfs2/rgrp.h +++ b/fs/gfs2/rgrp.h @@ -80,4 +80,5 @@ static inline bool gfs2_rs_active(struct gfs2_blkreserv *rs) return rs && !RB_EMPTY_NODE(&rs->rs_node); } +extern void check_and_update_goal(struct gfs2_inode *ip); #endif /* __RGRP_DOT_H__ */ diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 0546ab4e28e8..42bfd3361979 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -44,7 +44,7 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, if (!tr) return -ENOMEM; - tr->tr_ip = (unsigned long)__builtin_return_address(0); + tr->tr_ip = _RET_IP_; tr->tr_blocks = blocks; tr->tr_revokes = revokes; tr->tr_reserved = 1; diff --git a/fs/hfs/hfs_fs.h b/fs/hfs/hfs_fs.h index 0524cda47a6e..95d255219b1e 100644 --- a/fs/hfs/hfs_fs.h +++ b/fs/hfs/hfs_fs.h @@ -242,8 +242,6 @@ extern int hfs_mac2asc(struct super_block *, char *, const struct hfs_name *); /* super.c */ extern void hfs_mark_mdb_dirty(struct super_block *sb); -extern struct timezone sys_tz; - /* * There are two time systems. Both are based on seconds since * a particular time/date. diff --git a/fs/internal.h b/fs/internal.h index e325b4f9c799..757ba2abf21e 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -35,6 +35,11 @@ static inline int __sync_blockdev(struct block_device *bdev, int wait) #endif /* + * buffer.c + */ +extern void guard_bio_eod(int rw, struct bio *bio); + +/* * char_dev.c */ extern void __init chrdev_init(void); @@ -42,7 +47,6 @@ extern void __init chrdev_init(void); /* * namei.c */ -extern int __inode_permission(struct inode *, int); extern int user_path_mountpoint_at(int, const char __user *, unsigned int, struct path *); extern int vfs_path_lookup(struct dentry *, struct vfsmount *, const char *, unsigned int, struct path *); @@ -51,7 +55,7 @@ extern int vfs_path_lookup(struct dentry *, struct vfsmount *, * namespace.c */ extern int copy_mount_options(const void __user *, unsigned long *); -extern int copy_mount_string(const void __user *, char **); +extern char *copy_mount_string(const void __user *); extern struct vfsmount *lookup_mnt(struct path *); extern int finish_automount(struct vfsmount *, struct path *); @@ -134,12 +138,6 @@ extern long prune_dcache_sb(struct super_block *sb, unsigned long nr_to_scan, extern int rw_verify_area(int, struct file *, const loff_t *, size_t); /* - * splice.c - */ -extern long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, - loff_t *opos, size_t len, unsigned int flags); - -/* * pipe.c */ extern const struct file_operations pipefifo_fops; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 5ddaf8625d3b..881b3bd0143f 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -247,7 +247,7 @@ static int isofs_dentry_cmp_common( } if (alen == blen) { if (ci) { - if (strnicmp(name->name, str, alen) == 0) + if (strncasecmp(name->name, str, alen) == 0) return 0; } else { if (strncmp(name->name, str, alen) == 0) diff --git a/fs/jbd/journal.c b/fs/jbd/journal.c index 06fe11e0abfa..aab8549591e7 100644 --- a/fs/jbd/journal.c +++ b/fs/jbd/journal.c @@ -886,7 +886,7 @@ journal_t * journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", diff --git a/fs/jbd2/checkpoint.c b/fs/jbd2/checkpoint.c index 7f34f4716165..988b32ed4c87 100644 --- a/fs/jbd2/checkpoint.c +++ b/fs/jbd2/checkpoint.c @@ -96,15 +96,8 @@ static int __try_to_free_cp_buf(struct journal_head *jh) if (jh->b_transaction == NULL && !buffer_locked(bh) && !buffer_dirty(bh) && !buffer_write_io_error(bh)) { - /* - * Get our reference so that bh cannot be freed before - * we unlock it - */ - get_bh(bh); JBUFFER_TRACE(jh, "remove from checkpoint list"); ret = __jbd2_journal_remove_checkpoint(jh) + 1; - BUFFER_TRACE(bh, "release"); - __brelse(bh); } return ret; } @@ -122,8 +115,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) nblocks = jbd2_space_needed(journal); while (jbd2_log_space_left(journal) < nblocks) { - if (journal->j_flags & JBD2_ABORT) - return; write_unlock(&journal->j_state_lock); mutex_lock(&journal->j_checkpoint_mutex); @@ -139,6 +130,10 @@ void __jbd2_log_wait_for_space(journal_t *journal) * trace for forensic evidence. */ write_lock(&journal->j_state_lock); + if (journal->j_flags & JBD2_ABORT) { + mutex_unlock(&journal->j_checkpoint_mutex); + return; + } spin_lock(&journal->j_list_lock); nblocks = jbd2_space_needed(journal); space_left = jbd2_log_space_left(journal); @@ -183,58 +178,6 @@ void __jbd2_log_wait_for_space(journal_t *journal) } } -/* - * Clean up transaction's list of buffers submitted for io. - * We wait for any pending IO to complete and remove any clean - * buffers. Note that we take the buffers in the opposite ordering - * from the one in which they were submitted for IO. - * - * Return 0 on success, and return <0 if some buffers have failed - * to be written out. - * - * Called with j_list_lock held. - */ -static int __wait_cp_io(journal_t *journal, transaction_t *transaction) -{ - struct journal_head *jh; - struct buffer_head *bh; - tid_t this_tid; - int released = 0; - int ret = 0; - - this_tid = transaction->t_tid; -restart: - /* Did somebody clean up the transaction in the meanwhile? */ - if (journal->j_checkpoint_transactions != transaction || - transaction->t_tid != this_tid) - return ret; - while (!released && transaction->t_checkpoint_io_list) { - jh = transaction->t_checkpoint_io_list; - bh = jh2bh(jh); - get_bh(bh); - if (buffer_locked(bh)) { - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - spin_lock(&journal->j_list_lock); - goto restart; - } - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - - /* - * Now in whatever state the buffer currently is, we know that - * it has been written out and so we can drop it from the list - */ - released = __jbd2_journal_remove_checkpoint(jh); - __brelse(bh); - } - - return ret; -} - static void __flush_batch(journal_t *journal, int *batch_count) { @@ -255,81 +198,6 @@ __flush_batch(journal_t *journal, int *batch_count) } /* - * Try to flush one buffer from the checkpoint list to disk. - * - * Return 1 if something happened which requires us to abort the current - * scan of the checkpoint list. Return <0 if the buffer has failed to - * be written out. - * - * Called with j_list_lock held and drops it if 1 is returned - */ -static int __process_buffer(journal_t *journal, struct journal_head *jh, - int *batch_count, transaction_t *transaction) -{ - struct buffer_head *bh = jh2bh(jh); - int ret = 0; - - if (buffer_locked(bh)) { - get_bh(bh); - spin_unlock(&journal->j_list_lock); - wait_on_buffer(bh); - /* the journal_head may have gone by now */ - BUFFER_TRACE(bh, "brelse"); - __brelse(bh); - ret = 1; - } else if (jh->b_transaction != NULL) { - transaction_t *t = jh->b_transaction; - tid_t tid = t->t_tid; - - transaction->t_chp_stats.cs_forced_to_close++; - spin_unlock(&journal->j_list_lock); - if (unlikely(journal->j_flags & JBD2_UNMOUNT)) - /* - * The journal thread is dead; so starting and - * waiting for a commit to finish will cause - * us to wait for a _very_ long time. - */ - printk(KERN_ERR "JBD2: %s: " - "Waiting for Godot: block %llu\n", - journal->j_devname, - (unsigned long long) bh->b_blocknr); - jbd2_log_start_commit(journal, tid); - jbd2_log_wait_commit(journal, tid); - ret = 1; - } else if (!buffer_dirty(bh)) { - ret = 1; - if (unlikely(buffer_write_io_error(bh))) - ret = -EIO; - get_bh(bh); - BUFFER_TRACE(bh, "remove from checkpoint"); - __jbd2_journal_remove_checkpoint(jh); - spin_unlock(&journal->j_list_lock); - __brelse(bh); - } else { - /* - * Important: we are about to write the buffer, and - * possibly block, while still holding the journal lock. - * We cannot afford to let the transaction logic start - * messing around with this buffer before we write it to - * disk, as that would break recoverability. - */ - BUFFER_TRACE(bh, "queue"); - get_bh(bh); - J_ASSERT_BH(bh, !buffer_jwrite(bh)); - journal->j_chkpt_bhs[*batch_count] = bh; - __buffer_relink_io(jh); - transaction->t_chp_stats.cs_written++; - (*batch_count)++; - if (*batch_count == JBD2_NR_BATCH) { - spin_unlock(&journal->j_list_lock); - __flush_batch(journal, batch_count); - ret = 1; - } - } - return ret; -} - -/* * Perform an actual checkpoint. We take the first transaction on the * list of transactions to be checkpointed and send all its buffers * to disk. We submit larger chunks of data at once. @@ -339,9 +207,11 @@ static int __process_buffer(journal_t *journal, struct journal_head *jh, */ int jbd2_log_do_checkpoint(journal_t *journal) { - transaction_t *transaction; - tid_t this_tid; - int result; + struct journal_head *jh; + struct buffer_head *bh; + transaction_t *transaction; + tid_t this_tid; + int result, batch_count = 0; jbd_debug(1, "Start checkpoint\n"); @@ -374,45 +244,117 @@ restart: * done (maybe it's a new transaction, but it fell at the same * address). */ - if (journal->j_checkpoint_transactions == transaction && - transaction->t_tid == this_tid) { - int batch_count = 0; - struct journal_head *jh; - int retry = 0, err; - - while (!retry && transaction->t_checkpoint_list) { - jh = transaction->t_checkpoint_list; - retry = __process_buffer(journal, jh, &batch_count, - transaction); - if (retry < 0 && !result) - result = retry; - if (!retry && (need_resched() || - spin_needbreak(&journal->j_list_lock))) { - spin_unlock(&journal->j_list_lock); - retry = 1; - break; - } + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + /* checkpoint all of the transaction's buffers */ + while (transaction->t_checkpoint_list) { + jh = transaction->t_checkpoint_list; + bh = jh2bh(jh); + + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + goto retry; } + if (jh->b_transaction != NULL) { + transaction_t *t = jh->b_transaction; + tid_t tid = t->t_tid; - if (batch_count) { - if (!retry) { - spin_unlock(&journal->j_list_lock); - retry = 1; - } - __flush_batch(journal, &batch_count); + transaction->t_chp_stats.cs_forced_to_close++; + spin_unlock(&journal->j_list_lock); + if (unlikely(journal->j_flags & JBD2_UNMOUNT)) + /* + * The journal thread is dead; so + * starting and waiting for a commit + * to finish will cause us to wait for + * a _very_ long time. + */ + printk(KERN_ERR + "JBD2: %s: Waiting for Godot: block %llu\n", + journal->j_devname, (unsigned long long) bh->b_blocknr); + + jbd2_log_start_commit(journal, tid); + jbd2_log_wait_commit(journal, tid); + goto retry; + } + if (!buffer_dirty(bh)) { + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + BUFFER_TRACE(bh, "remove from checkpoint"); + if (__jbd2_journal_remove_checkpoint(jh)) + /* The transaction was released; we're done */ + goto out; + continue; } + /* + * Important: we are about to write the buffer, and + * possibly block, while still holding the journal + * lock. We cannot afford to let the transaction + * logic start messing around with this buffer before + * we write it to disk, as that would break + * recoverability. + */ + BUFFER_TRACE(bh, "queue"); + get_bh(bh); + J_ASSERT_BH(bh, !buffer_jwrite(bh)); + journal->j_chkpt_bhs[batch_count++] = bh; + __buffer_relink_io(jh); + transaction->t_chp_stats.cs_written++; + if ((batch_count == JBD2_NR_BATCH) || + need_resched() || + spin_needbreak(&journal->j_list_lock)) + goto unlock_and_flush; + } - if (retry) { + if (batch_count) { + unlock_and_flush: + spin_unlock(&journal->j_list_lock); + retry: + if (batch_count) + __flush_batch(journal, &batch_count); spin_lock(&journal->j_list_lock); goto restart; + } + + /* + * Now we issued all of the transaction's buffers, let's deal + * with the buffers that are out for I/O. + */ +restart2: + /* Did somebody clean up the transaction in the meanwhile? */ + if (journal->j_checkpoint_transactions != transaction || + transaction->t_tid != this_tid) + goto out; + + while (transaction->t_checkpoint_io_list) { + jh = transaction->t_checkpoint_io_list; + bh = jh2bh(jh); + if (buffer_locked(bh)) { + spin_unlock(&journal->j_list_lock); + get_bh(bh); + wait_on_buffer(bh); + /* the journal_head may have gone by now */ + BUFFER_TRACE(bh, "brelse"); + __brelse(bh); + spin_lock(&journal->j_list_lock); + goto restart2; } + if (unlikely(buffer_write_io_error(bh)) && !result) + result = -EIO; + /* - * Now we have cleaned up the first transaction's checkpoint - * list. Let's clean up the second one + * Now in whatever state the buffer currently is, we + * know that it has been written out and so we can + * drop it from the list */ - err = __wait_cp_io(journal, transaction); - if (!result) - result = err; + if (__jbd2_journal_remove_checkpoint(jh)) + break; } out: spin_unlock(&journal->j_list_lock); @@ -478,18 +420,16 @@ int jbd2_cleanup_journal_tail(journal_t *journal) * Find all the written-back checkpoint buffers in the given list and * release them. * - * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) + * Returns 1 if we freed the transaction, 0 otherwise. */ - -static int journal_clean_one_cp_list(struct journal_head *jh, int *released) +static int journal_clean_one_cp_list(struct journal_head *jh) { struct journal_head *last_jh; struct journal_head *next_jh = jh; - int ret, freed = 0; + int ret; + int freed = 0; - *released = 0; if (!jh) return 0; @@ -498,13 +438,11 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) jh = next_jh; next_jh = jh->b_cpnext; ret = __try_to_free_cp_buf(jh); - if (ret) { - freed++; - if (ret == 2) { - *released = 1; - return freed; - } - } + if (!ret) + return freed; + if (ret == 2) + return 1; + freed = 1; /* * This function only frees up some memory * if possible so we dont have an obligation @@ -523,49 +461,49 @@ static int journal_clean_one_cp_list(struct journal_head *jh, int *released) * * Find all the written-back checkpoint buffers in the journal and release them. * - * Called with the journal locked. * Called with j_list_lock held. - * Returns number of buffers reaped (for debug) */ - -int __jbd2_journal_clean_checkpoint_list(journal_t *journal) +void __jbd2_journal_clean_checkpoint_list(journal_t *journal) { transaction_t *transaction, *last_transaction, *next_transaction; - int ret = 0; - int released; + int ret; transaction = journal->j_checkpoint_transactions; if (!transaction) - goto out; + return; last_transaction = transaction->t_cpprev; next_transaction = transaction; do { transaction = next_transaction; next_transaction = transaction->t_cpnext; - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_list, &released); + ret = journal_clean_one_cp_list(transaction->t_checkpoint_list); /* * This function only frees up some memory if possible so we * dont have an obligation to finish processing. Bail out if * preemption requested: */ if (need_resched()) - goto out; - if (released) + return; + if (ret) continue; /* * It is essential that we are as careful as in the case of * t_checkpoint_list with removing the buffer from the list as * we can possibly see not yet submitted buffers on io_list */ - ret += journal_clean_one_cp_list(transaction-> - t_checkpoint_io_list, &released); + ret = journal_clean_one_cp_list(transaction-> + t_checkpoint_io_list); if (need_resched()) - goto out; + return; + /* + * Stop scanning if we couldn't free the transaction. This + * avoids pointless scanning of transactions which still + * weren't checkpointed. + */ + if (!ret) + return; } while (transaction != last_transaction); -out: - return ret; } /* diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 19d74d86d99c..e4dc74713a43 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -1237,7 +1237,7 @@ journal_t * jbd2_journal_init_inode (struct inode *inode) goto out_err; } - bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); + bh = getblk_unmovable(journal->j_dev, blocknr, journal->j_blocksize); if (!bh) { printk(KERN_ERR "%s: Cannot get buffer for journal superblock\n", @@ -1522,14 +1522,6 @@ static int journal_get_superblock(journal_t *journal) goto out; } - if (jbd2_journal_has_csum_v2or3(journal) && - JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { - /* Can't have checksum v1 and v2 on at the same time! */ - printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2 " - "at the same time!\n"); - goto out; - } - if (JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V2) && JBD2_HAS_INCOMPAT_FEATURE(journal, JBD2_FEATURE_INCOMPAT_CSUM_V3)) { /* Can't have checksum v2 and v3 at the same time! */ @@ -1538,6 +1530,14 @@ static int journal_get_superblock(journal_t *journal) goto out; } + if (jbd2_journal_has_csum_v2or3(journal) && + JBD2_HAS_COMPAT_FEATURE(journal, JBD2_FEATURE_COMPAT_CHECKSUM)) { + /* Can't have checksum v1 and v2 on at the same time! */ + printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " + "at the same time!\n"); + goto out; + } + if (!jbd2_verify_csum_type(journal, sb)) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); goto out; diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9b329b55ffe3..bcbef08a4d8f 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -525,6 +525,7 @@ static int do_one_pass(journal_t *journal, !jbd2_descr_block_csum_verify(journal, bh->b_data)) { err = -EIO; + brelse(bh); goto failed; } diff --git a/fs/jffs2/jffs2_fs_sb.h b/fs/jffs2/jffs2_fs_sb.h index 413ef89c2d1b..046fee8b6e9b 100644 --- a/fs/jffs2/jffs2_fs_sb.h +++ b/fs/jffs2/jffs2_fs_sb.h @@ -134,8 +134,6 @@ struct jffs2_sb_info { struct rw_semaphore wbuf_sem; /* Protects the write buffer */ struct delayed_work wbuf_dwork; /* write-buffer write-out work */ - int wbuf_queued; /* non-zero delayed work is queued */ - spinlock_t wbuf_dwork_lock; /* protects wbuf_dwork and and wbuf_queued */ unsigned char *oobbuf; int oobavail; /* How many bytes are available for JFFS2 in OOB */ diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index a6597d60d76d..09ed55190ee2 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -1162,10 +1162,6 @@ static void delayed_wbuf_sync(struct work_struct *work) struct jffs2_sb_info *c = work_to_sb(work); struct super_block *sb = OFNI_BS_2SFFJ(c); - spin_lock(&c->wbuf_dwork_lock); - c->wbuf_queued = 0; - spin_unlock(&c->wbuf_dwork_lock); - if (!(sb->s_flags & MS_RDONLY)) { jffs2_dbg(1, "%s()\n", __func__); jffs2_flush_wbuf_gc(c, 0); @@ -1180,14 +1176,9 @@ void jffs2_dirty_trigger(struct jffs2_sb_info *c) if (sb->s_flags & MS_RDONLY) return; - spin_lock(&c->wbuf_dwork_lock); - if (!c->wbuf_queued) { + delay = msecs_to_jiffies(dirty_writeback_interval * 10); + if (queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay)) jffs2_dbg(1, "%s()\n", __func__); - delay = msecs_to_jiffies(dirty_writeback_interval * 10); - queue_delayed_work(system_long_wq, &c->wbuf_dwork, delay); - c->wbuf_queued = 1; - } - spin_unlock(&c->wbuf_dwork_lock); } int jffs2_nand_flash_setup(struct jffs2_sb_info *c) @@ -1211,7 +1202,6 @@ int jffs2_nand_flash_setup(struct jffs2_sb_info *c) /* Initialise write buffer */ init_rwsem(&c->wbuf_sem); - spin_lock_init(&c->wbuf_dwork_lock); INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->writesize; c->wbuf_ofs = 0xFFFFFFFF; @@ -1251,7 +1241,6 @@ int jffs2_dataflash_setup(struct jffs2_sb_info *c) { /* Initialize write buffer */ init_rwsem(&c->wbuf_sem); - spin_lock_init(&c->wbuf_dwork_lock); INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->erasesize; @@ -1311,7 +1300,6 @@ int jffs2_nor_wbuf_flash_setup(struct jffs2_sb_info *c) { /* Initialize write buffer */ init_rwsem(&c->wbuf_sem); - spin_lock_init(&c->wbuf_dwork_lock); INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->writesize; @@ -1346,7 +1334,6 @@ int jffs2_ubivol_setup(struct jffs2_sb_info *c) { return 0; init_rwsem(&c->wbuf_sem); - spin_lock_init(&c->wbuf_dwork_lock); INIT_DELAYED_WORK(&c->wbuf_dwork, delayed_wbuf_sync); c->wbuf_pagesize = c->mtd->writesize; diff --git a/fs/jfs/jfs_logmgr.c b/fs/jfs/jfs_logmgr.c index 0acddf60af55..bc462dcd7a40 100644 --- a/fs/jfs/jfs_logmgr.c +++ b/fs/jfs/jfs_logmgr.c @@ -1585,7 +1585,6 @@ void jfs_flush_journal(struct jfs_log *log, int wait) set_current_state(TASK_UNINTERRUPTIBLE); LOGGC_UNLOCK(log); schedule(); - __set_current_state(TASK_RUNNING); LOGGC_LOCK(log); remove_wait_queue(&target->gcwait, &__wait); } @@ -2359,7 +2358,6 @@ int jfsIOWait(void *arg) set_current_state(TASK_INTERRUPTIBLE); spin_unlock_irq(&log_redrive_lock); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/jfs/jfs_txnmgr.c b/fs/jfs/jfs_txnmgr.c index 564c4f279ac6..d595856453b2 100644 --- a/fs/jfs/jfs_txnmgr.c +++ b/fs/jfs/jfs_txnmgr.c @@ -136,7 +136,6 @@ static inline void TXN_SLEEP_DROP_LOCK(wait_queue_head_t * event) set_current_state(TASK_UNINTERRUPTIBLE); TXN_UNLOCK(); io_schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(event, &wait); } @@ -2808,7 +2807,6 @@ int jfs_lazycommit(void *arg) set_current_state(TASK_INTERRUPTIBLE); LAZY_UNLOCK(flags); schedule(); - __set_current_state(TASK_RUNNING); remove_wait_queue(&jfs_commit_thread_wait, &wq); } } while (!kthread_should_stop()); @@ -2996,7 +2994,6 @@ int jfs_sync(void *arg) set_current_state(TASK_INTERRUPTIBLE); TXN_UNLOCK(); schedule(); - __set_current_state(TASK_RUNNING); } } while (!kthread_should_stop()); diff --git a/fs/jfs/super.c b/fs/jfs/super.c index adf8cb045b9e..93e897e588a8 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -550,7 +550,7 @@ static int jfs_fill_super(struct super_block *sb, void *data, int silent) inode->i_ino = 0; inode->i_size = sb->s_bdev->bd_inode->i_size; inode->i_mapping->a_ops = &jfs_metapage_aops; - insert_inode_hash(inode); + hlist_add_fake(&inode->i_hash); mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); sbi->direct_inode = inode; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index a693f5b01ae6..1c771931bb60 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -463,21 +463,10 @@ static int kernfs_dop_revalidate(struct dentry *dentry, unsigned int flags) goto out_bad; mutex_unlock(&kernfs_mutex); -out_valid: return 1; out_bad: mutex_unlock(&kernfs_mutex); out_bad_unlocked: - /* - * @dentry doesn't match the underlying kernfs node, drop the - * dentry and force lookup. If we have submounts we must allow the - * vfs caches to lie about the state of the filesystem to prevent - * leaks and other nasty things, so use check_submounts_and_drop() - * instead of d_drop(). - */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - return 0; } diff --git a/fs/libfs.c b/fs/libfs.c index 88e3e00e2eca..171d2846f2a3 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -1075,3 +1075,21 @@ struct inode *alloc_anon_inode(struct super_block *s) return inode; } EXPORT_SYMBOL(alloc_anon_inode); + +/** + * simple_nosetlease - generic helper for prohibiting leases + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: new lease supplied for insertion + * @priv: private data for lm_setup operation + * + * Generic helper for filesystems that do not wish to allow leases to be set. + * All arguments are ignored and it just returns -EINVAL. + */ +int +simple_nosetlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) +{ + return -EINVAL; +} +EXPORT_SYMBOL(simple_nosetlease); diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index ca58d64374ca..9b320cc2a8cf 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -5,6 +5,7 @@ obj-$(CONFIG_LOCKD) += lockd.o lockd-objs-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o xdr.o grace.o + svcshare.o svcproc.o svcsubs.o mon.o xdr.o lockd-objs-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o +lockd-objs-$(CONFIG_PROC_FS) += procfs.o lockd-objs := $(lockd-objs-y) diff --git a/fs/lockd/mon.c b/fs/lockd/mon.c index daa8e7514eae..9106f42c472c 100644 --- a/fs/lockd/mon.c +++ b/fs/lockd/mon.c @@ -159,6 +159,12 @@ static int nsm_mon_unmon(struct nsm_handle *nsm, u32 proc, struct nsm_res *res, msg.rpc_proc = &clnt->cl_procinfo[proc]; status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + if (status == -ECONNREFUSED) { + dprintk("lockd: NSM upcall RPC failed, status=%d, forcing rebind\n", + status); + rpc_force_rebind(clnt); + status = rpc_call_sync(clnt, &msg, RPC_TASK_SOFTCONN); + } if (status < 0) dprintk("lockd: NSM upcall RPC failed, status=%d\n", status); diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 5010b55628b4..097bfa3adb1c 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -11,7 +11,6 @@ struct lockd_net { struct delayed_work grace_period_end; struct lock_manager lockd_manager; - struct list_head grace_list; spinlock_t nsm_clnt_lock; unsigned int nsm_users; diff --git a/fs/lockd/procfs.c b/fs/lockd/procfs.c new file mode 100644 index 000000000000..2a0a98480e39 --- /dev/null +++ b/fs/lockd/procfs.c @@ -0,0 +1,92 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com> + */ + +#include <linux/fs.h> +#include <linux/proc_fs.h> +#include <linux/module.h> +#include <linux/nsproxy.h> +#include <net/net_namespace.h> + +#include "netns.h" +#include "procfs.h" + +/* + * We only allow strings that start with 'Y', 'y', or '1'. + */ +static ssize_t +nlm_end_grace_write(struct file *file, const char __user *buf, size_t size, + loff_t *pos) +{ + char *data; + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + + if (size < 1) + return -EINVAL; + + data = simple_transaction_get(file, buf, size); + if (IS_ERR(data)) + return PTR_ERR(data); + + switch(data[0]) { + case 'Y': + case 'y': + case '1': + locks_end_grace(&ln->lockd_manager); + break; + default: + return -EINVAL; + } + + return size; +} + +static ssize_t +nlm_end_grace_read(struct file *file, char __user *buf, size_t size, + loff_t *pos) +{ + struct lockd_net *ln = net_generic(current->nsproxy->net_ns, + lockd_net_id); + char resp[3]; + + resp[0] = list_empty(&ln->lockd_manager.list) ? 'Y' : 'N'; + resp[1] = '\n'; + resp[2] = '\0'; + + return simple_read_from_buffer(buf, size, pos, resp, sizeof(resp)); +} + +static const struct file_operations lockd_end_grace_operations = { + .write = nlm_end_grace_write, + .read = nlm_end_grace_read, + .llseek = default_llseek, + .release = simple_transaction_release, + .owner = THIS_MODULE, +}; + +int __init +lockd_create_procfs(void) +{ + struct proc_dir_entry *entry; + + entry = proc_mkdir("fs/lockd", NULL); + if (!entry) + return -ENOMEM; + entry = proc_create("nlm_end_grace", S_IRUGO|S_IWUSR, entry, + &lockd_end_grace_operations); + if (!entry) { + remove_proc_entry("fs/lockd", NULL); + return -ENOMEM; + } + return 0; +} + +void __exit +lockd_remove_procfs(void) +{ + remove_proc_entry("fs/lockd/nlm_end_grace", NULL); + remove_proc_entry("fs/lockd", NULL); +} diff --git a/fs/lockd/procfs.h b/fs/lockd/procfs.h new file mode 100644 index 000000000000..2257a1311027 --- /dev/null +++ b/fs/lockd/procfs.h @@ -0,0 +1,28 @@ +/* + * Procfs support for lockd + * + * Copyright (c) 2014 Jeff Layton <jlayton@primarydata.com> + */ +#ifndef _LOCKD_PROCFS_H +#define _LOCKD_PROCFS_H + +#include <linux/kconfig.h> + +#if IS_ENABLED(CONFIG_PROC_FS) +int lockd_create_procfs(void); +void lockd_remove_procfs(void); +#else +static inline int +lockd_create_procfs(void) +{ + return 0; +} + +static inline void +lockd_remove_procfs(void) +{ + return; +} +#endif /* IS_ENABLED(CONFIG_PROC_FS) */ + +#endif /* _LOCKD_PROCFS_H */ diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index ec9e082f9ecd..d1bb7ecfd201 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -36,6 +36,7 @@ #include <linux/nfs.h> #include "netns.h" +#include "procfs.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) @@ -304,13 +305,16 @@ static int lockd_start_svc(struct svc_serv *serv) svc_sock_update_bufs(serv); serv->sv_maxconn = nlm_max_connections; - nlmsvc_task = kthread_run(lockd, nlmsvc_rqst, "%s", serv->sv_name); + nlmsvc_task = kthread_create(lockd, nlmsvc_rqst, "%s", serv->sv_name); if (IS_ERR(nlmsvc_task)) { error = PTR_ERR(nlmsvc_task); printk(KERN_WARNING "lockd_up: kthread_run failed, error=%d\n", error); goto out_task; } + nlmsvc_rqst->rq_task = nlmsvc_task; + wake_up_process(nlmsvc_task); + dprintk("lockd_up: service started\n"); return 0; @@ -581,7 +585,7 @@ static int lockd_init_net(struct net *net) struct lockd_net *ln = net_generic(net, lockd_net_id); INIT_DELAYED_WORK(&ln->grace_period_end, grace_ender); - INIT_LIST_HEAD(&ln->grace_list); + INIT_LIST_HEAD(&ln->lockd_manager.list); spin_lock_init(&ln->nsm_clnt_lock); return 0; } @@ -615,8 +619,15 @@ static int __init init_nlm(void) err = register_pernet_subsys(&lockd_net_ops); if (err) goto err_pernet; + + err = lockd_create_procfs(); + if (err) + goto err_procfs; + return 0; +err_procfs: + unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); @@ -629,6 +640,7 @@ static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); + lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL unregister_sysctl_table(nlm_sysctl_table); diff --git a/fs/lockd/svclock.c b/fs/lockd/svclock.c index ab798a88ec1d..13db95f54176 100644 --- a/fs/lockd/svclock.c +++ b/fs/lockd/svclock.c @@ -245,7 +245,6 @@ nlmsvc_create_block(struct svc_rqst *rqstp, struct nlm_host *host, block->b_daemon = rqstp->rq_server; block->b_host = host; block->b_file = file; - block->b_fl = NULL; file->f_count++; /* Add to file's list of blocks */ @@ -295,7 +294,6 @@ static void nlmsvc_free_block(struct kref *kref) nlmsvc_freegrantargs(block->b_call); nlmsvc_release_call(block->b_call); nlm_release_file(block->b_file); - kfree(block->b_fl); kfree(block); } @@ -508,7 +506,6 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, struct nlm_host *host, struct nlm_lock *lock, struct nlm_lock *conflock, struct nlm_cookie *cookie) { - struct nlm_block *block = NULL; int error; __be32 ret; @@ -519,63 +516,26 @@ nlmsvc_testlock(struct svc_rqst *rqstp, struct nlm_file *file, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); - /* Get existing block (in case client is busy-waiting) */ - block = nlmsvc_lookup_block(file, lock); - - if (block == NULL) { - struct file_lock *conf = kzalloc(sizeof(*conf), GFP_KERNEL); - - if (conf == NULL) - return nlm_granted; - block = nlmsvc_create_block(rqstp, host, file, lock, cookie); - if (block == NULL) { - kfree(conf); - return nlm_granted; - } - block->b_fl = conf; - } - if (block->b_flags & B_QUEUED) { - dprintk("lockd: nlmsvc_testlock deferred block %p flags %d fl %p\n", - block, block->b_flags, block->b_fl); - if (block->b_flags & B_TIMED_OUT) { - nlmsvc_unlink_block(block); - ret = nlm_lck_denied; - goto out; - } - if (block->b_flags & B_GOT_CALLBACK) { - nlmsvc_unlink_block(block); - if (block->b_fl != NULL - && block->b_fl->fl_type != F_UNLCK) { - lock->fl = *block->b_fl; - goto conf_lock; - } else { - ret = nlm_granted; - goto out; - } - } - ret = nlm_drop_reply; - goto out; - } - if (locks_in_grace(SVC_NET(rqstp))) { ret = nlm_lck_denied_grace_period; goto out; } + error = vfs_test_lock(file->f_file, &lock->fl); - if (error == FILE_LOCK_DEFERRED) { - ret = nlmsvc_defer_lock_rqst(rqstp, block); - goto out; - } if (error) { + /* We can't currently deal with deferred test requests */ + if (error == FILE_LOCK_DEFERRED) + WARN_ON_ONCE(1); + ret = nlm_lck_denied_nolocks; goto out; } + if (lock->fl.fl_type == F_UNLCK) { ret = nlm_granted; goto out; } -conf_lock: dprintk("lockd: conflicting lock(ty=%d, %Ld-%Ld)\n", lock->fl.fl_type, (long long)lock->fl.fl_start, (long long)lock->fl.fl_end); @@ -586,10 +546,9 @@ conf_lock: conflock->fl.fl_type = lock->fl.fl_type; conflock->fl.fl_start = lock->fl.fl_start; conflock->fl.fl_end = lock->fl.fl_end; + locks_release_private(&lock->fl); ret = nlm_lck_denied; out: - if (block) - nlmsvc_release_block(block); return ret; } @@ -660,29 +619,22 @@ nlmsvc_cancel_blocked(struct net *net, struct nlm_file *file, struct nlm_lock *l * This is a callback from the filesystem for VFS file lock requests. * It will be used if lm_grant is defined and the filesystem can not * respond to the request immediately. - * For GETLK request it will copy the reply to the nlm_block. * For SETLK or SETLKW request it will get the local posix lock. * In all cases it will move the block to the head of nlm_blocked q where * nlmsvc_retry_blocked() can send back a reply for SETLKW or revisit the * deferred rpc for GETLK and SETLK. */ static void -nlmsvc_update_deferred_block(struct nlm_block *block, struct file_lock *conf, - int result) +nlmsvc_update_deferred_block(struct nlm_block *block, int result) { block->b_flags |= B_GOT_CALLBACK; if (result == 0) block->b_granted = 1; else block->b_flags |= B_TIMED_OUT; - if (conf) { - if (block->b_fl) - __locks_copy_lock(block->b_fl, conf); - } } -static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, - int result) +static int nlmsvc_grant_deferred(struct file_lock *fl, int result) { struct nlm_block *block; int rc = -ENOENT; @@ -697,7 +649,7 @@ static int nlmsvc_grant_deferred(struct file_lock *fl, struct file_lock *conf, rc = -ENOLCK; break; } - nlmsvc_update_deferred_block(block, conf, result); + nlmsvc_update_deferred_block(block, result); } else if (result == 0) block->b_granted = 1; diff --git a/fs/locks.c b/fs/locks.c index bb08857f90b5..735b8d3fa78c 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -230,8 +230,12 @@ void locks_release_private(struct file_lock *fl) fl->fl_ops->fl_release_private(fl); fl->fl_ops = NULL; } - fl->fl_lmops = NULL; + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_put_owner) + fl->fl_lmops->lm_put_owner(fl); + fl->fl_lmops = NULL; + } } EXPORT_SYMBOL_GPL(locks_release_private); @@ -267,21 +271,10 @@ void locks_init_lock(struct file_lock *fl) EXPORT_SYMBOL(locks_init_lock); -static void locks_copy_private(struct file_lock *new, struct file_lock *fl) -{ - if (fl->fl_ops) { - if (fl->fl_ops->fl_copy_lock) - fl->fl_ops->fl_copy_lock(new, fl); - new->fl_ops = fl->fl_ops; - } - if (fl->fl_lmops) - new->fl_lmops = fl->fl_lmops; -} - /* * Initialize a new lock from an existing file_lock structure. */ -void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) +void locks_copy_conflock(struct file_lock *new, struct file_lock *fl) { new->fl_owner = fl->fl_owner; new->fl_pid = fl->fl_pid; @@ -290,22 +283,30 @@ void __locks_copy_lock(struct file_lock *new, const struct file_lock *fl) new->fl_type = fl->fl_type; new->fl_start = fl->fl_start; new->fl_end = fl->fl_end; + new->fl_lmops = fl->fl_lmops; new->fl_ops = NULL; - new->fl_lmops = NULL; + + if (fl->fl_lmops) { + if (fl->fl_lmops->lm_get_owner) + fl->fl_lmops->lm_get_owner(new, fl); + } } -EXPORT_SYMBOL(__locks_copy_lock); +EXPORT_SYMBOL(locks_copy_conflock); void locks_copy_lock(struct file_lock *new, struct file_lock *fl) { /* "new" must be a freshly-initialized lock */ WARN_ON_ONCE(new->fl_ops); - __locks_copy_lock(new, fl); + locks_copy_conflock(new, fl); + new->fl_file = fl->fl_file; new->fl_ops = fl->fl_ops; - new->fl_lmops = fl->fl_lmops; - locks_copy_private(new, fl); + if (fl->fl_ops) { + if (fl->fl_ops->fl_copy_lock) + fl->fl_ops->fl_copy_lock(new, fl); + } } EXPORT_SYMBOL(locks_copy_lock); @@ -325,17 +326,18 @@ static inline int flock_translate_cmd(int cmd) { } /* Fill in a file_lock structure with an appropriate FLOCK lock. */ -static int flock_make_lock(struct file *filp, struct file_lock **lock, - unsigned int cmd) +static struct file_lock * +flock_make_lock(struct file *filp, unsigned int cmd) { struct file_lock *fl; int type = flock_translate_cmd(cmd); + if (type < 0) - return type; + return ERR_PTR(type); fl = locks_alloc_lock(); if (fl == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); fl->fl_file = filp; fl->fl_owner = filp; @@ -344,8 +346,7 @@ static int flock_make_lock(struct file *filp, struct file_lock **lock, fl->fl_type = type; fl->fl_end = OFFSET_MAX; - *lock = fl; - return 0; + return fl; } static int assign_type(struct file_lock *fl, long type) @@ -426,14 +427,34 @@ static int flock_to_posix_lock(struct file *filp, struct file_lock *fl, } /* default lease lock manager operations */ -static void lease_break_callback(struct file_lock *fl) +static bool +lease_break_callback(struct file_lock *fl) { kill_fasync(&fl->fl_fasync, SIGIO, POLL_MSG); + return false; +} + +static void +lease_setup(struct file_lock *fl, void **priv) +{ + struct file *filp = fl->fl_file; + struct fasync_struct *fa = *priv; + + /* + * fasync_insert_entry() returns the old entry if any. If there was no + * old entry, then it used "priv" and inserted it into the fasync list. + * Clear the pointer to indicate that it shouldn't be freed. + */ + if (!fasync_insert_entry(fa->fa_fd, filp, &fl->fl_fasync, fa)) + *priv = NULL; + + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); } static const struct lock_manager_operations lease_manager_ops = { .lm_break = lease_break_callback, .lm_change = lease_modify, + .lm_setup = lease_setup, }; /* @@ -444,7 +465,7 @@ static int lease_init(struct file *filp, long type, struct file_lock *fl) if (assign_type(fl, type) != 0) return -EINVAL; - fl->fl_owner = current->files; + fl->fl_owner = filp; fl->fl_pid = current->tgid; fl->fl_file = filp; @@ -735,7 +756,7 @@ posix_test_lock(struct file *filp, struct file_lock *fl) break; } if (cfl) { - __locks_copy_lock(fl, cfl); + locks_copy_conflock(fl, cfl); if (cfl->fl_nspid) fl->fl_pid = pid_vnr(cfl->fl_nspid); } else @@ -941,7 +962,7 @@ static int __posix_lock_file(struct inode *inode, struct file_lock *request, str if (!posix_locks_conflict(request, fl)) continue; if (conflock) - __locks_copy_lock(conflock, fl); + locks_copy_conflock(conflock, fl); error = -EAGAIN; if (!(request->fl_flags & FL_SLEEP)) goto out; @@ -1273,7 +1294,7 @@ static void lease_clear_pending(struct file_lock *fl, int arg) } /* We already had a lease on this file; just change its type */ -int lease_modify(struct file_lock **before, int arg) +int lease_modify(struct file_lock **before, int arg, struct list_head *dispose) { struct file_lock *fl = *before; int error = assign_type(fl, arg); @@ -1292,11 +1313,10 @@ int lease_modify(struct file_lock **before, int arg) printk(KERN_ERR "locks_delete_lock: fasync == %p\n", fl->fl_fasync); fl->fl_fasync = NULL; } - locks_delete_lock(before, NULL); + locks_delete_lock(before, dispose); } return 0; } - EXPORT_SYMBOL(lease_modify); static bool past_time(unsigned long then) @@ -1307,18 +1327,20 @@ static bool past_time(unsigned long then) return time_after(jiffies, then); } -static void time_out_leases(struct inode *inode) +static void time_out_leases(struct inode *inode, struct list_head *dispose) { struct file_lock **before; struct file_lock *fl; + lockdep_assert_held(&inode->i_lock); + before = &inode->i_flock; while ((fl = *before) && IS_LEASE(fl) && lease_breaking(fl)) { trace_time_out_leases(inode, fl); if (past_time(fl->fl_downgrade_time)) - lease_modify(before, F_RDLCK); + lease_modify(before, F_RDLCK, dispose); if (past_time(fl->fl_break_time)) - lease_modify(before, F_UNLCK); + lease_modify(before, F_UNLCK, dispose); if (fl == *before) /* lease_modify may have freed fl */ before = &fl->fl_next; } @@ -1331,6 +1353,20 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) return locks_conflict(breaker, lease); } +static bool +any_leases_conflict(struct inode *inode, struct file_lock *breaker) +{ + struct file_lock *fl; + + lockdep_assert_held(&inode->i_lock); + + for (fl = inode->i_flock ; fl && IS_LEASE(fl); fl = fl->fl_next) { + if (leases_conflict(fl, breaker)) + return true; + } + return false; +} + /** * __break_lease - revoke all outstanding leases on file * @inode: the inode of the file to return @@ -1347,12 +1383,11 @@ static bool leases_conflict(struct file_lock *lease, struct file_lock *breaker) int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) { int error = 0; - struct file_lock *new_fl, *flock; - struct file_lock *fl; + struct file_lock *new_fl; + struct file_lock *fl, **before; unsigned long break_time; - int i_have_this_lease = 0; - bool lease_conflict = false; int want_write = (mode & O_ACCMODE) != O_RDONLY; + LIST_HEAD(dispose); new_fl = lease_alloc(NULL, want_write ? F_WRLCK : F_RDLCK); if (IS_ERR(new_fl)) @@ -1361,20 +1396,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) spin_lock(&inode->i_lock); - time_out_leases(inode); - - flock = inode->i_flock; - if ((flock == NULL) || !IS_LEASE(flock)) - goto out; + time_out_leases(inode, &dispose); - for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { - if (leases_conflict(fl, new_fl)) { - lease_conflict = true; - if (fl->fl_owner == current->files) - i_have_this_lease = 1; - } - } - if (!lease_conflict) + if (!any_leases_conflict(inode, new_fl)) goto out; break_time = 0; @@ -1384,7 +1408,9 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) break_time++; /* so that 0 means no break time */ } - for (fl = flock; fl && IS_LEASE(fl); fl = fl->fl_next) { + for (before = &inode->i_flock; + ((fl = *before) != NULL) && IS_LEASE(fl); + before = &fl->fl_next) { if (!leases_conflict(fl, new_fl)) continue; if (want_write) { @@ -1393,51 +1419,56 @@ int __break_lease(struct inode *inode, unsigned int mode, unsigned int type) fl->fl_flags |= FL_UNLOCK_PENDING; fl->fl_break_time = break_time; } else { - if (lease_breaking(flock)) + if (lease_breaking(inode->i_flock)) continue; fl->fl_flags |= FL_DOWNGRADE_PENDING; fl->fl_downgrade_time = break_time; } - fl->fl_lmops->lm_break(fl); + if (fl->fl_lmops->lm_break(fl)) + locks_delete_lock(before, &dispose); } - if (i_have_this_lease || (mode & O_NONBLOCK)) { + fl = inode->i_flock; + if (!fl || !IS_LEASE(fl)) + goto out; + + if (mode & O_NONBLOCK) { trace_break_lease_noblock(inode, new_fl); error = -EWOULDBLOCK; goto out; } restart: - break_time = flock->fl_break_time; + break_time = inode->i_flock->fl_break_time; if (break_time != 0) break_time -= jiffies; if (break_time == 0) break_time++; - locks_insert_block(flock, new_fl); + locks_insert_block(inode->i_flock, new_fl); trace_break_lease_block(inode, new_fl); spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); error = wait_event_interruptible_timeout(new_fl->fl_wait, !new_fl->fl_next, break_time); spin_lock(&inode->i_lock); trace_break_lease_unblock(inode, new_fl); locks_delete_block(new_fl); if (error >= 0) { - if (error == 0) - time_out_leases(inode); /* * Wait for the next conflicting lease that has not been * broken yet */ - for (flock = inode->i_flock; flock && IS_LEASE(flock); - flock = flock->fl_next) { - if (leases_conflict(new_fl, flock)) - goto restart; - } + if (error == 0) + time_out_leases(inode, &dispose); + if (any_leases_conflict(inode, new_fl)) + goto restart; + error = 0; } out: spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); locks_free_lock(new_fl); return error; } @@ -1455,8 +1486,18 @@ EXPORT_SYMBOL(__break_lease); */ void lease_get_mtime(struct inode *inode, struct timespec *time) { - struct file_lock *flock = inode->i_flock; - if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) + bool has_lease = false; + struct file_lock *flock; + + if (inode->i_flock) { + spin_lock(&inode->i_lock); + flock = inode->i_flock; + if (flock && IS_LEASE(flock) && (flock->fl_type == F_WRLCK)) + has_lease = true; + spin_unlock(&inode->i_lock); + } + + if (has_lease) *time = current_fs_time(inode->i_sb); else *time = inode->i_mtime; @@ -1492,9 +1533,10 @@ int fcntl_getlease(struct file *filp) struct file_lock *fl; struct inode *inode = file_inode(filp); int type = F_UNLCK; + LIST_HEAD(dispose); spin_lock(&inode->i_lock); - time_out_leases(file_inode(filp)); + time_out_leases(file_inode(filp), &dispose); for (fl = file_inode(filp)->i_flock; fl && IS_LEASE(fl); fl = fl->fl_next) { if (fl->fl_file == filp) { @@ -1503,6 +1545,7 @@ int fcntl_getlease(struct file *filp) } } spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); return type; } @@ -1532,13 +1575,15 @@ check_conflicting_open(const struct dentry *dentry, const long arg) return ret; } -static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp) +static int +generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **priv) { struct file_lock *fl, **before, **my_before = NULL, *lease; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; bool is_deleg = (*flp)->fl_flags & FL_DELEG; int error; + LIST_HEAD(dispose); lease = *flp; trace_generic_add_lease(inode, lease); @@ -1561,6 +1606,8 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp return -EINVAL; } + spin_lock(&inode->i_lock); + time_out_leases(inode, &dispose); error = check_conflicting_open(dentry, arg); if (error) goto out; @@ -1596,10 +1643,11 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp } if (my_before != NULL) { - error = lease->fl_lmops->lm_change(my_before, arg); - if (!error) - *flp = *my_before; - goto out; + lease = *my_before; + error = lease->fl_lmops->lm_change(my_before, arg, &dispose); + if (error) + goto out; + goto out_setup; } error = -EINVAL; @@ -1619,43 +1667,61 @@ static int generic_add_lease(struct file *filp, long arg, struct file_lock **flp smp_mb(); error = check_conflicting_open(dentry, arg); if (error) - locks_unlink_lock(before); + goto out_unlink; + +out_setup: + if (lease->fl_lmops->lm_setup) + lease->fl_lmops->lm_setup(lease, priv); out: + spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); if (is_deleg) mutex_unlock(&inode->i_mutex); + if (!error && !my_before) + *flp = NULL; return error; +out_unlink: + locks_unlink_lock(before); + goto out; } -static int generic_delete_lease(struct file *filp, struct file_lock **flp) +static int generic_delete_lease(struct file *filp) { + int error = -EAGAIN; struct file_lock *fl, **before; struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; + LIST_HEAD(dispose); - trace_generic_delete_lease(inode, *flp); - + spin_lock(&inode->i_lock); + time_out_leases(inode, &dispose); for (before = &inode->i_flock; ((fl = *before) != NULL) && IS_LEASE(fl); before = &fl->fl_next) { - if (fl->fl_file != filp) - continue; - return (*flp)->fl_lmops->lm_change(before, F_UNLCK); + if (fl->fl_file == filp) + break; } - return -EAGAIN; + trace_generic_delete_lease(inode, fl); + if (fl) + error = fl->fl_lmops->lm_change(before, F_UNLCK, &dispose); + spin_unlock(&inode->i_lock); + locks_dispose_list(&dispose); + return error; } /** * generic_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @flp: input - file_lock to use, output - file_lock inserted + * @filp: file pointer + * @arg: type of lease to obtain + * @flp: input - file_lock to use, output - file_lock inserted + * @priv: private data for lm_setup (may be NULL if lm_setup + * doesn't require it) * * The (input) flp->fl_lmops->lm_break function is required * by break_lease(). - * - * Called with inode->i_lock held. */ -int generic_setlease(struct file *filp, long arg, struct file_lock **flp) +int generic_setlease(struct file *filp, long arg, struct file_lock **flp, + void **priv) { struct dentry *dentry = filp->f_path.dentry; struct inode *inode = dentry->d_inode; @@ -1669,83 +1735,52 @@ int generic_setlease(struct file *filp, long arg, struct file_lock **flp) if (error) return error; - time_out_leases(inode); - - BUG_ON(!(*flp)->fl_lmops->lm_break); - switch (arg) { case F_UNLCK: - return generic_delete_lease(filp, flp); + return generic_delete_lease(filp); case F_RDLCK: case F_WRLCK: - return generic_add_lease(filp, arg, flp); + if (!(*flp)->fl_lmops->lm_break) { + WARN_ON_ONCE(1); + return -ENOLCK; + } + return generic_add_lease(filp, arg, flp, priv); default: return -EINVAL; } } EXPORT_SYMBOL(generic_setlease); -static int __vfs_setlease(struct file *filp, long arg, struct file_lock **lease) -{ - if (filp->f_op->setlease) - return filp->f_op->setlease(filp, arg, lease); - else - return generic_setlease(filp, arg, lease); -} - /** - * vfs_setlease - sets a lease on an open file - * @filp: file pointer - * @arg: type of lease to obtain - * @lease: file_lock to use - * - * Call this to establish a lease on the file. - * The (*lease)->fl_lmops->lm_break operation must be set; if not, - * break_lease will oops! - * - * This will call the filesystem's setlease file method, if - * defined. Note that there is no getlease method; instead, the - * filesystem setlease method should call back to setlease() to - * add a lease to the inode's lease list, where fcntl_getlease() can - * find it. Since fcntl_getlease() only reports whether the current - * task holds a lease, a cluster filesystem need only do this for - * leases held by processes on this node. - * - * There is also no break_lease method; filesystems that - * handle their own leases should break leases themselves from the - * filesystem's open, create, and (on truncate) setattr methods. - * - * Warning: the only current setlease methods exist only to disable - * leases in certain cases. More vfs changes may be required to - * allow a full filesystem lease implementation. + * vfs_setlease - sets a lease on an open file + * @filp: file pointer + * @arg: type of lease to obtain + * @lease: file_lock to use when adding a lease + * @priv: private info for lm_setup when adding a lease (may be + * NULL if lm_setup doesn't require it) + * + * Call this to establish a lease on the file. The "lease" argument is not + * used for F_UNLCK requests and may be NULL. For commands that set or alter + * an existing lease, the (*lease)->fl_lmops->lm_break operation must be set; + * if not, this function will return -ENOLCK (and generate a scary-looking + * stack trace). + * + * The "priv" pointer is passed directly to the lm_setup function as-is. It + * may be NULL if the lm_setup operation doesn't require it. */ - -int vfs_setlease(struct file *filp, long arg, struct file_lock **lease) +int +vfs_setlease(struct file *filp, long arg, struct file_lock **lease, void **priv) { - struct inode *inode = file_inode(filp); - int error; - - spin_lock(&inode->i_lock); - error = __vfs_setlease(filp, arg, lease); - spin_unlock(&inode->i_lock); - - return error; + if (filp->f_op->setlease) + return filp->f_op->setlease(filp, arg, lease, priv); + else + return generic_setlease(filp, arg, lease, priv); } EXPORT_SYMBOL_GPL(vfs_setlease); -static int do_fcntl_delete_lease(struct file *filp) -{ - struct file_lock fl, *flp = &fl; - - lease_init(filp, F_UNLCK, flp); - - return vfs_setlease(filp, F_UNLCK, &flp); -} - static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) { - struct file_lock *fl, *ret; - struct inode *inode = file_inode(filp); + struct file_lock *fl; struct fasync_struct *new; int error; @@ -1758,26 +1793,9 @@ static int do_fcntl_add_lease(unsigned int fd, struct file *filp, long arg) locks_free_lock(fl); return -ENOMEM; } - ret = fl; - spin_lock(&inode->i_lock); - error = __vfs_setlease(filp, arg, &ret); - if (error) - goto out_unlock; - if (ret == fl) - fl = NULL; - - /* - * fasync_insert_entry() returns the old entry if any. - * If there was no old entry, then it used 'new' and - * inserted it into the fasync list. Clear new so that - * we don't release it here. - */ - if (!fasync_insert_entry(fd, filp, &ret->fl_fasync, new)) - new = NULL; + new->fa_fd = fd; - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); -out_unlock: - spin_unlock(&inode->i_lock); + error = vfs_setlease(filp, arg, &fl, (void **)&new); if (fl) locks_free_lock(fl); if (new) @@ -1798,7 +1816,7 @@ out_unlock: int fcntl_setlease(unsigned int fd, struct file *filp, long arg) { if (arg == F_UNLCK) - return do_fcntl_delete_lease(filp); + return vfs_setlease(filp, F_UNLCK, NULL, NULL); return do_fcntl_add_lease(fd, filp, arg); } @@ -1867,9 +1885,12 @@ SYSCALL_DEFINE2(flock, unsigned int, fd, unsigned int, cmd) !(f.file->f_mode & (FMODE_READ|FMODE_WRITE))) goto out_putf; - error = flock_make_lock(f.file, &lock, cmd); - if (error) + lock = flock_make_lock(f.file, cmd); + if (IS_ERR(lock)) { + error = PTR_ERR(lock); goto out_putf; + } + if (can_sleep) lock->fl_flags |= FL_SLEEP; @@ -1981,11 +2002,13 @@ int fcntl_getlk(struct file *filp, unsigned int cmd, struct flock __user *l) if (file_lock.fl_type != F_UNLCK) { error = posix_lock_to_flock(&flock, &file_lock); if (error) - goto out; + goto rel_priv; } error = -EFAULT; if (!copy_to_user(l, &flock, sizeof(flock))) error = 0; +rel_priv: + locks_release_private(&file_lock); out: return error; } @@ -2206,7 +2229,8 @@ int fcntl_getlk64(struct file *filp, unsigned int cmd, struct flock64 __user *l) error = -EFAULT; if (!copy_to_user(l, &flock, sizeof(flock))) error = 0; - + + locks_release_private(&file_lock); out: return error; } @@ -2369,7 +2393,7 @@ void locks_remove_file(struct file *filp) while ((fl = *before) != NULL) { if (fl->fl_file == filp) { if (IS_LEASE(fl)) { - lease_modify(before, F_UNLCK); + lease_modify(before, F_UNLCK, &dispose); continue; } @@ -2593,86 +2617,6 @@ static int __init proc_locks_init(void) module_init(proc_locks_init); #endif -/** - * lock_may_read - checks that the region is free of locks - * @inode: the inode that is being read - * @start: the first byte to read - * @len: the number of bytes to read - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a read and - * byte-range POSIX locks can prohibit a read if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_read(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if (fl->fl_type == F_RDLCK) - continue; - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_READ) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_read); - -/** - * lock_may_write - checks that the region is free of locks - * @inode: the inode that is being written - * @start: the first byte to write - * @len: the number of bytes to write - * - * Emulates Windows locking requirements. Whole-file - * mandatory locks (share modes) can prohibit a write and - * byte-range POSIX locks can prohibit a write if they overlap. - * - * N.B. this function is only ever called - * from knfsd and ownership of locks is never checked. - */ -int lock_may_write(struct inode *inode, loff_t start, unsigned long len) -{ - struct file_lock *fl; - int result = 1; - - spin_lock(&inode->i_lock); - for (fl = inode->i_flock; fl != NULL; fl = fl->fl_next) { - if (IS_POSIX(fl)) { - if ((fl->fl_end < start) || (fl->fl_start > (start + len))) - continue; - } else if (IS_FLOCK(fl)) { - if (!(fl->fl_type & LOCK_MAND)) - continue; - if (fl->fl_type & LOCK_WRITE) - continue; - } else - continue; - result = 0; - break; - } - spin_unlock(&inode->i_lock); - return result; -} - -EXPORT_SYMBOL(lock_may_write); - static int __init filelock_init(void) { int i; diff --git a/fs/mount.h b/fs/mount.h index 6740a6215529..f82c62840905 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -21,6 +21,7 @@ struct mnt_pcp { struct mountpoint { struct hlist_node m_hash; struct dentry *m_dentry; + struct hlist_head m_list; int m_count; }; @@ -29,7 +30,10 @@ struct mount { struct mount *mnt_parent; struct dentry *mnt_mountpoint; struct vfsmount mnt; - struct rcu_head mnt_rcu; + union { + struct rcu_head mnt_rcu; + struct llist_node mnt_llist; + }; #ifdef CONFIG_SMP struct mnt_pcp __percpu *mnt_pcp; #else @@ -48,6 +52,7 @@ struct mount { struct mount *mnt_master; /* slave is on master->mnt_slave_list */ struct mnt_namespace *mnt_ns; /* containing namespace */ struct mountpoint *mnt_mp; /* where is it mounted */ + struct hlist_node mnt_mp_list; /* list mounts with the same mountpoint */ #ifdef CONFIG_FSNOTIFY struct hlist_head mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; @@ -82,6 +87,15 @@ extern struct mount *__lookup_mnt_last(struct vfsmount *, struct dentry *); extern bool legitimize_mnt(struct vfsmount *, unsigned); +extern void __detach_mounts(struct dentry *dentry); + +static inline void detach_mounts(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return; + __detach_mounts(dentry); +} + static inline void get_mnt_ns(struct mnt_namespace *ns) { atomic_inc(&ns->count); @@ -112,3 +126,12 @@ struct proc_mounts { #define proc_mounts(p) (container_of((p), struct proc_mounts, m)) extern const struct seq_operations mounts_op; + +extern bool __is_local_mountpoint(struct dentry *dentry); +static inline bool is_local_mountpoint(struct dentry *dentry) +{ + if (!d_mountpoint(dentry)) + return false; + + return __is_local_mountpoint(dentry); +} diff --git a/fs/mpage.c b/fs/mpage.c index 5f9ed622274f..3e79220babac 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -28,6 +28,7 @@ #include <linux/backing-dev.h> #include <linux/pagevec.h> #include <linux/cleancache.h> +#include "internal.h" /* * I/O completion handler for multipage BIOs. @@ -57,6 +58,7 @@ static void mpage_end_io(struct bio *bio, int err) static struct bio *mpage_bio_submit(int rw, struct bio *bio) { bio->bi_end_io = mpage_end_io; + guard_bio_eod(rw, bio); submit_bio(rw, bio); return NULL; } diff --git a/fs/namei.c b/fs/namei.c index a7b05bf82d31..42df664e95e5 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -416,6 +416,7 @@ int __inode_permission(struct inode *inode, int mask) return security_inode_permission(inode, mask); } +EXPORT_SYMBOL(__inode_permission); /** * sb_permission - Check superblock-level permissions @@ -1306,7 +1307,8 @@ static struct dentry *lookup_dcache(struct qstr *name, struct dentry *dir, if (error < 0) { dput(dentry); return ERR_PTR(error); - } else if (!d_invalidate(dentry)) { + } else { + d_invalidate(dentry); dput(dentry); dentry = NULL; } @@ -1435,10 +1437,9 @@ unlazy: dput(dentry); return status; } - if (!d_invalidate(dentry)) { - dput(dentry); - goto need_lookup; - } + d_invalidate(dentry); + dput(dentry); + goto need_lookup; } path->mnt = mnt; @@ -1950,7 +1951,7 @@ static int path_lookupat(int dfd, const char *name, err = path_init(dfd, name, flags | LOOKUP_PARENT, nd, &base); if (unlikely(err)) - return err; + goto out; current->total_link_count = 0; err = link_path_walk(name, nd); @@ -1982,6 +1983,7 @@ static int path_lookupat(int dfd, const char *name, } } +out: if (base) fput(base); @@ -2301,7 +2303,7 @@ path_mountpoint(int dfd, const char *name, struct path *path, unsigned int flags err = path_init(dfd, name, flags | LOOKUP_PARENT, &nd, &base); if (unlikely(err)) - return err; + goto out; current->total_link_count = 0; err = link_path_walk(name, &nd); @@ -2382,22 +2384,17 @@ kern_path_mountpoint(int dfd, const char *name, struct path *path, } EXPORT_SYMBOL(kern_path_mountpoint); -/* - * It's inline, so penalty for filesystems that don't use sticky bit is - * minimal. - */ -static inline int check_sticky(struct inode *dir, struct inode *inode) +int __check_sticky(struct inode *dir, struct inode *inode) { kuid_t fsuid = current_fsuid(); - if (!(dir->i_mode & S_ISVTX)) - return 0; if (uid_eq(inode->i_uid, fsuid)) return 0; if (uid_eq(dir->i_uid, fsuid)) return 0; return !capable_wrt_inode_uidgid(inode, CAP_FOWNER); } +EXPORT_SYMBOL(__check_sticky); /* * Check whether we can remove a link victim from directory dir, check @@ -3063,9 +3060,12 @@ finish_open_created: error = may_open(&nd->path, acc_mode, open_flag); if (error) goto out; - file->f_path.mnt = nd->path.mnt; - error = finish_open(file, nd->path.dentry, NULL, opened); - if (error) { + + BUG_ON(*opened & FILE_OPENED); /* once it's opened, it's opened */ + error = vfs_open(&nd->path, file, current_cred()); + if (!error) { + *opened |= FILE_OPENED; + } else { if (error == -EOPENSTALE) goto stale_open; goto out; @@ -3074,7 +3074,7 @@ opened: error = open_check_o_direct(file); if (error) goto exit_fput; - error = ima_file_check(file, op->acc_mode); + error = ima_file_check(file, op->acc_mode, *opened); if (error) goto exit_fput; @@ -3565,7 +3565,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) mutex_lock(&dentry->d_inode->i_mutex); error = -EBUSY; - if (d_mountpoint(dentry)) + if (is_local_mountpoint(dentry)) goto out; error = security_inode_rmdir(dir, dentry); @@ -3579,6 +3579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) dentry->d_inode->i_flags |= S_DEAD; dont_mount(dentry); + detach_mounts(dentry); out: mutex_unlock(&dentry->d_inode->i_mutex); @@ -3681,7 +3682,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate return -EPERM; mutex_lock(&target->i_mutex); - if (d_mountpoint(dentry)) + if (is_local_mountpoint(dentry)) error = -EBUSY; else { error = security_inode_unlink(dir, dentry); @@ -3690,8 +3691,10 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (error) goto out; error = dir->i_op->unlink(dir, dentry); - if (!error) + if (!error) { dont_mount(dentry); + detach_mounts(dentry); + } } } out: @@ -4126,7 +4129,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, mutex_lock(&target->i_mutex); error = -EBUSY; - if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) + if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) goto out; if (max_links && new_dir != old_dir) { @@ -4164,6 +4167,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (is_dir) target->i_flags |= S_DEAD; dont_mount(new_dentry); + detach_mounts(new_dentry); } if (!(old_dir->i_sb->s_type->fs_flags & FS_RENAME_DOES_D_MOVE)) { if (!(flags & RENAME_EXCHANGE)) @@ -4205,12 +4209,16 @@ SYSCALL_DEFINE5(renameat2, int, olddfd, const char __user *, oldname, bool should_retry = false; int error; - if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE)) + if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; - if ((flags & RENAME_NOREPLACE) && (flags & RENAME_EXCHANGE)) + if ((flags & (RENAME_NOREPLACE | RENAME_WHITEOUT)) && + (flags & RENAME_EXCHANGE)) return -EINVAL; + if ((flags & RENAME_WHITEOUT) && !capable(CAP_MKNOD)) + return -EPERM; + retry: from = user_path_parent(olddfd, oldname, &oldnd, lookup_flags); if (IS_ERR(from)) { @@ -4342,6 +4350,20 @@ SYSCALL_DEFINE2(rename, const char __user *, oldname, const char __user *, newna return sys_renameat2(AT_FDCWD, oldname, AT_FDCWD, newname, 0); } +int vfs_whiteout(struct inode *dir, struct dentry *dentry) +{ + int error = may_create(dir, dentry); + if (error) + return error; + + if (!dir->i_op->mknod) + return -EPERM; + + return dir->i_op->mknod(dir, dentry, + S_IFCHR | WHITEOUT_MODE, WHITEOUT_DEV); +} +EXPORT_SYMBOL(vfs_whiteout); + int readlink_copy(char __user *buffer, int buflen, const char *link) { int len = PTR_ERR(link); diff --git a/fs/namespace.c b/fs/namespace.c index ef42d9bee212..5b66b2b3624d 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -23,6 +23,7 @@ #include <linux/proc_ns.h> #include <linux/magic.h> #include <linux/bootmem.h> +#include <linux/task_work.h> #include "pnode.h" #include "internal.h" @@ -224,6 +225,7 @@ static struct mount *alloc_vfsmnt(const char *name) INIT_LIST_HEAD(&mnt->mnt_share); INIT_LIST_HEAD(&mnt->mnt_slave_list); INIT_LIST_HEAD(&mnt->mnt_slave); + INIT_HLIST_NODE(&mnt->mnt_mp_list); #ifdef CONFIG_FSNOTIFY INIT_HLIST_HEAD(&mnt->mnt_fsnotify_marks); #endif @@ -666,11 +668,45 @@ struct vfsmount *lookup_mnt(struct path *path) return m; } -static struct mountpoint *new_mountpoint(struct dentry *dentry) +/* + * __is_local_mountpoint - Test to see if dentry is a mountpoint in the + * current mount namespace. + * + * The common case is dentries are not mountpoints at all and that + * test is handled inline. For the slow case when we are actually + * dealing with a mountpoint of some kind, walk through all of the + * mounts in the current mount namespace and test to see if the dentry + * is a mountpoint. + * + * The mount_hashtable is not usable in the context because we + * need to identify all mounts that may be in the current mount + * namespace not just a mount that happens to have some specified + * parent mount. + */ +bool __is_local_mountpoint(struct dentry *dentry) +{ + struct mnt_namespace *ns = current->nsproxy->mnt_ns; + struct mount *mnt; + bool is_covered = false; + + if (!d_mountpoint(dentry)) + goto out; + + down_read(&namespace_sem); + list_for_each_entry(mnt, &ns->list, mnt_list) { + is_covered = (mnt->mnt_mountpoint == dentry); + if (is_covered) + break; + } + up_read(&namespace_sem); +out: + return is_covered; +} + +static struct mountpoint *lookup_mountpoint(struct dentry *dentry) { struct hlist_head *chain = mp_hash(dentry); struct mountpoint *mp; - int ret; hlist_for_each_entry(mp, chain, m_hash) { if (mp->m_dentry == dentry) { @@ -681,6 +717,14 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) return mp; } } + return NULL; +} + +static struct mountpoint *new_mountpoint(struct dentry *dentry) +{ + struct hlist_head *chain = mp_hash(dentry); + struct mountpoint *mp; + int ret; mp = kmalloc(sizeof(struct mountpoint), GFP_KERNEL); if (!mp) @@ -695,6 +739,7 @@ static struct mountpoint *new_mountpoint(struct dentry *dentry) mp->m_dentry = dentry; mp->m_count = 1; hlist_add_head(&mp->m_hash, chain); + INIT_HLIST_HEAD(&mp->m_list); return mp; } @@ -702,6 +747,7 @@ static void put_mountpoint(struct mountpoint *mp) { if (!--mp->m_count) { struct dentry *dentry = mp->m_dentry; + BUG_ON(!hlist_empty(&mp->m_list)); spin_lock(&dentry->d_lock); dentry->d_flags &= ~DCACHE_MOUNTED; spin_unlock(&dentry->d_lock); @@ -748,6 +794,7 @@ static void detach_mnt(struct mount *mnt, struct path *old_path) mnt->mnt_mountpoint = mnt->mnt.mnt_root; list_del_init(&mnt->mnt_child); hlist_del_init_rcu(&mnt->mnt_hash); + hlist_del_init(&mnt->mnt_mp_list); put_mountpoint(mnt->mnt_mp); mnt->mnt_mp = NULL; } @@ -764,6 +811,7 @@ void mnt_set_mountpoint(struct mount *mnt, child_mnt->mnt_mountpoint = dget(mp->m_dentry); child_mnt->mnt_parent = mnt; child_mnt->mnt_mp = mp; + hlist_add_head(&child_mnt->mnt_mp_list, &mp->m_list); } /* @@ -957,6 +1005,46 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return ERR_PTR(err); } +static void cleanup_mnt(struct mount *mnt) +{ + /* + * This probably indicates that somebody messed + * up a mnt_want/drop_write() pair. If this + * happens, the filesystem was probably unable + * to make r/w->r/o transitions. + */ + /* + * The locking used to deal with mnt_count decrement provides barriers, + * so mnt_get_writers() below is safe. + */ + WARN_ON(mnt_get_writers(mnt)); + if (unlikely(mnt->mnt_pins.first)) + mnt_pin_kill(mnt); + fsnotify_vfsmount_delete(&mnt->mnt); + dput(mnt->mnt.mnt_root); + deactivate_super(mnt->mnt.mnt_sb); + mnt_free_id(mnt); + call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); +} + +static void __cleanup_mnt(struct rcu_head *head) +{ + cleanup_mnt(container_of(head, struct mount, mnt_rcu)); +} + +static LLIST_HEAD(delayed_mntput_list); +static void delayed_mntput(struct work_struct *unused) +{ + struct llist_node *node = llist_del_all(&delayed_mntput_list); + struct llist_node *next; + + for (; node; node = next) { + next = llist_next(node); + cleanup_mnt(llist_entry(node, struct mount, mnt_llist)); + } +} +static DECLARE_DELAYED_WORK(delayed_mntput_work, delayed_mntput); + static void mntput_no_expire(struct mount *mnt) { rcu_read_lock(); @@ -982,24 +1070,18 @@ static void mntput_no_expire(struct mount *mnt) list_del(&mnt->mnt_instance); unlock_mount_hash(); - /* - * This probably indicates that somebody messed - * up a mnt_want/drop_write() pair. If this - * happens, the filesystem was probably unable - * to make r/w->r/o transitions. - */ - /* - * The locking used to deal with mnt_count decrement provides barriers, - * so mnt_get_writers() below is safe. - */ - WARN_ON(mnt_get_writers(mnt)); - if (unlikely(mnt->mnt_pins.first)) - mnt_pin_kill(mnt); - fsnotify_vfsmount_delete(&mnt->mnt); - dput(mnt->mnt.mnt_root); - deactivate_super(mnt->mnt.mnt_sb); - mnt_free_id(mnt); - call_rcu(&mnt->mnt_rcu, delayed_free_vfsmnt); + if (likely(!(mnt->mnt.mnt_flags & MNT_INTERNAL))) { + struct task_struct *task = current; + if (likely(!(task->flags & PF_KTHREAD))) { + init_task_work(&mnt->mnt_rcu, __cleanup_mnt); + if (!task_work_add(task, &mnt->mnt_rcu, true)) + return; + } + if (llist_add(&mnt->mnt_llist, &delayed_mntput_list)) + schedule_delayed_work(&delayed_mntput_work, 1); + return; + } + cleanup_mnt(mnt); } void mntput(struct vfsmount *mnt) @@ -1272,6 +1354,7 @@ void umount_tree(struct mount *mnt, int how) if (how < 2) p->mnt.mnt_flags |= MNT_SYNC_UMOUNT; if (mnt_has_parent(p)) { + hlist_del_init(&p->mnt_mp_list); put_mountpoint(p->mnt_mp); mnt_add_count(p->mnt_parent, -1); /* move the reference to mountpoint into ->mnt_ex_mountpoint */ @@ -1356,6 +1439,8 @@ static int do_umount(struct mount *mnt, int flags) * Special case for "unmounting" root ... * we just try to remount it readonly. */ + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; down_write(&sb->s_umount); if (!(sb->s_flags & MS_RDONLY)) retval = do_remount_sb(sb, MS_RDONLY, NULL, 0); @@ -1385,6 +1470,37 @@ static int do_umount(struct mount *mnt, int flags) return retval; } +/* + * __detach_mounts - lazily unmount all mounts on the specified dentry + * + * During unlink, rmdir, and d_drop it is possible to loose the path + * to an existing mountpoint, and wind up leaking the mount. + * detach_mounts allows lazily unmounting those mounts instead of + * leaking them. + * + * The caller may hold dentry->d_inode->i_mutex. + */ +void __detach_mounts(struct dentry *dentry) +{ + struct mountpoint *mp; + struct mount *mnt; + + namespace_lock(); + mp = lookup_mountpoint(dentry); + if (!mp) + goto out_unlock; + + lock_mount_hash(); + while (!hlist_empty(&mp->m_list)) { + mnt = hlist_entry(mp->m_list.first, struct mount, mnt_mp_list); + umount_tree(mnt, 2); + } + unlock_mount_hash(); + put_mountpoint(mp); +out_unlock: + namespace_unlock(); +} + /* * Is the caller allowed to modify his namespace? */ @@ -1570,6 +1686,33 @@ void drop_collected_mounts(struct vfsmount *mnt) namespace_unlock(); } +/** + * clone_private_mount - create a private clone of a path + * + * This creates a new vfsmount, which will be the clone of @path. The new will + * not be attached anywhere in the namespace and will be private (i.e. changes + * to the originating mount won't be propagated into this). + * + * Release with mntput(). + */ +struct vfsmount *clone_private_mount(struct path *path) +{ + struct mount *old_mnt = real_mount(path->mnt); + struct mount *new_mnt; + + if (IS_MNT_UNBINDABLE(old_mnt)) + return ERR_PTR(-EINVAL); + + down_read(&namespace_sem); + new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); + up_read(&namespace_sem); + if (IS_ERR(new_mnt)) + return ERR_CAST(new_mnt); + + return &new_mnt->mnt; +} +EXPORT_SYMBOL_GPL(clone_private_mount); + int iterate_mounts(int (*f)(struct vfsmount *, void *), void *arg, struct vfsmount *root) { @@ -1742,7 +1885,9 @@ retry: namespace_lock(); mnt = lookup_mnt(path); if (likely(!mnt)) { - struct mountpoint *mp = new_mountpoint(dentry); + struct mountpoint *mp = lookup_mountpoint(dentry); + if (!mp) + mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); mutex_unlock(&dentry->d_inode->i_mutex); @@ -2398,21 +2543,9 @@ int copy_mount_options(const void __user * data, unsigned long *where) return 0; } -int copy_mount_string(const void __user *data, char **where) +char *copy_mount_string(const void __user *data) { - char *tmp; - - if (!data) { - *where = NULL; - return 0; - } - - tmp = strndup_user(data, PAGE_SIZE); - if (IS_ERR(tmp)) - return PTR_ERR(tmp); - - *where = tmp; - return 0; + return data ? strndup_user(data, PAGE_SIZE) : NULL; } /* @@ -2429,7 +2562,7 @@ int copy_mount_string(const void __user *data, char **where) * Therefore, if this magic number is present, it carries no information * and must be discarded. */ -long do_mount(const char *dev_name, const char *dir_name, +long do_mount(const char *dev_name, const char __user *dir_name, const char *type_page, unsigned long flags, void *data_page) { struct path path; @@ -2441,15 +2574,11 @@ long do_mount(const char *dev_name, const char *dir_name, flags &= ~MS_MGC_MSK; /* Basic sanity checks */ - - if (!dir_name || !*dir_name || !memchr(dir_name, 0, PAGE_SIZE)) - return -EINVAL; - if (data_page) ((char *)data_page)[PAGE_SIZE - 1] = 0; /* ... and get the mountpoint */ - retval = kern_path(dir_name, LOOKUP_FOLLOW, &path); + retval = user_path(dir_name, &path); if (retval) return retval; @@ -2674,37 +2803,30 @@ SYSCALL_DEFINE5(mount, char __user *, dev_name, char __user *, dir_name, { int ret; char *kernel_type; - struct filename *kernel_dir; char *kernel_dev; unsigned long data_page; - ret = copy_mount_string(type, &kernel_type); - if (ret < 0) + kernel_type = copy_mount_string(type); + ret = PTR_ERR(kernel_type); + if (IS_ERR(kernel_type)) goto out_type; - kernel_dir = getname(dir_name); - if (IS_ERR(kernel_dir)) { - ret = PTR_ERR(kernel_dir); - goto out_dir; - } - - ret = copy_mount_string(dev_name, &kernel_dev); - if (ret < 0) + kernel_dev = copy_mount_string(dev_name); + ret = PTR_ERR(kernel_dev); + if (IS_ERR(kernel_dev)) goto out_dev; ret = copy_mount_options(data, &data_page); if (ret < 0) goto out_data; - ret = do_mount(kernel_dev, kernel_dir->name, kernel_type, flags, + ret = do_mount(kernel_dev, dir_name, kernel_type, flags, (void *) data_page); free_page(data_page); out_data: kfree(kernel_dev); out_dev: - putname(kernel_dir); -out_dir: kfree(kernel_type); out_type: return ret; @@ -2820,6 +2942,9 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, /* make sure we can reach put_old from new_root */ if (!is_path_reachable(old_mnt, old.dentry, &new)) goto out4; + /* make certain new is below the root */ + if (!is_path_reachable(new_mnt, new.dentry, &root)) + goto out4; root_mp->m_count++; /* pin it so it won't go away */ lock_mount_hash(); detach_mnt(new_mnt, &parent_path); diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index 08b8ea8c353e..7cb751dfbeef 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -388,7 +388,6 @@ static struct dentry * ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) { struct dentry *dent = dentry; - struct list_head *next; if (d_validate(dent, parent)) { if (dent->d_name.len <= NCP_MAXPATHLEN && @@ -404,9 +403,7 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) /* If a pointer is invalid, we search the dentry. */ spin_lock(&parent->d_lock); - next = parent->d_subdirs.next; - while (next != &parent->d_subdirs) { - dent = list_entry(next, struct dentry, d_u.d_child); + list_for_each_entry(dent, &parent->d_subdirs, d_u.d_child) { if ((unsigned long)dent->d_fsdata == fpos) { if (dent->d_inode) dget(dent); @@ -415,7 +412,6 @@ ncp_dget_fpos(struct dentry *dentry, struct dentry *parent, unsigned long fpos) spin_unlock(&parent->d_lock); goto out; } - next = next->next; } spin_unlock(&parent->d_lock); return NULL; @@ -1182,9 +1178,6 @@ static int day_n[] = {0, 31, 59, 90, 120, 151, 181, 212, 243, 273, 304, 334, 0, 0, 0, 0}; /* Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec */ - -extern struct timezone sys_tz; - static int utc2local(int time) { return time - sys_tz.tz_minuteswest * 60; diff --git a/fs/ncpfs/ncplib_kernel.h b/fs/ncpfs/ncplib_kernel.h index 32c06587351a..52cb19d66ecb 100644 --- a/fs/ncpfs/ncplib_kernel.h +++ b/fs/ncpfs/ncplib_kernel.h @@ -188,20 +188,14 @@ static inline void ncp_renew_dentries(struct dentry *parent) { struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct list_head *next; struct dentry *dentry; spin_lock(&parent->d_lock); - next = parent->d_subdirs.next; - while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_u.d_child); - + list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) { if (dentry->d_fsdata == NULL) ncp_age_dentry(server, dentry); else ncp_new_dentry(dentry); - - next = next->next; } spin_unlock(&parent->d_lock); } @@ -210,16 +204,12 @@ static inline void ncp_invalidate_dircache_entries(struct dentry *parent) { struct ncp_server *server = NCP_SERVER(parent->d_inode); - struct list_head *next; struct dentry *dentry; spin_lock(&parent->d_lock); - next = parent->d_subdirs.next; - while (next != &parent->d_subdirs) { - dentry = list_entry(next, struct dentry, d_u.d_child); + list_for_each_entry(dentry, &parent->d_subdirs, d_u.d_child) { dentry->d_fsdata = NULL; ncp_age_dentry(server, dentry); - next = next->next; } spin_unlock(&parent->d_lock); } diff --git a/fs/nfs/Makefile b/fs/nfs/Makefile index 4782e0840dcc..04cb830fa09f 100644 --- a/fs/nfs/Makefile +++ b/fs/nfs/Makefile @@ -28,6 +28,7 @@ nfsv4-y := nfs4proc.o nfs4xdr.o nfs4state.o nfs4renewd.o nfs4super.o nfs4file.o nfsv4-$(CONFIG_NFS_USE_LEGACY_DNS) += cache_lib.o nfsv4-$(CONFIG_SYSCTL) += nfs4sysctl.o nfsv4-$(CONFIG_NFS_V4_1) += pnfs.o pnfs_dev.o +nfsv4-$(CONFIG_NFS_V4_2) += nfs42proc.o obj-$(CONFIG_PNFS_FILE_LAYOUT) += filelayout/ obj-$(CONFIG_PNFS_OBJLAYOUT) += objlayout/ diff --git a/fs/nfs/blocklayout/Makefile b/fs/nfs/blocklayout/Makefile index d5815505c020..3ca14c36d08b 100644 --- a/fs/nfs/blocklayout/Makefile +++ b/fs/nfs/blocklayout/Makefile @@ -2,4 +2,5 @@ # Makefile for the pNFS block layout driver kernel module # obj-$(CONFIG_PNFS_BLOCK) += blocklayoutdriver.o -blocklayoutdriver-objs := blocklayout.o extents.o blocklayoutdev.o blocklayoutdm.o + +blocklayoutdriver-y += blocklayout.o dev.o extent_tree.o rpc_pipefs.o diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index cbb1797149d5..5228f201d3d5 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -35,7 +35,6 @@ #include <linux/mount.h> #include <linux/namei.h> #include <linux/bio.h> /* struct bio */ -#include <linux/buffer_head.h> /* various write calls */ #include <linux/prefetch.h> #include <linux/pagevec.h> @@ -50,40 +49,16 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Andy Adamson <andros@citi.umich.edu>"); MODULE_DESCRIPTION("The NFSv4.1 pNFS Block layout driver"); -static void print_page(struct page *page) +static bool is_hole(struct pnfs_block_extent *be) { - dprintk("PRINTPAGE page %p\n", page); - dprintk(" PagePrivate %d\n", PagePrivate(page)); - dprintk(" PageUptodate %d\n", PageUptodate(page)); - dprintk(" PageError %d\n", PageError(page)); - dprintk(" PageDirty %d\n", PageDirty(page)); - dprintk(" PageReferenced %d\n", PageReferenced(page)); - dprintk(" PageLocked %d\n", PageLocked(page)); - dprintk(" PageWriteback %d\n", PageWriteback(page)); - dprintk(" PageMappedToDisk %d\n", PageMappedToDisk(page)); - dprintk("\n"); -} - -/* Given the be associated with isect, determine if page data needs to be - * initialized. - */ -static int is_hole(struct pnfs_block_extent *be, sector_t isect) -{ - if (be->be_state == PNFS_BLOCK_NONE_DATA) - return 1; - else if (be->be_state != PNFS_BLOCK_INVALID_DATA) - return 0; - else - return !bl_is_sector_init(be->be_inval, isect); -} - -/* Given the be associated with isect, determine if page data can be - * written to disk. - */ -static int is_writable(struct pnfs_block_extent *be, sector_t isect) -{ - return (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA); + switch (be->be_state) { + case PNFS_BLOCK_NONE_DATA: + return true; + case PNFS_BLOCK_INVALID_DATA: + return be->be_tag ? false : true; + default: + return false; + } } /* The data we are handed might be spread across several bios. We need @@ -91,9 +66,8 @@ static int is_writable(struct pnfs_block_extent *be, sector_t isect) */ struct parallel_io { struct kref refcnt; - void (*pnfs_callback) (void *data, int num_se); + void (*pnfs_callback) (void *data); void *data; - int bse_count; }; static inline struct parallel_io *alloc_parallel(void *data) @@ -104,7 +78,6 @@ static inline struct parallel_io *alloc_parallel(void *data) if (rv) { rv->data = data; kref_init(&rv->refcnt); - rv->bse_count = 0; } return rv; } @@ -119,7 +92,7 @@ static void destroy_parallel(struct kref *kref) struct parallel_io *p = container_of(kref, struct parallel_io, refcnt); dprintk("%s enter\n", __func__); - p->pnfs_callback(p->data, p->bse_count); + p->pnfs_callback(p->data); kfree(p); } @@ -141,10 +114,9 @@ bl_submit_bio(int rw, struct bio *bio) return NULL; } -static struct bio *bl_alloc_init_bio(int npg, sector_t isect, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) +static struct bio * +bl_alloc_init_bio(int npg, struct block_device *bdev, sector_t disk_sector, + void (*end_io)(struct bio *, int err), struct parallel_io *par) { struct bio *bio; @@ -156,58 +128,64 @@ static struct bio *bl_alloc_init_bio(int npg, sector_t isect, } if (bio) { - bio->bi_iter.bi_sector = isect - be->be_f_offset + - be->be_v_offset; - bio->bi_bdev = be->be_mdev; + bio->bi_iter.bi_sector = disk_sector; + bio->bi_bdev = bdev; bio->bi_end_io = end_io; bio->bi_private = par; } return bio; } -static struct bio *do_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par, - unsigned int offset, int len) +static struct bio * +do_add_page_to_bio(struct bio *bio, int npg, int rw, sector_t isect, + struct page *page, struct pnfs_block_dev_map *map, + struct pnfs_block_extent *be, + void (*end_io)(struct bio *, int err), + struct parallel_io *par, unsigned int offset, int *len) { - isect = isect + (offset >> SECTOR_SHIFT); + struct pnfs_block_dev *dev = + container_of(be->be_device, struct pnfs_block_dev, node); + u64 disk_addr, end; + dprintk("%s: npg %d rw %d isect %llu offset %u len %d\n", __func__, - npg, rw, (unsigned long long)isect, offset, len); + npg, rw, (unsigned long long)isect, offset, *len); + + /* translate to device offset */ + isect += be->be_v_offset; + isect -= be->be_f_offset; + + /* translate to physical disk offset */ + disk_addr = (u64)isect << SECTOR_SHIFT; + if (disk_addr < map->start || disk_addr >= map->start + map->len) { + if (!dev->map(dev, disk_addr, map)) + return ERR_PTR(-EIO); + bio = bl_submit_bio(rw, bio); + } + disk_addr += map->disk_offset; + disk_addr -= map->start; + + /* limit length to what the device mapping allows */ + end = disk_addr + *len; + if (end >= map->start + map->len) + *len = map->start + map->len - disk_addr; + retry: if (!bio) { - bio = bl_alloc_init_bio(npg, isect, be, end_io, par); + bio = bl_alloc_init_bio(npg, map->bdev, + disk_addr >> SECTOR_SHIFT, end_io, par); if (!bio) return ERR_PTR(-ENOMEM); } - if (bio_add_page(bio, page, len, offset) < len) { + if (bio_add_page(bio, page, *len, offset) < *len) { bio = bl_submit_bio(rw, bio); goto retry; } return bio; } -static struct bio *bl_add_page_to_bio(struct bio *bio, int npg, int rw, - sector_t isect, struct page *page, - struct pnfs_block_extent *be, - void (*end_io)(struct bio *, int err), - struct parallel_io *par) -{ - return do_add_page_to_bio(bio, npg, rw, isect, page, be, - end_io, par, 0, PAGE_CACHE_SIZE); -} - -/* This is basically copied from mpage_end_io_read */ static void bl_end_io_read(struct bio *bio, int err) { struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - if (!err) - bio_for_each_segment_all(bvec, bio, i) - SetPageUptodate(bvec->bv_page); if (err) { struct nfs_pgio_header *header = par->data; @@ -216,6 +194,7 @@ static void bl_end_io_read(struct bio *bio, int err) header->pnfs_error = -EIO; pnfs_set_lo_fail(header->lseg); } + bio_put(bio); put_parallel(par); } @@ -231,7 +210,7 @@ static void bl_read_cleanup(struct work_struct *work) } static void -bl_end_par_io_read(void *data, int unused) +bl_end_par_io_read(void *data) { struct nfs_pgio_header *hdr = data; @@ -241,88 +220,78 @@ bl_end_par_io_read(void *data, int unused) } static enum pnfs_try_status -bl_read_pagelist(struct nfs_pgio_header *hdr) +bl_read_pagelist(struct nfs_pgio_header *header) { - struct nfs_pgio_header *header = hdr; - int i, hole; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; + struct pnfs_block_extent be; sector_t isect, extent_length = 0; struct parallel_io *par; - loff_t f_offset = hdr->args.offset; - size_t bytes_left = hdr->args.count; + loff_t f_offset = header->args.offset; + size_t bytes_left = header->args.count; unsigned int pg_offset, pg_len; - struct page **pages = hdr->args.pages; - int pg_index = hdr->args.pgbase >> PAGE_CACHE_SHIFT; + struct page **pages = header->args.pages; + int pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; const bool is_dio = (header->dreq != NULL); + struct blk_plug plug; + int i; dprintk("%s enter nr_pages %u offset %lld count %u\n", __func__, - hdr->page_array.npages, f_offset, - (unsigned int)hdr->args.count); + header->page_array.npages, f_offset, + (unsigned int)header->args.count); - par = alloc_parallel(hdr); + par = alloc_parallel(header); if (!par) - goto use_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_read; - /* At this point, we can no longer jump to use_mds */ + + blk_start_plug(&plug); isect = (sector_t) (f_offset >> SECTOR_SHIFT); /* Code assumes extents are page-aligned */ - for (i = pg_index; i < hdr->page_array.npages; i++) { - if (!extent_length) { + for (i = pg_index; i < header->page_array.npages; i++) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(READ, bio); + /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be) { + if (!ext_tree_lookup(bl, isect, &be, false)) { header->pnfs_error = -EIO; goto out; } - extent_length = be->be_length - - (isect - be->be_f_offset); - if (cow_read) { - sector_t cow_length = cow_read->be_length - - (isect - cow_read->be_f_offset); - extent_length = min(extent_length, cow_length); - } + extent_length = be.be_length - (isect - be.be_f_offset); } + pg_offset = f_offset & ~PAGE_CACHE_MASK; if (is_dio) { - pg_offset = f_offset & ~PAGE_CACHE_MASK; if (pg_offset + bytes_left > PAGE_CACHE_SIZE) pg_len = PAGE_CACHE_SIZE - pg_offset; else pg_len = bytes_left; - - f_offset += pg_len; - bytes_left -= pg_len; - isect += (pg_offset >> SECTOR_SHIFT); } else { - pg_offset = 0; + BUG_ON(pg_offset != 0); pg_len = PAGE_CACHE_SIZE; } - hole = is_hole(be, isect); - if (hole && !cow_read) { + isect += (pg_offset >> SECTOR_SHIFT); + extent_length -= (pg_offset >> SECTOR_SHIFT); + + if (is_hole(&be)) { bio = bl_submit_bio(READ, bio); /* Fill hole w/ zeroes w/o accessing device */ dprintk("%s Zeroing page for hole\n", __func__); zero_user_segment(pages[i], pg_offset, pg_len); - print_page(pages[i]); - SetPageUptodate(pages[i]); - } else { - struct pnfs_block_extent *be_read; - be_read = (hole && cow_read) ? cow_read : be; + /* invalidate map */ + map.start = NFS4_MAX_UINT64; + } else { bio = do_add_page_to_bio(bio, - hdr->page_array.npages - i, + header->page_array.npages - i, READ, - isect, pages[i], be_read, + isect, pages[i], &map, &be, bl_end_io_read, par, - pg_offset, pg_len); + pg_offset, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; @@ -330,75 +299,21 @@ bl_read_pagelist(struct nfs_pgio_header *hdr) } } isect += (pg_len >> SECTOR_SHIFT); - extent_length -= PAGE_CACHE_SECTORS; + extent_length -= (pg_len >> SECTOR_SHIFT); + f_offset += pg_len; + bytes_left -= pg_len; } if ((isect << SECTOR_SHIFT) >= header->inode->i_size) { - hdr->res.eof = 1; - hdr->res.count = header->inode->i_size - hdr->args.offset; + header->res.eof = 1; + header->res.count = header->inode->i_size - header->args.offset; } else { - hdr->res.count = (isect << SECTOR_SHIFT) - hdr->args.offset; + header->res.count = (isect << SECTOR_SHIFT) - header->args.offset; } out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(READ, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; - - use_mds: - dprintk("Giving up and using normal NFS\n"); - return PNFS_NOT_ATTEMPTED; -} - -static void mark_extents_written(struct pnfs_block_layout *bl, - __u64 offset, __u32 count) -{ - sector_t isect, end; - struct pnfs_block_extent *be; - struct pnfs_block_short_extent *se; - - dprintk("%s(%llu, %u)\n", __func__, offset, count); - if (count == 0) - return; - isect = (offset & (long)(PAGE_CACHE_MASK)) >> SECTOR_SHIFT; - end = (offset + count + PAGE_CACHE_SIZE - 1) & (long)(PAGE_CACHE_MASK); - end >>= SECTOR_SHIFT; - while (isect < end) { - sector_t len; - be = bl_find_get_extent(bl, isect, NULL); - BUG_ON(!be); /* FIXME */ - len = min(end, be->be_f_offset + be->be_length) - isect; - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - se = bl_pop_one_short_extent(be->be_inval); - BUG_ON(!se); - bl_mark_for_commit(be, isect, len, se); - } - isect += len; - bl_put_extent(be); - } -} - -static void bl_end_io_write_zero(struct bio *bio, int err) -{ - struct parallel_io *par = bio->bi_private; - struct bio_vec *bvec; - int i; - - bio_for_each_segment_all(bvec, bio, i) { - /* This is the zeroing page we added */ - end_page_writeback(bvec->bv_page); - page_cache_release(bvec->bv_page); - } - - if (unlikely(err)) { - struct nfs_pgio_header *header = par->data; - - if (!header->pnfs_error) - header->pnfs_error = -EIO; - pnfs_set_lo_fail(header->lseg); - } - bio_put(bio); - put_parallel(par); } static void bl_end_io_write(struct bio *bio, int err) @@ -421,533 +336,118 @@ static void bl_end_io_write(struct bio *bio, int err) */ static void bl_write_cleanup(struct work_struct *work) { - struct rpc_task *task; - struct nfs_pgio_header *hdr; + struct rpc_task *task = container_of(work, struct rpc_task, u.tk_work); + struct nfs_pgio_header *hdr = + container_of(task, struct nfs_pgio_header, task); + dprintk("%s enter\n", __func__); - task = container_of(work, struct rpc_task, u.tk_work); - hdr = container_of(task, struct nfs_pgio_header, task); + if (likely(!hdr->pnfs_error)) { - /* Marks for LAYOUTCOMMIT */ - mark_extents_written(BLK_LSEG2EXT(hdr->lseg), - hdr->args.offset, hdr->args.count); + struct pnfs_block_layout *bl = BLK_LSEG2EXT(hdr->lseg); + u64 start = hdr->args.offset & (loff_t)PAGE_CACHE_MASK; + u64 end = (hdr->args.offset + hdr->args.count + + PAGE_CACHE_SIZE - 1) & (loff_t)PAGE_CACHE_MASK; + + ext_tree_mark_written(bl, start >> SECTOR_SHIFT, + (end - start) >> SECTOR_SHIFT); } + pnfs_ld_write_done(hdr); } /* Called when last of bios associated with a bl_write_pagelist call finishes */ -static void bl_end_par_io_write(void *data, int num_se) +static void bl_end_par_io_write(void *data) { struct nfs_pgio_header *hdr = data; - if (unlikely(hdr->pnfs_error)) { - bl_free_short_extents(&BLK_LSEG2EXT(hdr->lseg)->bl_inval, - num_se); - } - hdr->task.tk_status = hdr->pnfs_error; hdr->verf.committed = NFS_FILE_SYNC; INIT_WORK(&hdr->task.u.tk_work, bl_write_cleanup); schedule_work(&hdr->task.u.tk_work); } -/* FIXME STUB - mark intersection of layout and page as bad, so is not - * used again. - */ -static void mark_bad_read(void) -{ - return; -} - -/* - * map_block: map a requested I/0 block (isect) into an offset in the LVM - * block_device - */ -static void -map_block(struct buffer_head *bh, sector_t isect, struct pnfs_block_extent *be) -{ - dprintk("%s enter be=%p\n", __func__, be); - - set_buffer_mapped(bh); - bh->b_bdev = be->be_mdev; - bh->b_blocknr = (isect - be->be_f_offset + be->be_v_offset) >> - (be->be_mdev->bd_inode->i_blkbits - SECTOR_SHIFT); - - dprintk("%s isect %llu, bh->b_blocknr %ld, using bsize %Zd\n", - __func__, (unsigned long long)isect, (long)bh->b_blocknr, - bh->b_size); - return; -} - -static void -bl_read_single_end_io(struct bio *bio, int error) -{ - struct bio_vec *bvec = bio->bi_io_vec + bio->bi_vcnt - 1; - struct page *page = bvec->bv_page; - - /* Only one page in bvec */ - unlock_page(page); -} - -static int -bl_do_readpage_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int offset, unsigned int len) -{ - struct bio *bio; - struct page *shadow_page; - sector_t isect; - char *kaddr, *kshadow_addr; - int ret = 0; - - dprintk("%s: offset %u len %u\n", __func__, offset, len); - - shadow_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM); - if (shadow_page == NULL) - return -ENOMEM; - - bio = bio_alloc(GFP_NOIO, 1); - if (bio == NULL) - return -ENOMEM; - - isect = (page->index << PAGE_CACHE_SECTOR_SHIFT) + - (offset / SECTOR_SIZE); - - bio->bi_iter.bi_sector = isect - be->be_f_offset + be->be_v_offset; - bio->bi_bdev = be->be_mdev; - bio->bi_end_io = bl_read_single_end_io; - - lock_page(shadow_page); - if (bio_add_page(bio, shadow_page, - SECTOR_SIZE, round_down(offset, SECTOR_SIZE)) == 0) { - unlock_page(shadow_page); - bio_put(bio); - return -EIO; - } - - submit_bio(READ, bio); - wait_on_page_locked(shadow_page); - if (unlikely(!test_bit(BIO_UPTODATE, &bio->bi_flags))) { - ret = -EIO; - } else { - kaddr = kmap_atomic(page); - kshadow_addr = kmap_atomic(shadow_page); - memcpy(kaddr + offset, kshadow_addr + offset, len); - kunmap_atomic(kshadow_addr); - kunmap_atomic(kaddr); - } - __free_page(shadow_page); - bio_put(bio); - - return ret; -} - -static int -bl_read_partial_page_sync(struct page *page, struct pnfs_block_extent *be, - unsigned int dirty_offset, unsigned int dirty_len, - bool full_page) -{ - int ret = 0; - unsigned int start, end; - - if (full_page) { - start = 0; - end = PAGE_CACHE_SIZE; - } else { - start = round_down(dirty_offset, SECTOR_SIZE); - end = round_up(dirty_offset + dirty_len, SECTOR_SIZE); - } - - dprintk("%s: offset %u len %d\n", __func__, dirty_offset, dirty_len); - if (!be) { - zero_user_segments(page, start, dirty_offset, - dirty_offset + dirty_len, end); - if (start == 0 && end == PAGE_CACHE_SIZE && - trylock_page(page)) { - SetPageUptodate(page); - unlock_page(page); - } - return ret; - } - - if (start != dirty_offset) - ret = bl_do_readpage_sync(page, be, start, dirty_offset - start); - - if (!ret && (dirty_offset + dirty_len < end)) - ret = bl_do_readpage_sync(page, be, dirty_offset + dirty_len, - end - dirty_offset - dirty_len); - - return ret; -} - -/* Given an unmapped page, zero it or read in page for COW, page is locked - * by caller. - */ -static int -init_page_for_write(struct page *page, struct pnfs_block_extent *cow_read) -{ - struct buffer_head *bh = NULL; - int ret = 0; - sector_t isect; - - dprintk("%s enter, %p\n", __func__, page); - BUG_ON(PageUptodate(page)); - if (!cow_read) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - goto cleanup; - } - - bh = alloc_page_buffers(page, PAGE_CACHE_SIZE, 0); - if (!bh) { - ret = -ENOMEM; - goto cleanup; - } - - isect = (sector_t) page->index << PAGE_CACHE_SECTOR_SHIFT; - map_block(bh, isect, cow_read); - if (!bh_uptodate_or_lock(bh)) - ret = bh_submit_read(bh); - if (ret) - goto cleanup; - SetPageUptodate(page); - -cleanup: - if (bh) - free_buffer_head(bh); - if (ret) { - /* Need to mark layout with bad read...should now - * just use nfs4 for reads and writes. - */ - mark_bad_read(); - } - return ret; -} - -/* Find or create a zeroing page marked being writeback. - * Return ERR_PTR on error, NULL to indicate skip this page and page itself - * to indicate write out. - */ -static struct page * -bl_find_get_zeroing_page(struct inode *inode, pgoff_t index, - struct pnfs_block_extent *cow_read) -{ - struct page *page; - int locked = 0; - page = find_get_page(inode->i_mapping, index); - if (page) - goto check_page; - - page = find_or_create_page(inode->i_mapping, index, GFP_NOFS); - if (unlikely(!page)) { - dprintk("%s oom\n", __func__); - return ERR_PTR(-ENOMEM); - } - locked = 1; - -check_page: - /* PageDirty: Other will write this out - * PageWriteback: Other is writing this out - * PageUptodate: It was read before - */ - if (PageDirty(page) || PageWriteback(page)) { - print_page(page); - if (locked) - unlock_page(page); - page_cache_release(page); - return NULL; - } - - if (!locked) { - lock_page(page); - locked = 1; - goto check_page; - } - if (!PageUptodate(page)) { - /* New page, readin or zero it */ - init_page_for_write(page, cow_read); - } - set_page_writeback(page); - unlock_page(page); - - return page; -} - static enum pnfs_try_status bl_write_pagelist(struct nfs_pgio_header *header, int sync) { - int i, ret, npg_zero, pg_index, last = 0; + struct pnfs_block_layout *bl = BLK_LSEG2EXT(header->lseg); + struct pnfs_block_dev_map map = { .start = NFS4_MAX_UINT64 }; struct bio *bio = NULL; - struct pnfs_block_extent *be = NULL, *cow_read = NULL; - sector_t isect, last_isect = 0, extent_length = 0; + struct pnfs_block_extent be; + sector_t isect, extent_length = 0; struct parallel_io *par = NULL; loff_t offset = header->args.offset; size_t count = header->args.count; - unsigned int pg_offset, pg_len, saved_len; struct page **pages = header->args.pages; - struct page *page; - pgoff_t index; - u64 temp; - int npg_per_block = - NFS_SERVER(header->inode)->pnfs_blksize >> PAGE_CACHE_SHIFT; + int pg_index = pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; + unsigned int pg_len; + struct blk_plug plug; + int i; dprintk("%s enter, %Zu@%lld\n", __func__, count, offset); - if (header->dreq != NULL && - (!IS_ALIGNED(offset, NFS_SERVER(header->inode)->pnfs_blksize) || - !IS_ALIGNED(count, NFS_SERVER(header->inode)->pnfs_blksize))) { - dprintk("pnfsblock nonblock aligned DIO writes. Resend MDS\n"); - goto out_mds; - } /* At this point, header->page_aray is a (sequential) list of nfs_pages. * We want to write each, and if there is an error set pnfs_error * to have it redone using nfs. */ par = alloc_parallel(header); if (!par) - goto out_mds; + return PNFS_NOT_ATTEMPTED; par->pnfs_callback = bl_end_par_io_write; - /* At this point, have to be more careful with error handling */ - isect = (sector_t) ((offset & (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), isect, &cow_read); - if (!be || !is_writable(be, isect)) { - dprintk("%s no matching extents!\n", __func__); - goto out_mds; - } + blk_start_plug(&plug); - /* First page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else - goto out_mds; - temp = offset >> PAGE_CACHE_SHIFT; - npg_zero = do_div(temp, npg_per_block); - isect = (sector_t) (((offset - npg_zero * PAGE_CACHE_SIZE) & - (long)PAGE_CACHE_MASK) >> SECTOR_SHIFT); - extent_length = be->be_length - (isect - be->be_f_offset); - -fill_invalid_ext: - dprintk("%s need to zero %d pages\n", __func__, npg_zero); - for (;npg_zero > 0; npg_zero--) { - if (bl_is_sector_init(be->be_inval, isect)) { - dprintk("isect %llu already init\n", - (unsigned long long)isect); - goto next_page; - } - /* page ref released in bl_end_io_write_zero */ - index = isect >> PAGE_CACHE_SECTOR_SHIFT; - dprintk("%s zero %dth page: index %lu isect %llu\n", - __func__, npg_zero, index, - (unsigned long long)isect); - page = bl_find_get_zeroing_page(header->inode, index, - cow_read); - if (unlikely(IS_ERR(page))) { - header->pnfs_error = PTR_ERR(page); - goto out; - } else if (page == NULL) - goto next_page; - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = ret; - goto out; - } - if (likely(!bl_push_one_short_extent(be->be_inval))) - par->bse_count++; - else { - end_page_writeback(page); - page_cache_release(page); - header->pnfs_error = -ENOMEM; - goto out; - } - /* FIXME: This should be done in bi_end_io */ - mark_extents_written(BLK_LSEG2EXT(header->lseg), - page->index << PAGE_CACHE_SHIFT, - PAGE_CACHE_SIZE); - - bio = bl_add_page_to_bio(bio, npg_zero, WRITE, - isect, page, be, - bl_end_io_write_zero, par); - if (IS_ERR(bio)) { - header->pnfs_error = PTR_ERR(bio); - bio = NULL; - goto out; - } -next_page: - isect += PAGE_CACHE_SECTORS; - extent_length -= PAGE_CACHE_SECTORS; - } - if (last) - goto write_done; - } - bio = bl_submit_bio(WRITE, bio); + /* we always write out the whole page */ + offset = offset & (loff_t)PAGE_CACHE_MASK; + isect = offset >> SECTOR_SHIFT; - /* Middle pages */ - pg_index = header->args.pgbase >> PAGE_CACHE_SHIFT; for (i = pg_index; i < header->page_array.npages; i++) { - if (!extent_length) { + if (extent_length <= 0) { /* We've used up the previous extent */ - bl_put_extent(be); - bl_put_extent(cow_read); bio = bl_submit_bio(WRITE, bio); /* Get the next one */ - be = bl_find_get_extent(BLK_LSEG2EXT(header->lseg), - isect, &cow_read); - if (!be || !is_writable(be, isect)) { + if (!ext_tree_lookup(bl, isect, &be, true)) { header->pnfs_error = -EINVAL; goto out; } - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (likely(!bl_push_one_short_extent( - be->be_inval))) - par->bse_count++; - else { - header->pnfs_error = -ENOMEM; - goto out; - } - } - extent_length = be->be_length - - (isect - be->be_f_offset); - } - - dprintk("%s offset %lld count %Zu\n", __func__, offset, count); - pg_offset = offset & ~PAGE_CACHE_MASK; - if (pg_offset + count > PAGE_CACHE_SIZE) - pg_len = PAGE_CACHE_SIZE - pg_offset; - else - pg_len = count; - - saved_len = pg_len; - if (be->be_state == PNFS_BLOCK_INVALID_DATA && - !bl_is_sector_init(be->be_inval, isect)) { - ret = bl_read_partial_page_sync(pages[i], cow_read, - pg_offset, pg_len, true); - if (ret) { - dprintk("%s bl_read_partial_page_sync fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - - ret = bl_mark_sectors_init(be->be_inval, isect, - PAGE_CACHE_SECTORS); - if (unlikely(ret)) { - dprintk("%s bl_mark_sectors_init fail %d\n", - __func__, ret); - header->pnfs_error = ret; - goto out; - } - /* Expand to full page write */ - pg_offset = 0; - pg_len = PAGE_CACHE_SIZE; - } else if ((pg_offset & (SECTOR_SIZE - 1)) || - (pg_len & (SECTOR_SIZE - 1))){ - /* ahh, nasty case. We have to do sync full sector - * read-modify-write cycles. - */ - unsigned int saved_offset = pg_offset; - ret = bl_read_partial_page_sync(pages[i], be, pg_offset, - pg_len, false); - pg_offset = round_down(pg_offset, SECTOR_SIZE); - pg_len = round_up(saved_offset + pg_len, SECTOR_SIZE) - - pg_offset; + extent_length = be.be_length - (isect - be.be_f_offset); } - + pg_len = PAGE_CACHE_SIZE; bio = do_add_page_to_bio(bio, header->page_array.npages - i, - WRITE, - isect, pages[i], be, + WRITE, isect, pages[i], &map, &be, bl_end_io_write, par, - pg_offset, pg_len); + 0, &pg_len); if (IS_ERR(bio)) { header->pnfs_error = PTR_ERR(bio); bio = NULL; goto out; } - offset += saved_len; - count -= saved_len; - isect += PAGE_CACHE_SECTORS; - last_isect = isect; - extent_length -= PAGE_CACHE_SECTORS; - } - /* Last page inside INVALID extent */ - if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - bio = bl_submit_bio(WRITE, bio); - temp = last_isect >> PAGE_CACHE_SECTOR_SHIFT; - npg_zero = npg_per_block - do_div(temp, npg_per_block); - if (npg_zero < npg_per_block) { - last = 1; - goto fill_invalid_ext; - } + offset += pg_len; + count -= pg_len; + isect += (pg_len >> SECTOR_SHIFT); + extent_length -= (pg_len >> SECTOR_SHIFT); } -write_done: header->res.count = header->args.count; out: - bl_put_extent(be); - bl_put_extent(cow_read); bl_submit_bio(WRITE, bio); + blk_finish_plug(&plug); put_parallel(par); return PNFS_ATTEMPTED; -out_mds: - bl_put_extent(be); - bl_put_extent(cow_read); - kfree(par); - return PNFS_NOT_ATTEMPTED; -} - -/* FIXME - range ignored */ -static void -release_extents(struct pnfs_block_layout *bl, struct pnfs_layout_range *range) -{ - int i; - struct pnfs_block_extent *be; - - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - while (!list_empty(&bl->bl_extents[i])) { - be = list_first_entry(&bl->bl_extents[i], - struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - } - spin_unlock(&bl->bl_ext_lock); -} - -static void -release_inval_marks(struct pnfs_inval_markings *marks) -{ - struct pnfs_inval_tracking *pos, *temp; - struct pnfs_block_short_extent *se, *stemp; - - list_for_each_entry_safe(pos, temp, &marks->im_tree.mtt_stub, it_link) { - list_del(&pos->it_link); - kfree(pos); - } - - list_for_each_entry_safe(se, stemp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - } - return; } static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo) { struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + int err; dprintk("%s enter\n", __func__); - release_extents(bl, NULL); - release_inval_marks(&bl->bl_inval); + + err = ext_tree_remove(bl, true, 0, LLONG_MAX); + WARN_ON(err); + kfree(bl); } @@ -960,14 +460,11 @@ static struct pnfs_layout_hdr *bl_alloc_layout_hdr(struct inode *inode, bl = kzalloc(sizeof(*bl), gfp_flags); if (!bl) return NULL; + + bl->bl_ext_rw = RB_ROOT; + bl->bl_ext_ro = RB_ROOT; spin_lock_init(&bl->bl_ext_lock); - INIT_LIST_HEAD(&bl->bl_extents[0]); - INIT_LIST_HEAD(&bl->bl_extents[1]); - INIT_LIST_HEAD(&bl->bl_commit); - INIT_LIST_HEAD(&bl->bl_committing); - bl->bl_count = 0; - bl->bl_blocksize = NFS_SERVER(inode)->pnfs_blksize >> SECTOR_SHIFT; - BL_INIT_INVAL_MARKS(&bl->bl_inval, bl->bl_blocksize); + return &bl->bl_layout; } @@ -977,215 +474,318 @@ static void bl_free_lseg(struct pnfs_layout_segment *lseg) kfree(lseg); } -/* We pretty much ignore lseg, and store all data layout wide, so we - * can correctly merge. - */ -static struct pnfs_layout_segment *bl_alloc_lseg(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, - gfp_t gfp_flags) -{ - struct pnfs_layout_segment *lseg; - int status; +/* Tracks info needed to ensure extents in layout obey constraints of spec */ +struct layout_verification { + u32 mode; /* R or RW */ + u64 start; /* Expected start of next non-COW extent */ + u64 inval; /* Start of INVAL coverage */ + u64 cowread; /* End of COW read coverage */ +}; - dprintk("%s enter\n", __func__); - lseg = kzalloc(sizeof(*lseg), gfp_flags); - if (!lseg) - return ERR_PTR(-ENOMEM); - status = nfs4_blk_process_layoutget(lo, lgr, gfp_flags); - if (status) { - /* We don't want to call the full-blown bl_free_lseg, - * since on error extents were not touched. - */ - kfree(lseg); - return ERR_PTR(status); +/* Verify the extent meets the layout requirements of the pnfs-block draft, + * section 2.3.1. + */ +static int verify_extent(struct pnfs_block_extent *be, + struct layout_verification *lv) +{ + if (lv->mode == IOMODE_READ) { + if (be->be_state == PNFS_BLOCK_READWRITE_DATA || + be->be_state == PNFS_BLOCK_INVALID_DATA) + return -EIO; + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; } - return lseg; + /* lv->mode == IOMODE_RW */ + if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + if (lv->cowread > lv->start) + return -EIO; + lv->start += be->be_length; + lv->inval = lv->start; + return 0; + } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { + if (be->be_f_offset != lv->start) + return -EIO; + lv->start += be->be_length; + return 0; + } else if (be->be_state == PNFS_BLOCK_READ_DATA) { + if (be->be_f_offset > lv->start) + return -EIO; + if (be->be_f_offset < lv->inval) + return -EIO; + if (be->be_f_offset < lv->cowread) + return -EIO; + /* It looks like you might want to min this with lv->start, + * but you really don't. + */ + lv->inval = lv->inval + be->be_length; + lv->cowread = be->be_f_offset + be->be_length; + return 0; + } else + return -EIO; } -static void -bl_encode_layoutcommit(struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) +static int decode_sector_number(__be32 **rp, sector_t *sp) { - dprintk("%s enter\n", __func__); - encode_pnfs_block_layoutupdate(BLK_LO2EXT(lo), xdr, arg); + uint64_t s; + + *rp = xdr_decode_hyper(*rp, &s); + if (s & 0x1ff) { + printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); + return -1; + } + *sp = s >> SECTOR_SHIFT; + return 0; } -static void -bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) +static int +bl_alloc_extent(struct xdr_stream *xdr, struct pnfs_layout_hdr *lo, + struct layout_verification *lv, struct list_head *extents, + gfp_t gfp_mask) { - struct pnfs_layout_hdr *lo = NFS_I(lcdata->args.inode)->layout; + struct pnfs_block_extent *be; + struct nfs4_deviceid id; + int error; + __be32 *p; - dprintk("%s enter\n", __func__); - clean_pnfs_block_layoutupdate(BLK_LO2EXT(lo), &lcdata->args, lcdata->res.status); -} + p = xdr_inline_decode(xdr, 28 + NFS4_DEVICEID4_SIZE); + if (!p) + return -EIO; -static void free_blk_mountid(struct block_mount_id *mid) -{ - if (mid) { - struct pnfs_block_dev *dev, *tmp; + be = kzalloc(sizeof(*be), GFP_NOFS); + if (!be) + return -ENOMEM; - /* No need to take bm_lock as we are last user freeing bm_devlist */ - list_for_each_entry_safe(dev, tmp, &mid->bm_devlist, bm_node) { - list_del(&dev->bm_node); - bl_free_block_dev(dev); - } - kfree(mid); + memcpy(&id, p, NFS4_DEVICEID4_SIZE); + p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); + + error = -EIO; + be->be_device = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), &id, + lo->plh_lc_cred, gfp_mask); + if (!be->be_device) + goto out_free_be; + + /* + * The next three values are read in as bytes, but stored in the + * extent structure in 512-byte granularity. + */ + if (decode_sector_number(&p, &be->be_f_offset) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_length) < 0) + goto out_put_deviceid; + if (decode_sector_number(&p, &be->be_v_offset) < 0) + goto out_put_deviceid; + be->be_state = be32_to_cpup(p++); + + error = verify_extent(be, lv); + if (error) { + dprintk("%s: extent verification failed\n", __func__); + goto out_put_deviceid; } + + list_add_tail(&be->be_list, extents); + return 0; + +out_put_deviceid: + nfs4_put_deviceid_node(be->be_device); +out_free_be: + kfree(be); + return error; } -/* This is mostly copied from the filelayout_get_device_info function. - * It seems much of this should be at the generic pnfs level. - */ -static struct pnfs_block_dev * -nfs4_blk_get_deviceinfo(struct nfs_server *server, const struct nfs_fh *fh, - struct nfs4_deviceid *d_id) +static struct pnfs_layout_segment * +bl_alloc_lseg(struct pnfs_layout_hdr *lo, struct nfs4_layoutget_res *lgr, + gfp_t gfp_mask) { - struct pnfs_device *dev; - struct pnfs_block_dev *rv; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - int i, rc; + struct layout_verification lv = { + .mode = lgr->range.iomode, + .start = lgr->range.offset >> SECTOR_SHIFT, + .inval = lgr->range.offset >> SECTOR_SHIFT, + .cowread = lgr->range.offset >> SECTOR_SHIFT, + }; + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + struct pnfs_layout_segment *lseg; + struct xdr_buf buf; + struct xdr_stream xdr; + struct page *scratch; + int status, i; + uint32_t count; + __be32 *p; + LIST_HEAD(extents); + + dprintk("---> %s\n", __func__); + + lseg = kzalloc(sizeof(*lseg), gfp_mask); + if (!lseg) + return ERR_PTR(-ENOMEM); + + status = -ENOMEM; + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, + lgr->layoutp->pages, lgr->layoutp->len); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + status = -EIO; + p = xdr_inline_decode(&xdr, 4); + if (unlikely(!p)) + goto out_free_scratch; + + count = be32_to_cpup(p++); + dprintk("%s: number of extents %d\n", __func__, count); /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount + * Decode individual extents, putting them in temporary staging area + * until whole layout is decoded to make error recovery easier. */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s max_resp_sz %u max_pages %d\n", - __func__, max_resp_sz, max_pages); - - dev = kmalloc(sizeof(*dev), GFP_NOFS); - if (!dev) { - dprintk("%s kmalloc failed\n", __func__); - return ERR_PTR(-ENOMEM); + for (i = 0; i < count; i++) { + status = bl_alloc_extent(&xdr, lo, &lv, &extents, gfp_mask); + if (status) + goto process_extents; } - pages = kcalloc(max_pages, sizeof(struct page *), GFP_NOFS); - if (pages == NULL) { - kfree(dev); - return ERR_PTR(-ENOMEM); + if (lgr->range.offset + lgr->range.length != + lv.start << SECTOR_SHIFT) { + dprintk("%s Final length mismatch\n", __func__); + status = -EIO; + goto process_extents; } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(GFP_NOFS); - if (!pages[i]) { - rv = ERR_PTR(-ENOMEM); - goto out_free; - } + + if (lv.start < lv.cowread) { + dprintk("%s Final uncovered COW extent\n", __func__); + status = -EIO; } - memcpy(&dev->dev_id, d_id, sizeof(*d_id)); - dev->layout_type = LAYOUT_BLOCK_VOLUME; - dev->pages = pages; - dev->pgbase = 0; - dev->pglen = PAGE_SIZE * max_pages; - dev->mincount = 0; - dev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - dprintk("%s: dev_id: %s\n", __func__, dev->dev_id.data); - rc = nfs4_proc_getdeviceinfo(server, dev, NULL); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) { - rv = ERR_PTR(rc); - goto out_free; +process_extents: + while (!list_empty(&extents)) { + struct pnfs_block_extent *be = + list_first_entry(&extents, struct pnfs_block_extent, + be_list); + list_del(&be->be_list); + + if (!status) + status = ext_tree_insert(bl, be); + + if (status) { + nfs4_put_deviceid_node(be->be_device); + kfree(be); + } } - rv = nfs4_blk_decode_device(server, dev); - out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(dev); - return rv; +out_free_scratch: + __free_page(scratch); +out: + dprintk("%s returns %d\n", __func__, status); + if (status) { + kfree(lseg); + return ERR_PTR(status); + } + return lseg; } -static int -bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +static void +bl_return_range(struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range) { - struct block_mount_id *b_mt_id = NULL; - struct pnfs_devicelist *dlist = NULL; - struct pnfs_block_dev *bdev; - LIST_HEAD(block_disklist); - int status, i; - - dprintk("%s enter\n", __func__); + struct pnfs_block_layout *bl = BLK_LO2EXT(lo); + sector_t offset = range->offset >> SECTOR_SHIFT, end; - if (server->pnfs_blksize == 0) { - dprintk("%s Server did not return blksize\n", __func__); - return -EINVAL; - } - b_mt_id = kzalloc(sizeof(struct block_mount_id), GFP_NOFS); - if (!b_mt_id) { - status = -ENOMEM; - goto out_error; - } - /* Initialize nfs4 block layout mount id */ - spin_lock_init(&b_mt_id->bm_lock); - INIT_LIST_HEAD(&b_mt_id->bm_devlist); - - dlist = kmalloc(sizeof(struct pnfs_devicelist), GFP_NOFS); - if (!dlist) { - status = -ENOMEM; - goto out_error; + if (range->offset % 8) { + dprintk("%s: offset %lld not block size aligned\n", + __func__, range->offset); + return; } - dlist->eof = 0; - while (!dlist->eof) { - status = nfs4_proc_getdevicelist(server, fh, dlist); - if (status) - goto out_error; - dprintk("%s GETDEVICELIST numdevs=%i, eof=%i\n", - __func__, dlist->num_devs, dlist->eof); - for (i = 0; i < dlist->num_devs; i++) { - bdev = nfs4_blk_get_deviceinfo(server, fh, - &dlist->dev_id[i]); - if (IS_ERR(bdev)) { - status = PTR_ERR(bdev); - goto out_error; - } - spin_lock(&b_mt_id->bm_lock); - list_add(&bdev->bm_node, &b_mt_id->bm_devlist); - spin_unlock(&b_mt_id->bm_lock); + + if (range->length != NFS4_MAX_UINT64) { + if (range->length % 8) { + dprintk("%s: length %lld not block size aligned\n", + __func__, range->length); + return; } - } - dprintk("%s SUCCESS\n", __func__); - server->pnfs_ld_data = b_mt_id; - out_return: - kfree(dlist); - return status; + end = offset + (range->length >> SECTOR_SHIFT); + } else { + end = round_down(NFS4_MAX_UINT64, PAGE_SIZE); + } - out_error: - free_blk_mountid(b_mt_id); - goto out_return; + ext_tree_remove(bl, range->iomode & IOMODE_RW, offset, end); } static int -bl_clear_layoutdriver(struct nfs_server *server) +bl_prepare_layoutcommit(struct nfs4_layoutcommit_args *arg) +{ + return ext_tree_prepare_commit(arg); +} + +static void +bl_cleanup_layoutcommit(struct nfs4_layoutcommit_data *lcdata) { - struct block_mount_id *b_mt_id = server->pnfs_ld_data; + ext_tree_mark_committed(&lcdata->args, lcdata->res.status); +} +static int +bl_set_layoutdriver(struct nfs_server *server, const struct nfs_fh *fh) +{ dprintk("%s enter\n", __func__); - free_blk_mountid(b_mt_id); - dprintk("%s RETURNS\n", __func__); + + if (server->pnfs_blksize == 0) { + dprintk("%s Server did not return blksize\n", __func__); + return -EINVAL; + } + if (server->pnfs_blksize > PAGE_SIZE) { + printk(KERN_ERR "%s: pNFS blksize %d not supported.\n", + __func__, server->pnfs_blksize); + return -EINVAL; + } + return 0; } static bool -is_aligned_req(struct nfs_page *req, unsigned int alignment) +is_aligned_req(struct nfs_pageio_descriptor *pgio, + struct nfs_page *req, unsigned int alignment) { - return IS_ALIGNED(req->wb_offset, alignment) && - IS_ALIGNED(req->wb_bytes, alignment); + /* + * Always accept buffered writes, higher layers take care of the + * right alignment. + */ + if (pgio->pg_dreq == NULL) + return true; + + if (!IS_ALIGNED(req->wb_offset, alignment)) + return false; + + if (IS_ALIGNED(req->wb_bytes, alignment)) + return true; + + if (req_offset(req) + req->wb_bytes == i_size_read(pgio->pg_inode)) { + /* + * If the write goes up to the inode size, just write + * the full page. Data past the inode size is + * guaranteed to be zeroed by the higher level client + * code, and this behaviour is mandated by RFC 5663 + * section 2.3.2. + */ + return true; + } + + return false; } static void bl_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) { nfs_pageio_reset_read_mds(pgio); - else - pnfs_generic_pg_init_read(pgio, req); + return; + } + + pnfs_generic_pg_init_read(pgio, req); } /* @@ -1196,10 +796,8 @@ static size_t bl_pg_test_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, SECTOR_SIZE)) + if (!is_aligned_req(pgio, req, SECTOR_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1229,19 +827,20 @@ static u64 pnfs_num_cont_bytes(struct inode *inode, pgoff_t idx) static void bl_pg_init_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) { + u64 wb_size; + + if (!is_aligned_req(pgio, req, PAGE_SIZE)) { nfs_pageio_reset_write_mds(pgio); - } else { - u64 wb_size; - if (pgio->pg_dreq == NULL) - wb_size = pnfs_num_cont_bytes(pgio->pg_inode, - req->wb_index); - else - wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); - - pnfs_generic_pg_init_write(pgio, req, wb_size); + return; } + + if (pgio->pg_dreq == NULL) + wb_size = pnfs_num_cont_bytes(pgio->pg_inode, + req->wb_index); + else + wb_size = nfs_dreq_bytes_left(pgio->pg_dreq); + + pnfs_generic_pg_init_write(pgio, req, wb_size); } /* @@ -1252,10 +851,8 @@ static size_t bl_pg_test_write(struct nfs_pageio_descriptor *pgio, struct nfs_page *prev, struct nfs_page *req) { - if (pgio->pg_dreq != NULL && - !is_aligned_req(req, PAGE_CACHE_SIZE)) + if (!is_aligned_req(pgio, req, PAGE_SIZE)) return 0; - return pnfs_generic_pg_test(pgio, prev, req); } @@ -1275,146 +872,24 @@ static struct pnfs_layoutdriver_type blocklayout_type = { .id = LAYOUT_BLOCK_VOLUME, .name = "LAYOUT_BLOCK_VOLUME", .owner = THIS_MODULE, + .flags = PNFS_LAYOUTRET_ON_SETATTR | + PNFS_READ_WHOLE_PAGE, .read_pagelist = bl_read_pagelist, .write_pagelist = bl_write_pagelist, .alloc_layout_hdr = bl_alloc_layout_hdr, .free_layout_hdr = bl_free_layout_hdr, .alloc_lseg = bl_alloc_lseg, .free_lseg = bl_free_lseg, - .encode_layoutcommit = bl_encode_layoutcommit, + .return_range = bl_return_range, + .prepare_layoutcommit = bl_prepare_layoutcommit, .cleanup_layoutcommit = bl_cleanup_layoutcommit, .set_layoutdriver = bl_set_layoutdriver, - .clear_layoutdriver = bl_clear_layoutdriver, + .alloc_deviceid_node = bl_alloc_deviceid_node, + .free_deviceid_node = bl_free_deviceid_node, .pg_read_ops = &bl_pg_read_ops, .pg_write_ops = &bl_pg_write_ops, }; -static const struct rpc_pipe_ops bl_upcall_ops = { - .upcall = rpc_pipe_generic_upcall, - .downcall = bl_pipe_downcall, - .destroy_msg = bl_pipe_destroy_msg, -}; - -static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - struct dentry *dir, *dentry; - - dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); - if (dir == NULL) - return ERR_PTR(-ENOENT); - dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); - dput(dir); - return dentry; -} - -static void nfs4blocklayout_unregister_sb(struct super_block *sb, - struct rpc_pipe *pipe) -{ - if (pipe->dentry) - rpc_unlink(pipe->dentry); -} - -static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, - void *ptr) -{ - struct super_block *sb = ptr; - struct net *net = sb->s_fs_info; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - int ret = 0; - - if (!try_module_get(THIS_MODULE)) - return 0; - - if (nn->bl_device_pipe == NULL) { - module_put(THIS_MODULE); - return 0; - } - - switch (event) { - case RPC_PIPEFS_MOUNT: - dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - ret = PTR_ERR(dentry); - break; - } - nn->bl_device_pipe->dentry = dentry; - break; - case RPC_PIPEFS_UMOUNT: - if (nn->bl_device_pipe->dentry) - nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); - break; - default: - ret = -ENOTSUPP; - break; - } - module_put(THIS_MODULE); - return ret; -} - -static struct notifier_block nfs4blocklayout_block = { - .notifier_call = rpc_pipefs_event, -}; - -static struct dentry *nfs4blocklayout_register_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - struct dentry *dentry; - - pipefs_sb = rpc_get_sb_net(net); - if (!pipefs_sb) - return NULL; - dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - return dentry; -} - -static void nfs4blocklayout_unregister_net(struct net *net, - struct rpc_pipe *pipe) -{ - struct super_block *pipefs_sb; - - pipefs_sb = rpc_get_sb_net(net); - if (pipefs_sb) { - nfs4blocklayout_unregister_sb(pipefs_sb, pipe); - rpc_put_sb_net(net); - } -} - -static int nfs4blocklayout_net_init(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct dentry *dentry; - - init_waitqueue_head(&nn->bl_wq); - nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); - if (IS_ERR(nn->bl_device_pipe)) - return PTR_ERR(nn->bl_device_pipe); - dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); - if (IS_ERR(dentry)) { - rpc_destroy_pipe_data(nn->bl_device_pipe); - return PTR_ERR(dentry); - } - nn->bl_device_pipe->dentry = dentry; - return 0; -} - -static void nfs4blocklayout_net_exit(struct net *net) -{ - struct nfs_net *nn = net_generic(net, nfs_net_id); - - nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); - rpc_destroy_pipe_data(nn->bl_device_pipe); - nn->bl_device_pipe = NULL; -} - -static struct pernet_operations nfs4blocklayout_net_ops = { - .init = nfs4blocklayout_net_init, - .exit = nfs4blocklayout_net_exit, -}; - static int __init nfs4blocklayout_init(void) { int ret; @@ -1424,20 +899,14 @@ static int __init nfs4blocklayout_init(void) ret = pnfs_register_layoutdriver(&blocklayout_type); if (ret) goto out; - - ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + ret = bl_init_pipefs(); if (ret) - goto out_remove; - ret = register_pernet_subsys(&nfs4blocklayout_net_ops); - if (ret) - goto out_notifier; -out: - return ret; + goto out_unregister; + return 0; -out_notifier: - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); -out_remove: +out_unregister: pnfs_unregister_layoutdriver(&blocklayout_type); +out: return ret; } @@ -1446,8 +915,7 @@ static void __exit nfs4blocklayout_exit(void) dprintk("%s: NFSv4 Block Layout Driver Unregistering...\n", __func__); - rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); - unregister_pernet_subsys(&nfs4blocklayout_net_ops); + bl_cleanup_pipefs(); pnfs_unregister_layoutdriver(&blocklayout_type); } diff --git a/fs/nfs/blocklayout/blocklayout.h b/fs/nfs/blocklayout/blocklayout.h index 9838fb020473..92dca9e90d8d 100644 --- a/fs/nfs/blocklayout/blocklayout.h +++ b/fs/nfs/blocklayout/blocklayout.h @@ -44,105 +44,112 @@ #define PAGE_CACHE_SECTOR_SHIFT (PAGE_CACHE_SHIFT - SECTOR_SHIFT) #define SECTOR_SIZE (1 << SECTOR_SHIFT) -struct block_mount_id { - spinlock_t bm_lock; /* protects list */ - struct list_head bm_devlist; /* holds pnfs_block_dev */ -}; +struct pnfs_block_dev; -struct pnfs_block_dev { - struct list_head bm_node; - struct nfs4_deviceid bm_mdevid; /* associated devid */ - struct block_device *bm_mdev; /* meta device itself */ - struct net *net; +enum pnfs_block_volume_type { + PNFS_BLOCK_VOLUME_SIMPLE = 0, + PNFS_BLOCK_VOLUME_SLICE = 1, + PNFS_BLOCK_VOLUME_CONCAT = 2, + PNFS_BLOCK_VOLUME_STRIPE = 3, }; -enum exstate4 { - PNFS_BLOCK_READWRITE_DATA = 0, - PNFS_BLOCK_READ_DATA = 1, - PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ - PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ +#define PNFS_BLOCK_MAX_UUIDS 4 +#define PNFS_BLOCK_MAX_DEVICES 64 + +/* + * Random upper cap for the uuid length to avoid unbounded allocation. + * Not actually limited by the protocol. + */ +#define PNFS_BLOCK_UUID_LEN 128 + + +struct pnfs_block_volume { + enum pnfs_block_volume_type type; + union { + struct { + int len; + int nr_sigs; + struct { + u64 offset; + u32 sig_len; + u8 sig[PNFS_BLOCK_UUID_LEN]; + } sigs[PNFS_BLOCK_MAX_UUIDS]; + } simple; + struct { + u64 start; + u64 len; + u32 volume; + } slice; + struct { + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } concat; + struct { + u64 chunk_size; + u32 volumes_count; + u32 volumes[PNFS_BLOCK_MAX_DEVICES]; + } stripe; + }; }; -#define MY_MAX_TAGS (15) /* tag bitnums used must be less than this */ +struct pnfs_block_dev_map { + sector_t start; + sector_t len; -struct my_tree { - sector_t mtt_step_size; /* Internal sector alignment */ - struct list_head mtt_stub; /* Should be a radix tree */ + sector_t disk_offset; + struct block_device *bdev; }; -struct pnfs_inval_markings { - spinlock_t im_lock; - struct my_tree im_tree; /* Sectors that need LAYOUTCOMMIT */ - sector_t im_block_size; /* Server blocksize in sectors */ - struct list_head im_extents; /* Short extents for INVAL->RW conversion */ +struct pnfs_block_dev { + struct nfs4_deviceid_node node; + + u64 start; + u64 len; + + u32 nr_children; + struct pnfs_block_dev *children; + u64 chunk_size; + + struct block_device *bdev; + u64 disk_offset; + + bool (*map)(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map); }; -struct pnfs_inval_tracking { - struct list_head it_link; - int it_sector; - int it_tags; +enum exstate4 { + PNFS_BLOCK_READWRITE_DATA = 0, + PNFS_BLOCK_READ_DATA = 1, + PNFS_BLOCK_INVALID_DATA = 2, /* mapped, but data is invalid */ + PNFS_BLOCK_NONE_DATA = 3 /* unmapped, it's a hole */ }; /* sector_t fields are all in 512-byte sectors */ struct pnfs_block_extent { - struct kref be_refcnt; - struct list_head be_node; /* link into lseg list */ - struct nfs4_deviceid be_devid; /* FIXME: could use device cache instead */ - struct block_device *be_mdev; + union { + struct rb_node be_node; + struct list_head be_list; + }; + struct nfs4_deviceid_node *be_device; sector_t be_f_offset; /* the starting offset in the file */ sector_t be_length; /* the size of the extent */ sector_t be_v_offset; /* the starting offset in the volume */ enum exstate4 be_state; /* the state of this extent */ - struct pnfs_inval_markings *be_inval; /* tracks INVAL->RW transition */ +#define EXTENT_WRITTEN 1 +#define EXTENT_COMMITTING 2 + unsigned int be_tag; }; -/* Shortened extent used by LAYOUTCOMMIT */ -struct pnfs_block_short_extent { - struct list_head bse_node; - struct nfs4_deviceid bse_devid; - struct block_device *bse_mdev; - sector_t bse_f_offset; /* the starting offset in the file */ - sector_t bse_length; /* the size of the extent */ -}; - -static inline void -BL_INIT_INVAL_MARKS(struct pnfs_inval_markings *marks, sector_t blocksize) -{ - spin_lock_init(&marks->im_lock); - INIT_LIST_HEAD(&marks->im_tree.mtt_stub); - INIT_LIST_HEAD(&marks->im_extents); - marks->im_block_size = blocksize; - marks->im_tree.mtt_step_size = min((sector_t)PAGE_CACHE_SECTORS, - blocksize); -} - -enum extentclass4 { - RW_EXTENT = 0, /* READWRTE and INVAL */ - RO_EXTENT = 1, /* READ and NONE */ - EXTENT_LISTS = 2, -}; - -static inline int bl_choose_list(enum exstate4 state) -{ - if (state == PNFS_BLOCK_READ_DATA || state == PNFS_BLOCK_NONE_DATA) - return RO_EXTENT; - else - return RW_EXTENT; -} +/* on the wire size of the extent */ +#define BL_EXTENT_SIZE (7 * sizeof(__be32) + NFS4_DEVICEID4_SIZE) struct pnfs_block_layout { - struct pnfs_layout_hdr bl_layout; - struct pnfs_inval_markings bl_inval; /* tracks INVAL->RW transition */ + struct pnfs_layout_hdr bl_layout; + struct rb_root bl_ext_rw; + struct rb_root bl_ext_ro; spinlock_t bl_ext_lock; /* Protects list manipulation */ - struct list_head bl_extents[EXTENT_LISTS]; /* R and RW extents */ - struct list_head bl_commit; /* Needs layout commit */ - struct list_head bl_committing; /* Layout committing */ - unsigned int bl_count; /* entries in bl_commit */ - sector_t bl_blocksize; /* Server blocksize in sectors */ }; -#define BLK_ID(lo) ((struct block_mount_id *)(NFS_SERVER(lo->plh_inode)->pnfs_ld_data)) - static inline struct pnfs_block_layout * BLK_LO2EXT(struct pnfs_layout_hdr *lo) { @@ -171,41 +178,27 @@ struct bl_msg_hdr { #define BL_DEVICE_REQUEST_PROC 0x1 /* User level process succeeds */ #define BL_DEVICE_REQUEST_ERR 0x2 /* User level process fails */ -/* blocklayoutdev.c */ -ssize_t bl_pipe_downcall(struct file *, const char __user *, size_t); -void bl_pipe_destroy_msg(struct rpc_pipe_msg *); -void nfs4_blkdev_put(struct block_device *bdev); -struct pnfs_block_dev *nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev); -int nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); - -/* blocklayoutdm.c */ -void bl_free_block_dev(struct pnfs_block_dev *bdev); - -/* extents.c */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read); -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length); -void bl_put_extent(struct pnfs_block_extent *be); -struct pnfs_block_extent *bl_alloc_extent(void); -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect); -int encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg); -void clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status); -int bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new); -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new); -int bl_push_one_short_extent(struct pnfs_inval_markings *marks); -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks); -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free); +/* dev.c */ +struct nfs4_deviceid_node *bl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_mask); +void bl_free_deviceid_node(struct nfs4_deviceid_node *d); + +/* extent_tree.c */ +int ext_tree_insert(struct pnfs_block_layout *bl, + struct pnfs_block_extent *new); +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, sector_t start, + sector_t end); +int ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len); +bool ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw); +int ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg); +void ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status); + +/* rpc_pipefs.c */ +dev_t bl_resolve_deviceid(struct nfs_server *server, + struct pnfs_block_volume *b, gfp_t gfp_mask); +int __init bl_init_pipefs(void); +void __exit bl_cleanup_pipefs(void); #endif /* FS_NFS_NFS4BLOCKLAYOUT_H */ diff --git a/fs/nfs/blocklayout/blocklayoutdev.c b/fs/nfs/blocklayout/blocklayoutdev.c deleted file mode 100644 index 04303b5c9361..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdev.c +++ /dev/null @@ -1,384 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdev.c - * - * Device operations for the pnfs nfs4 file layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ -#include <linux/module.h> -#include <linux/buffer_head.h> /* __bread */ - -#include <linux/genhd.h> -#include <linux/blkdev.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static int decode_sector_number(__be32 **rp, sector_t *sp) -{ - uint64_t s; - - *rp = xdr_decode_hyper(*rp, &s); - if (s & 0x1ff) { - printk(KERN_WARNING "NFS: %s: sector not aligned\n", __func__); - return -1; - } - *sp = s >> SECTOR_SHIFT; - return 0; -} - -/* - * Release the block device - */ -void nfs4_blkdev_put(struct block_device *bdev) -{ - dprintk("%s for device %d:%d\n", __func__, MAJOR(bdev->bd_dev), - MINOR(bdev->bd_dev)); - blkdev_put(bdev, FMODE_READ); -} - -ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, - size_t mlen) -{ - struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, - nfs_net_id); - - if (mlen != sizeof (struct bl_dev_msg)) - return -EINVAL; - - if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) - return -EFAULT; - - wake_up(&nn->bl_wq); - - return mlen; -} - -void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) -{ - struct bl_pipe_msg *bl_pipe_msg = container_of(msg, struct bl_pipe_msg, msg); - - if (msg->errno >= 0) - return; - wake_up(bl_pipe_msg->bl_wq); -} - -/* - * Decodes pnfs_block_deviceaddr4 which is XDR encoded in dev->dev_addr_buf. - */ -struct pnfs_block_dev * -nfs4_blk_decode_device(struct nfs_server *server, - struct pnfs_device *dev) -{ - struct pnfs_block_dev *rv; - struct block_device *bd = NULL; - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_MOUNT, - .totallen = dev->mincount, - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - int offset, len, i, rc; - struct net *net = server->nfs_client->cl_net; - struct nfs_net *nn = net_generic(net, nfs_net_id); - struct bl_dev_msg *reply = &nn->bl_mount_reply; - - dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); - dprintk("%s: deviceid: %s, mincount: %d\n", __func__, dev->dev_id.data, - dev->mincount); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->data = kzalloc(sizeof(bl_msg) + dev->mincount, GFP_NOFS); - if (!msg->data) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - len = dev->mincount; - offset = sizeof(bl_msg); - for (i = 0; len > 0; i++) { - memcpy(&dataptr[offset], page_address(dev->pages[i]), - len < PAGE_CACHE_SIZE ? len : PAGE_CACHE_SIZE); - len -= PAGE_CACHE_SIZE; - offset += PAGE_CACHE_SIZE; - } - msg->len = sizeof(bl_msg) + dev->mincount; - - dprintk("%s CALLING USERSPACE DAEMON\n", __func__); - add_wait_queue(&nn->bl_wq, &wq); - rc = rpc_queue_upcall(nn->bl_device_pipe, msg); - if (rc < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - rv = ERR_PTR(rc); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - - if (reply->status != BL_DEVICE_REQUEST_PROC) { - dprintk("%s failed to open device: %d\n", - __func__, reply->status); - rv = ERR_PTR(-EINVAL); - goto out; - } - - bd = blkdev_get_by_dev(MKDEV(reply->major, reply->minor), - FMODE_READ, NULL); - if (IS_ERR(bd)) { - dprintk("%s failed to open device : %ld\n", __func__, - PTR_ERR(bd)); - rv = ERR_CAST(bd); - goto out; - } - - rv = kzalloc(sizeof(*rv), GFP_NOFS); - if (!rv) { - rv = ERR_PTR(-ENOMEM); - goto out; - } - - rv->bm_mdev = bd; - memcpy(&rv->bm_mdevid, &dev->dev_id, sizeof(struct nfs4_deviceid)); - rv->net = net; - dprintk("%s Created device %s with bd_block_size %u\n", - __func__, - bd->bd_disk->disk_name, - bd->bd_block_size); - -out: - kfree(msg->data); - return rv; -} - -/* Map deviceid returned by the server to constructed block_device */ -static struct block_device *translate_devid(struct pnfs_layout_hdr *lo, - struct nfs4_deviceid *id) -{ - struct block_device *rv = NULL; - struct block_mount_id *mid; - struct pnfs_block_dev *dev; - - dprintk("%s enter, lo=%p, id=%p\n", __func__, lo, id); - mid = BLK_ID(lo); - spin_lock(&mid->bm_lock); - list_for_each_entry(dev, &mid->bm_devlist, bm_node) { - if (memcmp(id->data, dev->bm_mdevid.data, - NFS4_DEVICEID4_SIZE) == 0) { - rv = dev->bm_mdev; - goto out; - } - } - out: - spin_unlock(&mid->bm_lock); - dprintk("%s returning %p\n", __func__, rv); - return rv; -} - -/* Tracks info needed to ensure extents in layout obey constraints of spec */ -struct layout_verification { - u32 mode; /* R or RW */ - u64 start; /* Expected start of next non-COW extent */ - u64 inval; /* Start of INVAL coverage */ - u64 cowread; /* End of COW read coverage */ -}; - -/* Verify the extent meets the layout requirements of the pnfs-block draft, - * section 2.3.1. - */ -static int verify_extent(struct pnfs_block_extent *be, - struct layout_verification *lv) -{ - if (lv->mode == IOMODE_READ) { - if (be->be_state == PNFS_BLOCK_READWRITE_DATA || - be->be_state == PNFS_BLOCK_INVALID_DATA) - return -EIO; - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } - /* lv->mode == IOMODE_RW */ - if (be->be_state == PNFS_BLOCK_READWRITE_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - if (lv->cowread > lv->start) - return -EIO; - lv->start += be->be_length; - lv->inval = lv->start; - return 0; - } else if (be->be_state == PNFS_BLOCK_INVALID_DATA) { - if (be->be_f_offset != lv->start) - return -EIO; - lv->start += be->be_length; - return 0; - } else if (be->be_state == PNFS_BLOCK_READ_DATA) { - if (be->be_f_offset > lv->start) - return -EIO; - if (be->be_f_offset < lv->inval) - return -EIO; - if (be->be_f_offset < lv->cowread) - return -EIO; - /* It looks like you might want to min this with lv->start, - * but you really don't. - */ - lv->inval = lv->inval + be->be_length; - lv->cowread = be->be_f_offset + be->be_length; - return 0; - } else - return -EIO; -} - -/* XDR decode pnfs_block_layout4 structure */ -int -nfs4_blk_process_layoutget(struct pnfs_layout_hdr *lo, - struct nfs4_layoutget_res *lgr, gfp_t gfp_flags) -{ - struct pnfs_block_layout *bl = BLK_LO2EXT(lo); - int i, status = -EIO; - uint32_t count; - struct pnfs_block_extent *be = NULL, *save; - struct xdr_stream stream; - struct xdr_buf buf; - struct page *scratch; - __be32 *p; - struct layout_verification lv = { - .mode = lgr->range.iomode, - .start = lgr->range.offset >> SECTOR_SHIFT, - .inval = lgr->range.offset >> SECTOR_SHIFT, - .cowread = lgr->range.offset >> SECTOR_SHIFT, - }; - LIST_HEAD(extents); - - dprintk("---> %s\n", __func__); - - scratch = alloc_page(gfp_flags); - if (!scratch) - return -ENOMEM; - - xdr_init_decode_pages(&stream, &buf, lgr->layoutp->pages, lgr->layoutp->len); - xdr_set_scratch_buffer(&stream, page_address(scratch), PAGE_SIZE); - - p = xdr_inline_decode(&stream, 4); - if (unlikely(!p)) - goto out_err; - - count = be32_to_cpup(p++); - - dprintk("%s enter, number of extents %i\n", __func__, count); - p = xdr_inline_decode(&stream, (28 + NFS4_DEVICEID4_SIZE) * count); - if (unlikely(!p)) - goto out_err; - - /* Decode individual extents, putting them in temporary - * staging area until whole layout is decoded to make error - * recovery easier. - */ - for (i = 0; i < count; i++) { - be = bl_alloc_extent(); - if (!be) { - status = -ENOMEM; - goto out_err; - } - memcpy(&be->be_devid, p, NFS4_DEVICEID4_SIZE); - p += XDR_QUADLEN(NFS4_DEVICEID4_SIZE); - be->be_mdev = translate_devid(lo, &be->be_devid); - if (!be->be_mdev) - goto out_err; - - /* The next three values are read in as bytes, - * but stored as 512-byte sector lengths - */ - if (decode_sector_number(&p, &be->be_f_offset) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_length) < 0) - goto out_err; - if (decode_sector_number(&p, &be->be_v_offset) < 0) - goto out_err; - be->be_state = be32_to_cpup(p++); - if (be->be_state == PNFS_BLOCK_INVALID_DATA) - be->be_inval = &bl->bl_inval; - if (verify_extent(be, &lv)) { - dprintk("%s verify failed\n", __func__); - goto out_err; - } - list_add_tail(&be->be_node, &extents); - } - if (lgr->range.offset + lgr->range.length != - lv.start << SECTOR_SHIFT) { - dprintk("%s Final length mismatch\n", __func__); - be = NULL; - goto out_err; - } - if (lv.start < lv.cowread) { - dprintk("%s Final uncovered COW extent\n", __func__); - be = NULL; - goto out_err; - } - /* Extents decoded properly, now try to merge them in to - * existing layout extents. - */ - spin_lock(&bl->bl_ext_lock); - list_for_each_entry_safe(be, save, &extents, be_node) { - list_del(&be->be_node); - status = bl_add_merge_extent(bl, be); - if (status) { - spin_unlock(&bl->bl_ext_lock); - /* This is a fairly catastrophic error, as the - * entire layout extent lists are now corrupted. - * We should have some way to distinguish this. - */ - be = NULL; - goto out_err; - } - } - spin_unlock(&bl->bl_ext_lock); - status = 0; - out: - __free_page(scratch); - dprintk("%s returns %i\n", __func__, status); - return status; - - out_err: - bl_put_extent(be); - while (!list_empty(&extents)) { - be = list_first_entry(&extents, struct pnfs_block_extent, - be_node); - list_del(&be->be_node); - bl_put_extent(be); - } - goto out; -} diff --git a/fs/nfs/blocklayout/blocklayoutdm.c b/fs/nfs/blocklayout/blocklayoutdm.c deleted file mode 100644 index 8999cfddd866..000000000000 --- a/fs/nfs/blocklayout/blocklayoutdm.c +++ /dev/null @@ -1,108 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayoutdm.c - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2007 The Regents of the University of Michigan. - * All rights reserved. - * - * Fred Isaman <iisaman@umich.edu> - * Andy Adamson <andros@citi.umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include <linux/genhd.h> /* gendisk - used in a dprintk*/ -#include <linux/sched.h> -#include <linux/hash.h> - -#include "blocklayout.h" - -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -static void dev_remove(struct net *net, dev_t dev) -{ - struct bl_pipe_msg bl_pipe_msg; - struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; - struct bl_dev_msg bl_umount_request; - struct bl_msg_hdr bl_msg = { - .type = BL_DEVICE_UMOUNT, - .totallen = sizeof(bl_umount_request), - }; - uint8_t *dataptr; - DECLARE_WAITQUEUE(wq, current); - struct nfs_net *nn = net_generic(net, nfs_net_id); - - dprintk("Entering %s\n", __func__); - - bl_pipe_msg.bl_wq = &nn->bl_wq; - memset(msg, 0, sizeof(*msg)); - msg->len = sizeof(bl_msg) + bl_msg.totallen; - msg->data = kzalloc(msg->len, GFP_NOFS); - if (!msg->data) - goto out; - - memset(&bl_umount_request, 0, sizeof(bl_umount_request)); - bl_umount_request.major = MAJOR(dev); - bl_umount_request.minor = MINOR(dev); - - memcpy(msg->data, &bl_msg, sizeof(bl_msg)); - dataptr = (uint8_t *) msg->data; - memcpy(&dataptr[sizeof(bl_msg)], &bl_umount_request, sizeof(bl_umount_request)); - - add_wait_queue(&nn->bl_wq, &wq); - if (rpc_queue_upcall(nn->bl_device_pipe, msg) < 0) { - remove_wait_queue(&nn->bl_wq, &wq); - goto out; - } - - set_current_state(TASK_UNINTERRUPTIBLE); - schedule(); - __set_current_state(TASK_RUNNING); - remove_wait_queue(&nn->bl_wq, &wq); - -out: - kfree(msg->data); -} - -/* - * Release meta device - */ -static void nfs4_blk_metadev_release(struct pnfs_block_dev *bdev) -{ - dprintk("%s Releasing\n", __func__); - nfs4_blkdev_put(bdev->bm_mdev); - dev_remove(bdev->net, bdev->bm_mdev->bd_dev); -} - -void bl_free_block_dev(struct pnfs_block_dev *bdev) -{ - if (bdev) { - if (bdev->bm_mdev) { - dprintk("%s Removing DM device: %d:%d\n", - __func__, - MAJOR(bdev->bm_mdev->bd_dev), - MINOR(bdev->bm_mdev->bd_dev)); - nfs4_blk_metadev_release(bdev); - } - kfree(bdev); - } -} diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c new file mode 100644 index 000000000000..5aed4f98df41 --- /dev/null +++ b/fs/nfs/blocklayout/dev.c @@ -0,0 +1,363 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ +#include <linux/sunrpc/svc.h> +#include <linux/blkdev.h> +#include <linux/nfs4.h> +#include <linux/nfs_fs.h> +#include <linux/nfs_xdr.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +bl_free_device(struct pnfs_block_dev *dev) +{ + if (dev->nr_children) { + int i; + + for (i = 0; i < dev->nr_children; i++) + bl_free_device(&dev->children[i]); + kfree(dev->children); + } else { + if (dev->bdev) + blkdev_put(dev->bdev, FMODE_READ); + } +} + +void +bl_free_deviceid_node(struct nfs4_deviceid_node *d) +{ + struct pnfs_block_dev *dev = + container_of(d, struct pnfs_block_dev, node); + + bl_free_device(dev); + kfree(dev); +} + +static int +nfs4_block_decode_volume(struct xdr_stream *xdr, struct pnfs_block_volume *b) +{ + __be32 *p; + int i; + + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->type = be32_to_cpup(p++); + + switch (b->type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->simple.nr_sigs = be32_to_cpup(p++); + if (!b->simple.nr_sigs) { + dprintk("no signature\n"); + return -EIO; + } + + b->simple.len = 4 + 4; + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->simple.sigs[i].offset); + b->simple.sigs[i].sig_len = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->simple.sigs[i].sig_len); + if (!p) + return -EIO; + memcpy(&b->simple.sigs[i].sig, p, + b->simple.sigs[i].sig_len); + + b->simple.len += 8 + 4 + b->simple.sigs[i].sig_len; + } + break; + case PNFS_BLOCK_VOLUME_SLICE: + p = xdr_inline_decode(xdr, 8 + 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->slice.start); + p = xdr_decode_hyper(p, &b->slice.len); + b->slice.volume = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_CONCAT: + p = xdr_inline_decode(xdr, 4); + if (!p) + return -EIO; + b->concat.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->concat.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->concat.volumes_count; i++) + b->concat.volumes[i] = be32_to_cpup(p++); + break; + case PNFS_BLOCK_VOLUME_STRIPE: + p = xdr_inline_decode(xdr, 8 + 4); + if (!p) + return -EIO; + p = xdr_decode_hyper(p, &b->stripe.chunk_size); + b->stripe.volumes_count = be32_to_cpup(p++); + + p = xdr_inline_decode(xdr, b->stripe.volumes_count * 4); + if (!p) + return -EIO; + for (i = 0; i < b->stripe.volumes_count; i++) + b->stripe.volumes[i] = be32_to_cpup(p++); + break; + default: + dprintk("unknown volume type!\n"); + return -EIO; + } + + return 0; +} + +static bool bl_map_simple(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + map->start = dev->start; + map->len = dev->len; + map->disk_offset = dev->disk_offset; + map->bdev = dev->bdev; + return true; +} + +static bool bl_map_concat(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + int i; + + for (i = 0; i < dev->nr_children; i++) { + struct pnfs_block_dev *child = &dev->children[i]; + + if (child->start > offset || + child->start + child->len <= offset) + continue; + + child->map(child, offset - child->start, map); + return true; + } + + dprintk("%s: ran off loop!\n", __func__); + return false; +} + +static bool bl_map_stripe(struct pnfs_block_dev *dev, u64 offset, + struct pnfs_block_dev_map *map) +{ + struct pnfs_block_dev *child; + u64 chunk; + u32 chunk_idx; + u64 disk_offset; + + chunk = div_u64(offset, dev->chunk_size); + div_u64_rem(chunk, dev->nr_children, &chunk_idx); + + if (chunk_idx > dev->nr_children) { + dprintk("%s: invalid chunk idx %d (%lld/%lld)\n", + __func__, chunk_idx, offset, dev->chunk_size); + /* error, should not happen */ + return false; + } + + /* truncate offset to the beginning of the stripe */ + offset = chunk * dev->chunk_size; + + /* disk offset of the stripe */ + disk_offset = div_u64(offset, dev->nr_children); + + child = &dev->children[chunk_idx]; + child->map(child, disk_offset, map); + + map->start += offset; + map->disk_offset += disk_offset; + map->len = dev->chunk_size; + return true; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask); + + +static int +bl_parse_simple(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + dev_t dev; + + dev = bl_resolve_deviceid(server, v, gfp_mask); + if (!dev) + return -EIO; + + d->bdev = blkdev_get_by_dev(dev, FMODE_READ, NULL); + if (IS_ERR(d->bdev)) { + printk(KERN_WARNING "pNFS: failed to open device %d:%d (%ld)\n", + MAJOR(dev), MINOR(dev), PTR_ERR(d->bdev)); + return PTR_ERR(d->bdev); + } + + + d->len = i_size_read(d->bdev->bd_inode); + d->map = bl_map_simple; + + printk(KERN_INFO "pNFS: using block device %s\n", + d->bdev->bd_disk->disk_name); + return 0; +} + +static int +bl_parse_slice(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + int ret; + + ret = bl_parse_deviceid(server, d, volumes, v->slice.volume, gfp_mask); + if (ret) + return ret; + + d->disk_offset = v->slice.start; + d->len = v->slice.len; + return 0; +} + +static int +bl_parse_concat(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->concat.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->concat.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->concat.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + d->children[i].start += len; + len += d->children[i].len; + } + + d->len = len; + d->map = bl_map_concat; + return 0; +} + +static int +bl_parse_stripe(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + struct pnfs_block_volume *v = &volumes[idx]; + u64 len = 0; + int ret, i; + + d->children = kcalloc(v->stripe.volumes_count, + sizeof(struct pnfs_block_dev), GFP_KERNEL); + if (!d->children) + return -ENOMEM; + + for (i = 0; i < v->stripe.volumes_count; i++) { + ret = bl_parse_deviceid(server, &d->children[i], + volumes, v->stripe.volumes[i], gfp_mask); + if (ret) + return ret; + + d->nr_children++; + len += d->children[i].len; + } + + d->len = len; + d->chunk_size = v->stripe.chunk_size; + d->map = bl_map_stripe; + return 0; +} + +static int +bl_parse_deviceid(struct nfs_server *server, struct pnfs_block_dev *d, + struct pnfs_block_volume *volumes, int idx, gfp_t gfp_mask) +{ + switch (volumes[idx].type) { + case PNFS_BLOCK_VOLUME_SIMPLE: + return bl_parse_simple(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_SLICE: + return bl_parse_slice(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_CONCAT: + return bl_parse_concat(server, d, volumes, idx, gfp_mask); + case PNFS_BLOCK_VOLUME_STRIPE: + return bl_parse_stripe(server, d, volumes, idx, gfp_mask); + default: + dprintk("unsupported volume type: %d\n", volumes[idx].type); + return -EIO; + } +} + +struct nfs4_deviceid_node * +bl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_mask) +{ + struct nfs4_deviceid_node *node = NULL; + struct pnfs_block_volume *volumes; + struct pnfs_block_dev *top; + struct xdr_stream xdr; + struct xdr_buf buf; + struct page *scratch; + int nr_volumes, ret, i; + __be32 *p; + + scratch = alloc_page(gfp_mask); + if (!scratch) + goto out; + + xdr_init_decode_pages(&xdr, &buf, pdev->pages, pdev->pglen); + xdr_set_scratch_buffer(&xdr, page_address(scratch), PAGE_SIZE); + + p = xdr_inline_decode(&xdr, sizeof(__be32)); + if (!p) + goto out_free_scratch; + nr_volumes = be32_to_cpup(p++); + + volumes = kcalloc(nr_volumes, sizeof(struct pnfs_block_volume), + gfp_mask); + if (!volumes) + goto out_free_scratch; + + for (i = 0; i < nr_volumes; i++) { + ret = nfs4_block_decode_volume(&xdr, &volumes[i]); + if (ret < 0) + goto out_free_volumes; + } + + top = kzalloc(sizeof(*top), gfp_mask); + if (!top) + goto out_free_volumes; + + ret = bl_parse_deviceid(server, top, volumes, nr_volumes - 1, gfp_mask); + if (ret) { + bl_free_device(top); + kfree(top); + goto out_free_volumes; + } + + node = &top->node; + nfs4_init_deviceid_node(node, server, &pdev->dev_id); + +out_free_volumes: + kfree(volumes); +out_free_scratch: + __free_page(scratch); +out: + return node; +} diff --git a/fs/nfs/blocklayout/extent_tree.c b/fs/nfs/blocklayout/extent_tree.c new file mode 100644 index 000000000000..31d0b5e53dfd --- /dev/null +++ b/fs/nfs/blocklayout/extent_tree.c @@ -0,0 +1,602 @@ +/* + * Copyright (c) 2014 Christoph Hellwig. + */ + +#include <linux/vmalloc.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static inline struct pnfs_block_extent * +ext_node(struct rb_node *node) +{ + return rb_entry(node, struct pnfs_block_extent, be_node); +} + +static struct pnfs_block_extent * +ext_tree_first(struct rb_root *root) +{ + struct rb_node *node = rb_first(root); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_prev(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_prev(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static struct pnfs_block_extent * +ext_tree_next(struct pnfs_block_extent *be) +{ + struct rb_node *node = rb_next(&be->be_node); + return node ? ext_node(node) : NULL; +} + +static inline sector_t +ext_f_end(struct pnfs_block_extent *be) +{ + return be->be_f_offset + be->be_length; +} + +static struct pnfs_block_extent * +__ext_tree_search(struct rb_root *root, sector_t start) +{ + struct rb_node *node = root->rb_node; + struct pnfs_block_extent *be = NULL; + + while (node) { + be = ext_node(node); + if (start < be->be_f_offset) + node = node->rb_left; + else if (start >= ext_f_end(be)) + node = node->rb_right; + else + return be; + } + + if (be) { + if (start < be->be_f_offset) + return be; + + if (start >= ext_f_end(be)) + return ext_tree_next(be); + } + + return NULL; +} + +static bool +ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2) +{ + if (be1->be_state != be2->be_state) + return false; + if (be1->be_device != be2->be_device) + return false; + + if (be1->be_f_offset + be1->be_length != be2->be_f_offset) + return false; + + if (be1->be_state != PNFS_BLOCK_NONE_DATA && + (be1->be_v_offset + be1->be_length != be2->be_v_offset)) + return false; + + if (be1->be_state == PNFS_BLOCK_INVALID_DATA && + be1->be_tag != be2->be_tag) + return false; + + return true; +} + +static struct pnfs_block_extent * +ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + left->be_length += be->be_length; + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + return left; + } + + return be; +} + +static struct pnfs_block_extent * +ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be) +{ + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + be->be_length += right->be_length; + rb_erase(&right->be_node, root); + nfs4_put_deviceid_node(right->be_device); + kfree(right); + } + + return be; +} + +static void +__ext_tree_insert(struct rb_root *root, + struct pnfs_block_extent *new, bool merge_ok) +{ + struct rb_node **p = &root->rb_node, *parent = NULL; + struct pnfs_block_extent *be; + + while (*p) { + parent = *p; + be = ext_node(parent); + + if (new->be_f_offset < be->be_f_offset) { + if (merge_ok && ext_can_merge(new, be)) { + be->be_f_offset = new->be_f_offset; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset = new->be_v_offset; + be->be_length += new->be_length; + be = ext_try_to_merge_left(root, be); + goto free_new; + } + p = &(*p)->rb_left; + } else if (new->be_f_offset >= ext_f_end(be)) { + if (merge_ok && ext_can_merge(be, new)) { + be->be_length += new->be_length; + be = ext_try_to_merge_right(root, be); + goto free_new; + } + p = &(*p)->rb_right; + } else { + BUG(); + } + } + + rb_link_node(&new->be_node, parent, p); + rb_insert_color(&new->be_node, root); + return; +free_new: + nfs4_put_deviceid_node(new->be_device); + kfree(new); +} + +static int +__ext_tree_remove(struct rb_root *root, sector_t start, sector_t end) +{ + struct pnfs_block_extent *be; + sector_t len1 = 0, len2 = 0; + sector_t orig_v_offset; + sector_t orig_len; + + be = __ext_tree_search(root, start); + if (!be) + return 0; + if (be->be_f_offset >= end) + return 0; + + orig_v_offset = be->be_v_offset; + orig_len = be->be_length; + + if (start > be->be_f_offset) + len1 = start - be->be_f_offset; + if (ext_f_end(be) > end) + len2 = ext_f_end(be) - end; + + if (len2 > 0) { + if (len1 > 0) { + struct pnfs_block_extent *new; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = len1; + + new->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + new->be_v_offset = + orig_v_offset + orig_len - len2; + } + new->be_length = len2; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, true); + } else { + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) { + be->be_v_offset = + orig_v_offset + orig_len - len2; + } + be->be_length = len2; + } + } else { + if (len1 > 0) { + be->be_length = len1; + be = ext_tree_next(be); + } + + while (be && ext_f_end(be) <= end) { + struct pnfs_block_extent *next = ext_tree_next(be); + + rb_erase(&be->be_node, root); + nfs4_put_deviceid_node(be->be_device); + kfree(be); + be = next; + } + + if (be && be->be_f_offset < end) { + len1 = ext_f_end(be) - end; + be->be_f_offset = end; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + be->be_v_offset += be->be_length - len1; + be->be_length = len1; + } + } + + return 0; +} + +int +ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new) +{ + struct pnfs_block_extent *be; + struct rb_root *root; + int err = 0; + + switch (new->be_state) { + case PNFS_BLOCK_READWRITE_DATA: + case PNFS_BLOCK_INVALID_DATA: + root = &bl->bl_ext_rw; + break; + case PNFS_BLOCK_READ_DATA: + case PNFS_BLOCK_NONE_DATA: + root = &bl->bl_ext_ro; + break; + default: + dprintk("invalid extent type\n"); + return -EINVAL; + } + + spin_lock(&bl->bl_ext_lock); +retry: + be = __ext_tree_search(root, new->be_f_offset); + if (!be || be->be_f_offset >= ext_f_end(new)) { + __ext_tree_insert(root, new, true); + } else if (new->be_f_offset >= be->be_f_offset) { + if (ext_f_end(new) <= ext_f_end(be)) { + nfs4_put_deviceid_node(new->be_device); + kfree(new); + } else { + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } + } else if (ext_f_end(new) <= ext_f_end(be)) { + new->be_length = be->be_f_offset - new->be_f_offset; + __ext_tree_insert(root, new, true); + } else { + struct pnfs_block_extent *split; + sector_t new_len = ext_f_end(new) - ext_f_end(be); + sector_t diff = new->be_length - new_len; + + split = kmemdup(new, sizeof(*new), GFP_ATOMIC); + if (!split) { + err = -EINVAL; + goto out; + } + + split->be_length = be->be_f_offset - split->be_f_offset; + split->be_device = nfs4_get_deviceid(new->be_device); + __ext_tree_insert(root, split, true); + + new->be_f_offset += diff; + new->be_v_offset += diff; + new->be_length = new_len; + goto retry; + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static bool +__ext_tree_lookup(struct rb_root *root, sector_t isect, + struct pnfs_block_extent *ret) +{ + struct rb_node *node; + struct pnfs_block_extent *be; + + node = root->rb_node; + while (node) { + be = ext_node(node); + if (isect < be->be_f_offset) + node = node->rb_left; + else if (isect >= ext_f_end(be)) + node = node->rb_right; + else { + *ret = *be; + return true; + } + } + + return false; +} + +bool +ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect, + struct pnfs_block_extent *ret, bool rw) +{ + bool found = false; + + spin_lock(&bl->bl_ext_lock); + if (!rw) + found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret); + if (!found) + found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret); + spin_unlock(&bl->bl_ext_lock); + + return found; +} + +int ext_tree_remove(struct pnfs_block_layout *bl, bool rw, + sector_t start, sector_t end) +{ + int err, err2; + + spin_lock(&bl->bl_ext_lock); + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (rw) { + err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end); + if (!err) + err = err2; + } + spin_unlock(&bl->bl_ext_lock); + + return err; +} + +static int +ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be, + sector_t split) +{ + struct pnfs_block_extent *new; + sector_t orig_len = be->be_length; + + new = kzalloc(sizeof(*new), GFP_ATOMIC); + if (!new) + return -ENOMEM; + + be->be_length = split - be->be_f_offset; + + new->be_f_offset = split; + if (be->be_state != PNFS_BLOCK_NONE_DATA) + new->be_v_offset = be->be_v_offset + be->be_length; + new->be_length = orig_len - be->be_length; + new->be_state = be->be_state; + new->be_tag = be->be_tag; + new->be_device = nfs4_get_deviceid(be->be_device); + + __ext_tree_insert(root, new, false); + return 0; +} + +int +ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start, + sector_t len) +{ + struct rb_root *root = &bl->bl_ext_rw; + sector_t end = start + len; + struct pnfs_block_extent *be; + int err = 0; + + spin_lock(&bl->bl_ext_lock); + /* + * First remove all COW extents or holes from written to range. + */ + err = __ext_tree_remove(&bl->bl_ext_ro, start, end); + if (err) + goto out; + + /* + * Then mark all invalid extents in the range as written to. + */ + for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) { + if (be->be_f_offset >= end) + break; + + if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag) + continue; + + if (be->be_f_offset < start) { + struct pnfs_block_extent *left = ext_tree_prev(be); + + if (left && ext_can_merge(left, be)) { + sector_t diff = start - be->be_f_offset; + + left->be_length += diff; + + be->be_f_offset += diff; + be->be_v_offset += diff; + be->be_length -= diff; + } else { + err = ext_tree_split(root, be, start); + if (err) + goto out; + } + } + + if (ext_f_end(be) > end) { + struct pnfs_block_extent *right = ext_tree_next(be); + + if (right && ext_can_merge(be, right)) { + sector_t diff = end - be->be_f_offset; + + be->be_length -= diff; + + right->be_f_offset -= diff; + right->be_v_offset -= diff; + right->be_length += diff; + } else { + err = ext_tree_split(root, be, end); + if (err) + goto out; + } + } + + if (be->be_f_offset >= start && ext_f_end(be) <= end) { + be->be_tag = EXTENT_WRITTEN; + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + } +out: + spin_unlock(&bl->bl_ext_lock); + return err; +} + +static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg, + size_t buffer_size) +{ + if (arg->layoutupdate_pages != &arg->layoutupdate_page) { + int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i; + + for (i = 0; i < nr_pages; i++) + put_page(arg->layoutupdate_pages[i]); + kfree(arg->layoutupdate_pages); + } else { + put_page(arg->layoutupdate_page); + } +} + +static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p, + size_t buffer_size, size_t *count) +{ + struct pnfs_block_extent *be; + int ret = 0; + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_WRITTEN) + continue; + + (*count)++; + if (*count * BL_EXTENT_SIZE > buffer_size) { + /* keep counting.. */ + ret = -ENOSPC; + continue; + } + + p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data, + NFS4_DEVICEID4_SIZE); + p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT); + p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT); + p = xdr_encode_hyper(p, 0LL); + *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); + + be->be_tag = EXTENT_COMMITTING; + } + spin_unlock(&bl->bl_ext_lock); + + return ret; +} + +int +ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + size_t count = 0, buffer_size = PAGE_SIZE; + __be32 *start_p; + int ret; + + dprintk("%s enter\n", __func__); + + arg->layoutupdate_page = alloc_page(GFP_NOFS); + if (!arg->layoutupdate_page) + return -ENOMEM; + start_p = page_address(arg->layoutupdate_page); + arg->layoutupdate_pages = &arg->layoutupdate_page; + +retry: + ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count); + if (unlikely(ret)) { + ext_tree_free_commitdata(arg, buffer_size); + + buffer_size = sizeof(__be32) + BL_EXTENT_SIZE * count; + count = 0; + + arg->layoutupdate_pages = + kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE), + sizeof(struct page *), GFP_NOFS); + if (!arg->layoutupdate_pages) + return -ENOMEM; + + start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL); + if (!start_p) { + kfree(arg->layoutupdate_pages); + return -ENOMEM; + } + + goto retry; + } + + *start_p = cpu_to_be32(count); + arg->layoutupdate_len = sizeof(__be32) + BL_EXTENT_SIZE * count; + + if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) { + __be32 *p = start_p; + int i = 0; + + for (p = start_p; + p < start_p + arg->layoutupdate_len; + p += PAGE_SIZE) { + arg->layoutupdate_pages[i++] = vmalloc_to_page(p); + } + } + + dprintk("%s found %zu ranges\n", __func__, count); + return 0; +} + +void +ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status) +{ + struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout); + struct rb_root *root = &bl->bl_ext_rw; + struct pnfs_block_extent *be; + + dprintk("%s status %d\n", __func__, status); + + ext_tree_free_commitdata(arg, arg->layoutupdate_len); + + spin_lock(&bl->bl_ext_lock); + for (be = ext_tree_first(root); be; be = ext_tree_next(be)) { + if (be->be_state != PNFS_BLOCK_INVALID_DATA || + be->be_tag != EXTENT_COMMITTING) + continue; + + if (status) { + /* + * Mark as written and try again. + * + * XXX: some real error handling here wouldn't hurt.. + */ + be->be_tag = EXTENT_WRITTEN; + } else { + be->be_state = PNFS_BLOCK_READWRITE_DATA; + be->be_tag = 0; + } + + be = ext_try_to_merge_left(root, be); + be = ext_try_to_merge_right(root, be); + } + spin_unlock(&bl->bl_ext_lock); +} diff --git a/fs/nfs/blocklayout/extents.c b/fs/nfs/blocklayout/extents.c deleted file mode 100644 index 4d0161442565..000000000000 --- a/fs/nfs/blocklayout/extents.c +++ /dev/null @@ -1,908 +0,0 @@ -/* - * linux/fs/nfs/blocklayout/blocklayout.h - * - * Module for the NFSv4.1 pNFS block layout driver. - * - * Copyright (c) 2006 The Regents of the University of Michigan. - * All rights reserved. - * - * Andy Adamson <andros@citi.umich.edu> - * Fred Isaman <iisaman@umich.edu> - * - * permission is granted to use, copy, create derivative works and - * redistribute this software and such derivative works for any purpose, - * so long as the name of the university of michigan is not used in - * any advertising or publicity pertaining to the use or distribution - * of this software without specific, written prior authorization. if - * the above copyright notice or any other identification of the - * university of michigan is included in any copy of any portion of - * this software, then the disclaimer below must also be included. - * - * this software is provided as is, without representation from the - * university of michigan as to its fitness for any purpose, and without - * warranty by the university of michigan of any kind, either express - * or implied, including without limitation the implied warranties of - * merchantability and fitness for a particular purpose. the regents - * of the university of michigan shall not be liable for any damages, - * including special, indirect, incidental, or consequential damages, - * with respect to any claim arising out or in connection with the use - * of the software, even if it has been or is hereafter advised of the - * possibility of such damages. - */ - -#include "blocklayout.h" -#define NFSDBG_FACILITY NFSDBG_PNFS_LD - -/* Bit numbers */ -#define EXTENT_INITIALIZED 0 -#define EXTENT_WRITTEN 1 -#define EXTENT_IN_COMMIT 2 -#define INTERNAL_EXISTS MY_MAX_TAGS -#define INTERNAL_MASK ((1 << INTERNAL_EXISTS) - 1) - -/* Returns largest t<=s s.t. t%base==0 */ -static inline sector_t normalize(sector_t s, int base) -{ - sector_t tmp = s; /* Since do_div modifies its argument */ - return s - sector_div(tmp, base); -} - -static inline sector_t normalize_up(sector_t s, int base) -{ - return normalize(s + base - 1, base); -} - -/* Complete stub using list while determine API wanted */ - -/* Returns tags, or negative */ -static int32_t _find_entry(struct my_tree *tree, u64 s) -{ - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu) enter\n", __func__, s); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) - return pos->it_tags & INTERNAL_MASK; - else - break; - } - return -ENOENT; -} - -static inline -int _has_tag(struct my_tree *tree, u64 s, int32_t tag) -{ - int32_t tags; - - dprintk("%s(%llu, %i) enter\n", __func__, s, tag); - s = normalize(s, tree->mtt_step_size); - tags = _find_entry(tree, s); - if ((tags < 0) || !(tags & (1 << tag))) - return 0; - else - return 1; -} - -/* Creates entry with tag, or if entry already exists, unions tag to it. - * If storage is not NULL, newly created entry will use it. - * Returns number of entries added, or negative on error. - */ -static int _add_entry(struct my_tree *tree, u64 s, int32_t tag, - struct pnfs_inval_tracking *storage) -{ - int found = 0; - struct pnfs_inval_tracking *pos; - - dprintk("%s(%llu, %i, %p) enter\n", __func__, s, tag, storage); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector > s) - continue; - else if (pos->it_sector == s) { - found = 1; - break; - } else - break; - } - if (found) { - pos->it_tags |= (1 << tag); - return 0; - } else { - struct pnfs_inval_tracking *new; - new = storage; - new->it_sector = s; - new->it_tags = (1 << tag); - list_add(&new->it_link, &pos->it_link); - return 1; - } -} - -/* XXXX Really want option to not create */ -/* Over range, unions tag with existing entries, else creates entry with tag */ -static int _set_range(struct my_tree *tree, int32_t tag, u64 s, u64 length) -{ - u64 i; - - dprintk("%s(%i, %llu, %llu) enter\n", __func__, tag, s, length); - for (i = normalize(s, tree->mtt_step_size); i < s + length; - i += tree->mtt_step_size) - if (_add_entry(tree, i, tag, NULL)) - return -ENOMEM; - return 0; -} - -/* Ensure that future operations on given range of tree will not malloc */ -static int _preload_range(struct pnfs_inval_markings *marks, - u64 offset, u64 length) -{ - u64 start, end, s; - int count, i, used = 0, status = -ENOMEM; - struct pnfs_inval_tracking **storage; - struct my_tree *tree = &marks->im_tree; - - dprintk("%s(%llu, %llu) enter\n", __func__, offset, length); - start = normalize(offset, tree->mtt_step_size); - end = normalize_up(offset + length, tree->mtt_step_size); - count = (int)(end - start) / (int)tree->mtt_step_size; - - /* Pre-malloc what memory we might need */ - storage = kcalloc(count, sizeof(*storage), GFP_NOFS); - if (!storage) - return -ENOMEM; - for (i = 0; i < count; i++) { - storage[i] = kmalloc(sizeof(struct pnfs_inval_tracking), - GFP_NOFS); - if (!storage[i]) - goto out_cleanup; - } - - spin_lock_bh(&marks->im_lock); - for (s = start; s < end; s += tree->mtt_step_size) - used += _add_entry(tree, s, INTERNAL_EXISTS, storage[used]); - spin_unlock_bh(&marks->im_lock); - - status = 0; - - out_cleanup: - for (i = used; i < count; i++) { - if (!storage[i]) - break; - kfree(storage[i]); - } - kfree(storage); - return status; -} - -/* We are relying on page lock to serialize this */ -int bl_is_sector_init(struct pnfs_inval_markings *marks, sector_t isect) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _has_tag(&marks->im_tree, isect, EXTENT_INITIALIZED); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Assume start, end already sector aligned */ -static int -_range_has_tag(struct my_tree *tree, u64 start, u64 end, int32_t tag) -{ - struct pnfs_inval_tracking *pos; - u64 expect = 0; - - dprintk("%s(%llu, %llu, %i) enter\n", __func__, start, end, tag); - list_for_each_entry_reverse(pos, &tree->mtt_stub, it_link) { - if (pos->it_sector >= end) - continue; - if (!expect) { - if ((pos->it_sector == end - tree->mtt_step_size) && - (pos->it_tags & (1 << tag))) { - expect = pos->it_sector - tree->mtt_step_size; - if (pos->it_sector < tree->mtt_step_size || expect < start) - return 1; - continue; - } else { - return 0; - } - } - if (pos->it_sector != expect || !(pos->it_tags & (1 << tag))) - return 0; - expect -= tree->mtt_step_size; - if (expect < start) - return 1; - } - return 0; -} - -static int is_range_written(struct pnfs_inval_markings *marks, - sector_t start, sector_t end) -{ - int rv; - - spin_lock_bh(&marks->im_lock); - rv = _range_has_tag(&marks->im_tree, start, end, EXTENT_WRITTEN); - spin_unlock_bh(&marks->im_lock); - return rv; -} - -/* Marks sectors in [offest, offset_length) as having been initialized. - * All lengths are step-aligned, where step is min(pagesize, blocksize). - * Currently assumes offset is page-aligned - */ -int bl_mark_sectors_init(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - sector_t start, end; - - dprintk("%s(offset=%llu,len=%llu) enter\n", - __func__, (u64)offset, (u64)length); - - start = normalize(offset, marks->im_block_size); - end = normalize_up(offset + length, marks->im_block_size); - if (_preload_range(marks, start, end - start)) - goto outerr; - - spin_lock_bh(&marks->im_lock); - if (_set_range(&marks->im_tree, EXTENT_INITIALIZED, offset, length)) - goto out_unlock; - spin_unlock_bh(&marks->im_lock); - - return 0; - -out_unlock: - spin_unlock_bh(&marks->im_lock); -outerr: - return -ENOMEM; -} - -/* Marks sectors in [offest, offset+length) as having been written to disk. - * All lengths should be block aligned. - */ -static int mark_written_sectors(struct pnfs_inval_markings *marks, - sector_t offset, sector_t length) -{ - int status; - - dprintk("%s(offset=%llu,len=%llu) enter\n", __func__, - (u64)offset, (u64)length); - spin_lock_bh(&marks->im_lock); - status = _set_range(&marks->im_tree, EXTENT_WRITTEN, offset, length); - spin_unlock_bh(&marks->im_lock); - return status; -} - -static void print_short_extent(struct pnfs_block_short_extent *be) -{ - dprintk("PRINT SHORT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->bse_f_offset); - dprintk(" be_length %llu\n", (u64)be->bse_length); - } -} - -static void print_clist(struct list_head *list, unsigned int count) -{ - struct pnfs_block_short_extent *be; - unsigned int i = 0; - - ifdebug(FACILITY) { - printk(KERN_DEBUG "****************\n"); - printk(KERN_DEBUG "Extent list looks like:\n"); - list_for_each_entry(be, list, bse_node) { - i++; - print_short_extent(be); - } - if (i != count) - printk(KERN_DEBUG "\n\nExpected %u entries\n\n\n", count); - printk(KERN_DEBUG "****************\n"); - } -} - -/* Note: In theory, we should do more checking that devid's match between - * old and new, but if they don't, the lists are too corrupt to salvage anyway. - */ -/* Note this is very similar to bl_add_merge_extent */ -static void add_to_commitlist(struct pnfs_block_layout *bl, - struct pnfs_block_short_extent *new) -{ - struct list_head *clist = &bl->bl_commit; - struct pnfs_block_short_extent *old, *save; - sector_t end = new->bse_f_offset + new->bse_length; - - dprintk("%s enter\n", __func__); - print_short_extent(new); - print_clist(clist, bl->bl_count); - bl->bl_count++; - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe(old, save, clist, bse_node) { - if (new->bse_f_offset < old->bse_f_offset) - break; - if (end <= old->bse_f_offset + old->bse_length) { - /* Range is already in list */ - bl->bl_count--; - kfree(new); - return; - } else if (new->bse_f_offset <= - old->bse_f_offset + old->bse_length) { - /* new overlaps or abuts existing be */ - if (new->bse_mdev == old->bse_mdev) { - /* extend new to fully replace old */ - new->bse_length += new->bse_f_offset - - old->bse_f_offset; - new->bse_f_offset = old->bse_f_offset; - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - } - /* Note that if we never hit the above break, old will not point to a - * valid extent. However, in that case &old->bse_node==list. - */ - list_add_tail(&new->bse_node, &old->bse_node); - /* Scan forward for overlaps. If we find any, extend new and - * remove the overlapped extent. - */ - old = list_prepare_entry(new, clist, bse_node); - list_for_each_entry_safe_continue(old, save, clist, bse_node) { - if (end < old->bse_f_offset) - break; - /* new overlaps or abuts old */ - if (new->bse_mdev == old->bse_mdev) { - if (end < old->bse_f_offset + old->bse_length) { - /* extend new to fully cover old */ - end = old->bse_f_offset + old->bse_length; - new->bse_length = end - new->bse_f_offset; - } - list_del(&old->bse_node); - bl->bl_count--; - kfree(old); - } - } - dprintk("%s: after merging\n", __func__); - print_clist(clist, bl->bl_count); -} - -/* Note the range described by offset, length is guaranteed to be contained - * within be. - * new will be freed, either by this function or add_to_commitlist if they - * decide not to use it, or after LAYOUTCOMMIT uses it in the commitlist. - */ -int bl_mark_for_commit(struct pnfs_block_extent *be, - sector_t offset, sector_t length, - struct pnfs_block_short_extent *new) -{ - sector_t new_end, end = offset + length; - struct pnfs_block_layout *bl = container_of(be->be_inval, - struct pnfs_block_layout, - bl_inval); - - mark_written_sectors(be->be_inval, offset, length); - /* We want to add the range to commit list, but it must be - * block-normalized, and verified that the normalized range has - * been entirely written to disk. - */ - new->bse_f_offset = offset; - offset = normalize(offset, bl->bl_blocksize); - if (offset < new->bse_f_offset) { - if (is_range_written(be->be_inval, offset, new->bse_f_offset)) - new->bse_f_offset = offset; - else - new->bse_f_offset = offset + bl->bl_blocksize; - } - new_end = normalize_up(end, bl->bl_blocksize); - if (end < new_end) { - if (is_range_written(be->be_inval, end, new_end)) - end = new_end; - else - end = new_end - bl->bl_blocksize; - } - if (end <= new->bse_f_offset) { - kfree(new); - return 0; - } - new->bse_length = end - new->bse_f_offset; - new->bse_devid = be->be_devid; - new->bse_mdev = be->be_mdev; - - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, new); - spin_unlock(&bl->bl_ext_lock); - return 0; -} - -static void print_bl_extent(struct pnfs_block_extent *be) -{ - dprintk("PRINT EXTENT extent %p\n", be); - if (be) { - dprintk(" be_f_offset %llu\n", (u64)be->be_f_offset); - dprintk(" be_length %llu\n", (u64)be->be_length); - dprintk(" be_v_offset %llu\n", (u64)be->be_v_offset); - dprintk(" be_state %d\n", be->be_state); - } -} - -static void -destroy_extent(struct kref *kref) -{ - struct pnfs_block_extent *be; - - be = container_of(kref, struct pnfs_block_extent, be_refcnt); - dprintk("%s be=%p\n", __func__, be); - kfree(be); -} - -void -bl_put_extent(struct pnfs_block_extent *be) -{ - if (be) { - dprintk("%s enter %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_put(&be->be_refcnt, destroy_extent); - } -} - -struct pnfs_block_extent *bl_alloc_extent(void) -{ - struct pnfs_block_extent *be; - - be = kmalloc(sizeof(struct pnfs_block_extent), GFP_NOFS); - if (!be) - return NULL; - INIT_LIST_HEAD(&be->be_node); - kref_init(&be->be_refcnt); - be->be_inval = NULL; - return be; -} - -static void print_elist(struct list_head *list) -{ - struct pnfs_block_extent *be; - dprintk("****************\n"); - dprintk("Extent list looks like:\n"); - list_for_each_entry(be, list, be_node) { - print_bl_extent(be); - } - dprintk("****************\n"); -} - -static inline int -extents_consistent(struct pnfs_block_extent *old, struct pnfs_block_extent *new) -{ - /* Note this assumes new->be_f_offset >= old->be_f_offset */ - return (new->be_state == old->be_state) && - ((new->be_state == PNFS_BLOCK_NONE_DATA) || - ((new->be_v_offset - old->be_v_offset == - new->be_f_offset - old->be_f_offset) && - new->be_mdev == old->be_mdev)); -} - -/* Adds new to appropriate list in bl, modifying new and removing existing - * extents as appropriate to deal with overlaps. - * - * See bl_find_get_extent for list constraints. - * - * Refcount on new is already set. If end up not using it, or error out, - * need to put the reference. - * - * bl->bl_ext_lock is held by caller. - */ -int -bl_add_merge_extent(struct pnfs_block_layout *bl, - struct pnfs_block_extent *new) -{ - struct pnfs_block_extent *be, *tmp; - sector_t end = new->be_f_offset + new->be_length; - struct list_head *list; - - dprintk("%s enter with be=%p\n", __func__, new); - print_bl_extent(new); - list = &bl->bl_extents[bl_choose_list(new->be_state)]; - print_elist(list); - - /* Scan for proper place to insert, extending new to the left - * as much as possible. - */ - list_for_each_entry_safe_reverse(be, tmp, list, be_node) { - if (new->be_f_offset >= be->be_f_offset + be->be_length) - break; - if (new->be_f_offset >= be->be_f_offset) { - if (end <= be->be_f_offset + be->be_length) { - /* new is a subset of existing be*/ - if (extents_consistent(be, new)) { - dprintk("%s: new is subset, ignoring\n", - __func__); - bl_put_extent(new); - return 0; - } else { - goto out_err; - } - } else { - /* |<-- be -->| - * |<-- new -->| */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - new->be_length += new->be_f_offset - - be->be_f_offset; - new->be_f_offset = be->be_f_offset; - new->be_v_offset = be->be_v_offset; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } else if (end >= be->be_f_offset + be->be_length) { - /* new extent overlap existing be */ - if (extents_consistent(be, new)) { - /* extend new to fully replace be */ - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } else if (end > be->be_f_offset) { - /* |<-- be -->| - *|<-- new -->| */ - if (extents_consistent(new, be)) { - /* extend new to fully replace be */ - new->be_length += be->be_f_offset + be->be_length - - new->be_f_offset - new->be_length; - dprintk("%s: removing %p\n", __func__, be); - list_del(&be->be_node); - bl_put_extent(be); - } else { - goto out_err; - } - } - } - /* Note that if we never hit the above break, be will not point to a - * valid extent. However, in that case &be->be_node==list. - */ - list_add(&new->be_node, &be->be_node); - dprintk("%s: inserting new\n", __func__); - print_elist(list); - /* FIXME - The per-list consistency checks have all been done, - * should now check cross-list consistency. - */ - return 0; - - out_err: - bl_put_extent(new); - return -EIO; -} - -/* Returns extent, or NULL. If a second READ extent exists, it is returned - * in cow_read, if given. - * - * The extents are kept in two seperate ordered lists, one for READ and NONE, - * one for READWRITE and INVALID. Within each list, we assume: - * 1. Extents are ordered by file offset. - * 2. For any given isect, there is at most one extents that matches. - */ -struct pnfs_block_extent * -bl_find_get_extent(struct pnfs_block_layout *bl, sector_t isect, - struct pnfs_block_extent **cow_read) -{ - struct pnfs_block_extent *be, *cow, *ret; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - cow = ret = NULL; - spin_lock(&bl->bl_ext_lock); - for (i = 0; i < EXTENT_LISTS; i++) { - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - if (!ret) - ret = be; - else if (be->be_state != PNFS_BLOCK_READ_DATA) - bl_put_extent(be); - else - cow = be; - break; - } - } - if (ret && - (!cow_read || ret->be_state != PNFS_BLOCK_INVALID_DATA)) - break; - } - spin_unlock(&bl->bl_ext_lock); - if (cow_read) - *cow_read = cow; - print_bl_extent(ret); - return ret; -} - -/* Similar to bl_find_get_extent, but called with lock held, and ignores cow */ -static struct pnfs_block_extent * -bl_find_get_extent_locked(struct pnfs_block_layout *bl, sector_t isect) -{ - struct pnfs_block_extent *be, *ret = NULL; - int i; - - dprintk("%s enter with isect %llu\n", __func__, (u64)isect); - for (i = 0; i < EXTENT_LISTS; i++) { - if (ret) - break; - list_for_each_entry_reverse(be, &bl->bl_extents[i], be_node) { - if (isect >= be->be_f_offset + be->be_length) - break; - if (isect >= be->be_f_offset) { - /* We have found an extent */ - dprintk("%s Get %p (%i)\n", __func__, be, - atomic_read(&be->be_refcnt.refcount)); - kref_get(&be->be_refcnt); - ret = be; - break; - } - } - } - print_bl_extent(ret); - return ret; -} - -int -encode_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - struct xdr_stream *xdr, - const struct nfs4_layoutcommit_args *arg) -{ - struct pnfs_block_short_extent *lce, *save; - unsigned int count = 0; - __be32 *p, *xdr_start; - - dprintk("%s enter\n", __func__); - /* BUG - creation of bl_commit is buggy - need to wait for - * entire block to be marked WRITTEN before it can be added. - */ - spin_lock(&bl->bl_ext_lock); - /* Want to adjust for possible truncate */ - /* We now want to adjust argument range */ - - /* XDR encode the ranges found */ - xdr_start = xdr_reserve_space(xdr, 8); - if (!xdr_start) - goto out; - list_for_each_entry_safe(lce, save, &bl->bl_commit, bse_node) { - p = xdr_reserve_space(xdr, 7 * 4 + sizeof(lce->bse_devid.data)); - if (!p) - break; - p = xdr_encode_opaque_fixed(p, lce->bse_devid.data, NFS4_DEVICEID4_SIZE); - p = xdr_encode_hyper(p, lce->bse_f_offset << SECTOR_SHIFT); - p = xdr_encode_hyper(p, lce->bse_length << SECTOR_SHIFT); - p = xdr_encode_hyper(p, 0LL); - *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA); - list_move_tail(&lce->bse_node, &bl->bl_committing); - bl->bl_count--; - count++; - } - xdr_start[0] = cpu_to_be32((xdr->p - xdr_start - 1) * 4); - xdr_start[1] = cpu_to_be32(count); -out: - spin_unlock(&bl->bl_ext_lock); - dprintk("%s found %i ranges\n", __func__, count); - return 0; -} - -/* Helper function to set_to_rw that initialize a new extent */ -static void -_prep_new_extent(struct pnfs_block_extent *new, - struct pnfs_block_extent *orig, - sector_t offset, sector_t length, int state) -{ - kref_init(&new->be_refcnt); - /* don't need to INIT_LIST_HEAD(&new->be_node) */ - memcpy(&new->be_devid, &orig->be_devid, sizeof(struct nfs4_deviceid)); - new->be_mdev = orig->be_mdev; - new->be_f_offset = offset; - new->be_length = length; - new->be_v_offset = orig->be_v_offset - orig->be_f_offset + offset; - new->be_state = state; - new->be_inval = orig->be_inval; -} - -/* Tries to merge be with extent in front of it in list. - * Frees storage if not used. - */ -static struct pnfs_block_extent * -_front_merge(struct pnfs_block_extent *be, struct list_head *head, - struct pnfs_block_extent *storage) -{ - struct pnfs_block_extent *prev; - - if (!storage) - goto no_merge; - if (&be->be_node == head || be->be_node.prev == head) - goto no_merge; - prev = list_entry(be->be_node.prev, struct pnfs_block_extent, be_node); - if ((prev->be_f_offset + prev->be_length != be->be_f_offset) || - !extents_consistent(prev, be)) - goto no_merge; - _prep_new_extent(storage, prev, prev->be_f_offset, - prev->be_length + be->be_length, prev->be_state); - list_replace(&prev->be_node, &storage->be_node); - bl_put_extent(prev); - list_del(&be->be_node); - bl_put_extent(be); - return storage; - - no_merge: - kfree(storage); - return be; -} - -static u64 -set_to_rw(struct pnfs_block_layout *bl, u64 offset, u64 length) -{ - u64 rv = offset + length; - struct pnfs_block_extent *be, *e1, *e2, *e3, *new, *old; - struct pnfs_block_extent *children[3]; - struct pnfs_block_extent *merge1 = NULL, *merge2 = NULL; - int i = 0, j; - - dprintk("%s(%llu, %llu)\n", __func__, offset, length); - /* Create storage for up to three new extents e1, e2, e3 */ - e1 = kmalloc(sizeof(*e1), GFP_ATOMIC); - e2 = kmalloc(sizeof(*e2), GFP_ATOMIC); - e3 = kmalloc(sizeof(*e3), GFP_ATOMIC); - /* BUG - we are ignoring any failure */ - if (!e1 || !e2 || !e3) - goto out_nosplit; - - spin_lock(&bl->bl_ext_lock); - be = bl_find_get_extent_locked(bl, offset); - rv = be->be_f_offset + be->be_length; - if (be->be_state != PNFS_BLOCK_INVALID_DATA) { - spin_unlock(&bl->bl_ext_lock); - goto out_nosplit; - } - /* Add e* to children, bumping e*'s krefs */ - if (be->be_f_offset != offset) { - _prep_new_extent(e1, be, be->be_f_offset, - offset - be->be_f_offset, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e1; - print_bl_extent(e1); - } else - merge1 = e1; - _prep_new_extent(e2, be, offset, - min(length, be->be_f_offset + be->be_length - offset), - PNFS_BLOCK_READWRITE_DATA); - children[i++] = e2; - print_bl_extent(e2); - if (offset + length < be->be_f_offset + be->be_length) { - _prep_new_extent(e3, be, e2->be_f_offset + e2->be_length, - be->be_f_offset + be->be_length - - offset - length, - PNFS_BLOCK_INVALID_DATA); - children[i++] = e3; - print_bl_extent(e3); - } else - merge2 = e3; - - /* Remove be from list, and insert the e* */ - /* We don't get refs on e*, since this list is the base reference - * set when init'ed. - */ - if (i < 3) - children[i] = NULL; - new = children[0]; - list_replace(&be->be_node, &new->be_node); - bl_put_extent(be); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge1); - for (j = 1; j < i; j++) { - old = new; - new = children[j]; - list_add(&new->be_node, &old->be_node); - } - if (merge2) { - /* This is a HACK, should just create a _back_merge function */ - new = list_entry(new->be_node.next, - struct pnfs_block_extent, be_node); - new = _front_merge(new, &bl->bl_extents[RW_EXTENT], merge2); - } - spin_unlock(&bl->bl_ext_lock); - - /* Since we removed the base reference above, be is now scheduled for - * destruction. - */ - bl_put_extent(be); - dprintk("%s returns %llu after split\n", __func__, rv); - return rv; - - out_nosplit: - kfree(e1); - kfree(e2); - kfree(e3); - dprintk("%s returns %llu without splitting\n", __func__, rv); - return rv; -} - -void -clean_pnfs_block_layoutupdate(struct pnfs_block_layout *bl, - const struct nfs4_layoutcommit_args *arg, - int status) -{ - struct pnfs_block_short_extent *lce, *save; - - dprintk("%s status %d\n", __func__, status); - list_for_each_entry_safe(lce, save, &bl->bl_committing, bse_node) { - if (likely(!status)) { - u64 offset = lce->bse_f_offset; - u64 end = offset + lce->bse_length; - - do { - offset = set_to_rw(bl, offset, end - offset); - } while (offset < end); - list_del(&lce->bse_node); - - kfree(lce); - } else { - list_del(&lce->bse_node); - spin_lock(&bl->bl_ext_lock); - add_to_commitlist(bl, lce); - spin_unlock(&bl->bl_ext_lock); - } - } -} - -int bl_push_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *new; - - new = kmalloc(sizeof(*new), GFP_NOFS); - if (unlikely(!new)) - return -ENOMEM; - - spin_lock_bh(&marks->im_lock); - list_add(&new->bse_node, &marks->im_extents); - spin_unlock_bh(&marks->im_lock); - - return 0; -} - -struct pnfs_block_short_extent * -bl_pop_one_short_extent(struct pnfs_inval_markings *marks) -{ - struct pnfs_block_short_extent *rv = NULL; - - spin_lock_bh(&marks->im_lock); - if (!list_empty(&marks->im_extents)) { - rv = list_entry((&marks->im_extents)->next, - struct pnfs_block_short_extent, bse_node); - list_del_init(&rv->bse_node); - } - spin_unlock_bh(&marks->im_lock); - - return rv; -} - -void bl_free_short_extents(struct pnfs_inval_markings *marks, int num_to_free) -{ - struct pnfs_block_short_extent *se = NULL, *tmp; - - if (num_to_free <= 0) - return; - - spin_lock(&marks->im_lock); - list_for_each_entry_safe(se, tmp, &marks->im_extents, bse_node) { - list_del(&se->bse_node); - kfree(se); - if (--num_to_free == 0) - break; - } - spin_unlock(&marks->im_lock); - - BUG_ON(num_to_free > 0); -} diff --git a/fs/nfs/blocklayout/rpc_pipefs.c b/fs/nfs/blocklayout/rpc_pipefs.c new file mode 100644 index 000000000000..e966c023b1b7 --- /dev/null +++ b/fs/nfs/blocklayout/rpc_pipefs.c @@ -0,0 +1,284 @@ +/* + * Copyright (c) 2006,2007 The Regents of the University of Michigan. + * All rights reserved. + * + * Andy Adamson <andros@citi.umich.edu> + * Fred Isaman <iisaman@umich.edu> + * + * permission is granted to use, copy, create derivative works and + * redistribute this software and such derivative works for any purpose, + * so long as the name of the university of michigan is not used in + * any advertising or publicity pertaining to the use or distribution + * of this software without specific, written prior authorization. if + * the above copyright notice or any other identification of the + * university of michigan is included in any copy of any portion of + * this software, then the disclaimer below must also be included. + * + * this software is provided as is, without representation from the + * university of michigan as to its fitness for any purpose, and without + * warranty by the university of michigan of any kind, either express + * or implied, including without limitation the implied warranties of + * merchantability and fitness for a particular purpose. the regents + * of the university of michigan shall not be liable for any damages, + * including special, indirect, incidental, or consequential damages, + * with respect to any claim arising out or in connection with the use + * of the software, even if it has been or is hereafter advised of the + * possibility of such damages. + */ + +#include <linux/module.h> +#include <linux/genhd.h> +#include <linux/blkdev.h> + +#include "blocklayout.h" + +#define NFSDBG_FACILITY NFSDBG_PNFS_LD + +static void +nfs4_encode_simple(__be32 *p, struct pnfs_block_volume *b) +{ + int i; + + *p++ = cpu_to_be32(1); + *p++ = cpu_to_be32(b->type); + *p++ = cpu_to_be32(b->simple.nr_sigs); + for (i = 0; i < b->simple.nr_sigs; i++) { + p = xdr_encode_hyper(p, b->simple.sigs[i].offset); + p = xdr_encode_opaque(p, b->simple.sigs[i].sig, + b->simple.sigs[i].sig_len); + } +} + +dev_t +bl_resolve_deviceid(struct nfs_server *server, struct pnfs_block_volume *b, + gfp_t gfp_mask) +{ + struct net *net = server->nfs_client->cl_net; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct bl_dev_msg *reply = &nn->bl_mount_reply; + struct bl_pipe_msg bl_pipe_msg; + struct rpc_pipe_msg *msg = &bl_pipe_msg.msg; + struct bl_msg_hdr *bl_msg; + DECLARE_WAITQUEUE(wq, current); + dev_t dev = 0; + int rc; + + dprintk("%s CREATING PIPEFS MESSAGE\n", __func__); + + bl_pipe_msg.bl_wq = &nn->bl_wq; + + b->simple.len += 4; /* single volume */ + if (b->simple.len > PAGE_SIZE) + return -EIO; + + memset(msg, 0, sizeof(*msg)); + msg->len = sizeof(*bl_msg) + b->simple.len; + msg->data = kzalloc(msg->len, gfp_mask); + if (!msg->data) + goto out; + + bl_msg = msg->data; + bl_msg->type = BL_DEVICE_MOUNT, + bl_msg->totallen = b->simple.len; + nfs4_encode_simple(msg->data + sizeof(*bl_msg), b); + + dprintk("%s CALLING USERSPACE DAEMON\n", __func__); + add_wait_queue(&nn->bl_wq, &wq); + rc = rpc_queue_upcall(nn->bl_device_pipe, msg); + if (rc < 0) { + remove_wait_queue(&nn->bl_wq, &wq); + goto out; + } + + set_current_state(TASK_UNINTERRUPTIBLE); + schedule(); + remove_wait_queue(&nn->bl_wq, &wq); + + if (reply->status != BL_DEVICE_REQUEST_PROC) { + printk(KERN_WARNING "%s failed to decode device: %d\n", + __func__, reply->status); + goto out; + } + + dev = MKDEV(reply->major, reply->minor); +out: + kfree(msg->data); + return dev; +} + +static ssize_t bl_pipe_downcall(struct file *filp, const char __user *src, + size_t mlen) +{ + struct nfs_net *nn = net_generic(filp->f_dentry->d_sb->s_fs_info, + nfs_net_id); + + if (mlen != sizeof (struct bl_dev_msg)) + return -EINVAL; + + if (copy_from_user(&nn->bl_mount_reply, src, mlen) != 0) + return -EFAULT; + + wake_up(&nn->bl_wq); + + return mlen; +} + +static void bl_pipe_destroy_msg(struct rpc_pipe_msg *msg) +{ + struct bl_pipe_msg *bl_pipe_msg = + container_of(msg, struct bl_pipe_msg, msg); + + if (msg->errno >= 0) + return; + wake_up(bl_pipe_msg->bl_wq); +} + +static const struct rpc_pipe_ops bl_upcall_ops = { + .upcall = rpc_pipe_generic_upcall, + .downcall = bl_pipe_downcall, + .destroy_msg = bl_pipe_destroy_msg, +}; + +static struct dentry *nfs4blocklayout_register_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + struct dentry *dir, *dentry; + + dir = rpc_d_lookup_sb(sb, NFS_PIPE_DIRNAME); + if (dir == NULL) + return ERR_PTR(-ENOENT); + dentry = rpc_mkpipe_dentry(dir, "blocklayout", NULL, pipe); + dput(dir); + return dentry; +} + +static void nfs4blocklayout_unregister_sb(struct super_block *sb, + struct rpc_pipe *pipe) +{ + if (pipe->dentry) + rpc_unlink(pipe->dentry); +} + +static int rpc_pipefs_event(struct notifier_block *nb, unsigned long event, + void *ptr) +{ + struct super_block *sb = ptr; + struct net *net = sb->s_fs_info; + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + int ret = 0; + + if (!try_module_get(THIS_MODULE)) + return 0; + + if (nn->bl_device_pipe == NULL) { + module_put(THIS_MODULE); + return 0; + } + + switch (event) { + case RPC_PIPEFS_MOUNT: + dentry = nfs4blocklayout_register_sb(sb, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + ret = PTR_ERR(dentry); + break; + } + nn->bl_device_pipe->dentry = dentry; + break; + case RPC_PIPEFS_UMOUNT: + if (nn->bl_device_pipe->dentry) + nfs4blocklayout_unregister_sb(sb, nn->bl_device_pipe); + break; + default: + ret = -ENOTSUPP; + break; + } + module_put(THIS_MODULE); + return ret; +} + +static struct notifier_block nfs4blocklayout_block = { + .notifier_call = rpc_pipefs_event, +}; + +static struct dentry *nfs4blocklayout_register_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + struct dentry *dentry; + + pipefs_sb = rpc_get_sb_net(net); + if (!pipefs_sb) + return NULL; + dentry = nfs4blocklayout_register_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + return dentry; +} + +static void nfs4blocklayout_unregister_net(struct net *net, + struct rpc_pipe *pipe) +{ + struct super_block *pipefs_sb; + + pipefs_sb = rpc_get_sb_net(net); + if (pipefs_sb) { + nfs4blocklayout_unregister_sb(pipefs_sb, pipe); + rpc_put_sb_net(net); + } +} + +static int nfs4blocklayout_net_init(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + struct dentry *dentry; + + init_waitqueue_head(&nn->bl_wq); + nn->bl_device_pipe = rpc_mkpipe_data(&bl_upcall_ops, 0); + if (IS_ERR(nn->bl_device_pipe)) + return PTR_ERR(nn->bl_device_pipe); + dentry = nfs4blocklayout_register_net(net, nn->bl_device_pipe); + if (IS_ERR(dentry)) { + rpc_destroy_pipe_data(nn->bl_device_pipe); + return PTR_ERR(dentry); + } + nn->bl_device_pipe->dentry = dentry; + return 0; +} + +static void nfs4blocklayout_net_exit(struct net *net) +{ + struct nfs_net *nn = net_generic(net, nfs_net_id); + + nfs4blocklayout_unregister_net(net, nn->bl_device_pipe); + rpc_destroy_pipe_data(nn->bl_device_pipe); + nn->bl_device_pipe = NULL; +} + +static struct pernet_operations nfs4blocklayout_net_ops = { + .init = nfs4blocklayout_net_init, + .exit = nfs4blocklayout_net_exit, +}; + +int __init bl_init_pipefs(void) +{ + int ret; + + ret = rpc_pipefs_notifier_register(&nfs4blocklayout_block); + if (ret) + goto out; + ret = register_pernet_subsys(&nfs4blocklayout_net_ops); + if (ret) + goto out_unregister_notifier; + return 0; + +out_unregister_notifier: + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); +out: + return ret; +} + +void __exit bl_cleanup_pipefs(void) +{ + rpc_pipefs_notifier_unregister(&nfs4blocklayout_block); + unregister_pernet_subsys(&nfs4blocklayout_net_ops); +} diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 54de482143cc..b8fb3a4ef649 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -235,7 +235,7 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, cb_info->serv = serv; cb_info->rqst = rqstp; - cb_info->task = kthread_run(callback_svc, cb_info->rqst, + cb_info->task = kthread_create(callback_svc, cb_info->rqst, "nfsv4.%u-svc", minorversion); if (IS_ERR(cb_info->task)) { ret = PTR_ERR(cb_info->task); @@ -244,6 +244,8 @@ static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt, cb_info->task = NULL; return ret; } + rqstp->rq_task = cb_info->task; + wake_up_process(cb_info->task); dprintk("nfs_callback_up: service started\n"); return 0; } diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index 41db5258e7a7..73466b934090 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -171,14 +171,26 @@ static u32 initiate_file_draining(struct nfs_client *clp, goto out; ino = lo->plh_inode; + + spin_lock(&ino->i_lock); + pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + spin_unlock(&ino->i_lock); + + pnfs_layoutcommit_inode(ino, false); + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags) || pnfs_mark_matching_lsegs_invalid(lo, &free_me_list, - &args->cbl_range)) + &args->cbl_range)) { rv = NFS4ERR_DELAY; - else - rv = NFS4ERR_NOMATCHING_LAYOUT; - pnfs_set_layout_stateid(lo, &args->cbl_stateid, true); + goto unlock; + } + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, + &args->cbl_range); + } +unlock: spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&free_me_list); pnfs_put_layout_hdr(lo); @@ -277,9 +289,6 @@ __be32 nfs4_callback_devicenotify(struct cb_devicenotifyargs *args, } found: - if (dev->cbd_notify_type == NOTIFY_DEVICEID4_CHANGE) - dprintk("%s: NOTIFY_DEVICEID4_CHANGE not supported, " - "deleting instead\n", __func__); nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id); } diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6a4f3666e273..f9f4845db989 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -1252,6 +1252,7 @@ static int nfs_server_list_open(struct inode *inode, struct file *file) * set up the iterator to start reading from the server list and return the first item */ static void *nfs_server_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); @@ -1274,6 +1275,7 @@ static void *nfs_server_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_server_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); @@ -1318,7 +1320,7 @@ static int nfs_server_list_show(struct seq_file *m, void *v) */ static int nfs_volume_list_open(struct inode *inode, struct file *file) { - return seq_open_net(inode, file, &nfs_server_list_ops, + return seq_open_net(inode, file, &nfs_volume_list_ops, sizeof(struct seq_net_private)); } @@ -1326,6 +1328,7 @@ static int nfs_volume_list_open(struct inode *inode, struct file *file) * set up the iterator to start reading from the volume list and return the first item */ static void *nfs_volume_list_start(struct seq_file *m, loff_t *_pos) + __acquires(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(m), nfs_net_id); @@ -1348,6 +1351,7 @@ static void *nfs_volume_list_next(struct seq_file *p, void *v, loff_t *pos) * clean up after reading from the transports list */ static void nfs_volume_list_stop(struct seq_file *p, void *v) + __releases(&nn->nfs_client_lock) { struct nfs_net *nn = net_generic(seq_file_net(p), nfs_net_id); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 36d921f0c602..06e8cfcbb670 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -486,8 +486,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry) nfs_setsecurity(dentry->d_inode, entry->fattr, entry->label); goto out; } else { - if (d_invalidate(dentry) != 0) - goto out; + d_invalidate(dentry); dput(dentry); } } @@ -1211,10 +1210,6 @@ out_zap_parent: if (IS_ROOT(dentry)) goto out_valid; } - /* If we have submounts, don't unhash ! */ - if (check_submounts_and_drop(dentry) != 0) - goto out_valid; - dput(parent); dfprintk(LOOKUPCACHE, "NFS: %s(%pd2) is invalid\n", __func__, dentry); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 65ef6e00deee..20cffc830468 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -178,7 +178,6 @@ static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq, return memcmp(verfp, &hdr->verf, sizeof(struct nfs_writeverf)); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) /* * nfs_direct_cmp_commit_data_verf - compare verifier for commit data * @dreq - direct request possibly spanning multiple servers @@ -197,7 +196,6 @@ static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq, WARN_ON_ONCE(verfp->committed < 0); return memcmp(verfp, &data->verf, sizeof(struct nfs_writeverf)); } -#endif /** * nfs_direct_IO - NFS address space operation for direct I/O @@ -222,11 +220,9 @@ ssize_t nfs_direct_IO(int rw, struct kiocb *iocb, struct iov_iter *iter, loff_t #else VM_BUG_ON(iocb->ki_nbytes != PAGE_SIZE); - if (rw == READ || rw == KERNEL_READ) - return nfs_file_direct_read(iocb, iter, pos, - rw == READ ? true : false); - return nfs_file_direct_write(iocb, iter, pos, - rw == WRITE ? true : false); + if (rw == READ) + return nfs_file_direct_read(iocb, iter, pos); + return nfs_file_direct_write(iocb, iter, pos); #endif /* CONFIG_NFS_SWAP */ } @@ -512,7 +508,7 @@ static ssize_t nfs_direct_read_schedule_iovec(struct nfs_direct_req *dreq, * cache. */ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio) + loff_t pos) { struct file *file = iocb->ki_filp; struct address_space *mapping = file->f_mapping; @@ -576,7 +572,6 @@ out: return result; } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq) { struct nfs_pageio_descriptor desc; @@ -700,17 +695,6 @@ static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode schedule_work(&dreq->work); /* Calls nfs_direct_write_schedule_work */ } -#else -static void nfs_direct_write_schedule_work(struct work_struct *work) -{ -} - -static void nfs_direct_write_complete(struct nfs_direct_req *dreq, struct inode *inode) -{ - nfs_direct_complete(dreq, true); -} -#endif - static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; @@ -893,7 +877,7 @@ static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq, * is no atomic O_APPEND write facility in the NFS protocol. */ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, - loff_t pos, bool uio) + loff_t pos) { ssize_t result = -EINVAL; struct file *file = iocb->ki_filp; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 524dd80d1898..2ab6f00dba5b 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -36,6 +36,7 @@ #include "internal.h" #include "iostat.h" #include "fscache.h" +#include "pnfs.h" #include "nfstrace.h" @@ -171,7 +172,7 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) ssize_t result; if (iocb->ki_filp->f_flags & O_DIRECT) - return nfs_file_direct_read(iocb, to, iocb->ki_pos, true); + return nfs_file_direct_read(iocb, to, iocb->ki_pos); dprintk("NFS: read(%pD2, %zu@%lu)\n", iocb->ki_filp, @@ -327,6 +328,12 @@ static int nfs_want_read_modify_write(struct file *file, struct page *page, unsigned int offset = pos & (PAGE_CACHE_SIZE - 1); unsigned int end = offset + len; + if (pnfs_ld_read_whole_page(file->f_mapping->host)) { + if (!PageUptodate(page)) + return 1; + return 0; + } + if ((file->f_mode & FMODE_READ) && /* open for read? */ !PageUptodate(page) && /* Uptodate? */ !PagePrivate(page) && /* i/o request already? */ @@ -468,17 +475,26 @@ static int nfs_release_page(struct page *page, gfp_t gfp) dfprintk(PAGECACHE, "NFS: release_page(%p)\n", page); - /* Only do I/O if gfp is a superset of GFP_KERNEL, and we're not - * doing this memory reclaim for a fs-related allocation. + /* Always try to initiate a 'commit' if relevant, but only + * wait for it if __GFP_WAIT is set. Even then, only wait 1 + * second and only if the 'bdi' is not congested. + * Waiting indefinitely can cause deadlocks when the NFS + * server is on this machine, when a new TCP connection is + * needed and in other rare cases. There is no particular + * need to wait extensively here. A short wait has the + * benefit that someone else can worry about the freezer. */ - if (mapping && (gfp & GFP_KERNEL) == GFP_KERNEL && - !(current->flags & PF_FSTRANS)) { - int how = FLUSH_SYNC; - - /* Don't let kswapd deadlock waiting for OOM RPC calls */ - if (current_is_kswapd()) - how = 0; - nfs_commit_inode(mapping->host, how); + if (mapping) { + struct nfs_server *nfss = NFS_SERVER(mapping->host); + nfs_commit_inode(mapping->host, 0); + if ((gfp & __GFP_WAIT) && + !bdi_write_congested(&nfss->backing_dev_info)) { + wait_on_page_bit_killable_timeout(page, PG_private, + HZ); + if (PagePrivate(page)) + set_bdi_congested(&nfss->backing_dev_info, + BLK_RW_ASYNC); + } } /* If PagePrivate() is set, then the page is not freeable */ if (PagePrivate(page)) @@ -539,13 +555,25 @@ static int nfs_launder_page(struct page *page) static int nfs_swap_activate(struct swap_info_struct *sis, struct file *file, sector_t *span) { + int ret; + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + *span = sis->pages; - return xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 1); + + rcu_read_lock(); + ret = xs_swapper(rcu_dereference(clnt->cl_xprt), 1); + rcu_read_unlock(); + + return ret; } static void nfs_swap_deactivate(struct file *file) { - xs_swapper(NFS_CLIENT(file->f_mapping->host)->cl_xprt, 0); + struct rpc_clnt *clnt = NFS_CLIENT(file->f_mapping->host); + + rcu_read_lock(); + xs_swapper(rcu_dereference(clnt->cl_xprt), 0); + rcu_read_unlock(); } #endif @@ -648,7 +676,7 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) return result; if (file->f_flags & O_DIRECT) - return nfs_file_direct_write(iocb, from, pos, true); + return nfs_file_direct_write(iocb, from, pos); dprintk("NFS: write(%pD2, %zu@%Ld)\n", file, count, (long long) pos); @@ -891,17 +919,6 @@ int nfs_flock(struct file *filp, int cmd, struct file_lock *fl) } EXPORT_SYMBOL_GPL(nfs_flock); -/* - * There is no protocol support for leases, so we have no way to implement - * them correctly in the face of opens by other clients. - */ -int nfs_setlease(struct file *file, long arg, struct file_lock **fl) -{ - dprintk("NFS: setlease(%pD2, arg=%ld)\n", file, arg); - return -EINVAL; -} -EXPORT_SYMBOL_GPL(nfs_setlease); - const struct file_operations nfs_file_operations = { .llseek = nfs_file_llseek, .read = new_sync_read, @@ -918,6 +935,6 @@ const struct file_operations nfs_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; EXPORT_SYMBOL_GPL(nfs_file_operations); diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index 90978075f730..46fab1cb455a 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -265,7 +265,7 @@ filelayout_set_layoutcommit(struct nfs_pgio_header *hdr) { if (FILELAYOUT_LSEG(hdr->lseg)->commit_through_mds || - hdr->res.verf->committed == NFS_FILE_SYNC) + hdr->res.verf->committed != NFS_DATA_SYNC) return; pnfs_set_layoutcommit(hdr); @@ -403,6 +403,9 @@ static int filelayout_commit_done_cb(struct rpc_task *task, return -EAGAIN; } + if (data->verf.committed == NFS_UNSTABLE) + pnfs_commit_set_layoutcommit(data); + return 0; } @@ -646,18 +649,15 @@ filelayout_check_layout(struct pnfs_layout_hdr *lo, } /* find and reference the deviceid */ - d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode)->pnfs_curr_ld, - NFS_SERVER(lo->plh_inode)->nfs_client, id); - if (d == NULL) { - dsaddr = filelayout_get_device_info(lo->plh_inode, id, - lo->plh_lc_cred, gfp_flags); - if (dsaddr == NULL) - goto out; - } else - dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); + d = nfs4_find_get_deviceid(NFS_SERVER(lo->plh_inode), id, + lo->plh_lc_cred, gfp_flags); + if (d == NULL) + goto out; + + dsaddr = container_of(d, struct nfs4_file_layout_dsaddr, id_node); /* Found deviceid is unavailable */ if (filelayout_test_devid_unavailable(&dsaddr->id_node)) - goto out_put; + goto out_put; fl->dsaddr = dsaddr; @@ -1031,7 +1031,7 @@ filelayout_clear_request_commit(struct nfs_page *req, } out: nfs_request_remove_commit_list(req, cinfo); - pnfs_put_lseg_async(freeme); + pnfs_put_lseg_locked(freeme); } static void @@ -1368,6 +1368,17 @@ out: cinfo->ds->ncommitting = 0; return PNFS_ATTEMPTED; } +static struct nfs4_deviceid_node * +filelayout_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags) +{ + struct nfs4_file_layout_dsaddr *dsaddr; + + dsaddr = nfs4_fl_alloc_deviceid_node(server, pdev, gfp_flags); + if (!dsaddr) + return NULL; + return &dsaddr->id_node; +} static void filelayout_free_deveiceid_node(struct nfs4_deviceid_node *d) @@ -1420,6 +1431,7 @@ static struct pnfs_layoutdriver_type filelayout_type = { .commit_pagelist = filelayout_commit_pagelist, .read_pagelist = filelayout_read_pagelist, .write_pagelist = filelayout_write_pagelist, + .alloc_deviceid_node = filelayout_alloc_deviceid_node, .free_deviceid_node = filelayout_free_deveiceid_node, }; diff --git a/fs/nfs/filelayout/filelayout.h b/fs/nfs/filelayout/filelayout.h index ffbddf2219ea..7c9f800c49d7 100644 --- a/fs/nfs/filelayout/filelayout.h +++ b/fs/nfs/filelayout/filelayout.h @@ -147,10 +147,11 @@ u32 nfs4_fl_calc_j_index(struct pnfs_layout_segment *lseg, loff_t offset); u32 nfs4_fl_calc_ds_index(struct pnfs_layout_segment *lseg, u32 j); struct nfs4_pnfs_ds *nfs4_fl_prepare_ds(struct pnfs_layout_segment *lseg, u32 ds_idx); + +extern struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, + struct pnfs_device *pdev, gfp_t gfp_flags); extern void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); extern void nfs4_fl_free_deviceid(struct nfs4_file_layout_dsaddr *dsaddr); -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, gfp_t gfp_flags); #endif /* FS_NFS_NFS4FILELAYOUT_H */ diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 8540516f4d71..9bb806a76d99 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -484,8 +484,9 @@ out_err: } /* Decode opaque device data and return the result */ -static struct nfs4_file_layout_dsaddr* -decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) +struct nfs4_file_layout_dsaddr * +nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { int i; u32 cnt, num; @@ -570,10 +571,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) dsaddr->stripe_indices = stripe_indices; stripe_indices = NULL; dsaddr->ds_num = num; - nfs4_init_deviceid_node(&dsaddr->id_node, - NFS_SERVER(ino)->pnfs_curr_ld, - NFS_SERVER(ino)->nfs_client, - &pdev->dev_id); + nfs4_init_deviceid_node(&dsaddr->id_node, server, &pdev->dev_id); INIT_LIST_HEAD(&dsaddrs); @@ -587,7 +585,7 @@ decode_device(struct inode *ino, struct pnfs_device *pdev, gfp_t gfp_flags) mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = decode_ds_addr(NFS_SERVER(ino)->nfs_client->cl_net, + da = decode_ds_addr(server->nfs_client->cl_net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); @@ -637,102 +635,6 @@ out_err: return NULL; } -/* - * Decode the opaque device specified in 'dev' and add it to the cache of - * available devices. - */ -static struct nfs4_file_layout_dsaddr * -decode_and_add_device(struct inode *inode, struct pnfs_device *dev, gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct nfs4_file_layout_dsaddr *n, *new; - - new = decode_device(inode, dev, gfp_flags); - if (!new) { - printk(KERN_WARNING "NFS: %s: Could not decode or add device\n", - __func__); - return NULL; - } - - d = nfs4_insert_deviceid_node(&new->id_node); - n = container_of(d, struct nfs4_file_layout_dsaddr, id_node); - if (n != new) { - nfs4_fl_free_deviceid(new); - return n; - } - - return new; -} - -/* - * Retrieve the information for dev_id, add it to the list - * of available devices, and return it. - */ -struct nfs4_file_layout_dsaddr * -filelayout_get_device_info(struct inode *inode, - struct nfs4_deviceid *dev_id, - struct rpc_cred *cred, - gfp_t gfp_flags) -{ - struct pnfs_device *pdev = NULL; - u32 max_resp_sz; - int max_pages; - struct page **pages = NULL; - struct nfs4_file_layout_dsaddr *dsaddr = NULL; - int rc, i; - struct nfs_server *server = NFS_SERVER(inode); - - /* - * Use the session max response size as the basis for setting - * GETDEVICEINFO's maxcount - */ - max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; - max_pages = nfs_page_array_len(0, max_resp_sz); - dprintk("%s inode %p max_resp_sz %u max_pages %d\n", - __func__, inode, max_resp_sz, max_pages); - - pdev = kzalloc(sizeof(struct pnfs_device), gfp_flags); - if (pdev == NULL) - return NULL; - - pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); - if (pages == NULL) { - kfree(pdev); - return NULL; - } - for (i = 0; i < max_pages; i++) { - pages[i] = alloc_page(gfp_flags); - if (!pages[i]) - goto out_free; - } - - memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); - pdev->layout_type = LAYOUT_NFSV4_1_FILES; - pdev->pages = pages; - pdev->pgbase = 0; - pdev->pglen = max_resp_sz; - pdev->mincount = 0; - pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; - - rc = nfs4_proc_getdeviceinfo(server, pdev, cred); - dprintk("%s getdevice info returns %d\n", __func__, rc); - if (rc) - goto out_free; - - /* - * Found new device, need to decode it and then add it to the - * list of known devices for this mountpoint. - */ - dsaddr = decode_and_add_device(inode, pdev, gfp_flags); -out_free: - for (i = 0; i < max_pages; i++) - __free_page(pages[i]); - kfree(pages); - kfree(pdev); - dprintk("<-- %s dsaddr %p\n", __func__, dsaddr); - return dsaddr; -} - void nfs4_fl_put_deviceid(struct nfs4_file_layout_dsaddr *dsaddr) { diff --git a/fs/nfs/fscache-index.c b/fs/nfs/fscache-index.c index 7cf2c4699b08..777b055063f6 100644 --- a/fs/nfs/fscache-index.c +++ b/fs/nfs/fscache-index.c @@ -74,11 +74,10 @@ static uint16_t nfs_server_get_key(const void *cookie_netfs_data, struct nfs_server_key *key = buffer; uint16_t len = sizeof(struct nfs_server_key); + memset(key, 0, len); key->nfsversion = clp->rpc_ops->version; key->family = clp->cl_addr.ss_family; - memset(key, 0, len); - switch (clp->cl_addr.ss_family) { case AF_INET: key->port = sin->sin_port; diff --git a/fs/nfs/idmap.c b/fs/nfs/idmap.c index 7dd55b745c4d..2f5db844c172 100644 --- a/fs/nfs/idmap.c +++ b/fs/nfs/idmap.c @@ -177,7 +177,6 @@ static struct key_type key_type_id_resolver = { .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, - .match = user_match, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, @@ -401,7 +400,6 @@ static struct key_type key_type_id_resolver_legacy = { .preparse = user_preparse, .free_preparse = user_free_preparse, .instantiate = generic_key_instantiate, - .match = user_match, .revoke = user_revoke, .destroy = user_destroy, .describe = user_describe, diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 577a36f0a510..6388a59f2add 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -505,7 +505,9 @@ nfs_setattr(struct dentry *dentry, struct iattr *attr) attr->ia_valid &= ~ATTR_MODE; if (attr->ia_valid & ATTR_SIZE) { - if (!S_ISREG(inode->i_mode) || attr->ia_size == i_size_read(inode)) + BUG_ON(!S_ISREG(inode->i_mode)); + + if (attr->ia_size == i_size_read(inode)) attr->ia_valid &= ~ATTR_SIZE; } @@ -716,6 +718,7 @@ struct nfs_lock_context *nfs_get_lock_context(struct nfs_open_context *ctx) kfree(new); return res; } +EXPORT_SYMBOL_GPL(nfs_get_lock_context); void nfs_put_lock_context(struct nfs_lock_context *l_ctx) { @@ -728,6 +731,7 @@ void nfs_put_lock_context(struct nfs_lock_context *l_ctx) spin_unlock(&inode->i_lock); kfree(l_ctx); } +EXPORT_SYMBOL_GPL(nfs_put_lock_context); /** * nfs_close_context - Common close_context() routine NFSv2/v3 diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9056622d2230..efaa31c70fbe 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -218,13 +218,6 @@ static inline void nfs_fs_proc_exit(void) int nfs_sockaddr_match_ipaddr(const struct sockaddr *, const struct sockaddr *); #endif -/* nfs3client.c */ -#if IS_ENABLED(CONFIG_NFS_V3) -struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); -struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, - struct nfs_fattr *, rpc_authflavor_t); -#endif - /* callback_xdr.c */ extern struct svc_version nfs4_callback_version1; extern struct svc_version nfs4_callback_version4; @@ -346,7 +339,6 @@ int nfs_file_release(struct inode *, struct file *); int nfs_lock(struct file *, int, struct file_lock *); int nfs_flock(struct file *, int, struct file_lock *); int nfs_check_flags(int); -int nfs_setlease(struct file *, long, struct file_lock **); /* inode.c */ extern struct workqueue_struct *nfsiod_workqueue; diff --git a/fs/nfs/nfs3_fs.h b/fs/nfs/nfs3_fs.h new file mode 100644 index 000000000000..333ae4068506 --- /dev/null +++ b/fs/nfs/nfs3_fs.h @@ -0,0 +1,34 @@ +/* + * Copyright (C) 2014 Anna Schumaker. + * + * NFSv3-specific filesystem definitions and declarations + */ +#ifndef __LINUX_FS_NFS_NFS3_FS_H +#define __LINUX_FS_NFS_NFS3_FS_H + +/* + * nfs3acl.c + */ +#ifdef CONFIG_NFS_V3_ACL +extern struct posix_acl *nfs3_get_acl(struct inode *inode, int type); +extern int nfs3_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl); +extern ssize_t nfs3_listxattr(struct dentry *, char *, size_t); +extern const struct xattr_handler *nfs3_xattr_handlers[]; +#else +static inline int nfs3_proc_setacls(struct inode *inode, struct posix_acl *acl, + struct posix_acl *dfacl) +{ + return 0; +} +#define nfs3_listxattr NULL +#endif /* CONFIG_NFS_V3_ACL */ + +/* nfs3client.c */ +struct nfs_server *nfs3_create_server(struct nfs_mount_info *, struct nfs_subversion *); +struct nfs_server *nfs3_clone_server(struct nfs_server *, struct nfs_fh *, + struct nfs_fattr *, rpc_authflavor_t); + + +#endif /* __LINUX_FS_NFS_NFS3_FS_H */ diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 24c6898159cc..658e586ca438 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -7,6 +7,7 @@ #include <linux/nfsacl.h> #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b3fc65ef39ca..8c1b437c5403 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -1,6 +1,7 @@ #include <linux/nfs_fs.h> #include <linux/nfs_mount.h> #include "internal.h" +#include "nfs3_fs.h" #ifdef CONFIG_NFS_V3_ACL static struct rpc_stat nfsacl_rpcstat = { &nfsacl_program }; diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 809670eba52a..524f9f837408 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -22,6 +22,7 @@ #include "iostat.h" #include "internal.h" +#include "nfs3_fs.h" #define NFSDBG_FACILITY NFSDBG_PROC diff --git a/fs/nfs/nfs3super.c b/fs/nfs/nfs3super.c index d6a98949af19..6af29c2da352 100644 --- a/fs/nfs/nfs3super.c +++ b/fs/nfs/nfs3super.c @@ -4,6 +4,7 @@ #include <linux/module.h> #include <linux/nfs_fs.h> #include "internal.h" +#include "nfs3_fs.h" #include "nfs.h" static struct nfs_subversion nfs_v3 = { diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h new file mode 100644 index 000000000000..d10333a197bf --- /dev/null +++ b/fs/nfs/nfs42.h @@ -0,0 +1,14 @@ +/* + * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ + +#ifndef __LINUX_FS_NFS_NFS4_2_H +#define __LINUX_FS_NFS_NFS4_2_H + +/* nfs4.2proc.c */ +loff_t nfs42_proc_llseek(struct file *, loff_t, int); + +/* nfs4.2xdr.h */ +extern struct rpc_procinfo nfs4_2_procedures[]; + +#endif /* __LINUX_FS_NFS_NFS4_2_H */ diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c new file mode 100644 index 000000000000..0886f1db5917 --- /dev/null +++ b/fs/nfs/nfs42proc.c @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#include <linux/fs.h> +#include <linux/sunrpc/sched.h> +#include <linux/nfs.h> +#include <linux/nfs3.h> +#include <linux/nfs4.h> +#include <linux/nfs_xdr.h> +#include <linux/nfs_fs.h> +#include "nfs4_fs.h" +#include "nfs42.h" + +static int nfs42_set_rw_stateid(nfs4_stateid *dst, struct file *file, + fmode_t fmode) +{ + struct nfs_open_context *open; + struct nfs_lock_context *lock; + int ret; + + open = get_nfs_open_context(nfs_file_open_context(file)); + lock = nfs_get_lock_context(open); + if (IS_ERR(lock)) { + put_nfs_open_context(open); + return PTR_ERR(lock); + } + + ret = nfs4_set_rw_stateid(dst, open, lock, fmode); + + nfs_put_lock_context(lock); + put_nfs_open_context(open); + return ret; +} + +loff_t nfs42_proc_llseek(struct file *filep, loff_t offset, int whence) +{ + struct inode *inode = file_inode(filep); + struct nfs42_seek_args args = { + .sa_fh = NFS_FH(inode), + .sa_offset = offset, + .sa_what = (whence == SEEK_HOLE) ? + NFS4_CONTENT_HOLE : NFS4_CONTENT_DATA, + }; + struct nfs42_seek_res res; + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_SEEK], + .rpc_argp = &args, + .rpc_resp = &res, + }; + struct nfs_server *server = NFS_SERVER(inode); + int status; + + if (!(server->caps & NFS_CAP_SEEK)) + return -ENOTSUPP; + + status = nfs42_set_rw_stateid(&args.sa_stateid, filep, FMODE_READ); + if (status) + return status; + + nfs_wb_all(inode); + status = nfs4_call_sync(server->client, server, &msg, + &args.seq_args, &res.seq_res, 0); + if (status == -ENOTSUPP) + server->caps &= ~NFS_CAP_SEEK; + if (status) + return status; + + return vfs_setpos(filep, res.sr_offset, inode->i_sb->s_maxbytes); +} diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c new file mode 100644 index 000000000000..c90469b604b8 --- /dev/null +++ b/fs/nfs/nfs42xdr.c @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2014 Anna Schumaker <Anna.Schumaker@Netapp.com> + */ +#ifndef __LINUX_FS_NFS_NFS4_2XDR_H +#define __LINUX_FS_NFS_NFS4_2XDR_H + +#define encode_seek_maxsz (op_encode_hdr_maxsz + \ + encode_stateid_maxsz + \ + 2 /* offset */ + \ + 1 /* whence */) +#define decode_seek_maxsz (op_decode_hdr_maxsz + \ + 1 /* eof */ + \ + 1 /* whence */ + \ + 2 /* offset */ + \ + 2 /* length */) + +#define NFS4_enc_seek_sz (compound_encode_hdr_maxsz + \ + encode_putfh_maxsz + \ + encode_seek_maxsz) +#define NFS4_dec_seek_sz (compound_decode_hdr_maxsz + \ + decode_putfh_maxsz + \ + decode_seek_maxsz) + + +static void encode_seek(struct xdr_stream *xdr, + struct nfs42_seek_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_SEEK, decode_seek_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->sa_stateid); + encode_uint64(xdr, args->sa_offset); + encode_uint32(xdr, args->sa_what); +} + +/* + * Encode SEEK request + */ +static void nfs4_xdr_enc_seek(struct rpc_rqst *req, + struct xdr_stream *xdr, + struct nfs42_seek_args *args) +{ + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->sa_fh, &hdr); + encode_seek(xdr, args, &hdr); + encode_nops(&hdr); +} + +static int decode_seek(struct xdr_stream *xdr, struct nfs42_seek_res *res) +{ + int status; + __be32 *p; + + status = decode_op_hdr(xdr, OP_SEEK); + if (status) + return status; + + p = xdr_inline_decode(xdr, 4 + 8); + if (unlikely(!p)) + goto out_overflow; + + res->sr_eof = be32_to_cpup(p++); + p = xdr_decode_hyper(p, &res->sr_offset); + return 0; + +out_overflow: + print_overflow_msg(__func__, xdr); + return -EIO; +} + +/* + * Decode SEEK request + */ +static int nfs4_xdr_dec_seek(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + struct nfs42_seek_res *res) +{ + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_seek(xdr, res); +out: + return status; +} +#endif /* __LINUX_FS_NFS_NFS4_2XDR_H */ diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index a8b855ab4e22..be6cac37ea10 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -226,6 +226,9 @@ int nfs4_replace_transport(struct nfs_server *server, const struct nfs4_fs_locations *locations); /* nfs4proc.c */ +extern int nfs4_call_sync(struct rpc_clnt *, struct nfs_server *, + struct rpc_message *, struct nfs4_sequence_args *, + struct nfs4_sequence_res *, int); extern int nfs4_proc_setclientid(struct nfs_client *, u32, unsigned short, struct rpc_cred *, struct nfs4_setclientid_res *); extern int nfs4_proc_setclientid_confirm(struct nfs_client *, struct nfs4_setclientid_res *arg, struct rpc_cred *); extern int nfs4_proc_get_rootfh(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *, bool); diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index a816f0627a6c..c51fb4db9bfe 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -8,6 +8,10 @@ #include "fscache.h" #include "pnfs.h" +#ifdef CONFIG_NFS_V4_2 +#include "nfs42.h" +#endif + #define NFSDBG_FACILITY NFSDBG_FILE static int @@ -115,8 +119,29 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) return ret; } +#ifdef CONFIG_NFS_V4_2 +static loff_t nfs4_file_llseek(struct file *filep, loff_t offset, int whence) +{ + loff_t ret; + + switch (whence) { + case SEEK_HOLE: + case SEEK_DATA: + ret = nfs42_proc_llseek(filep, offset, whence); + if (ret != -ENOTSUPP) + return ret; + default: + return nfs_file_llseek(filep, offset, whence); + } +} +#endif /* CONFIG_NFS_V4_2 */ + const struct file_operations nfs4_file_operations = { +#ifdef CONFIG_NFS_V4_2 + .llseek = nfs4_file_llseek, +#else .llseek = nfs_file_llseek, +#endif .read = new_sync_read, .write = new_sync_write, .read_iter = nfs_file_read, @@ -131,5 +156,5 @@ const struct file_operations nfs4_file_operations = { .splice_read = nfs_file_splice_read, .splice_write = iter_file_splice_write, .check_flags = nfs_check_flags, - .setlease = nfs_setlease, + .setlease = simple_nosetlease, }; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 6ca0c8e7a945..405bd95c1f58 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -77,7 +77,7 @@ struct nfs4_opendata; static int _nfs4_proc_open(struct nfs4_opendata *data); static int _nfs4_recover_proc_open(struct nfs4_opendata *data); static int nfs4_do_fsinfo(struct nfs_server *, struct nfs_fh *, struct nfs_fsinfo *); -static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *); +static int nfs4_async_handle_error(struct rpc_task *, const struct nfs_server *, struct nfs4_state *, long *); static void nfs_fixup_referral_attributes(struct nfs_fattr *fattr); static int nfs4_proc_getattr(struct nfs_server *, struct nfs_fh *, struct nfs_fattr *, struct nfs4_label *label); static int _nfs4_proc_getattr(struct nfs_server *server, struct nfs_fh *fhandle, struct nfs_fattr *fattr, struct nfs4_label *label); @@ -314,20 +314,30 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent kunmap_atomic(start); } +static long nfs4_update_delay(long *timeout) +{ + long ret; + if (!timeout) + return NFS4_POLL_RETRY_MAX; + if (*timeout <= 0) + *timeout = NFS4_POLL_RETRY_MIN; + if (*timeout > NFS4_POLL_RETRY_MAX) + *timeout = NFS4_POLL_RETRY_MAX; + ret = *timeout; + *timeout <<= 1; + return ret; +} + static int nfs4_delay(struct rpc_clnt *clnt, long *timeout) { int res = 0; might_sleep(); - if (*timeout <= 0) - *timeout = NFS4_POLL_RETRY_MIN; - if (*timeout > NFS4_POLL_RETRY_MAX) - *timeout = NFS4_POLL_RETRY_MAX; - freezable_schedule_timeout_killable_unsafe(*timeout); + freezable_schedule_timeout_killable_unsafe( + nfs4_update_delay(timeout)); if (fatal_signal_pending(current)) res = -ERESTARTSYS; - *timeout <<= 1; return res; } @@ -875,7 +885,6 @@ static int nfs4_call_sync_sequence(struct rpc_clnt *clnt, return ret; } -static int nfs4_call_sync(struct rpc_clnt *clnt, struct nfs_server *server, struct rpc_message *msg, @@ -1307,15 +1316,13 @@ static struct nfs4_state *nfs4_try_open_cached(struct nfs4_opendata *opendata) int ret = -EAGAIN; for (;;) { + spin_lock(&state->owner->so_lock); if (can_open_cached(state, fmode, open_mode)) { - spin_lock(&state->owner->so_lock); - if (can_open_cached(state, fmode, open_mode)) { - update_open_stateflags(state, fmode); - spin_unlock(&state->owner->so_lock); - goto out_return_state; - } + update_open_stateflags(state, fmode); spin_unlock(&state->owner->so_lock); + goto out_return_state; } + spin_unlock(&state->owner->so_lock); rcu_read_lock(); delegation = rcu_dereference(nfsi->delegation); if (!can_open_delegated(delegation, fmode)) { @@ -2589,7 +2596,7 @@ static void nfs4_close_done(struct rpc_task *task, void *data) if (calldata->arg.fmode == 0) break; default: - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); goto out_release; } @@ -3217,7 +3224,9 @@ nfs4_proc_setattr(struct dentry *dentry, struct nfs_fattr *fattr, struct nfs4_label *label = NULL; int status; - if (pnfs_ld_layoutret_on_setattr(inode)) + if (pnfs_ld_layoutret_on_setattr(inode) && + sattr->ia_valid & ATTR_SIZE && + sattr->ia_size < i_size_read(inode)) pnfs_commit_and_return_layout(inode); nfs_fattr_init(fattr); @@ -3576,7 +3585,8 @@ static int nfs4_proc_unlink_done(struct rpc_task *task, struct inode *dir) if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, + &data->timeout) == -EAGAIN) return 0; update_changeattr(dir, &res->cinfo); return 1; @@ -3609,7 +3619,7 @@ static int nfs4_proc_rename_done(struct rpc_task *task, struct inode *old_dir, if (!nfs4_sequence_done(task, &res->seq_res)) return 0; - if (nfs4_async_handle_error(task, res->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, res->server, NULL, &data->timeout) == -EAGAIN) return 0; update_changeattr(old_dir, &res->old_cinfo); @@ -4113,7 +4123,8 @@ static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) trace_nfs4_read(hdr, task->tk_status); if (nfs4_async_handle_error(task, server, - hdr->args.context->state) == -EAGAIN) { + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4181,10 +4192,11 @@ static int nfs4_write_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr) { struct inode *inode = hdr->inode; - + trace_nfs4_write(hdr, task->tk_status); if (nfs4_async_handle_error(task, NFS_SERVER(inode), - hdr->args.context->state) == -EAGAIN) { + hdr->args.context->state, + NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4264,7 +4276,8 @@ static int nfs4_commit_done_cb(struct rpc_task *task, struct nfs_commit_data *da struct inode *inode = data->inode; trace_nfs4_commit(data, task->tk_status); - if (nfs4_async_handle_error(task, NFS_SERVER(inode), NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, NFS_SERVER(inode), + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return -EAGAIN; } @@ -4817,7 +4830,8 @@ out: static int -nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, struct nfs4_state *state) +nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, + struct nfs4_state *state, long *timeout) { struct nfs_client *clp = server->nfs_client; @@ -4867,6 +4881,8 @@ nfs4_async_handle_error(struct rpc_task *task, const struct nfs_server *server, #endif /* CONFIG_NFS_V4_1 */ case -NFS4ERR_DELAY: nfs_inc_server_stats(server, NFSIOS_DELAY); + rpc_delay(task, nfs4_update_delay(timeout)); + goto restart_call; case -NFS4ERR_GRACE: rpc_delay(task, NFS4_POLL_RETRY_MAX); case -NFS4ERR_RETRY_UNCACHED_REP: @@ -5107,8 +5123,8 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata) pnfs_roc_set_barrier(data->inode, data->roc_barrier); break; default: - if (nfs4_async_handle_error(task, data->res.server, NULL) == - -EAGAIN) { + if (nfs4_async_handle_error(task, data->res.server, + NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -5372,7 +5388,8 @@ static void nfs4_locku_done(struct rpc_task *task, void *data) case -NFS4ERR_EXPIRED: break; default: - if (nfs4_async_handle_error(task, calldata->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, calldata->server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } nfs_release_seqid(calldata->arg.seqid); @@ -5978,7 +5995,8 @@ static void nfs4_release_lockowner_done(struct rpc_task *task, void *calldata) break; case -NFS4ERR_LEASE_MOVED: case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, server, + NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } @@ -7353,7 +7371,7 @@ static int nfs41_proc_async_sequence(struct nfs_client *clp, struct rpc_cred *cr int ret = 0; if ((renew_flags & NFS4_RENEW_TIMEOUT) == 0) - return 0; + return -EAGAIN; task = _nfs41_proc_sequence(clp, cred, false); if (IS_ERR(task)) ret = PTR_ERR(task); @@ -7583,14 +7601,19 @@ static void nfs4_layoutget_done(struct rpc_task *task, void *calldata) } else { LIST_HEAD(head); + /* + * Mark the bad layout state as invalid, then retry + * with the current stateid. + */ pnfs_mark_matching_lsegs_invalid(lo, &head, NULL); spin_unlock(&inode->i_lock); - /* Mark the bad layout state as invalid, then - * retry using the open stateid. */ pnfs_free_lseg_list(&head); + + task->tk_status = 0; + rpc_restart_call_prepare(task); } } - if (nfs4_async_handle_error(task, server, state) == -EAGAIN) + if (nfs4_async_handle_error(task, server, state, NULL) == -EAGAIN) rpc_restart_call_prepare(task); out: dprintk("<-- %s\n", __func__); @@ -7750,7 +7773,7 @@ static void nfs4_layoutreturn_done(struct rpc_task *task, void *calldata) case 0: break; case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, server, NULL) != -EAGAIN) + if (nfs4_async_handle_error(task, server, NULL, NULL) != -EAGAIN) break; rpc_restart_call_prepare(task); return; @@ -7809,54 +7832,6 @@ int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp) return status; } -/* - * Retrieve the list of Data Server devices from the MDS. - */ -static int _nfs4_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_getdevicelist_args args = { - .fh = fh, - .layoutclass = server->pnfs_curr_ld->id, - }; - struct nfs4_getdevicelist_res res = { - .devlist = devlist, - }; - struct rpc_message msg = { - .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_GETDEVICELIST], - .rpc_argp = &args, - .rpc_resp = &res, - }; - int status; - - dprintk("--> %s\n", __func__); - status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, - &res.seq_res, 0); - dprintk("<-- %s status=%d\n", __func__, status); - return status; -} - -int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist) -{ - struct nfs4_exception exception = { }; - int err; - - do { - err = nfs4_handle_exception(server, - _nfs4_getdevicelist(server, fh, devlist), - &exception); - } while (exception.retry); - - dprintk("%s: err=%d, num_devs=%u\n", __func__, - err, devlist->num_devs); - - return err; -} -EXPORT_SYMBOL_GPL(nfs4_proc_getdevicelist); - static int _nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *pdev, @@ -7929,7 +7904,7 @@ nfs4_layoutcommit_done(struct rpc_task *task, void *calldata) case 0: break; default: - if (nfs4_async_handle_error(task, server, NULL) == -EAGAIN) { + if (nfs4_async_handle_error(task, server, NULL, NULL) == -EAGAIN) { rpc_restart_call_prepare(task); return; } @@ -8225,7 +8200,7 @@ static void nfs41_free_stateid_done(struct rpc_task *task, void *calldata) switch (task->tk_status) { case -NFS4ERR_DELAY: - if (nfs4_async_handle_error(task, data->server, NULL) == -EAGAIN) + if (nfs4_async_handle_error(task, data->server, NULL, NULL) == -EAGAIN) rpc_restart_call_prepare(task); } } @@ -8433,7 +8408,8 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = { | NFS_CAP_CHANGE_ATTR | NFS_CAP_POSIX_LOCK | NFS_CAP_STATEID_NFSV41 - | NFS_CAP_ATOMIC_OPEN_V1, + | NFS_CAP_ATOMIC_OPEN_V1 + | NFS_CAP_SEEK, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4renewd.c b/fs/nfs/nfs4renewd.c index 1720d32ffa54..e1ba58c3d1ad 100644 --- a/fs/nfs/nfs4renewd.c +++ b/fs/nfs/nfs4renewd.c @@ -88,10 +88,18 @@ nfs4_renew_state(struct work_struct *work) } nfs_expire_all_delegations(clp); } else { + int ret; + /* Queue an asynchronous RENEW. */ - ops->sched_state_renewal(clp, cred, renew_flags); + ret = ops->sched_state_renewal(clp, cred, renew_flags); put_rpccred(cred); - goto out_exp; + switch (ret) { + default: + goto out_exp; + case -EAGAIN: + case -ENOMEM: + break; + } } } else { dprintk("%s: failed to call renewd. Reason: lease not expired \n", diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 22fe35104c0c..5194933ed419 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1705,7 +1705,8 @@ restart: if (status < 0) { set_bit(ops->owner_flag_bit, &sp->so_flags); nfs4_put_state_owner(sp); - return nfs4_recovery_handle_error(clp, status); + status = nfs4_recovery_handle_error(clp, status); + return (status != 0) ? status : -EAGAIN; } nfs4_put_state_owner(sp); @@ -1714,7 +1715,7 @@ restart: spin_unlock(&clp->cl_lock); } rcu_read_unlock(); - return status; + return 0; } static int nfs4_check_lease(struct nfs_client *clp) @@ -1761,7 +1762,6 @@ static int nfs4_handle_reclaim_lease_error(struct nfs_client *clp, int status) break; case -NFS4ERR_STALE_CLIENTID: clear_bit(NFS4CLNT_LEASE_CONFIRM, &clp->cl_state); - nfs4_state_clear_reclaim_reboot(clp); nfs4_state_start_reclaim_reboot(clp); break; case -NFS4ERR_CLID_INUSE: @@ -2345,6 +2345,7 @@ static void nfs4_state_manager(struct nfs_client *clp) status = nfs4_check_lease(clp); if (status < 0) goto out_error; + continue; } if (test_and_clear_bit(NFS4CLNT_MOVED, &clp->cl_state)) { @@ -2366,14 +2367,11 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim reboot"; status = nfs4_do_reclaim(clp, clp->cl_mvops->reboot_recovery_ops); - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || - test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state)) - continue; - nfs4_state_end_reclaim_reboot(clp); - if (test_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) + if (status == -EAGAIN) continue; if (status < 0) goto out_error; + nfs4_state_end_reclaim_reboot(clp); } /* Now recover expired state... */ @@ -2381,9 +2379,7 @@ static void nfs4_state_manager(struct nfs_client *clp) section = "reclaim nograce"; status = nfs4_do_reclaim(clp, clp->cl_mvops->nograce_recovery_ops); - if (test_bit(NFS4CLNT_LEASE_EXPIRED, &clp->cl_state) || - test_bit(NFS4CLNT_SESSION_RESET, &clp->cl_state) || - test_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state)) + if (status == -EAGAIN) continue; if (status < 0) goto out_error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e13b59d8d9aa..206c08a60c7f 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -362,25 +362,19 @@ static int nfs4_stat_to_errno(int); XDR_QUADLEN(NFS4_MAX_SESSIONID_LEN) + 5) #define encode_reclaim_complete_maxsz (op_encode_hdr_maxsz + 4) #define decode_reclaim_complete_maxsz (op_decode_hdr_maxsz + 4) -#define encode_getdevicelist_maxsz (op_encode_hdr_maxsz + 4 + \ - encode_verifier_maxsz) -#define decode_getdevicelist_maxsz (op_decode_hdr_maxsz + \ - 2 /* nfs_cookie4 gdlr_cookie */ + \ - decode_verifier_maxsz \ - /* verifier4 gdlr_verifier */ + \ - 1 /* gdlr_deviceid_list count */ + \ - XDR_QUADLEN(NFS4_PNFS_GETDEVLIST_MAXNUM * \ - NFS4_DEVICEID4_SIZE) \ - /* gdlr_deviceid_list */ + \ - 1 /* bool gdlr_eof */) -#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + 4 + \ - XDR_QUADLEN(NFS4_DEVICEID4_SIZE)) +#define encode_getdeviceinfo_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_DEVICEID4_SIZE) + \ + 1 /* layout type */ + \ + 1 /* maxcount */ + \ + 1 /* bitmap size */ + \ + 1 /* notification bitmap length */ + \ + 1 /* notification bitmap, word 0 */) #define decode_getdeviceinfo_maxsz (op_decode_hdr_maxsz + \ 1 /* layout type */ + \ 1 /* opaque devaddr4 length */ + \ /* devaddr4 payload is read into page */ \ 1 /* notification bitmap length */ + \ - 1 /* notification bitmap */) + 1 /* notification bitmap, word 0 */) #define encode_layoutget_maxsz (op_encode_hdr_maxsz + 10 + \ encode_stateid_maxsz) #define decode_layoutget_maxsz (op_decode_hdr_maxsz + 8 + \ @@ -395,7 +389,10 @@ static int nfs4_stat_to_errno(int); 2 /* last byte written */ + \ 1 /* nt_timechanged (false) */ + \ 1 /* layoutupdate4 layout type */ + \ - 1 /* NULL filelayout layoutupdate4 payload */) + 1 /* layoutupdate4 opaqueue len */) + /* the actual content of layoutupdate4 should + be allocated by drivers and spliced in + using xdr_write_pages */ #define decode_layoutcommit_maxsz (op_decode_hdr_maxsz + 3) #define encode_layoutreturn_maxsz (8 + op_encode_hdr_maxsz + \ encode_stateid_maxsz + \ @@ -809,14 +806,6 @@ static int nfs4_stat_to_errno(int); #define NFS4_dec_reclaim_complete_sz (compound_decode_hdr_maxsz + \ decode_sequence_maxsz + \ decode_reclaim_complete_maxsz) -#define NFS4_enc_getdevicelist_sz (compound_encode_hdr_maxsz + \ - encode_sequence_maxsz + \ - encode_putfh_maxsz + \ - encode_getdevicelist_maxsz) -#define NFS4_dec_getdevicelist_sz (compound_decode_hdr_maxsz + \ - decode_sequence_maxsz + \ - decode_putfh_maxsz + \ - decode_getdevicelist_maxsz) #define NFS4_enc_getdeviceinfo_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz +\ encode_getdeviceinfo_maxsz) @@ -1927,24 +1916,6 @@ static void encode_sequence(struct xdr_stream *xdr, #ifdef CONFIG_NFS_V4_1 static void -encode_getdevicelist(struct xdr_stream *xdr, - const struct nfs4_getdevicelist_args *args, - struct compound_hdr *hdr) -{ - __be32 *p; - nfs4_verifier dummy = { - .data = "dummmmmy", - }; - - encode_op_hdr(xdr, OP_GETDEVICELIST, decode_getdevicelist_maxsz, hdr); - p = reserve_space(xdr, 16); - *p++ = cpu_to_be32(args->layoutclass); - *p++ = cpu_to_be32(NFS4_PNFS_GETDEVLIST_MAXNUM); - xdr_encode_hyper(p, 0ULL); /* cookie */ - encode_nfs4_verifier(xdr, &dummy); -} - -static void encode_getdeviceinfo(struct xdr_stream *xdr, const struct nfs4_getdeviceinfo_args *args, struct compound_hdr *hdr) @@ -1952,12 +1923,15 @@ encode_getdeviceinfo(struct xdr_stream *xdr, __be32 *p; encode_op_hdr(xdr, OP_GETDEVICEINFO, decode_getdeviceinfo_maxsz, hdr); - p = reserve_space(xdr, 12 + NFS4_DEVICEID4_SIZE); + p = reserve_space(xdr, NFS4_DEVICEID4_SIZE + 4 + 4); p = xdr_encode_opaque_fixed(p, args->pdev->dev_id.data, NFS4_DEVICEID4_SIZE); *p++ = cpu_to_be32(args->pdev->layout_type); *p++ = cpu_to_be32(args->pdev->maxcount); /* gdia_maxcount */ - *p++ = cpu_to_be32(0); /* bitmap length 0 */ + + p = reserve_space(xdr, 4 + 4); + *p++ = cpu_to_be32(1); /* bitmap length */ + *p++ = cpu_to_be32(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE); } static void @@ -1990,7 +1964,7 @@ encode_layoutget(struct xdr_stream *xdr, static int encode_layoutcommit(struct xdr_stream *xdr, struct inode *inode, - const struct nfs4_layoutcommit_args *args, + struct nfs4_layoutcommit_args *args, struct compound_hdr *hdr) { __be32 *p; @@ -2011,11 +1985,16 @@ encode_layoutcommit(struct xdr_stream *xdr, *p++ = cpu_to_be32(0); /* Never send time_modify_changed */ *p++ = cpu_to_be32(NFS_SERVER(args->inode)->pnfs_curr_ld->id);/* type */ - if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) + if (NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit) { NFS_SERVER(inode)->pnfs_curr_ld->encode_layoutcommit( NFS_I(inode)->layout, xdr, args); - else - encode_uint32(xdr, 0); /* no layout-type payload */ + } else { + encode_uint32(xdr, args->layoutupdate_len); + if (args->layoutupdate_pages) { + xdr_write_pages(xdr, args->layoutupdate_pages, 0, + args->layoutupdate_len); + } + } return 0; } @@ -2893,24 +2872,6 @@ static void nfs4_xdr_enc_reclaim_complete(struct rpc_rqst *req, } /* - * Encode GETDEVICELIST request - */ -static void nfs4_xdr_enc_getdevicelist(struct rpc_rqst *req, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_args *args) -{ - struct compound_hdr hdr = { - .minorversion = nfs4_xdr_minorversion(&args->seq_args), - }; - - encode_compound_hdr(xdr, req, &hdr); - encode_sequence(xdr, &args->seq_args, &hdr); - encode_putfh(xdr, args->fh, &hdr); - encode_getdevicelist(xdr, args, &hdr); - encode_nops(&hdr); -} - -/* * Encode GETDEVICEINFO request */ static void nfs4_xdr_enc_getdeviceinfo(struct rpc_rqst *req, @@ -5765,54 +5726,6 @@ out_overflow: } #if defined(CONFIG_NFS_V4_1) -/* - * TODO: Need to handle case when EOF != true; - */ -static int decode_getdevicelist(struct xdr_stream *xdr, - struct pnfs_devicelist *res) -{ - __be32 *p; - int status, i; - nfs4_verifier verftemp; - - status = decode_op_hdr(xdr, OP_GETDEVICELIST); - if (status) - return status; - - p = xdr_inline_decode(xdr, 8 + 8 + 4); - if (unlikely(!p)) - goto out_overflow; - - /* TODO: Skip cookie for now */ - p += 2; - - /* Read verifier */ - p = xdr_decode_opaque_fixed(p, verftemp.data, NFS4_VERIFIER_SIZE); - - res->num_devs = be32_to_cpup(p); - - dprintk("%s: num_dev %d\n", __func__, res->num_devs); - - if (res->num_devs > NFS4_PNFS_GETDEVLIST_MAXNUM) { - printk(KERN_ERR "NFS: %s too many result dev_num %u\n", - __func__, res->num_devs); - return -EIO; - } - - p = xdr_inline_decode(xdr, - res->num_devs * NFS4_DEVICEID4_SIZE + 4); - if (unlikely(!p)) - goto out_overflow; - for (i = 0; i < res->num_devs; i++) - p = xdr_decode_opaque_fixed(p, res->dev_id[i].data, - NFS4_DEVICEID4_SIZE); - res->eof = be32_to_cpup(p); - return 0; -out_overflow: - print_overflow_msg(__func__, xdr); - return -EIO; -} - static int decode_getdeviceinfo(struct xdr_stream *xdr, struct pnfs_device *pdev) { @@ -5862,9 +5775,16 @@ static int decode_getdeviceinfo(struct xdr_stream *xdr, p = xdr_inline_decode(xdr, 4 * len); if (unlikely(!p)) goto out_overflow; - for (i = 0; i < len; i++, p++) { - if (be32_to_cpup(p)) { - dprintk("%s: notifications not supported\n", + + if (be32_to_cpup(p++) & + ~(NOTIFY_DEVICEID4_CHANGE | NOTIFY_DEVICEID4_DELETE)) { + dprintk("%s: unsupported notification\n", + __func__); + } + + for (i = 1; i < len; i++) { + if (be32_to_cpup(p++)) { + dprintk("%s: unsupported notification\n", __func__); return -EIO; } @@ -7097,32 +7017,6 @@ static int nfs4_xdr_dec_reclaim_complete(struct rpc_rqst *rqstp, } /* - * Decode GETDEVICELIST response - */ -static int nfs4_xdr_dec_getdevicelist(struct rpc_rqst *rqstp, - struct xdr_stream *xdr, - struct nfs4_getdevicelist_res *res) -{ - struct compound_hdr hdr; - int status; - - dprintk("encoding getdevicelist!\n"); - - status = decode_compound_hdr(xdr, &hdr); - if (status != 0) - goto out; - status = decode_sequence(xdr, &res->seq_res, rqstp); - if (status != 0) - goto out; - status = decode_putfh(xdr); - if (status != 0) - goto out; - status = decode_getdevicelist(xdr, res->devlist); -out: - return status; -} - -/* * Decode GETDEVINFO response */ static int nfs4_xdr_dec_getdeviceinfo(struct rpc_rqst *rqstp, @@ -7427,6 +7321,10 @@ nfs4_stat_to_errno(int stat) return -stat; } +#ifdef CONFIG_NFS_V4_2 +#include "nfs42xdr.c" +#endif /* CONFIG_NFS_V4_2 */ + #define PROC(proc, argtype, restype) \ [NFSPROC4_CLNT_##proc] = { \ .p_proc = NFSPROC4_COMPOUND, \ @@ -7490,11 +7388,13 @@ struct rpc_procinfo nfs4_procedures[] = { PROC(SECINFO_NO_NAME, enc_secinfo_no_name, dec_secinfo_no_name), PROC(TEST_STATEID, enc_test_stateid, dec_test_stateid), PROC(FREE_STATEID, enc_free_stateid, dec_free_stateid), - PROC(GETDEVICELIST, enc_getdevicelist, dec_getdevicelist), PROC(BIND_CONN_TO_SESSION, enc_bind_conn_to_session, dec_bind_conn_to_session), PROC(DESTROY_CLIENTID, enc_destroy_clientid, dec_destroy_clientid), #endif /* CONFIG_NFS_V4_1 */ +#ifdef CONFIG_NFS_V4_2 + PROC(SEEK, enc_seek, dec_seek), +#endif /* CONFIG_NFS_V4_2 */ }; const struct rpc_version nfs_version4 = { diff --git a/fs/nfs/objlayout/objio_osd.c b/fs/nfs/objlayout/objio_osd.c index ae05278b3761..9e5bc42180e4 100644 --- a/fs/nfs/objlayout/objio_osd.c +++ b/fs/nfs/objlayout/objio_osd.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 @@ -60,52 +60,6 @@ objio_free_deviceid_node(struct nfs4_deviceid_node *d) kfree(de); } -static struct objio_dev_ent *_dev_list_find(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de; - - d = nfs4_find_get_deviceid(nfss->pnfs_curr_ld, nfss->nfs_client, d_id); - if (!d) - return NULL; - - de = container_of(d, struct objio_dev_ent, id_node); - return de; -} - -static struct objio_dev_ent * -_dev_list_add(const struct nfs_server *nfss, - const struct nfs4_deviceid *d_id, struct osd_dev *od, - gfp_t gfp_flags) -{ - struct nfs4_deviceid_node *d; - struct objio_dev_ent *de = kzalloc(sizeof(*de), gfp_flags); - struct objio_dev_ent *n; - - if (!de) { - dprintk("%s: -ENOMEM od=%p\n", __func__, od); - return NULL; - } - - dprintk("%s: Adding od=%p\n", __func__, od); - nfs4_init_deviceid_node(&de->id_node, - nfss->pnfs_curr_ld, - nfss->nfs_client, - d_id); - de->od.od = od; - - d = nfs4_insert_deviceid_node(&de->id_node); - n = container_of(d, struct objio_dev_ent, id_node); - if (n != de) { - dprintk("%s: Race with other n->od=%p\n", __func__, n->od.od); - objio_free_deviceid_node(&de->id_node); - de = n; - } - - return de; -} - struct objio_segment { struct pnfs_layout_segment lseg; @@ -130,29 +84,24 @@ struct objio_state { /* Send and wait for a get_device_info of devices in the layout, then look them up with the osd_initiator library */ -static int objio_devices_lookup(struct pnfs_layout_hdr *pnfslay, - struct objio_segment *objio_seg, unsigned c, struct nfs4_deviceid *d_id, - gfp_t gfp_flags) +struct nfs4_deviceid_node * +objio_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags) { struct pnfs_osd_deviceaddr *deviceaddr; - struct objio_dev_ent *ode; + struct objio_dev_ent *ode = NULL; struct osd_dev *od; struct osd_dev_info odi; bool retry_flag = true; + __be32 *p; int err; - ode = _dev_list_find(NFS_SERVER(pnfslay->plh_inode), d_id); - if (ode) { - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ - return 0; - } + deviceaddr = kzalloc(sizeof(*deviceaddr), gfp_flags); + if (!deviceaddr) + return NULL; - err = objlayout_get_deviceinfo(pnfslay, d_id, &deviceaddr, gfp_flags); - if (unlikely(err)) { - dprintk("%s: objlayout_get_deviceinfo dev(%llx:%llx) =>%d\n", - __func__, _DEVID_LO(d_id), _DEVID_HI(d_id), err); - return err; - } + p = page_address(pdev->pages[0]); + pnfs_osd_xdr_decode_deviceaddr(deviceaddr, p); odi.systemid_len = deviceaddr->oda_systemid.len; if (odi.systemid_len > sizeof(odi.systemid)) { @@ -188,14 +137,24 @@ retry_lookup: goto out; } - ode = _dev_list_add(NFS_SERVER(pnfslay->plh_inode), d_id, od, - gfp_flags); - objio_seg->oc.ods[c] = &ode->od; /* must use container_of */ dprintk("Adding new dev_id(%llx:%llx)\n", - _DEVID_LO(d_id), _DEVID_HI(d_id)); + _DEVID_LO(&pdev->dev_id), _DEVID_HI(&pdev->dev_id)); + + ode = kzalloc(sizeof(*ode), gfp_flags); + if (!ode) { + dprintk("%s: -ENOMEM od=%p\n", __func__, od); + goto out; + } + + nfs4_init_deviceid_node(&ode->id_node, server, &pdev->dev_id); + kfree(deviceaddr); + + ode->od.od = od; + return &ode->id_node; + out: - objlayout_put_deviceinfo(deviceaddr); - return err; + kfree(deviceaddr); + return NULL; } static void copy_single_comp(struct ore_components *oc, unsigned c, @@ -254,6 +213,7 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, struct xdr_stream *xdr, gfp_t gfp_flags) { + struct nfs_server *server = NFS_SERVER(pnfslay->plh_inode); struct objio_segment *objio_seg; struct pnfs_osd_xdr_decode_layout_iter iter; struct pnfs_osd_layout layout; @@ -283,13 +243,21 @@ int objio_alloc_lseg(struct pnfs_layout_segment **outp, objio_seg->oc.first_dev = layout.olo_comps_index; cur_comp = 0; while (pnfs_osd_xdr_decode_layout_comp(&src_comp, &iter, xdr, &err)) { + struct nfs4_deviceid_node *d; + struct objio_dev_ent *ode; + copy_single_comp(&objio_seg->oc, cur_comp, &src_comp); - err = objio_devices_lookup(pnfslay, objio_seg, cur_comp, - &src_comp.oc_object_id.oid_device_id, - gfp_flags); - if (err) + + d = nfs4_find_get_deviceid(server, + &src_comp.oc_object_id.oid_device_id, + pnfslay->plh_lc_cred, gfp_flags); + if (!d) { + err = -ENXIO; goto err; - ++cur_comp; + } + + ode = container_of(d, struct objio_dev_ent, id_node); + objio_seg->oc.ods[cur_comp++] = &ode->od; } /* pnfs_osd_xdr_decode_layout_comp returns false on error */ if (unlikely(err)) @@ -653,6 +621,7 @@ static struct pnfs_layoutdriver_type objlayout_type = { .flags = PNFS_LAYOUTRET_ON_SETATTR | PNFS_LAYOUTRET_ON_ERROR, + .max_deviceinfo_size = PAGE_SIZE, .owner = THIS_MODULE, .alloc_layout_hdr = objlayout_alloc_layout_hdr, .free_layout_hdr = objlayout_free_layout_hdr, diff --git a/fs/nfs/objlayout/objlayout.c b/fs/nfs/objlayout/objlayout.c index 697a16d11fac..919efd4a1a23 100644 --- a/fs/nfs/objlayout/objlayout.c +++ b/fs/nfs/objlayout/objlayout.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 @@ -574,76 +574,6 @@ loop_done: dprintk("%s: Return\n", __func__); } - -/* - * Get Device Info API for io engines - */ -struct objlayout_deviceinfo { - struct page *page; - struct pnfs_osd_deviceaddr da; /* This must be last */ -}; - -/* Initialize and call nfs_getdeviceinfo, then decode and return a - * "struct pnfs_osd_deviceaddr *" Eventually objlayout_put_deviceinfo() - * should be called. - */ -int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags) -{ - struct objlayout_deviceinfo *odi; - struct pnfs_device pd; - struct page *page, **pages; - u32 *p; - int err; - - page = alloc_page(gfp_flags); - if (!page) - return -ENOMEM; - - pages = &page; - pd.pages = pages; - - memcpy(&pd.dev_id, d_id, sizeof(*d_id)); - pd.layout_type = LAYOUT_OSD2_OBJECTS; - pd.pages = &page; - pd.pgbase = 0; - pd.pglen = PAGE_SIZE; - pd.mincount = 0; - pd.maxcount = PAGE_SIZE; - - err = nfs4_proc_getdeviceinfo(NFS_SERVER(pnfslay->plh_inode), &pd, - pnfslay->plh_lc_cred); - dprintk("%s nfs_getdeviceinfo returned %d\n", __func__, err); - if (err) - goto err_out; - - p = page_address(page); - odi = kzalloc(sizeof(*odi), gfp_flags); - if (!odi) { - err = -ENOMEM; - goto err_out; - } - pnfs_osd_xdr_decode_deviceaddr(&odi->da, p); - odi->page = page; - *deviceaddr = &odi->da; - return 0; - -err_out: - __free_page(page); - return err; -} - -void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr) -{ - struct objlayout_deviceinfo *odi = container_of(deviceaddr, - struct objlayout_deviceinfo, - da); - - __free_page(odi->page); - kfree(odi); -} - enum { OBJLAYOUT_MAX_URI_LEN = 256, OBJLAYOUT_MAX_OSDNAME_LEN = 64, OBJLAYOUT_MAX_SYSID_HEX_LEN = OSD_SYSTEMID_LEN * 2 + 1, diff --git a/fs/nfs/objlayout/objlayout.h b/fs/nfs/objlayout/objlayout.h index fd13f1d2f136..2641dbad345c 100644 --- a/fs/nfs/objlayout/objlayout.h +++ b/fs/nfs/objlayout/objlayout.h @@ -6,7 +6,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 @@ -149,11 +149,6 @@ extern void objlayout_read_done(struct objlayout_io_res *oir, extern void objlayout_write_done(struct objlayout_io_res *oir, ssize_t status, bool sync); -extern int objlayout_get_deviceinfo(struct pnfs_layout_hdr *pnfslay, - struct nfs4_deviceid *d_id, struct pnfs_osd_deviceaddr **deviceaddr, - gfp_t gfp_flags); -extern void objlayout_put_deviceinfo(struct pnfs_osd_deviceaddr *deviceaddr); - /* * exported generic objects function vectors */ diff --git a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c index b3918f7ac34d..f093c7ec983b 100644 --- a/fs/nfs/objlayout/pnfs_osd_xdr_cli.c +++ b/fs/nfs/objlayout/pnfs_osd_xdr_cli.c @@ -5,7 +5,7 @@ * All rights reserved. * * Benny Halevy <bhalevy@panasas.com> - * Boaz Harrosh <bharrosh@panasas.com> + * Boaz Harrosh <ooo@electrozaur.com> * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index be7cbce6e4c7..ed0db61f8543 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -481,6 +481,14 @@ size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc, return 0; } + /* + * Limit the request size so that we can still allocate a page array + * for it without upsetting the slab allocator. + */ + if (((desc->pg_count + req->wb_bytes) >> PAGE_SHIFT) * + sizeof(struct page) > PAGE_SIZE) + return 0; + return min(desc->pg_bsize - desc->pg_count, (size_t)req->wb_bytes); } EXPORT_SYMBOL_GPL(nfs_generic_pg_test); @@ -518,7 +526,8 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); */ void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { - put_nfs_open_context(hdr->args.context); + if (hdr->args.context) + put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } @@ -743,12 +752,11 @@ int nfs_generic_pgio(struct nfs_pageio_descriptor *desc, nfs_list_remove_request(req); nfs_list_add_request(req, &hdr->pages); - if (WARN_ON_ONCE(pageused >= pagecount)) - return nfs_pgio_error(desc, hdr); - if (!last_page || last_page != req->wb_page) { - *pages++ = last_page = req->wb_page; pageused++; + if (pageused > pagecount) + break; + *pages++ = last_page = req->wb_page; } } if (WARN_ON_ONCE(pageused != pagecount)) diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index a3851debf8a2..0a5dda4d85c2 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -361,22 +361,43 @@ pnfs_put_lseg(struct pnfs_layout_segment *lseg) } EXPORT_SYMBOL_GPL(pnfs_put_lseg); -static void pnfs_put_lseg_async_work(struct work_struct *work) +static void pnfs_free_lseg_async_work(struct work_struct *work) { struct pnfs_layout_segment *lseg; + struct pnfs_layout_hdr *lo; lseg = container_of(work, struct pnfs_layout_segment, pls_work); + lo = lseg->pls_layout; - pnfs_put_lseg(lseg); + pnfs_free_lseg(lseg); + pnfs_put_layout_hdr(lo); } -void -pnfs_put_lseg_async(struct pnfs_layout_segment *lseg) +static void pnfs_free_lseg_async(struct pnfs_layout_segment *lseg) { - INIT_WORK(&lseg->pls_work, pnfs_put_lseg_async_work); + INIT_WORK(&lseg->pls_work, pnfs_free_lseg_async_work); schedule_work(&lseg->pls_work); } -EXPORT_SYMBOL_GPL(pnfs_put_lseg_async); + +void +pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg) +{ + if (!lseg) + return; + + assert_spin_locked(&lseg->pls_layout->plh_inode->i_lock); + + dprintk("%s: lseg %p ref %d valid %d\n", __func__, lseg, + atomic_read(&lseg->pls_refcount), + test_bit(NFS_LSEG_VALID, &lseg->pls_flags)); + if (atomic_dec_and_test(&lseg->pls_refcount)) { + struct pnfs_layout_hdr *lo = lseg->pls_layout; + pnfs_get_layout_hdr(lo); + pnfs_layout_remove_lseg(lo, lseg); + pnfs_free_lseg_async(lseg); + } +} +EXPORT_SYMBOL_GPL(pnfs_put_lseg_locked); static u64 end_offset(u64 start, u64 len) @@ -594,6 +615,9 @@ pnfs_layout_free_bulk_destroy_list(struct list_head *layout_list, dprintk("%s freeing layout for inode %lu\n", __func__, lo->plh_inode->i_ino); inode = lo->plh_inode; + + pnfs_layoutcommit_inode(inode, false); + spin_lock(&inode->i_lock); list_del_init(&lo->plh_bulk_destroy); lo->plh_block_lgets++; /* permanently block new LAYOUTGETs */ @@ -682,17 +706,6 @@ static bool pnfs_seqid_is_newer(u32 s1, u32 s2) return (s32)(s1 - s2) > 0; } -static void -pnfs_verify_layout_stateid(struct pnfs_layout_hdr *lo, - const nfs4_stateid *new, - struct list_head *free_me_list) -{ - if (nfs4_stateid_match_other(&lo->plh_stateid, new)) - return; - /* Layout is new! Kill existing layout segments */ - pnfs_mark_matching_lsegs_invalid(lo, free_me_list, NULL); -} - /* update lo->plh_stateid with new if is more recent */ void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new, @@ -749,7 +762,8 @@ pnfs_choose_layoutget_stateid(nfs4_stateid *dst, struct pnfs_layout_hdr *lo, status = -EAGAIN; } else if (!nfs4_valid_open_stateid(open_state)) { status = -EBADF; - } else if (list_empty(&lo->plh_segs)) { + } else if (list_empty(&lo->plh_segs) || + test_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags)) { int seq; do { @@ -864,6 +878,16 @@ _pnfs_return_layout(struct inode *ino) empty = list_empty(&lo->plh_segs); pnfs_clear_layoutcommit(ino, &tmp_list); pnfs_mark_matching_lsegs_invalid(lo, &tmp_list, NULL); + + if (NFS_SERVER(ino)->pnfs_curr_ld->return_range) { + struct pnfs_layout_range range = { + .iomode = IOMODE_ANY, + .offset = 0, + .length = NFS4_MAX_UINT64, + }; + NFS_SERVER(ino)->pnfs_curr_ld->return_range(lo, &range); + } + /* Don't send a LAYOUTRETURN if list was initially empty */ if (empty) { spin_unlock(&ino->i_lock); @@ -871,6 +895,8 @@ _pnfs_return_layout(struct inode *ino) dprintk("NFS: %s no layout segments to return\n", __func__); goto out; } + + set_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); lo->plh_block_lgets++; spin_unlock(&ino->i_lock); pnfs_free_lseg_list(&tmp_list); @@ -1358,25 +1384,41 @@ pnfs_layout_process(struct nfs4_layoutget *lgp) goto out; } + init_lseg(lo, lseg); + lseg->pls_range = res->range; + spin_lock(&ino->i_lock); if (test_bit(NFS_LAYOUT_BULK_RECALL, &lo->plh_flags)) { dprintk("%s forget reply due to recall\n", __func__); goto out_forget_reply; } - if (pnfs_layoutgets_blocked(lo, 1) || - pnfs_layout_stateid_blocked(lo, &res->stateid)) { + if (pnfs_layoutgets_blocked(lo, 1)) { dprintk("%s forget reply due to state\n", __func__); goto out_forget_reply; } - /* Check that the new stateid matches the old stateid */ - pnfs_verify_layout_stateid(lo, &res->stateid, &free_me); - /* Done processing layoutget. Set the layout stateid */ - pnfs_set_layout_stateid(lo, &res->stateid, false); + if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) { + /* existing state ID, make sure the sequence number matches. */ + if (pnfs_layout_stateid_blocked(lo, &res->stateid)) { + dprintk("%s forget reply due to sequence\n", __func__); + goto out_forget_reply; + } + pnfs_set_layout_stateid(lo, &res->stateid, false); + } else { + /* + * We got an entirely new state ID. Mark all segments for the + * inode invalid, and don't bother validating the stateid + * sequence number. + */ + pnfs_mark_matching_lsegs_invalid(lo, &free_me, NULL); + + nfs4_stateid_copy(&lo->plh_stateid, &res->stateid); + lo->plh_barrier = be32_to_cpu(res->stateid.seqid); + } + + clear_bit(NFS_LAYOUT_INVALID_STID, &lo->plh_flags); - init_lseg(lo, lseg); - lseg->pls_range = res->range; pnfs_get_lseg(lseg); pnfs_layout_insert_lseg(lo, lseg); @@ -1797,6 +1839,35 @@ pnfs_set_layoutcommit(struct nfs_pgio_header *hdr) } EXPORT_SYMBOL_GPL(pnfs_set_layoutcommit); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data) +{ + struct inode *inode = data->inode; + struct nfs_inode *nfsi = NFS_I(inode); + bool mark_as_dirty = false; + + spin_lock(&inode->i_lock); + if (!test_and_set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags)) { + mark_as_dirty = true; + dprintk("%s: Set layoutcommit for inode %lu ", + __func__, inode->i_ino); + } + if (!test_and_set_bit(NFS_LSEG_LAYOUTCOMMIT, &data->lseg->pls_flags)) { + /* references matched in nfs4_layoutcommit_release */ + pnfs_get_lseg(data->lseg); + } + if (data->lwb > nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = data->lwb; + spin_unlock(&inode->i_lock); + dprintk("%s: lseg %p end_pos %llu\n", + __func__, data->lseg, nfsi->layout->plh_lwb); + + /* if pnfs_layoutcommit_inode() runs between inode locks, the next one + * will be a noop because NFS_INO_LAYOUTCOMMIT will not be set */ + if (mark_as_dirty) + mark_inode_dirty_sync(inode); +} +EXPORT_SYMBOL_GPL(pnfs_commit_set_layoutcommit); + void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) { struct nfs_server *nfss = NFS_SERVER(data->args.inode); @@ -1817,6 +1888,7 @@ void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data) int pnfs_layoutcommit_inode(struct inode *inode, bool sync) { + struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld; struct nfs4_layoutcommit_data *data; struct nfs_inode *nfsi = NFS_I(inode); loff_t end_pos; @@ -1867,6 +1939,20 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync) data->args.lastbytewritten = end_pos - 1; data->res.server = NFS_SERVER(inode); + if (ld->prepare_layoutcommit) { + status = ld->prepare_layoutcommit(&data->args); + if (status) { + spin_lock(&inode->i_lock); + if (end_pos < nfsi->layout->plh_lwb) + nfsi->layout->plh_lwb = end_pos; + spin_unlock(&inode->i_lock); + put_rpccred(data->cred); + set_bit(NFS_INO_LAYOUTCOMMIT, &nfsi->flags); + goto clear_layoutcommitting; + } + } + + status = nfs4_proc_layoutcommit(data, sync); out: if (status) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index aca3dff5dae6..9ae5b765b073 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -65,12 +65,15 @@ enum { NFS_LAYOUT_BULK_RECALL, /* bulk recall affecting layout */ NFS_LAYOUT_ROC, /* some lseg had roc bit set */ NFS_LAYOUT_RETURN, /* Return this layout ASAP */ + NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */ }; enum layoutdriver_policy_flags { - /* Should the pNFS client commit and return the layout upon a setattr */ + /* Should the pNFS client commit and return the layout upon truncate to + * a smaller size */ PNFS_LAYOUTRET_ON_SETATTR = 1 << 0, PNFS_LAYOUTRET_ON_ERROR = 1 << 1, + PNFS_READ_WHOLE_PAGE = 1 << 2, }; struct nfs4_deviceid_node; @@ -82,6 +85,7 @@ struct pnfs_layoutdriver_type { const char *name; struct module *owner; unsigned flags; + unsigned max_deviceinfo_size; int (*set_layoutdriver) (struct nfs_server *, const struct nfs_fh *); int (*clear_layoutdriver) (struct nfs_server *); @@ -92,6 +96,9 @@ struct pnfs_layoutdriver_type { struct pnfs_layout_segment * (*alloc_lseg) (struct pnfs_layout_hdr *layoutid, struct nfs4_layoutget_res *lgr, gfp_t gfp_flags); void (*free_lseg) (struct pnfs_layout_segment *lseg); + void (*return_range) (struct pnfs_layout_hdr *lo, + struct pnfs_layout_range *range); + /* test for nfs page cache coalescing */ const struct nfs_pageio_ops *pg_read_ops; const struct nfs_pageio_ops *pg_write_ops; @@ -121,14 +128,17 @@ struct pnfs_layoutdriver_type { enum pnfs_try_status (*write_pagelist)(struct nfs_pgio_header *, int); void (*free_deviceid_node) (struct nfs4_deviceid_node *); + struct nfs4_deviceid_node * (*alloc_deviceid_node) + (struct nfs_server *server, struct pnfs_device *pdev, + gfp_t gfp_flags); void (*encode_layoutreturn) (struct pnfs_layout_hdr *layoutid, struct xdr_stream *xdr, const struct nfs4_layoutreturn_args *args); void (*cleanup_layoutcommit) (struct nfs4_layoutcommit_data *data); - - void (*encode_layoutcommit) (struct pnfs_layout_hdr *layoutid, + int (*prepare_layoutcommit) (struct nfs4_layoutcommit_args *args); + void (*encode_layoutcommit) (struct pnfs_layout_hdr *lo, struct xdr_stream *xdr, const struct nfs4_layoutcommit_args *args); }; @@ -171,9 +181,6 @@ extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *); extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *); /* nfs4proc.c */ -extern int nfs4_proc_getdevicelist(struct nfs_server *server, - const struct nfs_fh *fh, - struct pnfs_devicelist *devlist); extern int nfs4_proc_getdeviceinfo(struct nfs_server *server, struct pnfs_device *dev, struct rpc_cred *cred); @@ -183,7 +190,7 @@ extern int nfs4_proc_layoutreturn(struct nfs4_layoutreturn *lrp); /* pnfs.c */ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo); void pnfs_put_lseg(struct pnfs_layout_segment *lseg); -void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg); +void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg); void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32); void unset_pnfs_layoutdriver(struct nfs_server *); @@ -219,6 +226,7 @@ void pnfs_roc_release(struct inode *ino); void pnfs_roc_set_barrier(struct inode *ino, u32 barrier); bool pnfs_roc_drain(struct inode *ino, u32 *barrier, struct rpc_task *task); void pnfs_set_layoutcommit(struct nfs_pgio_header *); +void pnfs_commit_set_layoutcommit(struct nfs_commit_data *data); void pnfs_cleanup_layoutcommit(struct nfs4_layoutcommit_data *data); int pnfs_layoutcommit_inode(struct inode *inode, bool sync); int _pnfs_return_layout(struct inode *); @@ -255,18 +263,25 @@ struct nfs4_deviceid_node { atomic_t ref; }; -struct nfs4_deviceid_node *nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); +struct nfs4_deviceid_node * +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask); void nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *, const struct nfs_client *, const struct nfs4_deviceid *); -void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, - const struct pnfs_layoutdriver_type *, - const struct nfs_client *, +void nfs4_init_deviceid_node(struct nfs4_deviceid_node *, struct nfs_server *, const struct nfs4_deviceid *); -struct nfs4_deviceid_node *nfs4_insert_deviceid_node(struct nfs4_deviceid_node *); bool nfs4_put_deviceid_node(struct nfs4_deviceid_node *); void nfs4_mark_deviceid_unavailable(struct nfs4_deviceid_node *node); bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node); void nfs4_deviceid_purge_client(const struct nfs_client *); +static inline struct nfs4_deviceid_node * +nfs4_get_deviceid(struct nfs4_deviceid_node *d) +{ + atomic_inc(&d->ref); + return d; +} + static inline struct pnfs_layout_segment * pnfs_get_lseg(struct pnfs_layout_segment *lseg) { @@ -368,6 +383,14 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + if (!pnfs_enabled_sb(NFS_SERVER(inode))) + return false; + return NFS_SERVER(inode)->pnfs_curr_ld->flags & PNFS_READ_WHOLE_PAGE; +} + +static inline bool pnfs_layoutcommit_outstanding(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); @@ -422,10 +445,6 @@ static inline void pnfs_put_lseg(struct pnfs_layout_segment *lseg) { } -static inline void pnfs_put_lseg_async(struct pnfs_layout_segment *lseg) -{ -} - static inline int pnfs_return_layout(struct inode *ino) { return 0; @@ -443,6 +462,12 @@ pnfs_ld_layoutret_on_setattr(struct inode *inode) } static inline bool +pnfs_ld_read_whole_page(struct inode *inode) +{ + return false; +} + +static inline bool pnfs_roc(struct inode *ino) { return false; diff --git a/fs/nfs/pnfs_dev.c b/fs/nfs/pnfs_dev.c index 6da209bd9408..aa2ec0015183 100644 --- a/fs/nfs/pnfs_dev.c +++ b/fs/nfs/pnfs_dev.c @@ -29,6 +29,9 @@ */ #include <linux/export.h> +#include <linux/nfs_fs.h> +#include "nfs4session.h" +#include "internal.h" #include "pnfs.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -89,6 +92,74 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, return NULL; } +static struct nfs4_deviceid_node * +nfs4_get_device_info(struct nfs_server *server, + const struct nfs4_deviceid *dev_id, + struct rpc_cred *cred, gfp_t gfp_flags) +{ + struct nfs4_deviceid_node *d = NULL; + struct pnfs_device *pdev = NULL; + struct page **pages = NULL; + u32 max_resp_sz; + int max_pages; + int rc, i; + + /* + * Use the session max response size as the basis for setting + * GETDEVICEINFO's maxcount + */ + max_resp_sz = server->nfs_client->cl_session->fc_attrs.max_resp_sz; + if (server->pnfs_curr_ld->max_deviceinfo_size && + server->pnfs_curr_ld->max_deviceinfo_size < max_resp_sz) + max_resp_sz = server->pnfs_curr_ld->max_deviceinfo_size; + max_pages = nfs_page_array_len(0, max_resp_sz); + dprintk("%s: server %p max_resp_sz %u max_pages %d\n", + __func__, server, max_resp_sz, max_pages); + + pdev = kzalloc(sizeof(*pdev), gfp_flags); + if (!pdev) + return NULL; + + pages = kcalloc(max_pages, sizeof(struct page *), gfp_flags); + if (!pages) + goto out_free_pdev; + + for (i = 0; i < max_pages; i++) { + pages[i] = alloc_page(gfp_flags); + if (!pages[i]) + goto out_free_pages; + } + + memcpy(&pdev->dev_id, dev_id, sizeof(*dev_id)); + pdev->layout_type = server->pnfs_curr_ld->id; + pdev->pages = pages; + pdev->pgbase = 0; + pdev->pglen = max_resp_sz; + pdev->mincount = 0; + pdev->maxcount = max_resp_sz - nfs41_maxgetdevinfo_overhead; + + rc = nfs4_proc_getdeviceinfo(server, pdev, cred); + dprintk("%s getdevice info returns %d\n", __func__, rc); + if (rc) + goto out_free_pages; + + /* + * Found new device, need to decode it and then add it to the + * list of known devices for this mountpoint. + */ + d = server->pnfs_curr_ld->alloc_deviceid_node(server, pdev, + gfp_flags); + +out_free_pages: + for (i = 0; i < max_pages; i++) + __free_page(pages[i]); + kfree(pages); +out_free_pdev: + kfree(pdev); + dprintk("<-- %s d %p\n", __func__, d); + return d; +} + /* * Lookup a deviceid in cache and get a reference count on it if found * @@ -96,14 +167,14 @@ _lookup_deviceid(const struct pnfs_layoutdriver_type *ld, * @id deviceid to look up */ static struct nfs4_deviceid_node * -_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id, - long hash) +__nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, long hash) { struct nfs4_deviceid_node *d; rcu_read_lock(); - d = _lookup_deviceid(ld, clp, id, hash); + d = _lookup_deviceid(server->pnfs_curr_ld, server->nfs_client, id, + hash); if (d != NULL) atomic_inc(&d->ref); rcu_read_unlock(); @@ -111,10 +182,33 @@ _find_get_deviceid(const struct pnfs_layoutdriver_type *ld, } struct nfs4_deviceid_node * -nfs4_find_get_deviceid(const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *clp, const struct nfs4_deviceid *id) +nfs4_find_get_deviceid(struct nfs_server *server, + const struct nfs4_deviceid *id, struct rpc_cred *cred, + gfp_t gfp_mask) { - return _find_get_deviceid(ld, clp, id, nfs4_deviceid_hash(id)); + long hash = nfs4_deviceid_hash(id); + struct nfs4_deviceid_node *d, *new; + + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) + return d; + + new = nfs4_get_device_info(server, id, cred, gfp_mask); + if (!new) + return new; + + spin_lock(&nfs4_deviceid_lock); + d = __nfs4_find_get_deviceid(server, id, hash); + if (d) { + spin_unlock(&nfs4_deviceid_lock); + server->pnfs_curr_ld->free_deviceid_node(new); + return d; + } + hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); + atomic_inc(&new->ref); + spin_unlock(&nfs4_deviceid_lock); + + return new; } EXPORT_SYMBOL_GPL(nfs4_find_get_deviceid); @@ -151,15 +245,13 @@ nfs4_delete_deviceid(const struct pnfs_layoutdriver_type *ld, EXPORT_SYMBOL_GPL(nfs4_delete_deviceid); void -nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, - const struct pnfs_layoutdriver_type *ld, - const struct nfs_client *nfs_client, +nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, struct nfs_server *server, const struct nfs4_deviceid *id) { INIT_HLIST_NODE(&d->node); INIT_HLIST_NODE(&d->tmpnode); - d->ld = ld; - d->nfs_client = nfs_client; + d->ld = server->pnfs_curr_ld; + d->nfs_client = server->nfs_client; d->flags = 0; d->deviceid = *id; atomic_set(&d->ref, 1); @@ -167,39 +259,6 @@ nfs4_init_deviceid_node(struct nfs4_deviceid_node *d, EXPORT_SYMBOL_GPL(nfs4_init_deviceid_node); /* - * Uniquely initialize and insert a deviceid node into cache - * - * @new new deviceid node - * Note that the caller must set up the following members: - * new->ld - * new->nfs_client - * new->deviceid - * - * @ret the inserted node, if none found, otherwise, the found entry. - */ -struct nfs4_deviceid_node * -nfs4_insert_deviceid_node(struct nfs4_deviceid_node *new) -{ - struct nfs4_deviceid_node *d; - long hash; - - spin_lock(&nfs4_deviceid_lock); - hash = nfs4_deviceid_hash(&new->deviceid); - d = _find_get_deviceid(new->ld, new->nfs_client, &new->deviceid, hash); - if (d) { - spin_unlock(&nfs4_deviceid_lock); - return d; - } - - hlist_add_head_rcu(&new->node, &nfs4_deviceid_cache[hash]); - spin_unlock(&nfs4_deviceid_lock); - atomic_inc(&new->ref); - - return new; -} -EXPORT_SYMBOL_GPL(nfs4_insert_deviceid_node); - -/* * Dereference a deviceid node and delete it when its reference count drops * to zero. * @@ -299,4 +358,3 @@ nfs4_deviceid_mark_client_invalid(struct nfs_client *clp) } rcu_read_unlock(); } - diff --git a/fs/nfs/super.c b/fs/nfs/super.c index e4499d5b51e8..31a11b0e885d 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -2065,11 +2065,6 @@ static int nfs23_validate_mount_data(void *options, return NFS_TEXT_DATA; } -#if !IS_ENABLED(CONFIG_NFS_V3) - if (args->version == 3) - goto out_v3_not_compiled; -#endif /* !CONFIG_NFS_V3 */ - return 0; out_no_data: @@ -2085,12 +2080,6 @@ out_no_sec: dfprintk(MOUNT, "NFS: nfs_mount_data version supports only AUTH_SYS\n"); return -EINVAL; -#if !IS_ENABLED(CONFIG_NFS_V3) -out_v3_not_compiled: - dfprintk(MOUNT, "NFS: NFSv3 is not compiled into kernel\n"); - return -EPROTONOSUPPORT; -#endif /* !CONFIG_NFS_V3 */ - out_nomem: dfprintk(MOUNT, "NFS: not enough memory to handle mount options\n"); return -ENOMEM; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 175d5d073ccf..12493846a2d3 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -49,6 +49,9 @@ static const struct nfs_rw_ops nfs_rw_write_ops; static void nfs_clear_request_commit(struct nfs_page *req); static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, struct inode *inode); +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page); static struct kmem_cache *nfs_wdata_cachep; static mempool_t *nfs_wdata_mempool; @@ -95,38 +98,6 @@ static void nfs_context_set_write_error(struct nfs_open_context *ctx, int error) } /* - * nfs_page_search_commits_for_head_request_locked - * - * Search through commit lists on @inode for the head request for @page. - * Must be called while holding the inode (which is cinfo) lock. - * - * Returns the head request if found, or NULL if not found. - */ -static struct nfs_page * -nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, - struct page *page) -{ - struct nfs_page *freq, *t; - struct nfs_commit_info cinfo; - struct inode *inode = &nfsi->vfs_inode; - - nfs_init_cinfo_from_inode(&cinfo, inode); - - /* search through pnfs commit lists */ - freq = pnfs_search_commit_reqs(inode, &cinfo, page); - if (freq) - return freq->wb_head; - - /* Linearly search the commit list for the correct request */ - list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { - if (freq->wb_page == page) - return freq->wb_head; - } - - return NULL; -} - -/* * nfs_page_find_head_request_locked - find head request associated with @page * * must be called while holding the inode lock. @@ -271,11 +242,14 @@ static void nfs_mark_uptodate(struct nfs_page *req) static int wb_priority(struct writeback_control *wbc) { + int ret = 0; if (wbc->for_reclaim) return FLUSH_HIGHPRI | FLUSH_STABLE; + if (wbc->sync_mode == WB_SYNC_ALL) + ret = FLUSH_COND_STABLE; if (wbc->for_kupdate || wbc->for_background) - return FLUSH_LOWPRI | FLUSH_COND_STABLE; - return FLUSH_COND_STABLE; + ret |= FLUSH_LOWPRI; + return ret; } /* @@ -731,6 +705,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) if (likely(!PageSwapCache(head->wb_page))) { set_page_private(head->wb_page, 0); ClearPagePrivate(head->wb_page); + smp_mb__after_atomic(); + wake_up_page(head->wb_page, PG_private); clear_bit(PG_MAPPED, &head->wb_flags); } nfsi->npages--; @@ -749,7 +725,38 @@ nfs_mark_request_dirty(struct nfs_page *req) __set_page_dirty_nobuffers(req->wb_page); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) +/* + * nfs_page_search_commits_for_head_request_locked + * + * Search through commit lists on @inode for the head request for @page. + * Must be called while holding the inode (which is cinfo) lock. + * + * Returns the head request if found, or NULL if not found. + */ +static struct nfs_page * +nfs_page_search_commits_for_head_request_locked(struct nfs_inode *nfsi, + struct page *page) +{ + struct nfs_page *freq, *t; + struct nfs_commit_info cinfo; + struct inode *inode = &nfsi->vfs_inode; + + nfs_init_cinfo_from_inode(&cinfo, inode); + + /* search through pnfs commit lists */ + freq = pnfs_search_commit_reqs(inode, &cinfo, page); + if (freq) + return freq->wb_head; + + /* Linearly search the commit list for the correct request */ + list_for_each_entry_safe(freq, t, &cinfo.mds->list, wb_list) { + if (freq->wb_page == page) + return freq->wb_head; + } + + return NULL; +} + /** * nfs_request_add_commit_list - add request to a commit list * @req: pointer to a struct nfs_page @@ -867,36 +874,6 @@ int nfs_write_need_commit(struct nfs_pgio_header *hdr) return hdr->verf.committed != NFS_FILE_SYNC; } -#else -static void nfs_init_cinfo_from_inode(struct nfs_commit_info *cinfo, - struct inode *inode) -{ -} - -void nfs_init_cinfo(struct nfs_commit_info *cinfo, - struct inode *inode, - struct nfs_direct_req *dreq) -{ -} - -void -nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, - struct nfs_commit_info *cinfo) -{ -} - -static void -nfs_clear_request_commit(struct nfs_page *req) -{ -} - -int nfs_write_need_commit(struct nfs_pgio_header *hdr) -{ - return 0; -} - -#endif - static void nfs_write_completion(struct nfs_pgio_header *hdr) { struct nfs_commit_info cinfo; @@ -932,7 +909,6 @@ out: hdr->release(hdr); } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) { @@ -989,19 +965,6 @@ nfs_scan_commit(struct inode *inode, struct list_head *dst, return ret; } -#else -unsigned long nfs_reqs_to_commit(struct nfs_commit_info *cinfo) -{ - return 0; -} - -int nfs_scan_commit(struct inode *inode, struct list_head *dst, - struct nfs_commit_info *cinfo) -{ - return 0; -} -#endif - /* * Search for an existing write request, and attempt to update * it to reflect a new dirty region on a given page. @@ -1394,7 +1357,6 @@ static int nfs_writeback_done(struct rpc_task *task, return status; nfs_add_stats(inode, NFSIOS_SERVERWRITTENBYTES, hdr->res.count); -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) if (hdr->res.verf->committed < hdr->args.stable && task->tk_status >= 0) { /* We tried a write call, but the server did not @@ -1416,7 +1378,6 @@ static int nfs_writeback_done(struct rpc_task *task, complain = jiffies + 300 * HZ; } } -#endif /* Deal with the suid/sgid bit corner case */ if (nfs_should_remove_suid(inode)) @@ -1469,7 +1430,6 @@ static void nfs_writeback_result(struct rpc_task *task, } -#if IS_ENABLED(CONFIG_NFS_V3) || IS_ENABLED(CONFIG_NFS_V4) static int nfs_commit_set_lock(struct nfs_inode *nfsi, int may_wait) { int ret; @@ -1538,6 +1498,18 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data, } EXPORT_SYMBOL_GPL(nfs_initiate_commit); +static loff_t nfs_get_lwb(struct list_head *head) +{ + loff_t lwb = 0; + struct nfs_page *req; + + list_for_each_entry(req, head, wb_list) + if (lwb < (req_offset(req) + req->wb_bytes)) + lwb = req_offset(req) + req->wb_bytes; + + return lwb; +} + /* * Set up the argument/result storage required for the RPC call. */ @@ -1557,6 +1529,9 @@ void nfs_init_commit(struct nfs_commit_data *data, data->inode = inode; data->cred = first->wb_context->cred; data->lseg = lseg; /* reference transferred */ + /* only set lwb for pnfs commit */ + if (lseg) + data->lwb = nfs_get_lwb(&data->pages); data->mds_ops = &nfs_commit_ops; data->completion_ops = cinfo->completion_ops; data->dreq = cinfo->dreq; @@ -1636,6 +1611,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) struct nfs_page *req; int status = data->task.tk_status; struct nfs_commit_info cinfo; + struct nfs_server *nfss; while (!list_empty(&data->pages)) { req = nfs_list_entry(data->pages.next); @@ -1669,6 +1645,10 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data) next: nfs_unlock_and_release_request(req); } + nfss = NFS_SERVER(data->inode); + if (atomic_long_read(&nfss->writeback) < NFS_CONGESTION_OFF_THRESH) + clear_bdi_congested(&nfss->backing_dev_info, BLK_RW_ASYNC); + nfs_init_cinfo(&cinfo, data->inode, data->dreq); if (atomic_dec_and_test(&cinfo.mds->rpcs_out)) nfs_commit_clear_lock(NFS_I(data->inode)); @@ -1778,12 +1758,6 @@ out_mark_dirty: __mark_inode_dirty(inode, I_DIRTY_DATASYNC); return ret; } -#else -static int nfs_commit_unstable_pages(struct inode *inode, struct writeback_control *wbc) -{ - return 0; -} -#endif int nfs_write_inode(struct inode *inode, struct writeback_control *wbc) { diff --git a/fs/nfs_common/Makefile b/fs/nfs_common/Makefile index f689ed82af3a..d153ca3ea577 100644 --- a/fs/nfs_common/Makefile +++ b/fs/nfs_common/Makefile @@ -3,5 +3,6 @@ # obj-$(CONFIG_NFS_ACL_SUPPORT) += nfs_acl.o - nfs_acl-objs := nfsacl.o + +obj-$(CONFIG_GRACE_PERIOD) += grace.o diff --git a/fs/lockd/grace.c b/fs/nfs_common/grace.c index 6d1ee7204c88..ae6e58ea4de5 100644 --- a/fs/lockd/grace.c +++ b/fs/nfs_common/grace.c @@ -1,17 +1,20 @@ /* * Common code for control of lockd and nfsv4 grace periods. + * + * Transplanted from lockd code */ #include <linux/module.h> -#include <linux/lockd/bind.h> #include <net/net_namespace.h> +#include <net/netns/generic.h> +#include <linux/fs.h> -#include "netns.h" - +static int grace_net_id; static DEFINE_SPINLOCK(grace_lock); /** * locks_start_grace + * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * A grace period is a period during which locks should not be given @@ -21,18 +24,20 @@ static DEFINE_SPINLOCK(grace_lock); * * This function is called to start a grace period. */ -void locks_start_grace(struct net *net, struct lock_manager *lm) +void +locks_start_grace(struct net *net, struct lock_manager *lm) { - struct lockd_net *ln = net_generic(net, lockd_net_id); + struct list_head *grace_list = net_generic(net, grace_net_id); spin_lock(&grace_lock); - list_add(&lm->list, &ln->grace_list); + list_add(&lm->list, grace_list); spin_unlock(&grace_lock); } EXPORT_SYMBOL_GPL(locks_start_grace); /** * locks_end_grace + * @net: net namespace that this lock manager belongs to * @lm: who this grace period is for * * Call this function to state that the given lock manager is ready to @@ -41,7 +46,8 @@ EXPORT_SYMBOL_GPL(locks_start_grace); * Note that callers count on it being safe to call this more than once, * and the second call should be a no-op. */ -void locks_end_grace(struct lock_manager *lm) +void +locks_end_grace(struct lock_manager *lm) { spin_lock(&grace_lock); list_del_init(&lm->list); @@ -56,10 +62,52 @@ EXPORT_SYMBOL_GPL(locks_end_grace); * to answer ordinary lock requests, and when they should accept only * lock reclaims. */ -int locks_in_grace(struct net *net) +int +locks_in_grace(struct net *net) { - struct lockd_net *ln = net_generic(net, lockd_net_id); + struct list_head *grace_list = net_generic(net, grace_net_id); - return !list_empty(&ln->grace_list); + return !list_empty(grace_list); } EXPORT_SYMBOL_GPL(locks_in_grace); + +static int __net_init +grace_init_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + INIT_LIST_HEAD(grace_list); + return 0; +} + +static void __net_exit +grace_exit_net(struct net *net) +{ + struct list_head *grace_list = net_generic(net, grace_net_id); + + BUG_ON(!list_empty(grace_list)); +} + +static struct pernet_operations grace_net_ops = { + .init = grace_init_net, + .exit = grace_exit_net, + .id = &grace_net_id, + .size = sizeof(struct list_head), +}; + +static int __init +init_grace(void) +{ + return register_pernet_subsys(&grace_net_ops); +} + +static void __exit +exit_grace(void) +{ + unregister_pernet_subsys(&grace_net_ops); +} + +MODULE_AUTHOR("Jeff Layton <jlayton@primarydata.com>"); +MODULE_LICENSE("GPL"); +module_init(init_grace) +module_exit(exit_grace) diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index f994e750e0d1..73395156bdb4 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -71,6 +71,7 @@ config NFSD_V4 select FS_POSIX_ACL select SUNRPC_GSS select CRYPTO + select GRACE_PERIOD help This option enables support in your system's NFS server for version 4 of the NFS protocol (RFC 3530). @@ -94,9 +95,6 @@ config NFSD_V4_SECURITY_LABEL If you do not wish to enable fine-grained security labels SELinux or Smack policies on NFSv4 files, say N. - WARNING: there is still a chance of backwards-incompatible protocol changes. - For now we recommend "Y" only for developers and testers. - config NFSD_FAULT_INJECTION bool "NFS server manual fault injection" depends on NFSD_V4 && DEBUG_KERNEL diff --git a/fs/nfsd/cache.h b/fs/nfsd/cache.h index b582f9ab6b2a..dd96a3830004 100644 --- a/fs/nfsd/cache.h +++ b/fs/nfsd/cache.h @@ -18,7 +18,6 @@ * is much larger than a sockaddr_in6. */ struct svc_cacherep { - struct hlist_node c_hash; struct list_head c_lru; unsigned char c_state, /* unused, inprog, done */ diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 72ffd7cce3c3..30a739d896ff 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1145,6 +1145,7 @@ static struct flags { { NFSEXP_ALLSQUASH, {"all_squash", ""}}, { NFSEXP_ASYNC, {"async", "sync"}}, { NFSEXP_GATHERED_WRITES, {"wdelay", "no_wdelay"}}, + { NFSEXP_NOREADDIRPLUS, {"nordirplus", ""}}, { NFSEXP_NOHIDE, {"nohide", ""}}, { NFSEXP_CROSSMOUNT, {"crossmnt", ""}}, { NFSEXP_NOSUBTREECHECK, {"no_subtree_check", ""}}, diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index fa2525b2e9d7..12f2aab4f614 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -223,11 +223,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp, struct nfsd3_createargs *argp, newfhp = fh_init(&resp->fh, NFS3_FHSIZE); attr = &argp->attrs; - /* Get the directory inode */ - nfserr = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_CREATE); - if (nfserr) - RETURN_STATUS(nfserr); - /* Unfudge the mode bits */ attr->ia_mode &= ~S_IFMT; if (!(attr->ia_valid & ATTR_MODE)) { @@ -471,6 +466,14 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp, struct nfsd3_readdirargs *argp, resp->buflen = resp->count; resp->rqstp = rqstp; offset = argp->cookie; + + nfserr = fh_verify(rqstp, &resp->fh, S_IFDIR, NFSD_MAY_NOP); + if (nfserr) + RETURN_STATUS(nfserr); + + if (resp->fh.fh_export->ex_flags & NFSEXP_NOREADDIRPLUS) + RETURN_STATUS(nfserr_notsupp); + nfserr = nfsd_readdir(rqstp, &resp->fh, &offset, &resp->common, diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index e0be57b0f79b..ed2b1151b171 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -49,12 +49,6 @@ static void nfsd4_mark_cb_fault(struct nfs4_client *, int reason); /* Index of predefined Linux callback client operations */ -enum { - NFSPROC4_CLNT_CB_NULL = 0, - NFSPROC4_CLNT_CB_RECALL, - NFSPROC4_CLNT_CB_SEQUENCE, -}; - struct nfs4_cb_compound_hdr { /* args */ u32 ident; /* minorversion 0 only */ @@ -494,7 +488,7 @@ static void nfs4_xdr_enc_cb_null(struct rpc_rqst *req, struct xdr_stream *xdr, static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, const struct nfsd4_callback *cb) { - const struct nfs4_delegation *args = cb->cb_op; + const struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfs4_cb_compound_hdr hdr = { .ident = cb->cb_clp->cl_cb_ident, .minorversion = cb->cb_minorversion, @@ -502,7 +496,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr, encode_cb_compound4args(xdr, &hdr); encode_cb_sequence4args(xdr, cb, &hdr); - encode_cb_recall4args(xdr, args, &hdr); + encode_cb_recall4args(xdr, dp, &hdr); encode_cb_nops(&hdr); } @@ -746,27 +740,6 @@ static const struct rpc_call_ops nfsd4_cb_probe_ops = { static struct workqueue_struct *callback_wq; -static void run_nfsd4_cb(struct nfsd4_callback *cb) -{ - queue_work(callback_wq, &cb->cb_work); -} - -static void do_probe_callback(struct nfs4_client *clp) -{ - struct nfsd4_callback *cb = &clp->cl_cb_null; - - cb->cb_op = NULL; - cb->cb_clp = clp; - - cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_NULL]; - cb->cb_msg.rpc_argp = NULL; - cb->cb_msg.rpc_resp = NULL; - - cb->cb_ops = &nfsd4_cb_probe_ops; - - run_nfsd4_cb(cb); -} - /* * Poke the callback thread to process any updates to the callback * parameters, and send a null probe. @@ -775,7 +748,7 @@ void nfsd4_probe_callback(struct nfs4_client *clp) { clp->cl_cb_state = NFSD4_CB_UNKNOWN; set_bit(NFSD4_CLIENT_CB_UPDATE, &clp->cl_flags); - do_probe_callback(clp); + nfsd4_run_cb(&clp->cl_cb_null); } void nfsd4_probe_callback_sync(struct nfs4_client *clp) @@ -847,23 +820,9 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) rpc_wake_up_next(&clp->cl_cb_waitq); dprintk("%s: freed slot, new seqid=%d\n", __func__, clp->cl_cb_session->se_cb_seq_nr); - - /* We're done looking into the sequence information */ - task->tk_msg.rpc_resp = NULL; } -} - - -static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) -{ - struct nfsd4_callback *cb = calldata; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); - struct nfs4_client *clp = cb->cb_clp; - struct rpc_clnt *current_rpc_client = clp->cl_cb_client; - - nfsd4_cb_done(task, calldata); - if (current_rpc_client != task->tk_client) { + if (clp->cl_cb_client != task->tk_client) { /* We're shutting down or changing cl_cb_client; leave * it to nfsd4_process_cb_update to restart the call if * necessary. */ @@ -872,47 +831,42 @@ static void nfsd4_cb_recall_done(struct rpc_task *task, void *calldata) if (cb->cb_done) return; - switch (task->tk_status) { + + switch (cb->cb_ops->done(cb, task)) { case 0: - cb->cb_done = true; + task->tk_status = 0; + rpc_restart_call_prepare(task); return; - case -EBADHANDLE: - case -NFS4ERR_BAD_STATEID: - /* Race: client probably got cb_recall - * before open reply granting delegation */ + case 1: break; - default: + case -1: /* Network partition? */ nfsd4_mark_cb_down(clp, task->tk_status); + break; + default: + BUG(); } - if (dp->dl_retries--) { - rpc_delay(task, 2*HZ); - task->tk_status = 0; - rpc_restart_call_prepare(task); - return; - } - nfsd4_mark_cb_down(clp, task->tk_status); cb->cb_done = true; } -static void nfsd4_cb_recall_release(void *calldata) +static void nfsd4_cb_release(void *calldata) { struct nfsd4_callback *cb = calldata; struct nfs4_client *clp = cb->cb_clp; - struct nfs4_delegation *dp = container_of(cb, struct nfs4_delegation, dl_recall); if (cb->cb_done) { spin_lock(&clp->cl_lock); list_del(&cb->cb_per_client); spin_unlock(&clp->cl_lock); - nfs4_put_stid(&dp->dl_stid); + + cb->cb_ops->release(cb); } } -static const struct rpc_call_ops nfsd4_cb_recall_ops = { +static const struct rpc_call_ops nfsd4_cb_ops = { .rpc_call_prepare = nfsd4_cb_prepare, - .rpc_call_done = nfsd4_cb_recall_done, - .rpc_release = nfsd4_cb_recall_release, + .rpc_call_done = nfsd4_cb_done, + .rpc_release = nfsd4_cb_release, }; int nfsd4_create_callback_queue(void) @@ -937,16 +891,10 @@ void nfsd4_shutdown_callback(struct nfs4_client *clp) * instead, nfsd4_run_cb_null() will detect the killed * client, destroy the rpc client, and stop: */ - do_probe_callback(clp); + nfsd4_run_cb(&clp->cl_cb_null); flush_workqueue(callback_wq); } -static void nfsd4_release_cb(struct nfsd4_callback *cb) -{ - if (cb->cb_ops->rpc_release) - cb->cb_ops->rpc_release(cb); -} - /* requires cl_lock: */ static struct nfsd4_conn * __nfsd4_find_backchannel(struct nfs4_client *clp) { @@ -1009,63 +957,49 @@ static void nfsd4_process_cb_update(struct nfsd4_callback *cb) } /* Yay, the callback channel's back! Restart any callbacks: */ list_for_each_entry(cb, &clp->cl_callbacks, cb_per_client) - run_nfsd4_cb(cb); + queue_work(callback_wq, &cb->cb_work); } static void -nfsd4_run_callback_rpc(struct nfsd4_callback *cb) +nfsd4_run_cb_work(struct work_struct *work) { + struct nfsd4_callback *cb = + container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; + if (cb->cb_ops && cb->cb_ops->prepare) + cb->cb_ops->prepare(cb); + if (clp->cl_flags & NFSD4_CLIENT_CB_FLAG_MASK) nfsd4_process_cb_update(cb); clnt = clp->cl_cb_client; if (!clnt) { /* Callback channel broken, or client killed; give up: */ - nfsd4_release_cb(cb); + if (cb->cb_ops && cb->cb_ops->release) + cb->cb_ops->release(cb); return; } cb->cb_msg.rpc_cred = clp->cl_cb_cred; rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | RPC_TASK_SOFTCONN, - cb->cb_ops, cb); + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); } -void -nfsd4_run_cb_null(struct work_struct *w) +void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op) { - struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, - cb_work); - nfsd4_run_callback_rpc(cb); -} - -void -nfsd4_run_cb_recall(struct work_struct *w) -{ - struct nfsd4_callback *cb = container_of(w, struct nfsd4_callback, - cb_work); - - nfsd4_prepare_cb_recall(cb->cb_op); - nfsd4_run_callback_rpc(cb); -} - -void nfsd4_cb_recall(struct nfs4_delegation *dp) -{ - struct nfsd4_callback *cb = &dp->dl_recall; - struct nfs4_client *clp = dp->dl_stid.sc_client; - - dp->dl_retries = 1; - cb->cb_op = dp; cb->cb_clp = clp; - cb->cb_msg.rpc_proc = &nfs4_cb_procedures[NFSPROC4_CLNT_CB_RECALL]; + cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; - - cb->cb_ops = &nfsd4_cb_recall_ops; - + cb->cb_ops = ops; + INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); INIT_LIST_HEAD(&cb->cb_per_client); cb->cb_done = true; +} - run_nfsd4_cb(&dp->dl_recall); +void nfsd4_run_cb(struct nfsd4_callback *cb) +{ + queue_work(callback_wq, &cb->cb_work); } diff --git a/fs/nfsd/nfs4idmap.c b/fs/nfsd/nfs4idmap.c index a0ab0a847d69..e1b3d3d472da 100644 --- a/fs/nfsd/nfs4idmap.c +++ b/fs/nfsd/nfs4idmap.c @@ -215,7 +215,8 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) memset(&ent, 0, sizeof(ent)); /* Authentication name */ - if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); @@ -245,12 +246,10 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) /* Name */ error = -EINVAL; len = qword_get(&buf, buf1, PAGE_SIZE); - if (len < 0) + if (len < 0 || len >= IDMAP_NAMESZ) goto out; if (len == 0) set_bit(CACHE_NEGATIVE, &ent.h.flags); - else if (len >= IDMAP_NAMESZ) - goto out; else memcpy(ent.name, buf1, sizeof(ent.name)); error = -ENOMEM; @@ -259,15 +258,12 @@ idtoname_parse(struct cache_detail *cd, char *buf, int buflen) goto out; cache_put(&res->h, cd); - error = 0; out: kfree(buf1); - return error; } - static struct ent * idtoname_lookup(struct cache_detail *cd, struct ent *item) { @@ -368,7 +364,7 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) { struct ent ent, *res; char *buf1; - int error = -EINVAL; + int len, error = -EINVAL; if (buf[buflen - 1] != '\n') return (-EINVAL); @@ -381,7 +377,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) memset(&ent, 0, sizeof(ent)); /* Authentication name */ - if (qword_get(&buf, buf1, PAGE_SIZE) <= 0) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.authname, buf1, sizeof(ent.authname)); @@ -392,8 +389,8 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) IDMAP_TYPE_USER : IDMAP_TYPE_GROUP; /* Name */ - error = qword_get(&buf, buf1, PAGE_SIZE); - if (error <= 0 || error >= IDMAP_NAMESZ) + len = qword_get(&buf, buf1, PAGE_SIZE); + if (len <= 0 || len >= IDMAP_NAMESZ) goto out; memcpy(ent.name, buf1, sizeof(ent.name)); @@ -421,7 +418,6 @@ nametoid_parse(struct cache_detail *cd, char *buf, int buflen) error = 0; out: kfree(buf1); - return (error); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 5e0dc528a0e8..cdeb3cfd6f32 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -1013,6 +1013,49 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; } +static __be32 +nfsd4_seek(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, + struct nfsd4_seek *seek) +{ + int whence; + __be32 status; + struct file *file; + + status = nfs4_preprocess_stateid_op(SVC_NET(rqstp), cstate, + &seek->seek_stateid, + RD_STATE, &file); + if (status) { + dprintk("NFSD: nfsd4_seek: couldn't process stateid!\n"); + return status; + } + + switch (seek->seek_whence) { + case NFS4_CONTENT_DATA: + whence = SEEK_DATA; + break; + case NFS4_CONTENT_HOLE: + whence = SEEK_HOLE; + break; + default: + status = nfserr_union_notsupp; + goto out; + } + + /* + * Note: This call does change file->f_pos, but nothing in NFSD + * should ever file->f_pos. + */ + seek->seek_pos = vfs_llseek(file, seek->seek_offset, whence); + if (seek->seek_pos < 0) + status = nfserrno(seek->seek_pos); + else if (seek->seek_pos >= i_size_read(file_inode(file))) + seek->seek_eof = true; + +out: + fput(file); + return status; +} + /* This routine never returns NFS_OK! If there are no other errors, it * will return NFSERR_SAME or NFSERR_NOT_SAME depending on whether the * attributes matched. VERIFY is implemented by mapping NFSERR_SAME @@ -1881,6 +1924,12 @@ static struct nfsd4_operation nfsd4_ops[] = { .op_get_currentstateid = (stateid_getter)nfsd4_get_freestateid, .op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize, }, + + /* NFSv4.2 operations */ + [OP_SEEK] = { + .op_func = (nfsd4op_func)nfsd4_seek, + .op_name = "OP_SEEK", + }, }; int nfsd4_max_reply(struct svc_rqst *rqstp, struct nfsd4_op *op) diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 9c271f42604a..a25490ae6c62 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -58,7 +58,7 @@ struct nfsd4_client_tracking_ops { void (*create)(struct nfs4_client *); void (*remove)(struct nfs4_client *); int (*check)(struct nfs4_client *); - void (*grace_done)(struct nfsd_net *, time_t); + void (*grace_done)(struct nfsd_net *); }; /* Globals */ @@ -188,7 +188,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) status = mnt_want_write_file(nn->rec_file); if (status) - return; + goto out_creds; dir = nn->rec_file->f_path.dentry; /* lock the parent */ @@ -228,6 +228,7 @@ out_unlock: user_recovery_dirname); } mnt_drop_write_file(nn->rec_file); +out_creds: nfs4_reset_creds(original_cred); } @@ -392,7 +393,7 @@ purge_old(struct dentry *parent, struct dentry *child, struct nfsd_net *nn) } static void -nfsd4_recdir_purge_old(struct nfsd_net *nn, time_t boot_time) +nfsd4_recdir_purge_old(struct nfsd_net *nn) { int status; @@ -479,6 +480,16 @@ nfsd4_init_recdir(struct net *net) return status; } +static void +nfsd4_shutdown_recdir(struct net *net) +{ + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (!nn->rec_file) + return; + fput(nn->rec_file); + nn->rec_file = NULL; +} static int nfs4_legacy_state_init(struct net *net) @@ -512,10 +523,13 @@ nfsd4_load_reboot_recovery_data(struct net *net) int status; status = nfsd4_init_recdir(net); - if (!status) - status = nfsd4_recdir_load(net); if (status) - printk(KERN_ERR "NFSD: Failure reading reboot recovery data\n"); + return status; + + status = nfsd4_recdir_load(net); + if (status) + nfsd4_shutdown_recdir(net); + return status; } @@ -546,21 +560,12 @@ err: } static void -nfsd4_shutdown_recdir(struct nfsd_net *nn) -{ - if (!nn->rec_file) - return; - fput(nn->rec_file); - nn->rec_file = NULL; -} - -static void nfsd4_legacy_tracking_exit(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); nfs4_release_reclaim(nn); - nfsd4_shutdown_recdir(nn); + nfsd4_shutdown_recdir(net); nfs4_legacy_state_shutdown(net); } @@ -670,7 +675,6 @@ __cld_pipe_upcall(struct rpc_pipe *pipe, struct cld_msg *cmsg) } schedule(); - set_current_state(TASK_RUNNING); if (msg.errno < 0) ret = msg.errno; @@ -1016,7 +1020,7 @@ nfsd4_cld_check(struct nfs4_client *clp) } static void -nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) +nfsd4_cld_grace_done(struct nfsd_net *nn) { int ret; struct cld_upcall *cup; @@ -1029,7 +1033,7 @@ nfsd4_cld_grace_done(struct nfsd_net *nn, time_t boot_time) } cup->cu_msg.cm_cmd = Cld_GraceDone; - cup->cu_msg.cm_u.cm_gracetime = (int64_t)boot_time; + cup->cu_msg.cm_u.cm_gracetime = (int64_t)nn->boot_time; ret = cld_pipe_upcall(cn->cn_pipe, &cup->cu_msg); if (!ret) ret = cup->cu_msg.cm_status; @@ -1062,6 +1066,8 @@ MODULE_PARM_DESC(cltrack_legacy_disable, #define LEGACY_TOPDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_TOPDIR=" #define LEGACY_RECDIR_ENV_PREFIX "NFSDCLTRACK_LEGACY_RECDIR=" +#define HAS_SESSION_ENV_PREFIX "NFSDCLTRACK_CLIENT_HAS_SESSION=" +#define GRACE_START_ENV_PREFIX "NFSDCLTRACK_GRACE_START=" static char * nfsd4_cltrack_legacy_topdir(void) @@ -1126,10 +1132,60 @@ nfsd4_cltrack_legacy_recdir(const struct xdr_netobj *name) return result; } +static char * +nfsd4_cltrack_client_has_session(struct nfs4_client *clp) +{ + int copied; + size_t len; + char *result; + + /* prefix + Y/N character + terminating NULL */ + len = strlen(HAS_SESSION_ENV_PREFIX) + 1 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, HAS_SESSION_ENV_PREFIX "%c", + clp->cl_minorversion ? 'Y' : 'N'); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + +static char * +nfsd4_cltrack_grace_start(time_t grace_start) +{ + int copied; + size_t len; + char *result; + + /* prefix + max width of int64_t string + terminating NULL */ + len = strlen(GRACE_START_ENV_PREFIX) + 22 + 1; + + result = kmalloc(len, GFP_KERNEL); + if (!result) + return result; + + copied = snprintf(result, len, GRACE_START_ENV_PREFIX "%ld", + grace_start); + if (copied >= len) { + /* just return nothing if output was truncated */ + kfree(result); + return NULL; + } + + return result; +} + static int -nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) +nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *env0, char *env1) { - char *envp[2]; + char *envp[3]; char *argv[4]; int ret; @@ -1140,10 +1196,12 @@ nfsd4_umh_cltrack_upcall(char *cmd, char *arg, char *legacy) dprintk("%s: cmd: %s\n", __func__, cmd); dprintk("%s: arg: %s\n", __func__, arg ? arg : "(null)"); - dprintk("%s: legacy: %s\n", __func__, legacy ? legacy : "(null)"); + dprintk("%s: env0: %s\n", __func__, env0 ? env0 : "(null)"); + dprintk("%s: env1: %s\n", __func__, env1 ? env1 : "(null)"); - envp[0] = legacy; - envp[1] = NULL; + envp[0] = env0; + envp[1] = env1; + envp[2] = NULL; argv[0] = (char *)cltrack_prog; argv[1] = cmd; @@ -1187,28 +1245,78 @@ bin_to_hex_dup(const unsigned char *src, int srclen) } static int -nfsd4_umh_cltrack_init(struct net __attribute__((unused)) *net) +nfsd4_umh_cltrack_init(struct net *net) { + int ret; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + char *grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + /* XXX: The usermode helper s not working in container yet. */ if (net != &init_net) { WARN(1, KERN_ERR "NFSD: attempt to initialize umh client " "tracking in a container!\n"); return -EINVAL; } - return nfsd4_umh_cltrack_upcall("init", NULL, NULL); + + ret = nfsd4_umh_cltrack_upcall("init", NULL, grace_start, NULL); + kfree(grace_start); + return ret; +} + +static void +nfsd4_cltrack_upcall_lock(struct nfs4_client *clp) +{ + wait_on_bit_lock(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK, + TASK_UNINTERRUPTIBLE); +} + +static void +nfsd4_cltrack_upcall_unlock(struct nfs4_client *clp) +{ + smp_mb__before_atomic(); + clear_bit(NFSD4_CLIENT_UPCALL_LOCK, &clp->cl_flags); + smp_mb__after_atomic(); + wake_up_bit(&clp->cl_flags, NFSD4_CLIENT_UPCALL_LOCK); } static void nfsd4_umh_cltrack_create(struct nfs4_client *clp) { - char *hexid; + char *hexid, *has_session, *grace_start; + struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); + + /* + * With v4.0 clients, there's little difference in outcome between a + * create and check operation, and we can end up calling into this + * function multiple times per client (once for each openowner). So, + * for v4.0 clients skip upcalling once the client has been recorded + * on stable storage. + * + * For v4.1+ clients, the outcome of the two operations is different, + * so we must ensure that we upcall for the create operation. v4.1+ + * clients call this on RECLAIM_COMPLETE though, so we should only end + * up doing a single create upcall per client. + */ + if (clp->cl_minorversion == 0 && + test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("create", hexid, NULL); + + has_session = nfsd4_cltrack_client_has_session(clp); + grace_start = nfsd4_cltrack_grace_start(nn->boot_time); + + nfsd4_cltrack_upcall_lock(clp); + if (!nfsd4_umh_cltrack_upcall("create", hexid, has_session, grace_start)) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + + kfree(has_session); + kfree(grace_start); kfree(hexid); } @@ -1217,12 +1325,21 @@ nfsd4_umh_cltrack_remove(struct nfs4_client *clp) { char *hexid; + if (!test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return; + hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return; } - nfsd4_umh_cltrack_upcall("remove", hexid, NULL); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags) && + nfsd4_umh_cltrack_upcall("remove", hexid, NULL, NULL) == 0) + clear_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + nfsd4_cltrack_upcall_unlock(clp); + kfree(hexid); } @@ -1230,30 +1347,45 @@ static int nfsd4_umh_cltrack_check(struct nfs4_client *clp) { int ret; - char *hexid, *legacy; + char *hexid, *has_session, *legacy; + + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) + return 0; hexid = bin_to_hex_dup(clp->cl_name.data, clp->cl_name.len); if (!hexid) { dprintk("%s: can't allocate memory for upcall!\n", __func__); return -ENOMEM; } + + has_session = nfsd4_cltrack_client_has_session(clp); legacy = nfsd4_cltrack_legacy_recdir(&clp->cl_name); - ret = nfsd4_umh_cltrack_upcall("check", hexid, legacy); + + nfsd4_cltrack_upcall_lock(clp); + if (test_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags)) { + ret = 0; + } else { + ret = nfsd4_umh_cltrack_upcall("check", hexid, has_session, legacy); + if (ret == 0) + set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); + } + nfsd4_cltrack_upcall_unlock(clp); + kfree(has_session); kfree(legacy); kfree(hexid); + return ret; } static void -nfsd4_umh_cltrack_grace_done(struct nfsd_net __attribute__((unused)) *nn, - time_t boot_time) +nfsd4_umh_cltrack_grace_done(struct nfsd_net *nn) { char *legacy; char timestr[22]; /* FIXME: better way to determine max size? */ - sprintf(timestr, "%ld", boot_time); + sprintf(timestr, "%ld", nn->boot_time); legacy = nfsd4_cltrack_legacy_topdir(); - nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy); + nfsd4_umh_cltrack_upcall("gracedone", timestr, legacy, NULL); kfree(legacy); } @@ -1356,10 +1488,10 @@ nfsd4_client_record_check(struct nfs4_client *clp) } void -nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time) +nfsd4_record_grace_done(struct nfsd_net *nn) { if (nn->client_tracking_ops) - nn->client_tracking_ops->grace_done(nn, boot_time); + nn->client_tracking_ops->grace_done(nn); } static int diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 2e80a59e7e91..e9c3afe4b5d3 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -96,6 +96,8 @@ static struct kmem_cache *deleg_slab; static void free_session(struct nfsd4_session *); +static struct nfsd4_callback_ops nfsd4_cb_recall_ops; + static bool is_session_dead(struct nfsd4_session *ses) { return ses->se_flags & NFS4_SESSION_DEAD; @@ -216,6 +218,13 @@ static void nfsd4_put_session(struct nfsd4_session *ses) spin_unlock(&nn->client_lock); } +static inline struct nfs4_stateowner * +nfs4_get_stateowner(struct nfs4_stateowner *sop) +{ + atomic_inc(&sop->so_count); + return sop; +} + static int same_owner_str(struct nfs4_stateowner *sop, struct xdr_netobj *owner) { @@ -235,10 +244,8 @@ find_openstateowner_str_locked(unsigned int hashval, struct nfsd4_open *open, so_strhash) { if (!so->so_is_open_owner) continue; - if (same_owner_str(so, &open->op_owner)) { - atomic_inc(&so->so_count); - return openowner(so); - } + if (same_owner_str(so, &open->op_owner)) + return openowner(nfs4_get_stateowner(so)); } return NULL; } @@ -645,7 +652,9 @@ alloc_init_deleg(struct nfs4_client *clp, struct svc_fh *current_fh) INIT_LIST_HEAD(&dp->dl_perclnt); INIT_LIST_HEAD(&dp->dl_recall_lru); dp->dl_type = NFS4_OPEN_DELEGATE_READ; - INIT_WORK(&dp->dl_recall.cb_work, nfsd4_run_cb_recall); + dp->dl_retries = 1; + nfsd4_init_cb(&dp->dl_recall, dp->dl_stid.sc_client, + &nfsd4_cb_recall_ops, NFSPROC4_CLNT_CB_RECALL); return dp; out_dec: atomic_long_dec(&num_delegations); @@ -673,15 +682,16 @@ nfs4_put_stid(struct nfs4_stid *s) static void nfs4_put_deleg_lease(struct nfs4_file *fp) { - lockdep_assert_held(&state_lock); + struct file *filp = NULL; - if (!fp->fi_lease) - return; - if (atomic_dec_and_test(&fp->fi_delegees)) { - vfs_setlease(fp->fi_deleg_file, F_UNLCK, &fp->fi_lease); - fp->fi_lease = NULL; - fput(fp->fi_deleg_file); - fp->fi_deleg_file = NULL; + spin_lock(&fp->fi_lock); + if (fp->fi_deleg_file && atomic_dec_and_test(&fp->fi_delegees)) + swap(filp, fp->fi_deleg_file); + spin_unlock(&fp->fi_lock); + + if (filp) { + vfs_setlease(filp, F_UNLCK, NULL, NULL); + fput(filp); } } @@ -717,8 +727,6 @@ unhash_delegation_locked(struct nfs4_delegation *dp) list_del_init(&dp->dl_recall_lru); list_del_init(&dp->dl_perfile); spin_unlock(&fp->fi_lock); - if (fp) - nfs4_put_deleg_lease(fp); } static void destroy_delegation(struct nfs4_delegation *dp) @@ -726,6 +734,7 @@ static void destroy_delegation(struct nfs4_delegation *dp) spin_lock(&state_lock); unhash_delegation_locked(dp); spin_unlock(&state_lock); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } @@ -735,6 +744,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) WARN_ON(!list_empty(&dp->dl_recall_lru)); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); + if (clp->cl_minorversion == 0) nfs4_put_stid(&dp->dl_stid); else { @@ -1635,6 +1646,7 @@ __destroy_client(struct nfs4_client *clp) while (!list_empty(&reaplist)) { dp = list_entry(reaplist.next, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } while (!list_empty(&clp->cl_revoked)) { @@ -1644,7 +1656,7 @@ __destroy_client(struct nfs4_client *clp) } while (!list_empty(&clp->cl_openowners)) { oo = list_entry(clp->cl_openowners.next, struct nfs4_openowner, oo_perclient); - atomic_inc(&oo->oo_owner.so_count); + nfs4_get_stateowner(&oo->oo_owner); release_openowner(oo); } nfsd4_shutdown_callback(clp); @@ -1862,7 +1874,7 @@ static struct nfs4_client *create_client(struct xdr_netobj name, free_client(clp); return NULL; } - INIT_WORK(&clp->cl_cb_null.cb_work, nfsd4_run_cb_null); + nfsd4_init_cb(&clp->cl_cb_null, clp, NULL, NFSPROC4_CLNT_CB_NULL); clp->cl_time = get_seconds(); clear_bit(0, &clp->cl_cb_slot_busy); copy_verf(clp, verf); @@ -3056,8 +3068,8 @@ static void nfsd4_init_file(struct nfs4_file *fp, struct knfsd_fh *fh) INIT_LIST_HEAD(&fp->fi_stateids); INIT_LIST_HEAD(&fp->fi_delegations); fh_copy_shallow(&fp->fi_fhandle, fh); + fp->fi_deleg_file = NULL; fp->fi_had_conflict = false; - fp->fi_lease = NULL; fp->fi_share_deny = 0; memset(fp->fi_fds, 0, sizeof(fp->fi_fds)); memset(fp->fi_access, 0, sizeof(fp->fi_access)); @@ -3125,8 +3137,7 @@ static void nfsd4_cstate_assign_replay(struct nfsd4_compound_state *cstate, { if (!nfsd4_has_session(cstate)) { mutex_lock(&so->so_replay.rp_mutex); - cstate->replay_owner = so; - atomic_inc(&so->so_count); + cstate->replay_owner = nfs4_get_stateowner(so); } } @@ -3225,8 +3236,7 @@ static void init_open_stateid(struct nfs4_ol_stateid *stp, struct nfs4_file *fp, atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_OPEN_STID; INIT_LIST_HEAD(&stp->st_locks); - stp->st_stateowner = &oo->oo_owner; - atomic_inc(&stp->st_stateowner->so_count); + stp->st_stateowner = nfs4_get_stateowner(&oo->oo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_access_bmap = 0; @@ -3349,8 +3359,9 @@ nfs4_share_conflict(struct svc_fh *current_fh, unsigned int deny_type) return ret; } -void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp) +static void nfsd4_cb_recall_prepare(struct nfsd4_callback *cb) { + struct nfs4_delegation *dp = cb_to_delegation(cb); struct nfsd_net *nn = net_generic(dp->dl_stid.sc_client->net, nfsd_net_id); @@ -3371,6 +3382,43 @@ void nfsd4_prepare_cb_recall(struct nfs4_delegation *dp) spin_unlock(&state_lock); } +static int nfsd4_cb_recall_done(struct nfsd4_callback *cb, + struct rpc_task *task) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + switch (task->tk_status) { + case 0: + return 1; + case -EBADHANDLE: + case -NFS4ERR_BAD_STATEID: + /* + * Race: client probably got cb_recall before open reply + * granting delegation. + */ + if (dp->dl_retries--) { + rpc_delay(task, 2 * HZ); + return 0; + } + /*FALLTHRU*/ + default: + return -1; + } +} + +static void nfsd4_cb_recall_release(struct nfsd4_callback *cb) +{ + struct nfs4_delegation *dp = cb_to_delegation(cb); + + nfs4_put_stid(&dp->dl_stid); +} + +static struct nfsd4_callback_ops nfsd4_cb_recall_ops = { + .prepare = nfsd4_cb_recall_prepare, + .done = nfsd4_cb_recall_done, + .release = nfsd4_cb_recall_release, +}; + static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { /* @@ -3381,22 +3429,24 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * it's safe to take a reference. */ atomic_inc(&dp->dl_stid.sc_count); - nfsd4_cb_recall(dp); + nfsd4_run_cb(&dp->dl_recall); } /* Called from break_lease() with i_lock held. */ -static void nfsd_break_deleg_cb(struct file_lock *fl) +static bool +nfsd_break_deleg_cb(struct file_lock *fl) { + bool ret = false; struct nfs4_file *fp = (struct nfs4_file *)fl->fl_owner; struct nfs4_delegation *dp; if (!fp) { WARN(1, "(%p)->fl_owner NULL\n", fl); - return; + return ret; } if (fp->fi_had_conflict) { WARN(1, "duplicate break on %p\n", fp); - return; + return ret; } /* * We don't want the locks code to timeout the lease for us; @@ -3408,24 +3458,23 @@ static void nfsd_break_deleg_cb(struct file_lock *fl) spin_lock(&fp->fi_lock); fp->fi_had_conflict = true; /* - * If there are no delegations on the list, then we can't count on this - * lease ever being cleaned up. Set the fl_break_time to jiffies so that - * time_out_leases will do it ASAP. The fact that fi_had_conflict is now - * true should keep any new delegations from being hashed. + * If there are no delegations on the list, then return true + * so that the lease code will go ahead and delete it. */ if (list_empty(&fp->fi_delegations)) - fl->fl_break_time = jiffies; + ret = true; else list_for_each_entry(dp, &fp->fi_delegations, dl_perfile) nfsd_break_one_deleg(dp); spin_unlock(&fp->fi_lock); + return ret; } -static -int nfsd_change_deleg_cb(struct file_lock **onlist, int arg) +static int +nfsd_change_deleg_cb(struct file_lock **onlist, int arg, struct list_head *dispose) { if (arg & F_UNLCK) - return lease_modify(onlist, arg); + return lease_modify(onlist, arg, dispose); else return -EAGAIN; } @@ -3759,7 +3808,6 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) fl = locks_alloc_lock(); if (!fl) return NULL; - locks_init_lock(fl); fl->fl_lmops = &nfsd_lease_mng_ops; fl->fl_flags = FL_DELEG; fl->fl_type = flag == NFS4_OPEN_DELEGATE_READ? F_RDLCK: F_WRLCK; @@ -3772,7 +3820,7 @@ static struct file_lock *nfs4_alloc_init_lease(struct nfs4_file *fp, int flag) static int nfs4_setlease(struct nfs4_delegation *dp) { struct nfs4_file *fp = dp->dl_stid.sc_file; - struct file_lock *fl; + struct file_lock *fl, *ret; struct file *filp; int status = 0; @@ -3786,11 +3834,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) return -EBADF; } fl->fl_file = filp; - status = vfs_setlease(filp, fl->fl_type, &fl); - if (status) { + ret = fl; + status = vfs_setlease(filp, fl->fl_type, &fl, NULL); + if (fl) locks_free_lock(fl); + if (status) goto out_fput; - } spin_lock(&state_lock); spin_lock(&fp->fi_lock); /* Did the lease get broken before we took the lock? */ @@ -3798,13 +3847,12 @@ static int nfs4_setlease(struct nfs4_delegation *dp) if (fp->fi_had_conflict) goto out_unlock; /* Race breaker */ - if (fp->fi_lease) { + if (fp->fi_deleg_file) { status = 0; atomic_inc(&fp->fi_delegees); hash_delegation_locked(dp, fp); goto out_unlock; } - fp->fi_lease = fl; fp->fi_deleg_file = filp; atomic_set(&fp->fi_delegees, 1); hash_delegation_locked(dp, fp); @@ -3837,7 +3885,7 @@ nfs4_set_delegation(struct nfs4_client *clp, struct svc_fh *fh, spin_lock(&state_lock); spin_lock(&fp->fi_lock); dp->dl_stid.sc_file = fp; - if (!fp->fi_lease) { + if (!fp->fi_deleg_file) { spin_unlock(&fp->fi_lock); spin_unlock(&state_lock); status = nfs4_setlease(dp); @@ -4107,7 +4155,7 @@ out: return status; } -static void +void nfsd4_end_grace(struct nfsd_net *nn) { /* do nothing if grace period already ended */ @@ -4116,14 +4164,28 @@ nfsd4_end_grace(struct nfsd_net *nn) dprintk("NFSD: end of grace period\n"); nn->grace_ended = true; - nfsd4_record_grace_done(nn, nn->boot_time); + /* + * If the server goes down again right now, an NFSv4 + * client will still be allowed to reclaim after it comes back up, + * even if it hasn't yet had a chance to reclaim state this time. + * + */ + nfsd4_record_grace_done(nn); + /* + * At this point, NFSv4 clients can still reclaim. But if the + * server crashes, any that have not yet reclaimed will be out + * of luck on the next boot. + * + * (NFSv4.1+ clients are considered to have reclaimed once they + * call RECLAIM_COMPLETE. NFSv4.0 clients are considered to + * have reclaimed after their first OPEN.) + */ locks_end_grace(&nn->nfsd4_manager); /* - * Now that every NFSv4 client has had the chance to recover and - * to see the (possibly new, possibly shorter) lease time, we - * can safely set the next grace time to the current lease time: + * At this point, and once lockd and/or any other containers + * exit their grace period, further reclaims will fail and + * regular locking can resume. */ - nn->nfsd4_grace = nn->nfsd4_lease; } static time_t @@ -4867,9 +4929,25 @@ nfs4_transform_lock_offset(struct file_lock *lock) lock->fl_end = OFFSET_MAX; } -/* Hack!: For now, we're defining this just so we can use a pointer to it - * as a unique cookie to identify our (NFSv4's) posix locks. */ +static void nfsd4_fl_get_owner(struct file_lock *dst, struct file_lock *src) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)src->fl_owner; + dst->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lo->lo_owner)); +} + +static void nfsd4_fl_put_owner(struct file_lock *fl) +{ + struct nfs4_lockowner *lo = (struct nfs4_lockowner *)fl->fl_owner; + + if (lo) { + nfs4_put_stateowner(&lo->lo_owner); + fl->fl_owner = NULL; + } +} + static const struct lock_manager_operations nfsd_posix_mng_ops = { + .lm_get_owner = nfsd4_fl_get_owner, + .lm_put_owner = nfsd4_fl_put_owner, }; static inline void @@ -4915,10 +4993,8 @@ find_lockowner_str_locked(clientid_t *clid, struct xdr_netobj *owner, so_strhash) { if (so->so_is_open_owner) continue; - if (!same_owner_str(so, owner)) - continue; - atomic_inc(&so->so_count); - return lockowner(so); + if (same_owner_str(so, owner)) + return lockowner(nfs4_get_stateowner(so)); } return NULL; } @@ -4997,8 +5073,7 @@ init_lock_stateid(struct nfs4_ol_stateid *stp, struct nfs4_lockowner *lo, atomic_inc(&stp->st_stid.sc_count); stp->st_stid.sc_type = NFS4_LOCK_STID; - stp->st_stateowner = &lo->lo_owner; - atomic_inc(&lo->lo_owner.so_count); + stp->st_stateowner = nfs4_get_stateowner(&lo->lo_owner); get_nfs4_file(fp); stp->st_stid.sc_file = fp; stp->st_stid.sc_free = nfs4_free_lock_stateid; @@ -5210,7 +5285,6 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, } fp = lock_stp->st_stid.sc_file; - locks_init_lock(file_lock); switch (lock->lk_type) { case NFS4_READ_LT: case NFS4_READW_LT: @@ -5238,7 +5312,8 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_openmode; goto out; } - file_lock->fl_owner = (fl_owner_t)lock_sop; + + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; file_lock->fl_flags = FL_POSIX; @@ -5354,7 +5429,7 @@ nfsd4_lockt(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_jukebox; goto out; } - locks_init_lock(file_lock); + switch (lockt->lt_type) { case NFS4_READ_LT: case NFS4_READW_LT: @@ -5432,9 +5507,9 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_jukebox; goto fput; } - locks_init_lock(file_lock); + file_lock->fl_type = F_UNLCK; - file_lock->fl_owner = (fl_owner_t)lockowner(stp->st_stateowner); + file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(stp->st_stateowner)); file_lock->fl_pid = current->tgid; file_lock->fl_file = filp; file_lock->fl_flags = FL_POSIX; @@ -5541,7 +5616,7 @@ nfsd4_release_lockowner(struct svc_rqst *rqstp, } } - atomic_inc(&sop->so_count); + nfs4_get_stateowner(sop); break; } spin_unlock(&clp->cl_lock); @@ -5645,6 +5720,9 @@ nfs4_check_open_reclaim(clientid_t *clid, if (status) return nfserr_reclaim_bad; + if (test_bit(NFSD4_CLIENT_RECLAIM_COMPLETE, &cstate->clp->cl_flags)) + return nfserr_no_grace; + if (nfsd4_client_record_check(cstate->clp)) return nfserr_reclaim_bad; @@ -6342,10 +6420,10 @@ nfs4_state_start_net(struct net *net) ret = nfs4_state_create_net(net); if (ret) return ret; - nfsd4_client_tracking_init(net); nn->boot_time = get_seconds(); - locks_start_grace(net, &nn->nfsd4_manager); nn->grace_ended = false; + locks_start_grace(net, &nn->nfsd4_manager); + nfsd4_client_tracking_init(net); printk(KERN_INFO "NFSD: starting %ld-second grace period (net %p)\n", nn->nfsd4_grace, net); queue_delayed_work(laundry_wq, &nn->laundromat_work, nn->nfsd4_grace * HZ); @@ -6402,6 +6480,7 @@ nfs4_state_shutdown_net(struct net *net) list_for_each_safe(pos, next, &reaplist) { dp = list_entry (pos, struct nfs4_delegation, dl_recall_lru); list_del_init(&dp->dl_recall_lru); + nfs4_put_deleg_lease(dp->dl_stid.sc_file); nfs4_put_stid(&dp->dl_stid); } diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index b01f6e100ee8..eeea7a90eb87 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -31,13 +31,6 @@ * LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. - * - * TODO: Neil Brown made the following observation: We currently - * initially reserve NFSD_BUFSIZE space on the transmit queue and - * never release any of that until the request is complete. - * It would be good to calculate a new maximum response size while - * decoding the COMPOUND, and call svc_reserve with this number - * at the end of nfs4svc_decode_compoundargs. */ #include <linux/slab.h> @@ -1521,6 +1514,22 @@ static __be32 nfsd4_decode_reclaim_complete(struct nfsd4_compoundargs *argp, str } static __be32 +nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek) +{ + DECODE_HEAD; + + status = nfsd4_decode_stateid(argp, &seek->seek_stateid); + if (status) + return status; + + READ_BUF(8 + 4); + p = xdr_decode_hyper(p, &seek->seek_offset); + seek->seek_whence = be32_to_cpup(p); + + DECODE_TAIL; +} + +static __be32 nfsd4_decode_noop(struct nfsd4_compoundargs *argp, void *p) { return nfs_ok; @@ -1593,6 +1602,20 @@ static nfsd4_dec nfsd4_dec_ops[] = { [OP_WANT_DELEGATION] = (nfsd4_dec)nfsd4_decode_notsupp, [OP_DESTROY_CLIENTID] = (nfsd4_dec)nfsd4_decode_destroy_clientid, [OP_RECLAIM_COMPLETE] = (nfsd4_dec)nfsd4_decode_reclaim_complete, + + /* new operations for NFSv4.2 */ + [OP_ALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_COPY_NOTIFY] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_DEALLOCATE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_IO_ADVISE] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTERROR] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_LAYOUTSTATS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_CANCEL] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_OFFLOAD_STATUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_READ_PLUS] = (nfsd4_dec)nfsd4_decode_notsupp, + [OP_SEEK] = (nfsd4_dec)nfsd4_decode_seek, + [OP_WRITE_SAME] = (nfsd4_dec)nfsd4_decode_notsupp, }; static inline bool @@ -1670,6 +1693,14 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) readbytes += nfsd4_max_reply(argp->rqstp, op); } else max_reply += nfsd4_max_reply(argp->rqstp, op); + /* + * OP_LOCK may return a conflicting lock. (Special case + * because it will just skip encoding this if it runs + * out of xdr buffer space, and it is the only operation + * that behaves this way.) + */ + if (op->opnum == OP_LOCK) + max_reply += NFS4_OPAQUE_LIMIT; if (op->status) { argp->opcnt = i+1; @@ -3764,6 +3795,22 @@ nfsd4_encode_test_stateid(struct nfsd4_compoundres *resp, __be32 nfserr, } static __be32 +nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr, + struct nfsd4_seek *seek) +{ + __be32 *p; + + if (nfserr) + return nfserr; + + p = xdr_reserve_space(&resp->xdr, 4 + 8); + *p++ = cpu_to_be32(seek->seek_eof); + p = xdr_encode_hyper(p, seek->seek_pos); + + return nfserr; +} + +static __be32 nfsd4_encode_noop(struct nfsd4_compoundres *resp, __be32 nfserr, void *p) { return nfserr; @@ -3835,6 +3882,20 @@ static nfsd4_enc nfsd4_enc_ops[] = { [OP_WANT_DELEGATION] = (nfsd4_enc)nfsd4_encode_noop, [OP_DESTROY_CLIENTID] = (nfsd4_enc)nfsd4_encode_noop, [OP_RECLAIM_COMPLETE] = (nfsd4_enc)nfsd4_encode_noop, + + /* NFSv4.2 operations */ + [OP_ALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_COPY_NOTIFY] = (nfsd4_enc)nfsd4_encode_noop, + [OP_DEALLOCATE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_IO_ADVISE] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTERROR] = (nfsd4_enc)nfsd4_encode_noop, + [OP_LAYOUTSTATS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_CANCEL] = (nfsd4_enc)nfsd4_encode_noop, + [OP_OFFLOAD_STATUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_READ_PLUS] = (nfsd4_enc)nfsd4_encode_noop, + [OP_SEEK] = (nfsd4_enc)nfsd4_encode_seek, + [OP_WRITE_SAME] = (nfsd4_enc)nfsd4_encode_noop, }; /* diff --git a/fs/nfsd/nfscache.c b/fs/nfsd/nfscache.c index ff9567633245..122f69185ef5 100644 --- a/fs/nfsd/nfscache.c +++ b/fs/nfsd/nfscache.c @@ -27,8 +27,12 @@ */ #define TARGET_BUCKET_SIZE 64 -static struct hlist_head * cache_hash; -static struct list_head lru_head; +struct nfsd_drc_bucket { + struct list_head lru_head; + spinlock_t cache_lock; +}; + +static struct nfsd_drc_bucket *drc_hashtbl; static struct kmem_cache *drc_slab; /* max number of entries allowed in the cache */ @@ -36,6 +40,7 @@ static unsigned int max_drc_entries; /* number of significant bits in the hash value */ static unsigned int maskbits; +static unsigned int drc_hashsize; /* * Stats and other tracking of on the duplicate reply cache. All of these and @@ -43,7 +48,7 @@ static unsigned int maskbits; */ /* total number of entries */ -static unsigned int num_drc_entries; +static atomic_t num_drc_entries; /* cache misses due only to checksum comparison failures */ static unsigned int payload_misses; @@ -75,7 +80,6 @@ static struct shrinker nfsd_reply_cache_shrinker = { * A cache entry is "single use" if c_state == RC_INPROG * Otherwise, it when accessing _prev or _next, the lock must be held. */ -static DEFINE_SPINLOCK(cache_lock); static DECLARE_DELAYED_WORK(cache_cleaner, cache_cleaner_func); /* @@ -116,6 +120,12 @@ nfsd_hashsize(unsigned int limit) return roundup_pow_of_two(limit / TARGET_BUCKET_SIZE); } +static u32 +nfsd_cache_hash(__be32 xid) +{ + return hash_32(be32_to_cpu(xid), maskbits); +} + static struct svc_cacherep * nfsd_reply_cache_alloc(void) { @@ -126,7 +136,6 @@ nfsd_reply_cache_alloc(void) rp->c_state = RC_UNUSED; rp->c_type = RC_NOCACHE; INIT_LIST_HEAD(&rp->c_lru); - INIT_HLIST_NODE(&rp->c_hash); } return rp; } @@ -138,29 +147,27 @@ nfsd_reply_cache_free_locked(struct svc_cacherep *rp) drc_mem_usage -= rp->c_replvec.iov_len; kfree(rp->c_replvec.iov_base); } - if (!hlist_unhashed(&rp->c_hash)) - hlist_del(&rp->c_hash); list_del(&rp->c_lru); - --num_drc_entries; + atomic_dec(&num_drc_entries); drc_mem_usage -= sizeof(*rp); kmem_cache_free(drc_slab, rp); } static void -nfsd_reply_cache_free(struct svc_cacherep *rp) +nfsd_reply_cache_free(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); nfsd_reply_cache_free_locked(rp); - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); } int nfsd_reply_cache_init(void) { unsigned int hashsize; + unsigned int i; - INIT_LIST_HEAD(&lru_head); max_drc_entries = nfsd_cache_size_limit(); - num_drc_entries = 0; + atomic_set(&num_drc_entries, 0); hashsize = nfsd_hashsize(max_drc_entries); maskbits = ilog2(hashsize); @@ -170,9 +177,14 @@ int nfsd_reply_cache_init(void) if (!drc_slab) goto out_nomem; - cache_hash = kcalloc(hashsize, sizeof(struct hlist_head), GFP_KERNEL); - if (!cache_hash) + drc_hashtbl = kcalloc(hashsize, sizeof(*drc_hashtbl), GFP_KERNEL); + if (!drc_hashtbl) goto out_nomem; + for (i = 0; i < hashsize; i++) { + INIT_LIST_HEAD(&drc_hashtbl[i].lru_head); + spin_lock_init(&drc_hashtbl[i].cache_lock); + } + drc_hashsize = hashsize; return 0; out_nomem: @@ -184,17 +196,22 @@ out_nomem: void nfsd_reply_cache_shutdown(void) { struct svc_cacherep *rp; + unsigned int i; unregister_shrinker(&nfsd_reply_cache_shrinker); cancel_delayed_work_sync(&cache_cleaner); - while (!list_empty(&lru_head)) { - rp = list_entry(lru_head.next, struct svc_cacherep, c_lru); - nfsd_reply_cache_free_locked(rp); + for (i = 0; i < drc_hashsize; i++) { + struct list_head *head = &drc_hashtbl[i].lru_head; + while (!list_empty(head)) { + rp = list_first_entry(head, struct svc_cacherep, c_lru); + nfsd_reply_cache_free_locked(rp); + } } - kfree (cache_hash); - cache_hash = NULL; + kfree (drc_hashtbl); + drc_hashtbl = NULL; + drc_hashsize = 0; if (drc_slab) { kmem_cache_destroy(drc_slab); @@ -207,61 +224,63 @@ void nfsd_reply_cache_shutdown(void) * not already scheduled. */ static void -lru_put_end(struct svc_cacherep *rp) +lru_put_end(struct nfsd_drc_bucket *b, struct svc_cacherep *rp) { rp->c_timestamp = jiffies; - list_move_tail(&rp->c_lru, &lru_head); + list_move_tail(&rp->c_lru, &b->lru_head); schedule_delayed_work(&cache_cleaner, RC_EXPIRE); } -/* - * Move a cache entry from one hash list to another - */ -static void -hash_refile(struct svc_cacherep *rp) -{ - hlist_del_init(&rp->c_hash); - /* - * No point in byte swapping c_xid since we're just using it to pick - * a hash bucket. - */ - hlist_add_head(&rp->c_hash, cache_hash + - hash_32((__force u32)rp->c_xid, maskbits)); -} - -/* - * Walk the LRU list and prune off entries that are older than RC_EXPIRE. - * Also prune the oldest ones when the total exceeds the max number of entries. - */ static long -prune_cache_entries(void) +prune_bucket(struct nfsd_drc_bucket *b) { struct svc_cacherep *rp, *tmp; long freed = 0; - list_for_each_entry_safe(rp, tmp, &lru_head, c_lru) { + list_for_each_entry_safe(rp, tmp, &b->lru_head, c_lru) { /* * Don't free entries attached to calls that are still * in-progress, but do keep scanning the list. */ if (rp->c_state == RC_INPROG) continue; - if (num_drc_entries <= max_drc_entries && + if (atomic_read(&num_drc_entries) <= max_drc_entries && time_before(jiffies, rp->c_timestamp + RC_EXPIRE)) break; nfsd_reply_cache_free_locked(rp); freed++; } + return freed; +} + +/* + * Walk the LRU list and prune off entries that are older than RC_EXPIRE. + * Also prune the oldest ones when the total exceeds the max number of entries. + */ +static long +prune_cache_entries(void) +{ + unsigned int i; + long freed = 0; + bool cancel = true; + + for (i = 0; i < drc_hashsize; i++) { + struct nfsd_drc_bucket *b = &drc_hashtbl[i]; + + if (list_empty(&b->lru_head)) + continue; + spin_lock(&b->cache_lock); + freed += prune_bucket(b); + if (!list_empty(&b->lru_head)) + cancel = false; + spin_unlock(&b->cache_lock); + } /* - * Conditionally rearm the job. If we cleaned out the list, then - * cancel any pending run (since there won't be any work to do). - * Otherwise, we rearm the job or modify the existing one to run in - * RC_EXPIRE since we just ran the pruner. + * Conditionally rearm the job to run in RC_EXPIRE since we just + * ran the pruner. */ - if (list_empty(&lru_head)) - cancel_delayed_work(&cache_cleaner); - else + if (!cancel) mod_delayed_work(system_wq, &cache_cleaner, RC_EXPIRE); return freed; } @@ -269,32 +288,19 @@ prune_cache_entries(void) static void cache_cleaner_func(struct work_struct *unused) { - spin_lock(&cache_lock); prune_cache_entries(); - spin_unlock(&cache_lock); } static unsigned long nfsd_reply_cache_count(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long num; - - spin_lock(&cache_lock); - num = num_drc_entries; - spin_unlock(&cache_lock); - - return num; + return atomic_read(&num_drc_entries); } static unsigned long nfsd_reply_cache_scan(struct shrinker *shrink, struct shrink_control *sc) { - unsigned long freed; - - spin_lock(&cache_lock); - freed = prune_cache_entries(); - spin_unlock(&cache_lock); - return freed; + return prune_cache_entries(); } /* * Walk an xdr_buf and get a CRC for at most the first RC_CSUMLEN bytes @@ -332,20 +338,24 @@ nfsd_cache_csum(struct svc_rqst *rqstp) static bool nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) { - /* Check RPC header info first */ - if (rqstp->rq_xid != rp->c_xid || rqstp->rq_proc != rp->c_proc || - rqstp->rq_prot != rp->c_prot || rqstp->rq_vers != rp->c_vers || - rqstp->rq_arg.len != rp->c_len || - !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || - rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + /* Check RPC XID first */ + if (rqstp->rq_xid != rp->c_xid) return false; - /* compare checksum of NFS data */ if (csum != rp->c_csum) { ++payload_misses; return false; } + /* Other discriminators */ + if (rqstp->rq_proc != rp->c_proc || + rqstp->rq_prot != rp->c_prot || + rqstp->rq_vers != rp->c_vers || + rqstp->rq_arg.len != rp->c_len || + !rpc_cmp_addr(svc_addr(rqstp), (struct sockaddr *)&rp->c_addr) || + rpc_get_port(svc_addr(rqstp)) != rpc_get_port((struct sockaddr *)&rp->c_addr)) + return false; + return true; } @@ -355,18 +365,14 @@ nfsd_cache_match(struct svc_rqst *rqstp, __wsum csum, struct svc_cacherep *rp) * NULL on failure. */ static struct svc_cacherep * -nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) +nfsd_cache_search(struct nfsd_drc_bucket *b, struct svc_rqst *rqstp, + __wsum csum) { struct svc_cacherep *rp, *ret = NULL; - struct hlist_head *rh; + struct list_head *rh = &b->lru_head; unsigned int entries = 0; - /* - * No point in byte swapping rq_xid since we're just using it to pick - * a hash bucket. - */ - rh = &cache_hash[hash_32((__force u32)rqstp->rq_xid, maskbits)]; - hlist_for_each_entry(rp, rh, c_hash) { + list_for_each_entry(rp, rh, c_lru) { ++entries; if (nfsd_cache_match(rqstp, csum, rp)) { ret = rp; @@ -377,11 +383,12 @@ nfsd_cache_search(struct svc_rqst *rqstp, __wsum csum) /* tally hash chain length stats */ if (entries > longest_chain) { longest_chain = entries; - longest_chain_cachesize = num_drc_entries; + longest_chain_cachesize = atomic_read(&num_drc_entries); } else if (entries == longest_chain) { /* prefer to keep the smallest cachesize possible here */ - longest_chain_cachesize = min(longest_chain_cachesize, - num_drc_entries); + longest_chain_cachesize = min_t(unsigned int, + longest_chain_cachesize, + atomic_read(&num_drc_entries)); } return ret; @@ -403,6 +410,8 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) vers = rqstp->rq_vers, proc = rqstp->rq_proc; __wsum csum; + u32 hash = nfsd_cache_hash(xid); + struct nfsd_drc_bucket *b = &drc_hashtbl[hash]; unsigned long age; int type = rqstp->rq_cachetype; int rtn = RC_DOIT; @@ -420,16 +429,16 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) * preallocate an entry. */ rp = nfsd_reply_cache_alloc(); - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); if (likely(rp)) { - ++num_drc_entries; + atomic_inc(&num_drc_entries); drc_mem_usage += sizeof(*rp); } /* go ahead and prune the cache */ - prune_cache_entries(); + prune_bucket(b); - found = nfsd_cache_search(rqstp, csum); + found = nfsd_cache_search(b, rqstp, csum); if (found) { if (likely(rp)) nfsd_reply_cache_free_locked(rp); @@ -454,8 +463,7 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) rp->c_len = rqstp->rq_arg.len; rp->c_csum = csum; - hash_refile(rp); - lru_put_end(rp); + lru_put_end(b, rp); /* release any buffer */ if (rp->c_type == RC_REPLBUFF) { @@ -465,14 +473,14 @@ nfsd_cache_lookup(struct svc_rqst *rqstp) } rp->c_type = RC_NOCACHE; out: - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); return rtn; found_entry: nfsdstats.rchits++; /* We found a matching entry which is either in progress or done. */ age = jiffies - rp->c_timestamp; - lru_put_end(rp); + lru_put_end(b, rp); rtn = RC_DROPIT; /* Request being processed or excessive rexmits */ @@ -527,18 +535,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) { struct svc_cacherep *rp = rqstp->rq_cacherep; struct kvec *resv = &rqstp->rq_res.head[0], *cachv; + u32 hash; + struct nfsd_drc_bucket *b; int len; size_t bufsize = 0; if (!rp) return; + hash = nfsd_cache_hash(rp->c_xid); + b = &drc_hashtbl[hash]; + len = resv->iov_len - ((char*)statp - (char*)resv->iov_base); len >>= 2; /* Don't cache excessive amounts of data and XDR failures */ if (!statp || len > (256 >> 2)) { - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } @@ -553,23 +566,23 @@ nfsd_cache_update(struct svc_rqst *rqstp, int cachetype, __be32 *statp) bufsize = len << 2; cachv->iov_base = kmalloc(bufsize, GFP_KERNEL); if (!cachv->iov_base) { - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } cachv->iov_len = bufsize; memcpy(cachv->iov_base, statp, bufsize); break; case RC_NOCACHE: - nfsd_reply_cache_free(rp); + nfsd_reply_cache_free(b, rp); return; } - spin_lock(&cache_lock); + spin_lock(&b->cache_lock); drc_mem_usage += bufsize; - lru_put_end(rp); + lru_put_end(b, rp); rp->c_secure = rqstp->rq_secure; rp->c_type = cachetype; rp->c_state = RC_DONE; - spin_unlock(&cache_lock); + spin_unlock(&b->cache_lock); return; } @@ -600,9 +613,9 @@ nfsd_cache_append(struct svc_rqst *rqstp, struct kvec *data) */ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) { - spin_lock(&cache_lock); seq_printf(m, "max entries: %u\n", max_drc_entries); - seq_printf(m, "num entries: %u\n", num_drc_entries); + seq_printf(m, "num entries: %u\n", + atomic_read(&num_drc_entries)); seq_printf(m, "hash buckets: %u\n", 1 << maskbits); seq_printf(m, "mem usage: %u\n", drc_mem_usage); seq_printf(m, "cache hits: %u\n", nfsdstats.rchits); @@ -611,7 +624,6 @@ static int nfsd_reply_cache_stats_show(struct seq_file *m, void *v) seq_printf(m, "payload misses: %u\n", payload_misses); seq_printf(m, "longest chain len: %u\n", longest_chain); seq_printf(m, "cachesize at longest: %u\n", longest_chain_cachesize); - spin_unlock(&cache_lock); return 0; } diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index 4e042105fb6e..ca73ca79a0ee 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -49,6 +49,7 @@ enum { NFSD_Leasetime, NFSD_Gracetime, NFSD_RecoveryDir, + NFSD_V4EndGrace, #endif }; @@ -68,6 +69,7 @@ static ssize_t write_maxconn(struct file *file, char *buf, size_t size); static ssize_t write_leasetime(struct file *file, char *buf, size_t size); static ssize_t write_gracetime(struct file *file, char *buf, size_t size); static ssize_t write_recoverydir(struct file *file, char *buf, size_t size); +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size); #endif static ssize_t (*write_op[])(struct file *, char *, size_t) = { @@ -84,6 +86,7 @@ static ssize_t (*write_op[])(struct file *, char *, size_t) = { [NFSD_Leasetime] = write_leasetime, [NFSD_Gracetime] = write_gracetime, [NFSD_RecoveryDir] = write_recoverydir, + [NFSD_V4EndGrace] = write_v4_end_grace, #endif }; @@ -1077,6 +1080,47 @@ static ssize_t write_recoverydir(struct file *file, char *buf, size_t size) return rv; } +/** + * write_v4_end_grace - release grace period for nfsd's v4.x lock manager + * + * Input: + * buf: ignored + * size: zero + * OR + * + * Input: + * buf: any value + * size: non-zero length of C string in @buf + * Output: + * passed-in buffer filled with "Y" or "N" with a newline + * and NULL-terminated C string. This indicates whether + * the grace period has ended in the current net + * namespace. Return code is the size in bytes of the + * string. Writing a string that starts with 'Y', 'y', or + * '1' to the file will end the grace period for nfsd's v4 + * lock manager. + */ +static ssize_t write_v4_end_grace(struct file *file, char *buf, size_t size) +{ + struct net *net = file->f_dentry->d_sb->s_fs_info; + struct nfsd_net *nn = net_generic(net, nfsd_net_id); + + if (size > 0) { + switch(buf[0]) { + case 'Y': + case 'y': + case '1': + nfsd4_end_grace(nn); + break; + default: + return -EINVAL; + } + } + + return scnprintf(buf, SIMPLE_TRANSACTION_LIMIT, "%c\n", + nn->grace_ended ? 'Y' : 'N'); +} + #endif /*----------------------------------------------------------------------------*/ @@ -1110,6 +1154,7 @@ static int nfsd_fill_super(struct super_block * sb, void * data, int silent) [NFSD_Leasetime] = {"nfsv4leasetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_Gracetime] = {"nfsv4gracetime", &transaction_ops, S_IWUSR|S_IRUSR}, [NFSD_RecoveryDir] = {"nfsv4recoverydir", &transaction_ops, S_IWUSR|S_IRUSR}, + [NFSD_V4EndGrace] = {"v4_end_grace", &transaction_ops, S_IWUSR|S_IRUGO}, #endif /* last one */ {""} }; diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index 847daf37e566..747f3b95bd11 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -251,7 +251,7 @@ void nfsd_lockd_shutdown(void); #define nfserr_deleg_revoked cpu_to_be32(NFS4ERR_DELEG_REVOKED) #define nfserr_partner_notsupp cpu_to_be32(NFS4ERR_PARTNER_NOTSUPP) #define nfserr_partner_no_auth cpu_to_be32(NFS4ERR_PARTNER_NO_AUTH) -#define nfserr_metadata_notsupp cpu_to_be32(NFS4ERR_METADATA_NOTSUPP) +#define nfserr_union_notsupp cpu_to_be32(NFS4ERR_UNION_NOTSUPP) #define nfserr_offload_denied cpu_to_be32(NFS4ERR_OFFLOAD_DENIED) #define nfserr_wrong_lfs cpu_to_be32(NFS4ERR_WRONG_LFS) #define nfserr_badlabel cpu_to_be32(NFS4ERR_BADLABEL) diff --git a/fs/nfsd/nfsfh.c b/fs/nfsd/nfsfh.c index e883a5868be6..88026fc6a981 100644 --- a/fs/nfsd/nfsfh.c +++ b/fs/nfsd/nfsfh.c @@ -209,8 +209,10 @@ static __be32 nfsd_set_fh_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp) * fix that case easily. */ struct cred *new = prepare_creds(); - if (!new) - return nfserrno(-ENOMEM); + if (!new) { + error = nfserrno(-ENOMEM); + goto out; + } new->cap_effective = cap_raise_nfsd_set(new->cap_effective, new->cap_permitted); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 4a89e00d7461..2712042a66b1 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -62,16 +62,21 @@ typedef struct { (s)->si_generation struct nfsd4_callback { - void *cb_op; struct nfs4_client *cb_clp; struct list_head cb_per_client; u32 cb_minorversion; struct rpc_message cb_msg; - const struct rpc_call_ops *cb_ops; + struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; bool cb_done; }; +struct nfsd4_callback_ops { + void (*prepare)(struct nfsd4_callback *); + int (*done)(struct nfsd4_callback *, struct rpc_task *); + void (*release)(struct nfsd4_callback *); +}; + /* * A core object that represents a "common" stateid. These are generally * embedded within the different (more specific) stateid objects and contain @@ -127,6 +132,9 @@ struct nfs4_delegation { struct nfsd4_callback dl_recall; }; +#define cb_to_delegation(cb) \ + container_of(cb, struct nfs4_delegation, dl_recall) + /* client delegation callback info */ struct nfs4_cb_conn { /* SETCLIENTID info */ @@ -306,6 +314,7 @@ struct nfs4_client { #define NFSD4_CLIENT_STABLE (2) /* client on stable storage */ #define NFSD4_CLIENT_RECLAIM_COMPLETE (3) /* reclaim_complete done */ #define NFSD4_CLIENT_CONFIRMED (4) /* client is confirmed */ +#define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) unsigned long cl_flags; @@ -477,7 +486,6 @@ struct nfs4_file { atomic_t fi_access[2]; u32 fi_share_deny; struct file *fi_deleg_file; - struct file_lock *fi_lease; atomic_t fi_delegees; struct knfsd_fh fi_fhandle; bool fi_had_conflict; @@ -517,6 +525,13 @@ static inline struct nfs4_ol_stateid *openlockstateid(struct nfs4_stid *s) #define RD_STATE 0x00000010 #define WR_STATE 0x00000020 +enum nfsd4_cb_op { + NFSPROC4_CLNT_CB_NULL = 0, + NFSPROC4_CLNT_CB_RECALL, + NFSPROC4_CLNT_CB_SEQUENCE, +}; + + struct nfsd4_compound_state; struct nfsd_net; @@ -531,12 +546,12 @@ extern struct nfs4_client_reclaim *nfsd4_find_reclaim_client(const char *recdir, extern __be32 nfs4_check_open_reclaim(clientid_t *clid, struct nfsd4_compound_state *cstate, struct nfsd_net *nn); extern int set_callback_cred(void); -void nfsd4_run_cb_null(struct work_struct *w); -void nfsd4_run_cb_recall(struct work_struct *w); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); -extern void nfsd4_cb_recall(struct nfs4_delegation *dp); +extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, + struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); +extern void nfsd4_run_cb(struct nfsd4_callback *cb); extern int nfsd4_create_callback_queue(void); extern void nfsd4_destroy_callback_queue(void); extern void nfsd4_shutdown_callback(struct nfs4_client *); @@ -545,13 +560,16 @@ extern struct nfs4_client_reclaim *nfs4_client_to_reclaim(const char *name, struct nfsd_net *nn); extern bool nfs4_has_reclaimed_state(const char *name, struct nfsd_net *nn); +/* grace period management */ +void nfsd4_end_grace(struct nfsd_net *nn); + /* nfs4recover operations */ extern int nfsd4_client_tracking_init(struct net *net); extern void nfsd4_client_tracking_exit(struct net *net); extern void nfsd4_client_record_create(struct nfs4_client *clp); extern void nfsd4_client_record_remove(struct nfs4_client *clp); extern int nfsd4_client_record_check(struct nfs4_client *clp); -extern void nfsd4_record_grace_done(struct nfsd_net *nn, time_t boot_time); +extern void nfsd4_record_grace_done(struct nfsd_net *nn); /* nfs fault injection functions */ #ifdef CONFIG_NFSD_FAULT_INJECTION diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index f501a9b5c9df..989129e2d6ea 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -445,6 +445,16 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, struct iattr *iap, if (err) goto out; size_change = 1; + + /* + * RFC5661, Section 18.30.4: + * Changing the size of a file with SETATTR indirectly + * changes the time_modify and change attributes. + * + * (and similar for the older RFCs) + */ + if (iap->ia_size != i_size_read(inode)) + iap->ia_valid |= ATTR_MTIME; } iap->ia_valid |= ATTR_CTIME; @@ -649,6 +659,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, { struct path path; struct inode *inode; + struct file *file; int flags = O_RDONLY|O_LARGEFILE; __be32 err; int host_err = 0; @@ -703,19 +714,25 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, else flags = O_WRONLY|O_LARGEFILE; } - *filp = dentry_open(&path, flags, current_cred()); - if (IS_ERR(*filp)) { - host_err = PTR_ERR(*filp); - *filp = NULL; - } else { - host_err = ima_file_check(*filp, may_flags); - if (may_flags & NFSD_MAY_64BIT_COOKIE) - (*filp)->f_mode |= FMODE_64BITHASH; - else - (*filp)->f_mode |= FMODE_32BITHASH; + file = dentry_open(&path, flags, current_cred()); + if (IS_ERR(file)) { + host_err = PTR_ERR(file); + goto out_nfserr; } + host_err = ima_file_check(file, may_flags, 0); + if (host_err) { + nfsd_close(file); + goto out_nfserr; + } + + if (may_flags & NFSD_MAY_64BIT_COOKIE) + file->f_mode |= FMODE_64BITHASH; + else + file->f_mode |= FMODE_32BITHASH; + + *filp = file; out_nfserr: err = nfserrno(host_err); out: diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index 465e7799742a..5720e9457f33 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -428,6 +428,17 @@ struct nfsd4_reclaim_complete { u32 rca_one_fs; }; +struct nfsd4_seek { + /* request */ + stateid_t seek_stateid; + loff_t seek_offset; + u32 seek_whence; + + /* response */ + u32 seek_eof; + loff_t seek_pos; +}; + struct nfsd4_op { int opnum; __be32 status; @@ -473,6 +484,9 @@ struct nfsd4_op { struct nfsd4_reclaim_complete reclaim_complete; struct nfsd4_test_stateid test_stateid; struct nfsd4_free_stateid free_stateid; + + /* NFSv4.2 */ + struct nfsd4_seek seek; } u; struct nfs4_replay * replay; }; diff --git a/fs/nilfs2/file.c b/fs/nilfs2/file.c index 24978153c0c4..e9e3325f29f3 100644 --- a/fs/nilfs2/file.c +++ b/fs/nilfs2/file.c @@ -56,11 +56,9 @@ int nilfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) mutex_unlock(&inode->i_mutex); nilfs = inode->i_sb->s_fs_info; - if (!err && nilfs_test_opt(nilfs, BARRIER)) { - err = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - if (err != -EIO) - err = 0; - } + if (!err) + err = nilfs_flush_device(nilfs); + return err; } diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index d071e7f23de2..e1fa69b341b9 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -126,7 +126,7 @@ int nilfs_get_block(struct inode *inode, sector_t blkoff, nilfs_transaction_abort(inode->i_sb); goto out; } - nilfs_mark_inode_dirty(inode); + nilfs_mark_inode_dirty_sync(inode); nilfs_transaction_commit(inode->i_sb); /* never fails */ /* Error handling should be detailed */ set_buffer_new(bh_result); @@ -672,7 +672,7 @@ void nilfs_write_inode_common(struct inode *inode, for substitutions of appended fields */ } -void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) +void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags) { ino_t ino = inode->i_ino; struct nilfs_inode_info *ii = NILFS_I(inode); @@ -683,7 +683,8 @@ void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh) if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state)) memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size); - set_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + if (flags & I_DIRTY_DATASYNC) + set_bit(NILFS_I_INODE_SYNC, &ii->i_state); nilfs_write_inode_common(inode, raw_inode, 0); /* XXX: call with has_bmap = 0 is a workaround to avoid @@ -939,7 +940,7 @@ int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty) return 0; } -int nilfs_mark_inode_dirty(struct inode *inode) +int __nilfs_mark_inode_dirty(struct inode *inode, int flags) { struct buffer_head *ibh; int err; @@ -950,7 +951,7 @@ int nilfs_mark_inode_dirty(struct inode *inode) "failed to reget inode block.\n"); return err; } - nilfs_update_inode(inode, ibh); + nilfs_update_inode(inode, ibh, flags); mark_buffer_dirty(ibh); nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile); brelse(ibh); @@ -983,7 +984,7 @@ void nilfs_dirty_inode(struct inode *inode, int flags) return; } nilfs_transaction_begin(inode->i_sb, &ti, 0); - nilfs_mark_inode_dirty(inode); + __nilfs_mark_inode_dirty(inode, flags); nilfs_transaction_commit(inode->i_sb); /* never fails */ } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index 422fb54b7377..9a20e513d7eb 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -1022,11 +1022,9 @@ static int nilfs_ioctl_sync(struct inode *inode, struct file *filp, return ret; nilfs = inode->i_sb->s_fs_info; - if (nilfs_test_opt(nilfs, BARRIER)) { - ret = blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - if (ret == -EIO) - return ret; - } + ret = nilfs_flush_device(nilfs); + if (ret < 0) + return ret; if (argp != NULL) { down_read(&nilfs->ns_segctor_sem); diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h index 0696161bf59d..91093cd74f0d 100644 --- a/fs/nilfs2/nilfs.h +++ b/fs/nilfs2/nilfs.h @@ -104,7 +104,7 @@ enum { constructor */ NILFS_I_COLLECTED, /* All dirty blocks are collected */ NILFS_I_UPDATED, /* The file has been written back */ - NILFS_I_INODE_DIRTY, /* write_inode is requested */ + NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */ NILFS_I_BMAP, /* has bmap and btnode_cache */ NILFS_I_GCINODE, /* inode for GC, on memory only */ }; @@ -273,7 +273,7 @@ struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root, unsigned long ino); extern struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino, __u64 cno); -extern void nilfs_update_inode(struct inode *, struct buffer_head *); +extern void nilfs_update_inode(struct inode *, struct buffer_head *, int); extern void nilfs_truncate(struct inode *); extern void nilfs_evict_inode(struct inode *); extern int nilfs_setattr(struct dentry *, struct iattr *); @@ -282,10 +282,18 @@ int nilfs_permission(struct inode *inode, int mask); int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh); extern int nilfs_inode_dirty(struct inode *); int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty); -extern int nilfs_mark_inode_dirty(struct inode *); +extern int __nilfs_mark_inode_dirty(struct inode *, int); extern void nilfs_dirty_inode(struct inode *, int flags); int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, __u64 start, __u64 len); +static inline int nilfs_mark_inode_dirty(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY); +} +static inline int nilfs_mark_inode_dirty_sync(struct inode *inode) +{ + return __nilfs_mark_inode_dirty(inode, I_DIRTY_SYNC); +} /* super.c */ extern struct inode *nilfs_alloc_inode(struct super_block *); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index a1a191634abc..7ef18fc656c2 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -930,7 +930,7 @@ static void nilfs_drop_collected_inodes(struct list_head *head) if (!test_and_clear_bit(NILFS_I_COLLECTED, &ii->i_state)) continue; - clear_bit(NILFS_I_INODE_DIRTY, &ii->i_state); + clear_bit(NILFS_I_INODE_SYNC, &ii->i_state); set_bit(NILFS_I_UPDATED, &ii->i_state); } } @@ -1833,6 +1833,7 @@ static void nilfs_segctor_complete_write(struct nilfs_sc_info *sci) nilfs_set_next_segment(nilfs, segbuf); if (update_sr) { + nilfs->ns_flushed_device = 0; nilfs_set_last_segment(nilfs, segbuf->sb_pseg_start, segbuf->sb_sum.seg_seq, nilfs->ns_cno++); @@ -2194,7 +2195,7 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, nilfs_transaction_lock(sb, &ti, 0); ii = NILFS_I(inode); - if (test_bit(NILFS_I_INODE_DIRTY, &ii->i_state) || + if (test_bit(NILFS_I_INODE_SYNC, &ii->i_state) || nilfs_test_opt(nilfs, STRICT_ORDER) || test_bit(NILFS_SC_UNCLOSED, &sci->sc_flags) || nilfs_discontinued(nilfs)) { @@ -2216,6 +2217,8 @@ int nilfs_construct_dsync_segment(struct super_block *sb, struct inode *inode, sci->sc_dsync_end = end; err = nilfs_segctor_do_construct(sci, SC_LSEG_DSYNC); + if (!err) + nilfs->ns_flushed_device = 0; nilfs_transaction_unlock(sb); return err; diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c index 228f5bdf0772..2e5b3ec85b8f 100644 --- a/fs/nilfs2/super.c +++ b/fs/nilfs2/super.c @@ -310,6 +310,9 @@ int nilfs_commit_super(struct super_block *sb, int flag) nilfs->ns_sbsize)); } clear_nilfs_sb_dirty(nilfs); + nilfs->ns_flushed_device = 1; + /* make sure store to ns_flushed_device cannot be reordered */ + smp_wmb(); return nilfs_sync_super(sb, flag); } @@ -514,6 +517,9 @@ static int nilfs_sync_fs(struct super_block *sb, int wait) } up_write(&nilfs->ns_sem); + if (!err) + err = nilfs_flush_device(nilfs); + return err; } diff --git a/fs/nilfs2/the_nilfs.h b/fs/nilfs2/the_nilfs.h index d01ead1bea9a..23778d385836 100644 --- a/fs/nilfs2/the_nilfs.h +++ b/fs/nilfs2/the_nilfs.h @@ -46,6 +46,7 @@ enum { /** * struct the_nilfs - struct to supervise multiple nilfs mount points * @ns_flags: flags + * @ns_flushed_device: flag indicating if all volatile data was flushed * @ns_bdev: block device * @ns_sem: semaphore for shared states * @ns_snapshot_mount_mutex: mutex to protect snapshot mounts @@ -103,6 +104,7 @@ enum { */ struct the_nilfs { unsigned long ns_flags; + int ns_flushed_device; struct block_device *ns_bdev; struct rw_semaphore ns_sem; @@ -371,4 +373,24 @@ static inline int nilfs_segment_is_active(struct the_nilfs *nilfs, __u64 n) return n == nilfs->ns_segnum || n == nilfs->ns_nextnum; } +static inline int nilfs_flush_device(struct the_nilfs *nilfs) +{ + int err; + + if (!nilfs_test_opt(nilfs, BARRIER) || nilfs->ns_flushed_device) + return 0; + + nilfs->ns_flushed_device = 1; + /* + * the store to ns_flushed_device must not be reordered after + * blkdev_issue_flush(). + */ + smp_wmb(); + + err = blkdev_issue_flush(nilfs->ns_bdev, GFP_KERNEL, NULL); + if (err != -EIO) + err = 0; + return err; +} + #endif /* _THE_NILFS_H */ diff --git a/fs/notify/dnotify/dnotify.c b/fs/notify/dnotify/dnotify.c index abc8cbcfe90e..caaaf9dfe353 100644 --- a/fs/notify/dnotify/dnotify.c +++ b/fs/notify/dnotify/dnotify.c @@ -346,13 +346,7 @@ int fcntl_dirnotify(int fd, struct file *filp, unsigned long arg) goto out; } - error = __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); - if (error) { - /* if we added, we must shoot */ - if (dn_mark == new_dn_mark) - destroy = 1; - goto out; - } + __f_setown(filp, task_pid(current), PIDTYPE_PID, 0); error = attach_dn(dn, dn_mark, id, fd, filp, mask); /* !error means that we attached the dn to the dn_mark, so don't free it */ diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index b13992a41bd9..c991616acca9 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -78,7 +78,7 @@ static int create_fd(struct fsnotify_group *group, pr_debug("%s: group=%p event=%p\n", __func__, group, event); - client_fd = get_unused_fd(); + client_fd = get_unused_fd_flags(group->fanotify_data.f_flags); if (client_fd < 0) return client_fd; diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 85e7d2b431d9..9c0898c4cfe1 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -23,9 +23,6 @@ extern int fsnotify_add_vfsmount_mark(struct fsnotify_mark *mark, struct fsnotify_group *group, struct vfsmount *mnt, int allow_dups); -/* final kfree of a group */ -extern void fsnotify_final_destroy_group(struct fsnotify_group *group); - /* vfsmount specific destruction of a mark */ extern void fsnotify_destroy_vfsmount_mark(struct fsnotify_mark *mark); /* inode specific destruction of a mark */ diff --git a/fs/notify/group.c b/fs/notify/group.c index ad1995980456..d16b62cb2854 100644 --- a/fs/notify/group.c +++ b/fs/notify/group.c @@ -31,7 +31,7 @@ /* * Final freeing of a group */ -void fsnotify_final_destroy_group(struct fsnotify_group *group) +static void fsnotify_final_destroy_group(struct fsnotify_group *group) { if (group->ops->free_group_priv) group->ops->free_group_priv(group); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 0f88bc0b4e6c..7d888d77d59a 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -165,8 +165,10 @@ static void inotify_free_group_priv(struct fsnotify_group *group) /* ideally the idr is empty and we won't hit the BUG in the callback */ idr_for_each(&group->inotify_data.idr, idr_callback, group); idr_destroy(&group->inotify_data.idr); - atomic_dec(&group->inotify_data.user->inotify_devs); - free_uid(group->inotify_data.user); + if (group->inotify_data.user) { + atomic_dec(&group->inotify_data.user->inotify_devs); + free_uid(group->inotify_data.user); + } } static void inotify_free_event(struct fsnotify_event *fsn_event) diff --git a/fs/ntfs/Makefile b/fs/ntfs/Makefile index 30206b238433..36ae529511c4 100644 --- a/fs/ntfs/Makefile +++ b/fs/ntfs/Makefile @@ -8,7 +8,7 @@ ntfs-y := aops.o attrib.o collate.o compress.o debug.o dir.o file.o \ ntfs-$(CONFIG_NTFS_RW) += bitmap.o lcnalloc.o logfile.o quota.o usnjrnl.o -ccflags-y := -DNTFS_VERSION=\"2.1.30\" +ccflags-y := -DNTFS_VERSION=\"2.1.31\" ccflags-$(CONFIG_NTFS_DEBUG) += -DDEBUG ccflags-$(CONFIG_NTFS_RW) += -DNTFS_RW diff --git a/fs/ntfs/aops.c b/fs/ntfs/aops.c index d267ea6aa1a0..7521e11db728 100644 --- a/fs/ntfs/aops.c +++ b/fs/ntfs/aops.c @@ -1,8 +1,7 @@ /** * aops.c - NTFS kernel address space operations and page cache handling. - * Part of the Linux-NTFS project. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * Copyright (c) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -1539,16 +1538,157 @@ err_out: #endif /* NTFS_RW */ /** - * ntfs_aops - general address space operations for inodes and attributes + * ntfs_bmap - map logical file block to physical device block + * @mapping: address space mapping to which the block to be mapped belongs + * @block: logical block to map to its physical device block + * + * For regular, non-resident files (i.e. not compressed and not encrypted), map + * the logical @block belonging to the file described by the address space + * mapping @mapping to its physical device block. + * + * The size of the block is equal to the @s_blocksize field of the super block + * of the mounted file system which is guaranteed to be smaller than or equal + * to the cluster size thus the block is guaranteed to fit entirely inside the + * cluster which means we do not need to care how many contiguous bytes are + * available after the beginning of the block. + * + * Return the physical device block if the mapping succeeded or 0 if the block + * is sparse or there was an error. + * + * Note: This is a problem if someone tries to run bmap() on $Boot system file + * as that really is in block zero but there is nothing we can do. bmap() is + * just broken in that respect (just like it cannot distinguish sparse from + * not available or error). */ -const struct address_space_operations ntfs_aops = { - .readpage = ntfs_readpage, /* Fill page with data. */ +static sector_t ntfs_bmap(struct address_space *mapping, sector_t block) +{ + s64 ofs, size; + loff_t i_size; + LCN lcn; + unsigned long blocksize, flags; + ntfs_inode *ni = NTFS_I(mapping->host); + ntfs_volume *vol = ni->vol; + unsigned delta; + unsigned char blocksize_bits, cluster_size_shift; + + ntfs_debug("Entering for mft_no 0x%lx, logical block 0x%llx.", + ni->mft_no, (unsigned long long)block); + if (ni->type != AT_DATA || !NInoNonResident(ni) || NInoEncrypted(ni)) { + ntfs_error(vol->sb, "BMAP does not make sense for %s " + "attributes, returning 0.", + (ni->type != AT_DATA) ? "non-data" : + (!NInoNonResident(ni) ? "resident" : + "encrypted")); + return 0; + } + /* None of these can happen. */ + BUG_ON(NInoCompressed(ni)); + BUG_ON(NInoMstProtected(ni)); + blocksize = vol->sb->s_blocksize; + blocksize_bits = vol->sb->s_blocksize_bits; + ofs = (s64)block << blocksize_bits; + read_lock_irqsave(&ni->size_lock, flags); + size = ni->initialized_size; + i_size = i_size_read(VFS_I(ni)); + read_unlock_irqrestore(&ni->size_lock, flags); + /* + * If the offset is outside the initialized size or the block straddles + * the initialized size then pretend it is a hole unless the + * initialized size equals the file size. + */ + if (unlikely(ofs >= size || (ofs + blocksize > size && size < i_size))) + goto hole; + cluster_size_shift = vol->cluster_size_bits; + down_read(&ni->runlist.lock); + lcn = ntfs_attr_vcn_to_lcn_nolock(ni, ofs >> cluster_size_shift, false); + up_read(&ni->runlist.lock); + if (unlikely(lcn < LCN_HOLE)) { + /* + * Step down to an integer to avoid gcc doing a long long + * comparision in the switch when we know @lcn is between + * LCN_HOLE and LCN_EIO (i.e. -1 to -5). + * + * Otherwise older gcc (at least on some architectures) will + * try to use __cmpdi2() which is of course not available in + * the kernel. + */ + switch ((int)lcn) { + case LCN_ENOENT: + /* + * If the offset is out of bounds then pretend it is a + * hole. + */ + goto hole; + case LCN_ENOMEM: + ntfs_error(vol->sb, "Not enough memory to complete " + "mapping for inode 0x%lx. " + "Returning 0.", ni->mft_no); + break; + default: + ntfs_error(vol->sb, "Failed to complete mapping for " + "inode 0x%lx. Run chkdsk. " + "Returning 0.", ni->mft_no); + break; + } + return 0; + } + if (lcn < 0) { + /* It is a hole. */ +hole: + ntfs_debug("Done (returning hole)."); + return 0; + } + /* + * The block is really allocated and fullfils all our criteria. + * Convert the cluster to units of block size and return the result. + */ + delta = ofs & vol->cluster_size_mask; + if (unlikely(sizeof(block) < sizeof(lcn))) { + block = lcn = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + /* If the block number was truncated return 0. */ + if (unlikely(block != lcn)) { + ntfs_error(vol->sb, "Physical block 0x%llx is too " + "large to be returned, returning 0.", + (long long)lcn); + return 0; + } + } else + block = ((lcn << cluster_size_shift) + delta) >> + blocksize_bits; + ntfs_debug("Done (returning block 0x%llx).", (unsigned long long)lcn); + return block; +} + +/** + * ntfs_normal_aops - address space operations for normal inodes and attributes + * + * Note these are not used for compressed or mst protected inodes and + * attributes. + */ +const struct address_space_operations ntfs_normal_aops = { + .readpage = ntfs_readpage, #ifdef NTFS_RW - .writepage = ntfs_writepage, /* Write dirty page to disk. */ + .writepage = ntfs_writepage, + .set_page_dirty = __set_page_dirty_buffers, +#endif /* NTFS_RW */ + .bmap = ntfs_bmap, + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, + .error_remove_page = generic_error_remove_page, +}; + +/** + * ntfs_compressed_aops - address space operations for compressed inodes + */ +const struct address_space_operations ntfs_compressed_aops = { + .readpage = ntfs_readpage, +#ifdef NTFS_RW + .writepage = ntfs_writepage, + .set_page_dirty = __set_page_dirty_buffers, #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, /* Move a page cache page from - one physical page to an - other. */ + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; @@ -1564,9 +1704,8 @@ const struct address_space_operations ntfs_mst_aops = { without touching the buffers belonging to the page. */ #endif /* NTFS_RW */ - .migratepage = buffer_migrate_page, /* Move a page cache page from - one physical page to an - other. */ + .migratepage = buffer_migrate_page, + .is_partially_uptodate = block_is_partially_uptodate, .error_remove_page = generic_error_remove_page, }; diff --git a/fs/ntfs/debug.c b/fs/ntfs/debug.c index dd6103cc93c1..825a54e8f490 100644 --- a/fs/ntfs/debug.c +++ b/fs/ntfs/debug.c @@ -112,7 +112,7 @@ void __ntfs_error(const char *function, const struct super_block *sb, /* If 1, output debug messages, and if 0, don't. */ int debug_msgs = 0; -void __ntfs_debug (const char *file, int line, const char *function, +void __ntfs_debug(const char *file, int line, const char *function, const char *fmt, ...) { struct va_format vaf; diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index f5ec1ce7a532..643faa44f22b 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1,7 +1,7 @@ /* * file.c - NTFS kernel file operations. Part of the Linux-NTFS project. * - * Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc. + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -410,7 +410,8 @@ static inline int __ntfs_grab_cache_pages(struct address_space *mapping, BUG_ON(!nr_pages); err = nr = 0; do { - pages[nr] = find_lock_page(mapping, index); + pages[nr] = find_get_page_flags(mapping, index, FGP_LOCK | + FGP_ACCESSED); if (!pages[nr]) { if (!*cached_page) { *cached_page = page_cache_alloc(mapping); diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c index f47af5e6e230..898b9949d363 100644 --- a/fs/ntfs/inode.c +++ b/fs/ntfs/inode.c @@ -1,7 +1,7 @@ /** - * inode.c - NTFS kernel inode handling. Part of the Linux-NTFS project. + * inode.c - NTFS kernel inode handling. * - * Copyright (c) 2001-2007 Anton Altaparmakov + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * * This program/include file is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License as published @@ -1012,6 +1012,7 @@ skip_large_dir_stuff: /* Setup the operations for this inode. */ vi->i_op = &ntfs_dir_inode_ops; vi->i_fop = &ntfs_dir_ops; + vi->i_mapping->a_ops = &ntfs_mst_aops; } else { /* It is a file. */ ntfs_attr_reinit_search_ctx(ctx); @@ -1160,11 +1161,12 @@ no_data_attr_special_case: /* Setup the operations for this inode. */ vi->i_op = &ntfs_file_inode_ops; vi->i_fop = &ntfs_file_ops; + vi->i_mapping->a_ops = &ntfs_normal_aops; + if (NInoMstProtected(ni)) + vi->i_mapping->a_ops = &ntfs_mst_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; } - if (NInoMstProtected(ni)) - vi->i_mapping->a_ops = &ntfs_mst_aops; - else - vi->i_mapping->a_ops = &ntfs_aops; /* * The number of 512-byte blocks used on disk (for stat). This is in so * far inaccurate as it doesn't account for any named streams or other @@ -1414,10 +1416,11 @@ static int ntfs_read_locked_attr_inode(struct inode *base_vi, struct inode *vi) ni->allocated_size = sle64_to_cpu( a->data.non_resident.allocated_size); } + vi->i_mapping->a_ops = &ntfs_normal_aops; if (NInoMstProtected(ni)) vi->i_mapping->a_ops = &ntfs_mst_aops; - else - vi->i_mapping->a_ops = &ntfs_aops; + else if (NInoCompressed(ni)) + vi->i_mapping->a_ops = &ntfs_compressed_aops; if ((NInoCompressed(ni) || NInoSparse(ni)) && ni->type != AT_INDEX_ROOT) vi->i_blocks = ni->itype.compressed.size >> 9; else diff --git a/fs/ntfs/ntfs.h b/fs/ntfs/ntfs.h index d6a340bf80fc..c581e26a350d 100644 --- a/fs/ntfs/ntfs.h +++ b/fs/ntfs/ntfs.h @@ -1,8 +1,7 @@ /* - * ntfs.h - Defines for NTFS Linux kernel driver. Part of the Linux-NTFS - * project. + * ntfs.h - Defines for NTFS Linux kernel driver. * - * Copyright (c) 2001-2005 Anton Altaparmakov + * Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc. * Copyright (C) 2002 Richard Russon * * This program/include file is free software; you can redistribute it and/or @@ -57,7 +56,8 @@ extern struct kmem_cache *ntfs_attr_ctx_cache; extern struct kmem_cache *ntfs_index_ctx_cache; /* The various operations structs defined throughout the driver files. */ -extern const struct address_space_operations ntfs_aops; +extern const struct address_space_operations ntfs_normal_aops; +extern const struct address_space_operations ntfs_compressed_aops; extern const struct address_space_operations ntfs_mst_aops; extern const struct file_operations ntfs_file_ops; diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 6c3296e546c3..9e1e112074fb 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -3208,7 +3208,7 @@ static void __exit exit_ntfs_fs(void) } MODULE_AUTHOR("Anton Altaparmakov <anton@tuxera.com>"); -MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2011 Anton Altaparmakov and Tuxera Inc."); +MODULE_DESCRIPTION("NTFS 1.2/3.x driver - Copyright (c) 2001-2014 Anton Altaparmakov and Tuxera Inc."); MODULE_VERSION(NTFS_VERSION); MODULE_LICENSE("GPL"); #ifdef DEBUG diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 4a231a166cf8..1ef547e49373 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -1481,8 +1481,16 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, handle_t *handle; struct ocfs2_dinode *di = (struct ocfs2_dinode *)wc->w_di_bh->b_data; + handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out; + } + page = find_or_create_page(mapping, 0, GFP_NOFS); if (!page) { + ocfs2_commit_trans(osb, handle); ret = -ENOMEM; mlog_errno(ret); goto out; @@ -1494,13 +1502,6 @@ static int ocfs2_write_begin_inline(struct address_space *mapping, wc->w_pages[0] = wc->w_target_page = page; wc->w_num_pages = 1; - handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - mlog_errno(ret); - goto out; - } - ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), wc->w_di_bh, OCFS2_JOURNAL_ACCESS_WRITE); if (ret) { diff --git a/fs/ocfs2/cluster/heartbeat.c b/fs/ocfs2/cluster/heartbeat.c index 73039295d0d1..eb9d48746ab4 100644 --- a/fs/ocfs2/cluster/heartbeat.c +++ b/fs/ocfs2/cluster/heartbeat.c @@ -2244,7 +2244,7 @@ ssize_t o2hb_heartbeat_group_mode_store(struct o2hb_heartbeat_group *group, return -EINVAL; for (i = 0; i < O2HB_HEARTBEAT_NUM_MODES; ++i) { - if (strnicmp(page, o2hb_heartbeat_mode_desc[i], len)) + if (strncasecmp(page, o2hb_heartbeat_mode_desc[i], len)) continue; ret = o2hb_global_heartbeat_mode_set(i); @@ -2572,6 +2572,25 @@ int o2hb_check_node_heartbeating(u8 node_num) } EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating); +int o2hb_check_node_heartbeating_no_sem(u8 node_num) +{ + unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; + unsigned long flags; + + spin_lock_irqsave(&o2hb_live_lock, flags); + o2hb_fill_node_map_from_callback(testing_map, sizeof(testing_map)); + spin_unlock_irqrestore(&o2hb_live_lock, flags); + if (!test_bit(node_num, testing_map)) { + mlog(ML_HEARTBEAT, + "node (%u) does not have heartbeating enabled.\n", + node_num); + return 0; + } + + return 1; +} +EXPORT_SYMBOL_GPL(o2hb_check_node_heartbeating_no_sem); + int o2hb_check_node_heartbeating_from_callback(u8 node_num) { unsigned long testing_map[BITS_TO_LONGS(O2NM_MAX_NODES)]; diff --git a/fs/ocfs2/cluster/heartbeat.h b/fs/ocfs2/cluster/heartbeat.h index 00ad8e8fea51..3ef5137dc362 100644 --- a/fs/ocfs2/cluster/heartbeat.h +++ b/fs/ocfs2/cluster/heartbeat.h @@ -80,6 +80,7 @@ void o2hb_fill_node_map(unsigned long *map, void o2hb_exit(void); int o2hb_init(void); int o2hb_check_node_heartbeating(u8 node_num); +int o2hb_check_node_heartbeating_no_sem(u8 node_num); int o2hb_check_node_heartbeating_from_callback(u8 node_num); int o2hb_check_local_node_heartbeating(void); void o2hb_stop_all_regions(void); diff --git a/fs/ocfs2/cluster/masklog.c b/fs/ocfs2/cluster/masklog.c index 07ac24fd9252..af7598bff1b5 100644 --- a/fs/ocfs2/cluster/masklog.c +++ b/fs/ocfs2/cluster/masklog.c @@ -49,13 +49,13 @@ static ssize_t mlog_mask_show(u64 mask, char *buf) static ssize_t mlog_mask_store(u64 mask, const char *buf, size_t count) { - if (!strnicmp(buf, "allow", 5)) { + if (!strncasecmp(buf, "allow", 5)) { __mlog_set_u64(mask, mlog_and_bits); __mlog_clear_u64(mask, mlog_not_bits); - } else if (!strnicmp(buf, "deny", 4)) { + } else if (!strncasecmp(buf, "deny", 4)) { __mlog_set_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); - } else if (!strnicmp(buf, "off", 3)) { + } else if (!strncasecmp(buf, "off", 3)) { __mlog_clear_u64(mask, mlog_not_bits); __mlog_clear_u64(mask, mlog_and_bits); } else diff --git a/fs/ocfs2/cluster/netdebug.c b/fs/ocfs2/cluster/netdebug.c index 73ba81928bce..27d1242c8383 100644 --- a/fs/ocfs2/cluster/netdebug.c +++ b/fs/ocfs2/cluster/netdebug.c @@ -185,29 +185,13 @@ static const struct seq_operations nst_seq_ops = { static int nst_fop_open(struct inode *inode, struct file *file) { struct o2net_send_tracking *dummy_nst; - struct seq_file *seq; - int ret; - dummy_nst = kmalloc(sizeof(struct o2net_send_tracking), GFP_KERNEL); - if (dummy_nst == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_nst->st_task = NULL; - - ret = seq_open(file, &nst_seq_ops); - if (ret) - goto out; - - seq = file->private_data; - seq->private = dummy_nst; + dummy_nst = __seq_open_private(file, &nst_seq_ops, sizeof(*dummy_nst)); + if (!dummy_nst) + return -ENOMEM; o2net_debug_add_nst(dummy_nst); - dummy_nst = NULL; - -out: - kfree(dummy_nst); - return ret; + return 0; } static int nst_fop_release(struct inode *inode, struct file *file) @@ -412,33 +396,27 @@ static const struct seq_operations sc_seq_ops = { .show = sc_seq_show, }; -static int sc_common_open(struct file *file, struct o2net_sock_debug *sd) +static int sc_common_open(struct file *file, int ctxt) { + struct o2net_sock_debug *sd; struct o2net_sock_container *dummy_sc; - struct seq_file *seq; - int ret; - dummy_sc = kmalloc(sizeof(struct o2net_sock_container), GFP_KERNEL); - if (dummy_sc == NULL) { - ret = -ENOMEM; - goto out; - } - dummy_sc->sc_page = NULL; + dummy_sc = kzalloc(sizeof(*dummy_sc), GFP_KERNEL); + if (!dummy_sc) + return -ENOMEM; - ret = seq_open(file, &sc_seq_ops); - if (ret) - goto out; + sd = __seq_open_private(file, &sc_seq_ops, sizeof(*sd)); + if (!sd) { + kfree(dummy_sc); + return -ENOMEM; + } - seq = file->private_data; - seq->private = sd; + sd->dbg_ctxt = ctxt; sd->dbg_sock = dummy_sc; - o2net_debug_add_sc(dummy_sc); - dummy_sc = NULL; + o2net_debug_add_sc(dummy_sc); -out: - kfree(dummy_sc); - return ret; + return 0; } static int sc_fop_release(struct inode *inode, struct file *file) @@ -453,16 +431,7 @@ static int sc_fop_release(struct inode *inode, struct file *file) static int stats_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_STATS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_STATS); } static const struct file_operations stats_seq_fops = { @@ -474,16 +443,7 @@ static const struct file_operations stats_seq_fops = { static int sc_fop_open(struct inode *inode, struct file *file) { - struct o2net_sock_debug *sd; - - sd = kmalloc(sizeof(struct o2net_sock_debug), GFP_KERNEL); - if (sd == NULL) - return -ENOMEM; - - sd->dbg_ctxt = SHOW_SOCK_CONTAINERS; - sd->dbg_sock = NULL; - - return sc_common_open(file, sd); + return sc_common_open(file, SHOW_SOCK_CONTAINERS); } static const struct file_operations sc_seq_fops = { diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index ea34952f9496..97de0fbd9f78 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -536,7 +536,7 @@ static void o2net_set_nn_state(struct o2net_node *nn, if (nn->nn_persistent_error || nn->nn_sc_valid) wake_up(&nn->nn_sc_wq); - if (!was_err && nn->nn_persistent_error) { + if (was_valid && !was_err && nn->nn_persistent_error) { o2quo_conn_err(o2net_num_from_nn(nn)); queue_delayed_work(o2net_wq, &nn->nn_still_up, msecs_to_jiffies(O2NET_QUORUM_DELAY_MS)); @@ -1601,7 +1601,15 @@ static void o2net_start_connect(struct work_struct *work) struct sockaddr_in myaddr = {0, }, remoteaddr = {0, }; int ret = 0, stop; unsigned int timeout; + unsigned int noio_flag; + /* + * sock_create allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); /* if we're greater we initiate tx, otherwise we accept */ if (o2nm_this_node() <= o2net_num_from_nn(nn)) goto out; @@ -1710,6 +1718,7 @@ out: if (mynode) o2nm_node_put(mynode); + memalloc_noio_restore(noio_flag); return; } @@ -1721,7 +1730,8 @@ static void o2net_connect_expired(struct work_struct *work) spin_lock(&nn->nn_lock); if (!nn->nn_sc_valid) { printk(KERN_NOTICE "o2net: No connection established with " - "node %u after %u.%u seconds, giving up.\n", + "node %u after %u.%u seconds, check network and" + " cluster configuration.\n", o2net_num_from_nn(nn), o2net_idle_timeout() / 1000, o2net_idle_timeout() % 1000); @@ -1835,6 +1845,15 @@ static int o2net_accept_one(struct socket *sock, int *more) struct o2nm_node *local_node = NULL; struct o2net_sock_container *sc = NULL; struct o2net_node *nn; + unsigned int noio_flag; + + /* + * sock_create_lite allocates the sock with GFP_KERNEL. We must set + * per-process flag PF_MEMALLOC_NOIO so that all allocations done + * by this process are done as if GFP_NOIO was specified. So we + * are not reentering filesystem while doing memory reclaim. + */ + noio_flag = memalloc_noio_save(); BUG_ON(sock == NULL); *more = 0; @@ -1951,6 +1970,8 @@ out: o2nm_node_put(local_node); if (sc) sc_put(sc); + + memalloc_noio_restore(noio_flag); return ret; } @@ -2146,17 +2167,13 @@ int o2net_init(void) o2quo_init(); if (o2net_debugfs_init()) - return -ENOMEM; + goto out; o2net_hand = kzalloc(sizeof(struct o2net_handshake), GFP_KERNEL); o2net_keep_req = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); o2net_keep_resp = kzalloc(sizeof(struct o2net_msg), GFP_KERNEL); - if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) { - kfree(o2net_hand); - kfree(o2net_keep_req); - kfree(o2net_keep_resp); - return -ENOMEM; - } + if (!o2net_hand || !o2net_keep_req || !o2net_keep_resp) + goto out; o2net_hand->protocol_version = cpu_to_be64(O2NET_PROTOCOL_VERSION); o2net_hand->connector_id = cpu_to_be64(1); @@ -2181,6 +2198,14 @@ int o2net_init(void) } return 0; + +out: + kfree(o2net_hand); + kfree(o2net_keep_req); + kfree(o2net_keep_resp); + + o2quo_exit(); + return -ENOMEM; } void o2net_exit(void) diff --git a/fs/ocfs2/dlm/dlmdebug.c b/fs/ocfs2/dlm/dlmdebug.c index 18f13c2e4a10..149eb556b8c6 100644 --- a/fs/ocfs2/dlm/dlmdebug.c +++ b/fs/ocfs2/dlm/dlmdebug.c @@ -647,41 +647,30 @@ static const struct seq_operations debug_lockres_ops = { static int debug_lockres_open(struct inode *inode, struct file *file) { struct dlm_ctxt *dlm = inode->i_private; - int ret = -ENOMEM; - struct seq_file *seq; - struct debug_lockres *dl = NULL; + struct debug_lockres *dl; + void *buf; - dl = kzalloc(sizeof(struct debug_lockres), GFP_KERNEL); - if (!dl) { - mlog_errno(ret); + buf = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!buf) goto bail; - } - dl->dl_len = PAGE_SIZE; - dl->dl_buf = kmalloc(dl->dl_len, GFP_KERNEL); - if (!dl->dl_buf) { - mlog_errno(ret); - goto bail; - } + dl = __seq_open_private(file, &debug_lockres_ops, sizeof(*dl)); + if (!dl) + goto bailfree; - ret = seq_open(file, &debug_lockres_ops); - if (ret) { - mlog_errno(ret); - goto bail; - } - - seq = file->private_data; - seq->private = dl; + dl->dl_len = PAGE_SIZE; + dl->dl_buf = buf; dlm_grab(dlm); dl->dl_ctxt = dlm; return 0; + +bailfree: + kfree(buf); bail: - if (dl) - kfree(dl->dl_buf); - kfree(dl); - return ret; + mlog_errno(-ENOMEM); + return -ENOMEM; } static int debug_lockres_release(struct inode *inode, struct file *file) diff --git a/fs/ocfs2/dlm/dlmdomain.c b/fs/ocfs2/dlm/dlmdomain.c index 3fcf205ee900..02d315fef432 100644 --- a/fs/ocfs2/dlm/dlmdomain.c +++ b/fs/ocfs2/dlm/dlmdomain.c @@ -839,7 +839,7 @@ static int dlm_query_join_handler(struct o2net_msg *msg, u32 len, void *data, * to back off and try again. This gives heartbeat a chance * to catch up. */ - if (!o2hb_check_node_heartbeating(query->node_idx)) { + if (!o2hb_check_node_heartbeating_no_sem(query->node_idx)) { mlog(0, "node %u is not in our live map yet\n", query->node_idx); @@ -1975,24 +1975,22 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm = kzalloc(sizeof(*dlm), GFP_KERNEL); if (!dlm) { - mlog_errno(-ENOMEM); + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->name = kstrdup(domain, GFP_KERNEL); if (dlm->name == NULL) { - mlog_errno(-ENOMEM); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } dlm->lockres_hash = (struct hlist_head **)dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->lockres_hash) { - mlog_errno(-ENOMEM); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2002,11 +2000,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->master_hash = (struct hlist_head **) dlm_alloc_pagevec(DLM_HASH_PAGES); if (!dlm->master_hash) { - mlog_errno(-ENOMEM); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + ret = -ENOMEM; + mlog_errno(ret); goto leave; } @@ -2017,14 +2012,8 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, dlm->node_num = o2nm_this_node(); ret = dlm_create_debugfs_subroot(dlm); - if (ret < 0) { - dlm_free_pagevec((void **)dlm->master_hash, DLM_HASH_PAGES); - dlm_free_pagevec((void **)dlm->lockres_hash, DLM_HASH_PAGES); - kfree(dlm->name); - kfree(dlm); - dlm = NULL; + if (ret < 0) goto leave; - } spin_lock_init(&dlm->spinlock); spin_lock_init(&dlm->master_lock); @@ -2085,6 +2074,19 @@ static struct dlm_ctxt *dlm_alloc_ctxt(const char *domain, atomic_read(&dlm->dlm_refs.refcount)); leave: + if (ret < 0 && dlm) { + if (dlm->master_hash) + dlm_free_pagevec((void **)dlm->master_hash, + DLM_HASH_PAGES); + + if (dlm->lockres_hash) + dlm_free_pagevec((void **)dlm->lockres_hash, + DLM_HASH_PAGES); + + kfree(dlm->name); + kfree(dlm); + dlm = NULL; + } return dlm; } diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 12ba682fc53c..215e41abf101 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -625,9 +625,6 @@ struct dlm_lock_resource *dlm_new_lockres(struct dlm_ctxt *dlm, return res; error: - if (res && res->lockname.name) - kmem_cache_free(dlm_lockname_cache, (void *)res->lockname.name); - if (res) kmem_cache_free(dlm_lockres_cache, res); return NULL; diff --git a/fs/ocfs2/dlm/dlmrecovery.c b/fs/ocfs2/dlm/dlmrecovery.c index 45067faf5695..3365839d2971 100644 --- a/fs/ocfs2/dlm/dlmrecovery.c +++ b/fs/ocfs2/dlm/dlmrecovery.c @@ -1710,9 +1710,12 @@ int dlm_master_requery_handler(struct o2net_msg *msg, u32 len, void *data, BUG(); } else __dlm_lockres_grab_inflight_worker(dlm, res); - } else /* put.. incase we are not the master */ + spin_unlock(&res->spinlock); + } else { + /* put.. incase we are not the master */ + spin_unlock(&res->spinlock); dlm_lockres_put(res); - spin_unlock(&res->spinlock); + } } spin_unlock(&dlm->spinlock); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 52cfe99ae056..21262f2b1654 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2892,37 +2892,24 @@ static int ocfs2_dlm_debug_release(struct inode *inode, struct file *file) static int ocfs2_dlm_debug_open(struct inode *inode, struct file *file) { - int ret; struct ocfs2_dlm_seq_priv *priv; - struct seq_file *seq; struct ocfs2_super *osb; - priv = kzalloc(sizeof(struct ocfs2_dlm_seq_priv), GFP_KERNEL); + priv = __seq_open_private(file, &ocfs2_dlm_seq_ops, sizeof(*priv)); if (!priv) { - ret = -ENOMEM; - mlog_errno(ret); - goto out; + mlog_errno(-ENOMEM); + return -ENOMEM; } + osb = inode->i_private; ocfs2_get_dlm_debug(osb->osb_dlm_debug); priv->p_dlm_debug = osb->osb_dlm_debug; INIT_LIST_HEAD(&priv->p_iter_res.l_debug_list); - ret = seq_open(file, &ocfs2_dlm_seq_ops); - if (ret) { - kfree(priv); - mlog_errno(ret); - goto out; - } - - seq = file->private_data; - seq->private = priv; - ocfs2_add_lockres_tracking(&priv->p_iter_res, priv->p_dlm_debug); -out: - return ret; + return 0; } static const struct file_operations ocfs2_dlm_debug_fops = { diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 2930e231f3f9..324dc93ac896 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -760,7 +760,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, struct address_space *mapping = inode->i_mapping; struct page *page; unsigned long index = abs_from >> PAGE_CACHE_SHIFT; - handle_t *handle = NULL; + handle_t *handle; int ret = 0; unsigned zero_from, zero_to, block_start, block_end; struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; @@ -769,11 +769,17 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, BUG_ON(abs_to > (((u64)index + 1) << PAGE_CACHE_SHIFT)); BUG_ON(abs_from & (inode->i_blkbits - 1)); + handle = ocfs2_zero_start_ordered_transaction(inode, di_bh); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + page = find_or_create_page(mapping, index, GFP_NOFS); if (!page) { ret = -ENOMEM; mlog_errno(ret); - goto out; + goto out_commit_trans; } /* Get the offsets within the page that we want to zero */ @@ -805,15 +811,6 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, goto out_unlock; } - if (!handle) { - handle = ocfs2_zero_start_ordered_transaction(inode, - di_bh); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - handle = NULL; - break; - } - } /* must not update i_size! */ ret = block_commit_write(page, block_start + 1, @@ -824,27 +821,29 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, ret = 0; } + /* + * fs-writeback will release the dirty pages without page lock + * whose offset are over inode size, the release happens at + * block_write_full_page(). + */ + i_size_write(inode, abs_to); + inode->i_blocks = ocfs2_inode_sector_count(inode); + di->i_size = cpu_to_le64((u64)i_size_read(inode)); + inode->i_mtime = inode->i_ctime = CURRENT_TIME; + di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); + di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); + di->i_mtime_nsec = di->i_ctime_nsec; if (handle) { - /* - * fs-writeback will release the dirty pages without page lock - * whose offset are over inode size, the release happens at - * block_write_full_page(). - */ - i_size_write(inode, abs_to); - inode->i_blocks = ocfs2_inode_sector_count(inode); - di->i_size = cpu_to_le64((u64)i_size_read(inode)); - inode->i_mtime = inode->i_ctime = CURRENT_TIME; - di->i_mtime = di->i_ctime = cpu_to_le64(inode->i_mtime.tv_sec); - di->i_ctime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec); - di->i_mtime_nsec = di->i_ctime_nsec; ocfs2_journal_dirty(handle, di_bh); ocfs2_update_inode_fsync_trans(handle, inode, 1); - ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); } out_unlock: unlock_page(page); page_cache_release(page); +out_commit_trans: + if (handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); out: return ret; } @@ -1253,7 +1252,7 @@ bail: brelse(bh); /* Release quota pointers in case we acquired them */ - for (qtype = 0; qtype < MAXQUOTAS; qtype++) + for (qtype = 0; qtype < OCFS2_MAXQUOTAS; qtype++) dqput(transfer_to[qtype]); if (!status && attr->ia_valid & ATTR_MODE) { diff --git a/fs/ocfs2/inode.h b/fs/ocfs2/inode.h index a6c991c0fc98..a9b76de46047 100644 --- a/fs/ocfs2/inode.h +++ b/fs/ocfs2/inode.h @@ -162,7 +162,7 @@ static inline blkcnt_t ocfs2_inode_sector_count(struct inode *inode) { int c_to_s_bits = OCFS2_SB(inode->i_sb)->s_clustersize_bits - 9; - return (blkcnt_t)(OCFS2_I(inode)->ip_clusters << c_to_s_bits); + return (blkcnt_t)OCFS2_I(inode)->ip_clusters << c_to_s_bits; } /* Validate that a bh contains a valid inode */ diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 6219aaadeb08..74caffeeee1d 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -404,7 +404,7 @@ static int ocfs2_find_victim_alloc_group(struct inode *inode, * 'vict_blkno' was out of the valid range. */ if ((vict_blkno < le64_to_cpu(rec->c_blkno)) || - (vict_blkno >= (le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << + (vict_blkno >= ((u64)le32_to_cpu(ac_dinode->id1.bitmap1.i_total) << bits_per_unit))) { ret = -EINVAL; goto out; diff --git a/fs/ocfs2/quota.h b/fs/ocfs2/quota.h index f266d67df3c6..1eae330193a6 100644 --- a/fs/ocfs2/quota.h +++ b/fs/ocfs2/quota.h @@ -17,6 +17,9 @@ #include "ocfs2.h" +/* Number of quota types we support */ +#define OCFS2_MAXQUOTAS 2 + /* * In-memory structures */ @@ -39,7 +42,7 @@ struct ocfs2_recovery_chunk { }; struct ocfs2_quota_recovery { - struct list_head r_list[MAXQUOTAS]; /* List of chunks to recover */ + struct list_head r_list[OCFS2_MAXQUOTAS]; /* List of chunks to recover */ }; /* In-memory structure with quota header information */ diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index b990a62cff50..c93d67220887 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -336,8 +336,8 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) int ocfs2_global_read_info(struct super_block *sb, int type) { struct inode *gqinode = NULL; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct ocfs2_global_disk_dqinfo dinfo; struct mem_dqinfo *info = sb_dqinfo(sb, type); struct ocfs2_mem_dqinfo *oinfo = info->dqi_priv; diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2001862bf2b1..10b653930ee2 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -166,12 +166,12 @@ static int ocfs2_read_quota_block(struct inode *inode, u64 v_block, /* Check whether we understand format of quota files */ static int ocfs2_local_check_quota_file(struct super_block *sb, int type) { - unsigned int lmagics[MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; - unsigned int lversions[MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; - unsigned int gmagics[MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; - unsigned int gversions[MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; - unsigned int ino[MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, - GROUP_QUOTA_SYSTEM_INODE }; + unsigned int lmagics[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QMAGICS; + unsigned int lversions[OCFS2_MAXQUOTAS] = OCFS2_LOCAL_QVERSIONS; + unsigned int gmagics[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QMAGICS; + unsigned int gversions[OCFS2_MAXQUOTAS] = OCFS2_GLOBAL_QVERSIONS; + unsigned int ino[OCFS2_MAXQUOTAS] = { USER_QUOTA_SYSTEM_INODE, + GROUP_QUOTA_SYSTEM_INODE }; struct buffer_head *bh = NULL; struct inode *linode = sb_dqopt(sb)->files[type]; struct inode *ginode = NULL; @@ -336,7 +336,7 @@ void ocfs2_free_quota_recovery(struct ocfs2_quota_recovery *rec) { int type; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) free_recovery_list(&(rec->r_list[type])); kfree(rec); } @@ -382,7 +382,7 @@ static struct ocfs2_quota_recovery *ocfs2_alloc_quota_recovery(void) rec = kmalloc(sizeof(struct ocfs2_quota_recovery), GFP_NOFS); if (!rec) return NULL; - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) INIT_LIST_HEAD(&(rec->r_list[type])); return rec; } @@ -392,10 +392,11 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( struct ocfs2_super *osb, int slot_num) { - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct inode *lqinode; @@ -412,7 +413,7 @@ struct ocfs2_quota_recovery *ocfs2_begin_quota_recovery( return ERR_PTR(-ENOMEM); /* First init... */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; /* At this point, journal of the slot is already replayed so @@ -589,8 +590,8 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, struct ocfs2_quota_recovery *rec, int slot_num) { - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, - LOCAL_GROUP_QUOTA_SYSTEM_INODE }; + unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + LOCAL_GROUP_QUOTA_SYSTEM_INODE }; struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; @@ -604,7 +605,7 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, "slot %u\n", osb->dev_str, slot_num); mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; trace_ocfs2_finish_quota_recovery(slot_num); diff --git a/fs/ocfs2/stack_user.c b/fs/ocfs2/stack_user.c index 13a8537d8e8b..720aa389e0ea 100644 --- a/fs/ocfs2/stack_user.c +++ b/fs/ocfs2/stack_user.c @@ -591,7 +591,7 @@ static int ocfs2_control_release(struct inode *inode, struct file *file) */ ocfs2_control_this_node = -1; running_proto.pv_major = 0; - running_proto.pv_major = 0; + running_proto.pv_minor = 0; } out: diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 4142546aedae..93c85bc745e1 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -899,11 +899,12 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) { int type; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; int status = 0; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; if (unsuspend) @@ -927,17 +928,19 @@ static int ocfs2_susp_quotas(struct ocfs2_super *osb, int unsuspend) static int ocfs2_enable_quotas(struct ocfs2_super *osb) { - struct inode *inode[MAXQUOTAS] = { NULL, NULL }; + struct inode *inode[OCFS2_MAXQUOTAS] = { NULL, NULL }; struct super_block *sb = osb->sb; - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; - unsigned int ino[MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int ino[OCFS2_MAXQUOTAS] = { + LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; int status; int type; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE | DQUOT_NEGATIVE_USAGE; - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) continue; inode[type] = ocfs2_get_system_file_inode(osb, ino[type], @@ -952,12 +955,12 @@ static int ocfs2_enable_quotas(struct ocfs2_super *osb) goto out_quota_off; } - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); return 0; out_quota_off: ocfs2_disable_quotas(osb); - for (type = 0; type < MAXQUOTAS; type++) + for (type = 0; type < OCFS2_MAXQUOTAS; type++) iput(inode[type]); mlog_errno(status); return status; @@ -972,7 +975,7 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* We mostly ignore errors in this function because there's not much * we can do when we see them */ - for (type = 0; type < MAXQUOTAS; type++) { + for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (!sb_has_quota_loaded(sb, type)) continue; /* Cancel periodic syncing before we grab dqonoff_mutex */ @@ -993,8 +996,9 @@ static void ocfs2_disable_quotas(struct ocfs2_super *osb) /* Handle quota on quotactl */ static int ocfs2_quota_on(struct super_block *sb, int type, int format_id) { - unsigned int feature[MAXQUOTAS] = { OCFS2_FEATURE_RO_COMPAT_USRQUOTA, - OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; + unsigned int feature[OCFS2_MAXQUOTAS] = { + OCFS2_FEATURE_RO_COMPAT_USRQUOTA, + OCFS2_FEATURE_RO_COMPAT_GRPQUOTA}; if (!OCFS2_HAS_RO_COMPAT_FEATURE(sb, feature[type])) return -EINVAL; diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index ba8819702c56..138321b0c6c2 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -306,9 +306,7 @@ static const struct super_operations omfs_sops = { */ static int omfs_get_imap(struct super_block *sb) { - int bitmap_size; - int array_size; - int count; + unsigned int bitmap_size, count, array_size; struct omfs_sb_info *sbi = OMFS_SB(sb); struct buffer_head *bh; unsigned long **ptr; @@ -473,6 +471,12 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sbi->s_sys_blocksize = be32_to_cpu(omfs_sb->s_sys_blocksize); mutex_init(&sbi->s_bitmap_lock); + if (sbi->s_num_blocks > OMFS_MAX_BLOCKS) { + printk(KERN_ERR "omfs: sysblock number (%llx) is out of range\n", + (unsigned long long)sbi->s_num_blocks); + goto out_brelse_bh; + } + if (sbi->s_sys_blocksize > PAGE_SIZE) { printk(KERN_ERR "omfs: sysblock size (%d) is out of range\n", sbi->s_sys_blocksize); diff --git a/fs/omfs/omfs_fs.h b/fs/omfs/omfs_fs.h index ee5e4327de92..83a98330ed66 100644 --- a/fs/omfs/omfs_fs.h +++ b/fs/omfs/omfs_fs.h @@ -18,6 +18,7 @@ #define OMFS_XOR_COUNT 19 #define OMFS_MAX_BLOCK_SIZE 8192 #define OMFS_MAX_CLUSTER_SIZE 8 +#define OMFS_MAX_BLOCKS (1ul << 31) struct omfs_super_block { char s_fill1[256]; diff --git a/fs/open.c b/fs/open.c index d6fd3acde134..de92c13b58be 100644 --- a/fs/open.c +++ b/fs/open.c @@ -823,8 +823,7 @@ struct file *dentry_open(const struct path *path, int flags, f = get_empty_filp(); if (!IS_ERR(f)) { f->f_flags = flags; - f->f_path = *path; - error = do_dentry_open(f, NULL, cred); + error = vfs_open(path, f, cred); if (!error) { /* from now on we need fput() to dispose of f */ error = open_check_o_direct(f); @@ -841,6 +840,26 @@ struct file *dentry_open(const struct path *path, int flags, } EXPORT_SYMBOL(dentry_open); +/** + * vfs_open - open the file at the given path + * @path: path to open + * @filp: newly allocated file with f_flag initialized + * @cred: credentials to use + */ +int vfs_open(const struct path *path, struct file *filp, + const struct cred *cred) +{ + struct inode *inode = path->dentry->d_inode; + + if (inode->i_op->dentry_open) + return inode->i_op->dentry_open(path->dentry, filp, cred); + else { + filp->f_path = *path; + return do_dentry_open(filp, NULL, cred); + } +} +EXPORT_SYMBOL(vfs_open); + static inline int build_open_flags(int flags, umode_t mode, struct open_flags *op) { int lookup_flags = 0; diff --git a/fs/overlayfs/Kconfig b/fs/overlayfs/Kconfig new file mode 100644 index 000000000000..e60125976873 --- /dev/null +++ b/fs/overlayfs/Kconfig @@ -0,0 +1,10 @@ +config OVERLAYFS_FS + tristate "Overlay filesystem support" + help + An overlay filesystem combines two filesystems - an 'upper' filesystem + and a 'lower' filesystem. When a name exists in both filesystems, the + object in the 'upper' filesystem is visible while the object in the + 'lower' filesystem is either hidden or, in the case of directories, + merged with the 'upper' object. + + For more information see Documentation/filesystems/overlayfs.txt diff --git a/fs/overlayfs/Makefile b/fs/overlayfs/Makefile new file mode 100644 index 000000000000..8f91889480d0 --- /dev/null +++ b/fs/overlayfs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the overlay filesystem. +# + +obj-$(CONFIG_OVERLAYFS_FS) += overlayfs.o + +overlayfs-objs := super.o inode.o dir.o readdir.o copy_up.o diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c new file mode 100644 index 000000000000..ea10a8719107 --- /dev/null +++ b/fs/overlayfs/copy_up.c @@ -0,0 +1,414 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/splice.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/uaccess.h> +#include <linux/sched.h> +#include <linux/namei.h> +#include "overlayfs.h" + +#define OVL_COPY_UP_CHUNK_SIZE (1 << 20) + +int ovl_copy_xattr(struct dentry *old, struct dentry *new) +{ + ssize_t list_size, size; + char *buf, *name, *value; + int error; + + if (!old->d_inode->i_op->getxattr || + !new->d_inode->i_op->getxattr) + return 0; + + list_size = vfs_listxattr(old, NULL, 0); + if (list_size <= 0) { + if (list_size == -EOPNOTSUPP) + return 0; + return list_size; + } + + buf = kzalloc(list_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + error = -ENOMEM; + value = kmalloc(XATTR_SIZE_MAX, GFP_KERNEL); + if (!value) + goto out; + + list_size = vfs_listxattr(old, buf, list_size); + if (list_size <= 0) { + error = list_size; + goto out_free_value; + } + + for (name = buf; name < (buf + list_size); name += strlen(name) + 1) { + size = vfs_getxattr(old, name, value, XATTR_SIZE_MAX); + if (size <= 0) { + error = size; + goto out_free_value; + } + error = vfs_setxattr(new, name, value, size, 0); + if (error) + goto out_free_value; + } + +out_free_value: + kfree(value); +out: + kfree(buf); + return error; +} + +static int ovl_copy_up_data(struct path *old, struct path *new, loff_t len) +{ + struct file *old_file; + struct file *new_file; + loff_t old_pos = 0; + loff_t new_pos = 0; + int error = 0; + + if (len == 0) + return 0; + + old_file = ovl_path_open(old, O_RDONLY); + if (IS_ERR(old_file)) + return PTR_ERR(old_file); + + new_file = ovl_path_open(new, O_WRONLY); + if (IS_ERR(new_file)) { + error = PTR_ERR(new_file); + goto out_fput; + } + + /* FIXME: copy up sparse files efficiently */ + while (len) { + size_t this_len = OVL_COPY_UP_CHUNK_SIZE; + long bytes; + + if (len < this_len) + this_len = len; + + if (signal_pending_state(TASK_KILLABLE, current)) { + error = -EINTR; + break; + } + + bytes = do_splice_direct(old_file, &old_pos, + new_file, &new_pos, + this_len, SPLICE_F_MOVE); + if (bytes <= 0) { + error = bytes; + break; + } + WARN_ON(old_pos != new_pos); + + len -= bytes; + } + + fput(new_file); +out_fput: + fput(old_file); + return error; +} + +static char *ovl_read_symlink(struct dentry *realdentry) +{ + int res; + char *buf; + struct inode *inode = realdentry->d_inode; + mm_segment_t old_fs; + + res = -EINVAL; + if (!inode->i_op->readlink) + goto err; + + res = -ENOMEM; + buf = (char *) __get_free_page(GFP_KERNEL); + if (!buf) + goto err; + + old_fs = get_fs(); + set_fs(get_ds()); + /* The cast to a user pointer is valid due to the set_fs() */ + res = inode->i_op->readlink(realdentry, + (char __user *)buf, PAGE_SIZE - 1); + set_fs(old_fs); + if (res < 0) { + free_page((unsigned long) buf); + goto err; + } + buf[res] = '\0'; + + return buf; + +err: + return ERR_PTR(res); +} + +static int ovl_set_timestamps(struct dentry *upperdentry, struct kstat *stat) +{ + struct iattr attr = { + .ia_valid = + ATTR_ATIME | ATTR_MTIME | ATTR_ATIME_SET | ATTR_MTIME_SET, + .ia_atime = stat->atime, + .ia_mtime = stat->mtime, + }; + + return notify_change(upperdentry, &attr, NULL); +} + +int ovl_set_attr(struct dentry *upperdentry, struct kstat *stat) +{ + int err = 0; + + if (!S_ISLNK(stat->mode)) { + struct iattr attr = { + .ia_valid = ATTR_MODE, + .ia_mode = stat->mode, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) { + struct iattr attr = { + .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = stat->uid, + .ia_gid = stat->gid, + }; + err = notify_change(upperdentry, &attr, NULL); + } + if (!err) + ovl_set_timestamps(upperdentry, stat); + + return err; + +} + +static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, + struct dentry *dentry, struct path *lowerpath, + struct kstat *stat, struct iattr *attr, + const char *link) +{ + struct inode *wdir = workdir->d_inode; + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry = NULL; + struct dentry *upper = NULL; + umode_t mode = stat->mode; + int err; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out1; + + /* Can't properly set mode on creation because of the umask */ + stat->mode &= S_IFMT; + err = ovl_create_real(wdir, newdentry, stat, link, NULL, true); + stat->mode = mode; + if (err) + goto out2; + + if (S_ISREG(stat->mode)) { + struct path upperpath; + ovl_path_upper(dentry, &upperpath); + BUG_ON(upperpath.dentry != NULL); + upperpath.dentry = newdentry; + + err = ovl_copy_up_data(lowerpath, &upperpath, stat->size); + if (err) + goto out_cleanup; + } + + err = ovl_copy_xattr(lowerpath->dentry, newdentry); + if (err) + goto out_cleanup; + + mutex_lock(&newdentry->d_inode->i_mutex); + err = ovl_set_attr(newdentry, stat); + if (!err && attr) + err = notify_change(newdentry, attr, NULL); + mutex_unlock(&newdentry->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + + ovl_dentry_update(dentry, newdentry); + newdentry = NULL; + + /* + * Non-directores become opaque when copied up. + */ + if (!S_ISDIR(stat->mode)) + ovl_dentry_set_opaque(dentry, true); +out2: + dput(upper); +out1: + dput(newdentry); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out; +} + +/* + * Copy up a single dentry + * + * Directory renames only allowed on "pure upper" (already created on + * upper filesystem, never copied up). Directories which are on lower or + * are merged may not be renamed. For these -EXDEV is returned and + * userspace has to deal with it. This means, when copying up a + * directory we can rely on it and ancestors being stable. + * + * Non-directory renames start with copy up of source if necessary. The + * actual rename will only proceed once the copy up was successful. Copy + * up uses upper parent i_mutex for exclusion. Since rename can change + * d_parent it is possible that the copy up will lock the old parent. At + * that point the file will have already been copied up anyway. + */ +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr) +{ + struct dentry *workdir = ovl_workdir(dentry); + int err; + struct kstat pstat; + struct path parentpath; + struct dentry *upperdir; + struct dentry *upperdentry; + const struct cred *old_cred; + struct cred *override_cred; + char *link = NULL; + + ovl_path_upper(parent, &parentpath); + upperdir = parentpath.dentry; + + err = vfs_getattr(&parentpath, &pstat); + if (err) + return err; + + if (S_ISLNK(stat->mode)) { + link = ovl_read_symlink(lowerpath->dentry); + if (IS_ERR(link)) + return PTR_ERR(link); + } + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_free_link; + + override_cred->fsuid = stat->uid; + override_cred->fsgid = stat->gid; + /* + * CAP_SYS_ADMIN for copying up extended attributes + * CAP_DAC_OVERRIDE for create + * CAP_FOWNER for chmod, timestamp update + * CAP_FSETID for chmod + * CAP_CHOWN for chown + * CAP_MKNOD for mknod + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + cap_raise(override_cred->cap_effective, CAP_MKNOD); + old_cred = override_creds(override_cred); + + err = -EIO; + if (lock_rename(workdir, upperdir) != NULL) { + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + goto out_unlock; + } + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + unlock_rename(workdir, upperdir); + err = 0; + /* Raced with another copy-up? Do the setattr here */ + if (attr) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } + goto out_put_cred; + } + + err = ovl_copy_up_locked(workdir, upperdir, dentry, lowerpath, + stat, attr, link); + if (!err) { + /* Restore timestamps on parent (best effort) */ + ovl_set_timestamps(upperdir, &pstat); + } +out_unlock: + unlock_rename(workdir, upperdir); +out_put_cred: + revert_creds(old_cred); + put_cred(override_cred); + +out_free_link: + if (link) + free_page((unsigned long) link); + + return err; +} + +int ovl_copy_up(struct dentry *dentry) +{ + int err; + + err = 0; + while (!err) { + struct dentry *next; + struct dentry *parent; + struct path lowerpath; + struct kstat stat; + enum ovl_path_type type = ovl_path_type(dentry); + + if (type != OVL_PATH_LOWER) + break; + + next = dget(dentry); + /* find the topmost dentry not yet copied up */ + for (;;) { + parent = dget_parent(next); + + type = ovl_path_type(parent); + if (type != OVL_PATH_LOWER) + break; + + dput(next); + next = parent; + } + + ovl_path_lower(next, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (!err) + err = ovl_copy_up_one(parent, next, &lowerpath, &stat, NULL); + + dput(parent); + dput(next); + } + + return err; +} diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c new file mode 100644 index 000000000000..15cd91ad9940 --- /dev/null +++ b/fs/overlayfs/dir.c @@ -0,0 +1,921 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/cred.h> +#include "overlayfs.h" + +void ovl_cleanup(struct inode *wdir, struct dentry *wdentry) +{ + int err; + + dget(wdentry); + if (S_ISDIR(wdentry->d_inode->i_mode)) + err = ovl_do_rmdir(wdir, wdentry); + else + err = ovl_do_unlink(wdir, wdentry); + dput(wdentry); + + if (err) { + pr_err("overlayfs: cleanup of '%pd2' failed (%i)\n", + wdentry, err); + } +} + +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry) +{ + struct dentry *temp; + char name[20]; + + snprintf(name, sizeof(name), "#%lx", (unsigned long) dentry); + + temp = lookup_one_len(name, workdir, strlen(name)); + if (!IS_ERR(temp) && temp->d_inode) { + pr_err("overlayfs: workdir/%s already exists\n", name); + dput(temp); + temp = ERR_PTR(-EIO); + } + + return temp; +} + +/* caller holds i_mutex on workdir */ +static struct dentry *ovl_whiteout(struct dentry *workdir, + struct dentry *dentry) +{ + int err; + struct dentry *whiteout; + struct inode *wdir = workdir->d_inode; + + whiteout = ovl_lookup_temp(workdir, dentry); + if (IS_ERR(whiteout)) + return whiteout; + + err = ovl_do_whiteout(wdir, whiteout); + if (err) { + dput(whiteout); + whiteout = ERR_PTR(err); + } + + return whiteout; +} + +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug) +{ + int err; + + if (newdentry->d_inode) + return -ESTALE; + + if (hardlink) { + err = ovl_do_link(hardlink, dir, newdentry, debug); + } else { + switch (stat->mode & S_IFMT) { + case S_IFREG: + err = ovl_do_create(dir, newdentry, stat->mode, debug); + break; + + case S_IFDIR: + err = ovl_do_mkdir(dir, newdentry, stat->mode, debug); + break; + + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + err = ovl_do_mknod(dir, newdentry, + stat->mode, stat->rdev, debug); + break; + + case S_IFLNK: + err = ovl_do_symlink(dir, newdentry, link, debug); + break; + + default: + err = -EPERM; + } + } + if (!err && WARN_ON(!newdentry->d_inode)) { + /* + * Not quite sure if non-instantiated dentry is legal or not. + * VFS doesn't seem to care so check and warn here. + */ + err = -ENOENT; + } + return err; +} + +static int ovl_set_opaque(struct dentry *upperdentry) +{ + return ovl_do_setxattr(upperdentry, ovl_opaque_xattr, "y", 1, 0); +} + +static void ovl_remove_opaque(struct dentry *upperdentry) +{ + int err; + + err = ovl_do_removexattr(upperdentry, ovl_opaque_xattr); + if (err) { + pr_warn("overlayfs: failed to remove opaque from '%s' (%i)\n", + upperdentry->d_name.name, err); + } +} + +static int ovl_dir_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + int err; + enum ovl_path_type type; + struct path realpath; + + type = ovl_path_real(dentry, &realpath); + err = vfs_getattr(&realpath, stat); + if (err) + return err; + + stat->dev = dentry->d_sb->s_dev; + stat->ino = dentry->d_inode->i_ino; + + /* + * It's probably not worth it to count subdirs to get the + * correct link count. nlink=1 seems to pacify 'find' and + * other utilities. + */ + if (type == OVL_PATH_MERGE) + stat->nlink = 1; + + return 0; +} + +static int ovl_create_upper(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *newdentry; + int err; + + mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + newdentry = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + err = ovl_create_real(udir, newdentry, stat, link, hardlink, false); + if (err) + goto out_dput; + + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput: + dput(newdentry); +out_unlock: + mutex_unlock(&udir->i_mutex); + return err; +} + +static int ovl_lock_rename_workdir(struct dentry *workdir, + struct dentry *upperdir) +{ + /* Workdir should not be the same as upperdir */ + if (workdir == upperdir) + goto err; + + /* Workdir should not be subdir of upperdir and vice versa */ + if (lock_rename(workdir, upperdir) != NULL) + goto err_unlock; + + return 0; + +err_unlock: + unlock_rename(workdir, upperdir); +err: + pr_err("overlayfs: failed to lock workdir+upperdir\n"); + return -EIO; +} + +static struct dentry *ovl_clear_empty(struct dentry *dentry, + struct list_head *list) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct path upperpath; + struct dentry *upper; + struct dentry *opaquedir; + struct kstat stat; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + ovl_path_upper(dentry, &upperpath); + err = vfs_getattr(&upperpath, &stat); + if (err) + goto out_unlock; + + err = -ESTALE; + if (!S_ISDIR(stat.mode)) + goto out_unlock; + upper = upperpath.dentry; + if (upper->d_parent->d_inode != udir) + goto out_unlock; + + opaquedir = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out_unlock; + + err = ovl_create_real(wdir, opaquedir, &stat, NULL, NULL, true); + if (err) + goto out_dput; + + err = ovl_copy_xattr(upper, opaquedir); + if (err) + goto out_cleanup; + + err = ovl_set_opaque(opaquedir); + if (err) + goto out_cleanup; + + mutex_lock(&opaquedir->d_inode->i_mutex); + err = ovl_set_attr(opaquedir, &stat); + mutex_unlock(&opaquedir->d_inode->i_mutex); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, opaquedir, udir, upper, RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup_whiteouts(upper, list); + ovl_cleanup(wdir, upper); + unlock_rename(workdir, upperdir); + + /* dentry's upper doesn't match now, get rid of it */ + d_drop(dentry); + + return opaquedir; + +out_cleanup: + ovl_cleanup(wdir, opaquedir); +out_dput: + dput(opaquedir); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return ERR_PTR(err); +} + +static struct dentry *ovl_check_empty_and_clear(struct dentry *dentry, + enum ovl_path_type type) +{ + int err; + struct dentry *ret = NULL; + LIST_HEAD(list); + + err = ovl_check_empty_dir(dentry, &list); + if (err) + ret = ERR_PTR(err); + else if (type == OVL_PATH_MERGE) + ret = ovl_clear_empty(dentry, &list); + + ovl_cache_free(&list); + + return ret; +} + +static int ovl_create_over_whiteout(struct dentry *dentry, struct inode *inode, + struct kstat *stat, const char *link, + struct dentry *hardlink) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *upper; + struct dentry *newdentry; + int err; + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out; + + newdentry = ovl_lookup_temp(workdir, dentry); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto out_dput; + + err = ovl_create_real(wdir, newdentry, stat, link, hardlink, true); + if (err) + goto out_dput2; + + if (S_ISDIR(stat->mode)) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_cleanup; + + err = ovl_do_rename(wdir, newdentry, udir, upper, + RENAME_EXCHANGE); + if (err) + goto out_cleanup; + + ovl_cleanup(wdir, upper); + } else { + err = ovl_do_rename(wdir, newdentry, udir, upper, 0); + if (err) + goto out_cleanup; + } + ovl_dentry_version_inc(dentry->d_parent); + ovl_dentry_update(dentry, newdentry); + ovl_copyattr(newdentry->d_inode, inode); + d_instantiate(dentry, inode); + newdentry = NULL; +out_dput2: + dput(upper); +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(workdir, upperdir); +out: + return err; + +out_cleanup: + ovl_cleanup(wdir, newdentry); + goto out_dput2; +} + +static int ovl_create_or_link(struct dentry *dentry, int mode, dev_t rdev, + const char *link, struct dentry *hardlink) +{ + int err; + struct inode *inode; + struct kstat stat = { + .mode = mode, + .rdev = rdev, + }; + + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, mode, dentry->d_fsdata); + if (!inode) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_iput; + + if (!ovl_dentry_is_opaque(dentry)) { + err = ovl_create_upper(dentry, inode, &stat, link, hardlink); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_iput; + + /* + * CAP_SYS_ADMIN for setting opaque xattr + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + old_cred = override_creds(override_cred); + + err = ovl_create_over_whiteout(dentry, inode, &stat, link, + hardlink); + + revert_creds(old_cred); + put_cred(override_cred); + } + + if (!err) + inode = NULL; +out_iput: + iput(inode); +out: + return err; +} + +static int ovl_create_object(struct dentry *dentry, int mode, dev_t rdev, + const char *link) +{ + int err; + + err = ovl_want_write(dentry); + if (!err) { + err = ovl_create_or_link(dentry, mode, rdev, link, NULL); + ovl_drop_write(dentry); + } + + return err; +} + +static int ovl_create(struct inode *dir, struct dentry *dentry, umode_t mode, + bool excl) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); +} + +static int ovl_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) +{ + return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); +} + +static int ovl_mknod(struct inode *dir, struct dentry *dentry, umode_t mode, + dev_t rdev) +{ + /* Don't allow creation of "whiteout" on overlay */ + if (S_ISCHR(mode) && rdev == WHITEOUT_DEV) + return -EPERM; + + return ovl_create_object(dentry, mode, rdev, NULL); +} + +static int ovl_symlink(struct inode *dir, struct dentry *dentry, + const char *link) +{ + return ovl_create_object(dentry, S_IFLNK, 0, link); +} + +static int ovl_link(struct dentry *old, struct inode *newdir, + struct dentry *new) +{ + int err; + struct dentry *upper; + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + upper = ovl_dentry_upper(old); + err = ovl_create_or_link(new, upper->d_inode->i_mode, 0, NULL, upper); + +out_drop_write: + ovl_drop_write(old); +out: + return err; +} + +static int ovl_remove_and_whiteout(struct dentry *dentry, + enum ovl_path_type type, bool is_dir) +{ + struct dentry *workdir = ovl_workdir(dentry); + struct inode *wdir = workdir->d_inode; + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *udir = upperdir->d_inode; + struct dentry *whiteout; + struct dentry *upper; + struct dentry *opaquedir = NULL; + int err; + + if (is_dir) { + opaquedir = ovl_check_empty_and_clear(dentry, type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) + goto out; + } + + err = ovl_lock_rename_workdir(workdir, upperdir); + if (err) + goto out_dput; + + whiteout = ovl_whiteout(workdir, dentry); + err = PTR_ERR(whiteout); + if (IS_ERR(whiteout)) + goto out_unlock; + + if (type == OVL_PATH_LOWER) { + upper = lookup_one_len(dentry->d_name.name, upperdir, + dentry->d_name.len); + err = PTR_ERR(upper); + if (IS_ERR(upper)) + goto kill_whiteout; + + err = ovl_do_rename(wdir, whiteout, udir, upper, 0); + dput(upper); + if (err) + goto kill_whiteout; + } else { + int flags = 0; + + upper = ovl_dentry_upper(dentry); + if (opaquedir) + upper = opaquedir; + err = -ESTALE; + if (upper->d_parent != upperdir) + goto kill_whiteout; + + if (is_dir) + flags |= RENAME_EXCHANGE; + + err = ovl_do_rename(wdir, whiteout, udir, upper, flags); + if (err) + goto kill_whiteout; + + if (is_dir) + ovl_cleanup(wdir, upper); + } + ovl_dentry_version_inc(dentry->d_parent); +out_d_drop: + d_drop(dentry); + dput(whiteout); +out_unlock: + unlock_rename(workdir, upperdir); +out_dput: + dput(opaquedir); +out: + return err; + +kill_whiteout: + ovl_cleanup(wdir, whiteout); + goto out_d_drop; +} + +static int ovl_remove_upper(struct dentry *dentry, bool is_dir) +{ + struct dentry *upperdir = ovl_dentry_upper(dentry->d_parent); + struct inode *dir = upperdir->d_inode; + struct dentry *upper = ovl_dentry_upper(dentry); + int err; + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + err = -ESTALE; + if (upper->d_parent == upperdir) { + /* Don't let d_delete() think it can reset d_inode */ + dget(upper); + if (is_dir) + err = vfs_rmdir(dir, upper); + else + err = vfs_unlink(dir, upper, NULL); + dput(upper); + ovl_dentry_version_inc(dentry->d_parent); + } + + /* + * Keeping this dentry hashed would mean having to release + * upperpath/lowerpath, which could only be done if we are the + * sole user of this dentry. Too tricky... Just unhash for + * now. + */ + d_drop(dentry); + mutex_unlock(&dir->i_mutex); + + return err; +} + +static inline int ovl_check_sticky(struct dentry *dentry) +{ + struct inode *dir = ovl_dentry_real(dentry->d_parent)->d_inode; + struct inode *inode = ovl_dentry_real(dentry)->d_inode; + + if (check_sticky(dir, inode)) + return -EPERM; + + return 0; +} + +static int ovl_do_remove(struct dentry *dentry, bool is_dir) +{ + enum ovl_path_type type; + int err; + + err = ovl_check_sticky(dentry); + if (err) + goto out; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = ovl_copy_up(dentry->d_parent); + if (err) + goto out_drop_write; + + type = ovl_path_type(dentry); + if (type == OVL_PATH_PURE_UPPER) { + err = ovl_remove_upper(dentry, is_dir); + } else { + const struct cred *old_cred; + struct cred *override_cred; + + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir, rename + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + + err = ovl_remove_and_whiteout(dentry, type, is_dir); + + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_unlink(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, false); +} + +static int ovl_rmdir(struct inode *dir, struct dentry *dentry) +{ + return ovl_do_remove(dentry, true); +} + +static int ovl_rename2(struct inode *olddir, struct dentry *old, + struct inode *newdir, struct dentry *new, + unsigned int flags) +{ + int err; + enum ovl_path_type old_type; + enum ovl_path_type new_type; + struct dentry *old_upperdir; + struct dentry *new_upperdir; + struct dentry *olddentry; + struct dentry *newdentry; + struct dentry *trap; + bool old_opaque; + bool new_opaque; + bool new_create = false; + bool cleanup_whiteout = false; + bool overwrite = !(flags & RENAME_EXCHANGE); + bool is_dir = S_ISDIR(old->d_inode->i_mode); + bool new_is_dir = false; + struct dentry *opaquedir = NULL; + const struct cred *old_cred = NULL; + struct cred *override_cred = NULL; + + err = -EINVAL; + if (flags & ~(RENAME_EXCHANGE | RENAME_NOREPLACE)) + goto out; + + flags &= ~RENAME_NOREPLACE; + + err = ovl_check_sticky(old); + if (err) + goto out; + + /* Don't copy up directory trees */ + old_type = ovl_path_type(old); + err = -EXDEV; + if ((old_type == OVL_PATH_LOWER || old_type == OVL_PATH_MERGE) && is_dir) + goto out; + + if (new->d_inode) { + err = ovl_check_sticky(new); + if (err) + goto out; + + if (S_ISDIR(new->d_inode->i_mode)) + new_is_dir = true; + + new_type = ovl_path_type(new); + err = -EXDEV; + if (!overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) + goto out; + + err = 0; + if (new_type == OVL_PATH_LOWER && old_type == OVL_PATH_LOWER) { + if (ovl_dentry_lower(old)->d_inode == + ovl_dentry_lower(new)->d_inode) + goto out; + } + if (new_type != OVL_PATH_LOWER && old_type != OVL_PATH_LOWER) { + if (ovl_dentry_upper(old)->d_inode == + ovl_dentry_upper(new)->d_inode) + goto out; + } + } else { + if (ovl_dentry_is_opaque(new)) + new_type = OVL_PATH_UPPER; + else + new_type = OVL_PATH_PURE_UPPER; + } + + err = ovl_want_write(old); + if (err) + goto out; + + err = ovl_copy_up(old); + if (err) + goto out_drop_write; + + err = ovl_copy_up(new->d_parent); + if (err) + goto out_drop_write; + if (!overwrite) { + err = ovl_copy_up(new); + if (err) + goto out_drop_write; + } + + old_opaque = old_type != OVL_PATH_PURE_UPPER; + new_opaque = new_type != OVL_PATH_PURE_UPPER; + + if (old_opaque || new_opaque) { + err = -ENOMEM; + override_cred = prepare_creds(); + if (!override_cred) + goto out_drop_write; + + /* + * CAP_SYS_ADMIN for setting xattr on whiteout, opaque dir + * CAP_DAC_OVERRIDE for create in workdir + * CAP_FOWNER for removing whiteout from sticky dir + * CAP_FSETID for chmod of opaque dir + * CAP_CHOWN for chown of opaque dir + */ + cap_raise(override_cred->cap_effective, CAP_SYS_ADMIN); + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + cap_raise(override_cred->cap_effective, CAP_FOWNER); + cap_raise(override_cred->cap_effective, CAP_FSETID); + cap_raise(override_cred->cap_effective, CAP_CHOWN); + old_cred = override_creds(override_cred); + } + + if (overwrite && (new_type == OVL_PATH_LOWER || new_type == OVL_PATH_MERGE) && new_is_dir) { + opaquedir = ovl_check_empty_and_clear(new, new_type); + err = PTR_ERR(opaquedir); + if (IS_ERR(opaquedir)) { + opaquedir = NULL; + goto out_revert_creds; + } + } + + if (overwrite) { + if (old_opaque) { + if (new->d_inode || !new_opaque) { + /* Whiteout source */ + flags |= RENAME_WHITEOUT; + } else { + /* Switch whiteouts */ + flags |= RENAME_EXCHANGE; + } + } else if (is_dir && !new->d_inode && new_opaque) { + flags |= RENAME_EXCHANGE; + cleanup_whiteout = true; + } + } + + old_upperdir = ovl_dentry_upper(old->d_parent); + new_upperdir = ovl_dentry_upper(new->d_parent); + + trap = lock_rename(new_upperdir, old_upperdir); + + olddentry = ovl_dentry_upper(old); + newdentry = ovl_dentry_upper(new); + if (newdentry) { + if (opaquedir) { + newdentry = opaquedir; + opaquedir = NULL; + } else { + dget(newdentry); + } + } else { + new_create = true; + newdentry = lookup_one_len(new->d_name.name, new_upperdir, + new->d_name.len); + err = PTR_ERR(newdentry); + if (IS_ERR(newdentry)) + goto out_unlock; + } + + err = -ESTALE; + if (olddentry->d_parent != old_upperdir) + goto out_dput; + if (newdentry->d_parent != new_upperdir) + goto out_dput; + if (olddentry == trap) + goto out_dput; + if (newdentry == trap) + goto out_dput; + + if (is_dir && !old_opaque && new_opaque) { + err = ovl_set_opaque(olddentry); + if (err) + goto out_dput; + } + if (!overwrite && new_is_dir && old_opaque && !new_opaque) { + err = ovl_set_opaque(newdentry); + if (err) + goto out_dput; + } + + if (old_opaque || new_opaque) { + err = ovl_do_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + flags); + } else { + /* No debug for the plain case */ + BUG_ON(flags & ~RENAME_EXCHANGE); + err = vfs_rename(old_upperdir->d_inode, olddentry, + new_upperdir->d_inode, newdentry, + NULL, flags); + } + + if (err) { + if (is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(newdentry); + goto out_dput; + } + + if (is_dir && old_opaque && !new_opaque) + ovl_remove_opaque(olddentry); + if (!overwrite && new_is_dir && !old_opaque && new_opaque) + ovl_remove_opaque(newdentry); + + if (old_opaque != new_opaque) { + ovl_dentry_set_opaque(old, new_opaque); + if (!overwrite) + ovl_dentry_set_opaque(new, old_opaque); + } + + if (cleanup_whiteout) + ovl_cleanup(old_upperdir->d_inode, newdentry); + + ovl_dentry_version_inc(old->d_parent); + ovl_dentry_version_inc(new->d_parent); + +out_dput: + dput(newdentry); +out_unlock: + unlock_rename(new_upperdir, old_upperdir); +out_revert_creds: + if (old_opaque || new_opaque) { + revert_creds(old_cred); + put_cred(override_cred); + } +out_drop_write: + ovl_drop_write(old); +out: + dput(opaquedir); + return err; +} + +const struct inode_operations ovl_dir_inode_operations = { + .lookup = ovl_lookup, + .mkdir = ovl_mkdir, + .symlink = ovl_symlink, + .unlink = ovl_unlink, + .rmdir = ovl_rmdir, + .rename2 = ovl_rename2, + .link = ovl_link, + .setattr = ovl_setattr, + .create = ovl_create, + .mknod = ovl_mknod, + .permission = ovl_permission, + .getattr = ovl_dir_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c new file mode 100644 index 000000000000..af2d18c9fcee --- /dev/null +++ b/fs/overlayfs/inode.c @@ -0,0 +1,425 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/xattr.h> +#include "overlayfs.h" + +static int ovl_copy_up_last(struct dentry *dentry, struct iattr *attr, + bool no_data) +{ + int err; + struct dentry *parent; + struct kstat stat; + struct path lowerpath; + + parent = dget_parent(dentry); + err = ovl_copy_up(parent); + if (err) + goto out_dput_parent; + + ovl_path_lower(dentry, &lowerpath); + err = vfs_getattr(&lowerpath, &stat); + if (err) + goto out_dput_parent; + + if (no_data) + stat.size = 0; + + err = ovl_copy_up_one(parent, dentry, &lowerpath, &stat, attr); + +out_dput_parent: + dput(parent); + return err; +} + +int ovl_setattr(struct dentry *dentry, struct iattr *attr) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + upperdentry = ovl_dentry_upper(dentry); + if (upperdentry) { + mutex_lock(&upperdentry->d_inode->i_mutex); + err = notify_change(upperdentry, attr, NULL); + mutex_unlock(&upperdentry->d_inode->i_mutex); + } else { + err = ovl_copy_up_last(dentry, attr, false); + } + ovl_drop_write(dentry); +out: + return err; +} + +static int ovl_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat) +{ + struct path realpath; + + ovl_path_real(dentry, &realpath); + return vfs_getattr(&realpath, stat); +} + +int ovl_permission(struct inode *inode, int mask) +{ + struct ovl_entry *oe; + struct dentry *alias = NULL; + struct inode *realinode; + struct dentry *realdentry; + bool is_upper; + int err; + + if (S_ISDIR(inode->i_mode)) { + oe = inode->i_private; + } else if (mask & MAY_NOT_BLOCK) { + return -ECHILD; + } else { + /* + * For non-directories find an alias and get the info + * from there. + */ + alias = d_find_any_alias(inode); + if (WARN_ON(!alias)) + return -ENOENT; + + oe = alias->d_fsdata; + } + + realdentry = ovl_entry_real(oe, &is_upper); + + /* Careful in RCU walk mode */ + realinode = ACCESS_ONCE(realdentry->d_inode); + if (!realinode) { + WARN_ON(!(mask & MAY_NOT_BLOCK)); + err = -ENOENT; + goto out_dput; + } + + if (mask & MAY_WRITE) { + umode_t mode = realinode->i_mode; + + /* + * Writes will always be redirected to upper layer, so + * ignore lower layer being read-only. + * + * If the overlay itself is read-only then proceed + * with the permission check, don't return EROFS. + * This will only happen if this is the lower layer of + * another overlayfs. + * + * If upper fs becomes read-only after the overlay was + * constructed return EROFS to prevent modification of + * upper layer. + */ + err = -EROFS; + if (is_upper && !IS_RDONLY(inode) && IS_RDONLY(realinode) && + (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode))) + goto out_dput; + } + + err = __inode_permission(realinode, mask); +out_dput: + dput(alias); + return err; +} + + +struct ovl_link_data { + struct dentry *realdentry; + void *cookie; +}; + +static void *ovl_follow_link(struct dentry *dentry, struct nameidata *nd) +{ + void *ret; + struct dentry *realdentry; + struct inode *realinode; + + realdentry = ovl_dentry_real(dentry); + realinode = realdentry->d_inode; + + if (WARN_ON(!realinode->i_op->follow_link)) + return ERR_PTR(-EPERM); + + ret = realinode->i_op->follow_link(realdentry, nd); + if (IS_ERR(ret)) + return ret; + + if (realinode->i_op->put_link) { + struct ovl_link_data *data; + + data = kmalloc(sizeof(struct ovl_link_data), GFP_KERNEL); + if (!data) { + realinode->i_op->put_link(realdentry, nd, ret); + return ERR_PTR(-ENOMEM); + } + data->realdentry = realdentry; + data->cookie = ret; + + return data; + } else { + return NULL; + } +} + +static void ovl_put_link(struct dentry *dentry, struct nameidata *nd, void *c) +{ + struct inode *realinode; + struct ovl_link_data *data = c; + + if (!data) + return; + + realinode = data->realdentry->d_inode; + realinode->i_op->put_link(data->realdentry, nd, data->cookie); + kfree(data); +} + +static int ovl_readlink(struct dentry *dentry, char __user *buf, int bufsiz) +{ + struct path realpath; + struct inode *realinode; + + ovl_path_real(dentry, &realpath); + realinode = realpath.dentry->d_inode; + + if (!realinode->i_op->readlink) + return -EINVAL; + + touch_atime(&realpath); + + return realinode->i_op->readlink(realpath.dentry, buf, bufsiz); +} + + +static bool ovl_is_private_xattr(const char *name) +{ + return strncmp(name, "trusted.overlay.", 14) == 0; +} + +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err; + struct dentry *upperdentry; + + err = ovl_want_write(dentry); + if (err) + goto out; + + err = -EPERM; + if (ovl_is_private_xattr(name)) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + upperdentry = ovl_dentry_upper(dentry); + err = vfs_setxattr(upperdentry, name, value, size, flags); + +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size) +{ + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + return -ENODATA; + + return vfs_getxattr(ovl_dentry_real(dentry), name, value, size); +} + +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) +{ + ssize_t res; + int off; + + res = vfs_listxattr(ovl_dentry_real(dentry), list, size); + if (res <= 0 || size == 0) + return res; + + if (ovl_path_type(dentry->d_parent) != OVL_PATH_MERGE) + return res; + + /* filter out private xattrs */ + for (off = 0; off < res;) { + char *s = list + off; + size_t slen = strlen(s) + 1; + + BUG_ON(off + slen > res); + + if (ovl_is_private_xattr(s)) { + res -= slen; + memmove(s, s + slen, res - off); + } else { + off += slen; + } + } + + return res; +} + +int ovl_removexattr(struct dentry *dentry, const char *name) +{ + int err; + struct path realpath; + enum ovl_path_type type; + + err = ovl_want_write(dentry); + if (err) + goto out; + + if (ovl_path_type(dentry->d_parent) == OVL_PATH_MERGE && + ovl_is_private_xattr(name)) + goto out_drop_write; + + type = ovl_path_real(dentry, &realpath); + if (type == OVL_PATH_LOWER) { + err = vfs_getxattr(realpath.dentry, name, NULL, 0); + if (err < 0) + goto out_drop_write; + + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_removexattr(realpath.dentry, name); +out_drop_write: + ovl_drop_write(dentry); +out: + return err; +} + +static bool ovl_open_need_copy_up(int flags, enum ovl_path_type type, + struct dentry *realdentry) +{ + if (type != OVL_PATH_LOWER) + return false; + + if (special_file(realdentry->d_inode->i_mode)) + return false; + + if (!(OPEN_FMODE(flags) & FMODE_WRITE) && !(flags & O_TRUNC)) + return false; + + return true; +} + +static int ovl_dentry_open(struct dentry *dentry, struct file *file, + const struct cred *cred) +{ + int err; + struct path realpath; + enum ovl_path_type type; + bool want_write = false; + + type = ovl_path_real(dentry, &realpath); + if (ovl_open_need_copy_up(file->f_flags, type, realpath.dentry)) { + want_write = true; + err = ovl_want_write(dentry); + if (err) + goto out; + + if (file->f_flags & O_TRUNC) + err = ovl_copy_up_last(dentry, NULL, true); + else + err = ovl_copy_up(dentry); + if (err) + goto out_drop_write; + + ovl_path_upper(dentry, &realpath); + } + + err = vfs_open(&realpath, file, cred); +out_drop_write: + if (want_write) + ovl_drop_write(dentry); +out: + return err; +} + +static const struct inode_operations ovl_file_inode_operations = { + .setattr = ovl_setattr, + .permission = ovl_permission, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, + .dentry_open = ovl_dentry_open, +}; + +static const struct inode_operations ovl_symlink_inode_operations = { + .setattr = ovl_setattr, + .follow_link = ovl_follow_link, + .put_link = ovl_put_link, + .readlink = ovl_readlink, + .getattr = ovl_getattr, + .setxattr = ovl_setxattr, + .getxattr = ovl_getxattr, + .listxattr = ovl_listxattr, + .removexattr = ovl_removexattr, +}; + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe) +{ + struct inode *inode; + + inode = new_inode(sb); + if (!inode) + return NULL; + + mode &= S_IFMT; + + inode->i_ino = get_next_ino(); + inode->i_mode = mode; + inode->i_flags |= S_NOATIME | S_NOCMTIME; + + switch (mode) { + case S_IFDIR: + inode->i_private = oe; + inode->i_op = &ovl_dir_inode_operations; + inode->i_fop = &ovl_dir_operations; + break; + + case S_IFLNK: + inode->i_op = &ovl_symlink_inode_operations; + break; + + case S_IFREG: + case S_IFSOCK: + case S_IFBLK: + case S_IFCHR: + case S_IFIFO: + inode->i_op = &ovl_file_inode_operations; + break; + + default: + WARN(1, "illegal file type: %i\n", mode); + iput(inode); + inode = NULL; + } + + return inode; + +} diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h new file mode 100644 index 000000000000..814bed33dd07 --- /dev/null +++ b/fs/overlayfs/overlayfs.h @@ -0,0 +1,191 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/kernel.h> + +struct ovl_entry; + +enum ovl_path_type { + OVL_PATH_PURE_UPPER, + OVL_PATH_UPPER, + OVL_PATH_MERGE, + OVL_PATH_LOWER, +}; + +extern const char *ovl_opaque_xattr; + +static inline int ovl_do_rmdir(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_rmdir(dir, dentry); + pr_debug("rmdir(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_unlink(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_unlink(dir, dentry, NULL); + pr_debug("unlink(%pd2) = %i\n", dentry, err); + return err; +} + +static inline int ovl_do_link(struct dentry *old_dentry, struct inode *dir, + struct dentry *new_dentry, bool debug) +{ + int err = vfs_link(old_dentry, dir, new_dentry, NULL); + if (debug) { + pr_debug("link(%pd2, %pd2) = %i\n", + old_dentry, new_dentry, err); + } + return err; +} + +static inline int ovl_do_create(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_create(dir, dentry, mode, true); + if (debug) + pr_debug("create(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mkdir(struct inode *dir, struct dentry *dentry, + umode_t mode, bool debug) +{ + int err = vfs_mkdir(dir, dentry, mode); + if (debug) + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); + return err; +} + +static inline int ovl_do_mknod(struct inode *dir, struct dentry *dentry, + umode_t mode, dev_t dev, bool debug) +{ + int err = vfs_mknod(dir, dentry, mode, dev); + if (debug) { + pr_debug("mknod(%pd2, 0%o, 0%o) = %i\n", + dentry, mode, dev, err); + } + return err; +} + +static inline int ovl_do_symlink(struct inode *dir, struct dentry *dentry, + const char *oldname, bool debug) +{ + int err = vfs_symlink(dir, dentry, oldname); + if (debug) + pr_debug("symlink(\"%s\", %pd2) = %i\n", oldname, dentry, err); + return err; +} + +static inline int ovl_do_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags) +{ + int err = vfs_setxattr(dentry, name, value, size, flags); + pr_debug("setxattr(%pd2, \"%s\", \"%*s\", 0x%x) = %i\n", + dentry, name, (int) size, (char *) value, flags, err); + return err; +} + +static inline int ovl_do_removexattr(struct dentry *dentry, const char *name) +{ + int err = vfs_removexattr(dentry, name); + pr_debug("removexattr(%pd2, \"%s\") = %i\n", dentry, name, err); + return err; +} + +static inline int ovl_do_rename(struct inode *olddir, struct dentry *olddentry, + struct inode *newdir, struct dentry *newdentry, + unsigned int flags) +{ + int err; + + pr_debug("rename2(%pd2, %pd2, 0x%x)\n", + olddentry, newdentry, flags); + + err = vfs_rename(olddir, olddentry, newdir, newdentry, NULL, flags); + + if (err) { + pr_debug("...rename2(%pd2, %pd2, ...) = %i\n", + olddentry, newdentry, err); + } + return err; +} + +static inline int ovl_do_whiteout(struct inode *dir, struct dentry *dentry) +{ + int err = vfs_whiteout(dir, dentry); + pr_debug("whiteout(%pd2) = %i\n", dentry, err); + return err; +} + +enum ovl_path_type ovl_path_type(struct dentry *dentry); +u64 ovl_dentry_version_get(struct dentry *dentry); +void ovl_dentry_version_inc(struct dentry *dentry); +void ovl_path_upper(struct dentry *dentry, struct path *path); +void ovl_path_lower(struct dentry *dentry, struct path *path); +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path); +struct dentry *ovl_dentry_upper(struct dentry *dentry); +struct dentry *ovl_dentry_lower(struct dentry *dentry); +struct dentry *ovl_dentry_real(struct dentry *dentry); +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper); +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry); +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache); +struct dentry *ovl_workdir(struct dentry *dentry); +int ovl_want_write(struct dentry *dentry); +void ovl_drop_write(struct dentry *dentry); +bool ovl_dentry_is_opaque(struct dentry *dentry); +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque); +bool ovl_is_whiteout(struct dentry *dentry); +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry); +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags); +struct file *ovl_path_open(struct path *path, int flags); + +struct dentry *ovl_upper_create(struct dentry *upperdir, struct dentry *dentry, + struct kstat *stat, const char *link); + +/* readdir.c */ +extern const struct file_operations ovl_dir_operations; +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list); +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list); +void ovl_cache_free(struct list_head *list); + +/* inode.c */ +int ovl_setattr(struct dentry *dentry, struct iattr *attr); +int ovl_permission(struct inode *inode, int mask); +int ovl_setxattr(struct dentry *dentry, const char *name, + const void *value, size_t size, int flags); +ssize_t ovl_getxattr(struct dentry *dentry, const char *name, + void *value, size_t size); +ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size); +int ovl_removexattr(struct dentry *dentry, const char *name); + +struct inode *ovl_new_inode(struct super_block *sb, umode_t mode, + struct ovl_entry *oe); +static inline void ovl_copyattr(struct inode *from, struct inode *to) +{ + to->i_uid = from->i_uid; + to->i_gid = from->i_gid; +} + +/* dir.c */ +extern const struct inode_operations ovl_dir_inode_operations; +struct dentry *ovl_lookup_temp(struct dentry *workdir, struct dentry *dentry); +int ovl_create_real(struct inode *dir, struct dentry *newdentry, + struct kstat *stat, const char *link, + struct dentry *hardlink, bool debug); +void ovl_cleanup(struct inode *dir, struct dentry *dentry); + +/* copy_up.c */ +int ovl_copy_up(struct dentry *dentry); +int ovl_copy_up_one(struct dentry *parent, struct dentry *dentry, + struct path *lowerpath, struct kstat *stat, + struct iattr *attr); +int ovl_copy_xattr(struct dentry *old, struct dentry *new); +int ovl_set_attr(struct dentry *upper, struct kstat *stat); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c new file mode 100644 index 000000000000..910553f37aca --- /dev/null +++ b/fs/overlayfs/readdir.c @@ -0,0 +1,590 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/slab.h> +#include <linux/namei.h> +#include <linux/file.h> +#include <linux/xattr.h> +#include <linux/rbtree.h> +#include <linux/security.h> +#include <linux/cred.h> +#include "overlayfs.h" + +struct ovl_cache_entry { + unsigned int len; + unsigned int type; + u64 ino; + bool is_whiteout; + struct list_head l_node; + struct rb_node node; + char name[]; +}; + +struct ovl_dir_cache { + long refcount; + u64 version; + struct list_head entries; +}; + +struct ovl_readdir_data { + struct dir_context ctx; + bool is_merge; + struct rb_root root; + struct list_head *list; + struct list_head middle; + int count; + int err; +}; + +struct ovl_dir_file { + bool is_real; + bool is_upper; + struct ovl_dir_cache *cache; + struct ovl_cache_entry cursor; + struct file *realfile; + struct file *upperfile; +}; + +static struct ovl_cache_entry *ovl_cache_entry_from_node(struct rb_node *n) +{ + return container_of(n, struct ovl_cache_entry, node); +} + +static struct ovl_cache_entry *ovl_cache_entry_find(struct rb_root *root, + const char *name, int len) +{ + struct rb_node *node = root->rb_node; + int cmp; + + while (node) { + struct ovl_cache_entry *p = ovl_cache_entry_from_node(node); + + cmp = strncmp(name, p->name, len); + if (cmp > 0) + node = p->node.rb_right; + else if (cmp < 0 || len < p->len) + node = p->node.rb_left; + else + return p; + } + + return NULL; +} + +static struct ovl_cache_entry *ovl_cache_entry_new(const char *name, int len, + u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + size_t size = offsetof(struct ovl_cache_entry, name[len + 1]); + + p = kmalloc(size, GFP_KERNEL); + if (p) { + memcpy(p->name, name, len); + p->name[len] = '\0'; + p->len = len; + p->type = d_type; + p->ino = ino; + p->is_whiteout = false; + } + + return p; +} + +static int ovl_cache_entry_add_rb(struct ovl_readdir_data *rdd, + const char *name, int len, u64 ino, + unsigned int d_type) +{ + struct rb_node **newp = &rdd->root.rb_node; + struct rb_node *parent = NULL; + struct ovl_cache_entry *p; + + while (*newp) { + int cmp; + struct ovl_cache_entry *tmp; + + parent = *newp; + tmp = ovl_cache_entry_from_node(*newp); + cmp = strncmp(name, tmp->name, len); + if (cmp > 0) + newp = &tmp->node.rb_right; + else if (cmp < 0 || len < tmp->len) + newp = &tmp->node.rb_left; + else + return 0; + } + + p = ovl_cache_entry_new(name, len, ino, d_type); + if (p == NULL) + return -ENOMEM; + + list_add_tail(&p->l_node, rdd->list); + rb_link_node(&p->node, parent, newp); + rb_insert_color(&p->node, &rdd->root); + + return 0; +} + +static int ovl_fill_lower(struct ovl_readdir_data *rdd, + const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_cache_entry *p; + + p = ovl_cache_entry_find(&rdd->root, name, namelen); + if (p) { + list_move_tail(&p->l_node, &rdd->middle); + } else { + p = ovl_cache_entry_new(name, namelen, ino, d_type); + if (p == NULL) + rdd->err = -ENOMEM; + else + list_add_tail(&p->l_node, &rdd->middle); + } + + return rdd->err; +} + +void ovl_cache_free(struct list_head *list) +{ + struct ovl_cache_entry *p; + struct ovl_cache_entry *n; + + list_for_each_entry_safe(p, n, list, l_node) + kfree(p); + + INIT_LIST_HEAD(list); +} + +static void ovl_cache_put(struct ovl_dir_file *od, struct dentry *dentry) +{ + struct ovl_dir_cache *cache = od->cache; + + list_del(&od->cursor.l_node); + WARN_ON(cache->refcount <= 0); + cache->refcount--; + if (!cache->refcount) { + if (ovl_dir_cache(dentry) == cache) + ovl_set_dir_cache(dentry, NULL); + + ovl_cache_free(&cache->entries); + kfree(cache); + } +} + +static int ovl_fill_merge(void *buf, const char *name, int namelen, + loff_t offset, u64 ino, unsigned int d_type) +{ + struct ovl_readdir_data *rdd = buf; + + rdd->count++; + if (!rdd->is_merge) + return ovl_cache_entry_add_rb(rdd, name, namelen, ino, d_type); + else + return ovl_fill_lower(rdd, name, namelen, offset, ino, d_type); +} + +static inline int ovl_dir_read(struct path *realpath, + struct ovl_readdir_data *rdd) +{ + struct file *realfile; + int err; + + realfile = ovl_path_open(realpath, O_RDONLY | O_DIRECTORY); + if (IS_ERR(realfile)) + return PTR_ERR(realfile); + + rdd->ctx.pos = 0; + do { + rdd->count = 0; + rdd->err = 0; + err = iterate_dir(realfile, &rdd->ctx); + if (err >= 0) + err = rdd->err; + } while (!err && rdd->count); + fput(realfile); + + return err; +} + +static void ovl_dir_reset(struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + struct ovl_dir_cache *cache = od->cache; + struct dentry *dentry = file->f_path.dentry; + enum ovl_path_type type = ovl_path_type(dentry); + + if (cache && ovl_dentry_version_get(dentry) != cache->version) { + ovl_cache_put(od, dentry); + od->cache = NULL; + } + WARN_ON(!od->is_real && type != OVL_PATH_MERGE); + if (od->is_real && type == OVL_PATH_MERGE) + od->is_real = false; +} + +static int ovl_dir_mark_whiteouts(struct dentry *dir, + struct ovl_readdir_data *rdd) +{ + struct ovl_cache_entry *p; + struct dentry *dentry; + const struct cred *old_cred; + struct cred *override_cred; + + override_cred = prepare_creds(); + if (!override_cred) { + ovl_cache_free(rdd->list); + return -ENOMEM; + } + + /* + * CAP_DAC_OVERRIDE for lookup + */ + cap_raise(override_cred->cap_effective, CAP_DAC_OVERRIDE); + old_cred = override_creds(override_cred); + + mutex_lock(&dir->d_inode->i_mutex); + list_for_each_entry(p, rdd->list, l_node) { + if (!p->name) + continue; + + if (p->type != DT_CHR) + continue; + + dentry = lookup_one_len(p->name, dir, p->len); + if (IS_ERR(dentry)) + continue; + + p->is_whiteout = ovl_is_whiteout(dentry); + dput(dentry); + } + mutex_unlock(&dir->d_inode->i_mutex); + + revert_creds(old_cred); + put_cred(override_cred); + + return 0; +} + +static inline int ovl_dir_read_merged(struct path *upperpath, + struct path *lowerpath, + struct list_head *list) +{ + int err; + struct ovl_readdir_data rdd = { + .ctx.actor = ovl_fill_merge, + .list = list, + .root = RB_ROOT, + .is_merge = false, + }; + + if (upperpath->dentry) { + err = ovl_dir_read(upperpath, &rdd); + if (err) + goto out; + + if (lowerpath->dentry) { + err = ovl_dir_mark_whiteouts(upperpath->dentry, &rdd); + if (err) + goto out; + } + } + if (lowerpath->dentry) { + /* + * Insert lowerpath entries before upperpath ones, this allows + * offsets to be reasonably constant + */ + list_add(&rdd.middle, rdd.list); + rdd.is_merge = true; + err = ovl_dir_read(lowerpath, &rdd); + list_del(&rdd.middle); + } +out: + return err; + +} + +static void ovl_seek_cursor(struct ovl_dir_file *od, loff_t pos) +{ + struct ovl_cache_entry *p; + loff_t off = 0; + + list_for_each_entry(p, &od->cache->entries, l_node) { + if (!p->name) + continue; + if (off >= pos) + break; + off++; + } + list_move_tail(&od->cursor.l_node, &p->l_node); +} + +static struct ovl_dir_cache *ovl_cache_get(struct dentry *dentry) +{ + int res; + struct path lowerpath; + struct path upperpath; + struct ovl_dir_cache *cache; + + cache = ovl_dir_cache(dentry); + if (cache && ovl_dentry_version_get(dentry) == cache->version) { + cache->refcount++; + return cache; + } + ovl_set_dir_cache(dentry, NULL); + + cache = kzalloc(sizeof(struct ovl_dir_cache), GFP_KERNEL); + if (!cache) + return ERR_PTR(-ENOMEM); + + cache->refcount = 1; + INIT_LIST_HEAD(&cache->entries); + + ovl_path_lower(dentry, &lowerpath); + ovl_path_upper(dentry, &upperpath); + + res = ovl_dir_read_merged(&upperpath, &lowerpath, &cache->entries); + if (res) { + ovl_cache_free(&cache->entries); + kfree(cache); + return ERR_PTR(res); + } + + cache->version = ovl_dentry_version_get(dentry); + ovl_set_dir_cache(dentry, cache); + + return cache; +} + +static int ovl_iterate(struct file *file, struct dir_context *ctx) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + + if (!ctx->pos) + ovl_dir_reset(file); + + if (od->is_real) + return iterate_dir(od->realfile, ctx); + + if (!od->cache) { + struct ovl_dir_cache *cache; + + cache = ovl_cache_get(dentry); + if (IS_ERR(cache)) + return PTR_ERR(cache); + + od->cache = cache; + ovl_seek_cursor(od, ctx->pos); + } + + while (od->cursor.l_node.next != &od->cache->entries) { + struct ovl_cache_entry *p; + + p = list_entry(od->cursor.l_node.next, struct ovl_cache_entry, l_node); + /* Skip cursors */ + if (p->name) { + if (!p->is_whiteout) { + if (!dir_emit(ctx, p->name, p->len, p->ino, p->type)) + break; + } + ctx->pos++; + } + list_move(&od->cursor.l_node, &p->l_node); + } + return 0; +} + +static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) +{ + loff_t res; + struct ovl_dir_file *od = file->private_data; + + mutex_lock(&file_inode(file)->i_mutex); + if (!file->f_pos) + ovl_dir_reset(file); + + if (od->is_real) { + res = vfs_llseek(od->realfile, offset, origin); + file->f_pos = od->realfile->f_pos; + } else { + res = -EINVAL; + + switch (origin) { + case SEEK_CUR: + offset += file->f_pos; + break; + case SEEK_SET: + break; + default: + goto out_unlock; + } + if (offset < 0) + goto out_unlock; + + if (offset != file->f_pos) { + file->f_pos = offset; + if (od->cache) + ovl_seek_cursor(od, offset); + } + res = offset; + } +out_unlock: + mutex_unlock(&file_inode(file)->i_mutex); + + return res; +} + +static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, + int datasync) +{ + struct ovl_dir_file *od = file->private_data; + struct dentry *dentry = file->f_path.dentry; + struct file *realfile = od->realfile; + + /* + * Need to check if we started out being a lower dir, but got copied up + */ + if (!od->is_upper && ovl_path_type(dentry) == OVL_PATH_MERGE) { + struct inode *inode = file_inode(file); + + realfile = od->upperfile; + if (!realfile) { + struct path upperpath; + + ovl_path_upper(dentry, &upperpath); + realfile = ovl_path_open(&upperpath, O_RDONLY); + mutex_lock(&inode->i_mutex); + if (!od->upperfile) { + if (IS_ERR(realfile)) { + mutex_unlock(&inode->i_mutex); + return PTR_ERR(realfile); + } + od->upperfile = realfile; + } else { + /* somebody has beaten us to it */ + if (!IS_ERR(realfile)) + fput(realfile); + realfile = od->upperfile; + } + mutex_unlock(&inode->i_mutex); + } + } + + return vfs_fsync_range(realfile, start, end, datasync); +} + +static int ovl_dir_release(struct inode *inode, struct file *file) +{ + struct ovl_dir_file *od = file->private_data; + + if (od->cache) { + mutex_lock(&inode->i_mutex); + ovl_cache_put(od, file->f_path.dentry); + mutex_unlock(&inode->i_mutex); + } + fput(od->realfile); + if (od->upperfile) + fput(od->upperfile); + kfree(od); + + return 0; +} + +static int ovl_dir_open(struct inode *inode, struct file *file) +{ + struct path realpath; + struct file *realfile; + struct ovl_dir_file *od; + enum ovl_path_type type; + + od = kzalloc(sizeof(struct ovl_dir_file), GFP_KERNEL); + if (!od) + return -ENOMEM; + + type = ovl_path_real(file->f_path.dentry, &realpath); + realfile = ovl_path_open(&realpath, file->f_flags); + if (IS_ERR(realfile)) { + kfree(od); + return PTR_ERR(realfile); + } + INIT_LIST_HEAD(&od->cursor.l_node); + od->realfile = realfile; + od->is_real = (type != OVL_PATH_MERGE); + od->is_upper = (type != OVL_PATH_LOWER); + file->private_data = od; + + return 0; +} + +const struct file_operations ovl_dir_operations = { + .read = generic_read_dir, + .open = ovl_dir_open, + .iterate = ovl_iterate, + .llseek = ovl_dir_llseek, + .fsync = ovl_dir_fsync, + .release = ovl_dir_release, +}; + +int ovl_check_empty_dir(struct dentry *dentry, struct list_head *list) +{ + int err; + struct path lowerpath; + struct path upperpath; + struct ovl_cache_entry *p; + + ovl_path_upper(dentry, &upperpath); + ovl_path_lower(dentry, &lowerpath); + + err = ovl_dir_read_merged(&upperpath, &lowerpath, list); + if (err) + return err; + + err = 0; + + list_for_each_entry(p, list, l_node) { + if (p->is_whiteout) + continue; + + if (p->name[0] == '.') { + if (p->len == 1) + continue; + if (p->len == 2 && p->name[1] == '.') + continue; + } + err = -ENOTEMPTY; + break; + } + + return err; +} + +void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) +{ + struct ovl_cache_entry *p; + + mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_PARENT); + list_for_each_entry(p, list, l_node) { + struct dentry *dentry; + + if (!p->is_whiteout) + continue; + + dentry = lookup_one_len(p->name, upper, p->len); + if (IS_ERR(dentry)) { + pr_err("overlayfs: lookup '%s/%.*s' failed (%i)\n", + upper->d_name.name, p->len, p->name, + (int) PTR_ERR(dentry)); + continue; + } + ovl_cleanup(upper->d_inode, dentry); + dput(dentry); + } + mutex_unlock(&upper->d_inode->i_mutex); +} diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c new file mode 100644 index 000000000000..08b704cebfc4 --- /dev/null +++ b/fs/overlayfs/super.c @@ -0,0 +1,796 @@ +/* + * + * Copyright (C) 2011 Novell Inc. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License version 2 as published by + * the Free Software Foundation. + */ + +#include <linux/fs.h> +#include <linux/namei.h> +#include <linux/xattr.h> +#include <linux/security.h> +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/parser.h> +#include <linux/module.h> +#include <linux/sched.h> +#include <linux/statfs.h> +#include <linux/seq_file.h> +#include "overlayfs.h" + +MODULE_AUTHOR("Miklos Szeredi <miklos@szeredi.hu>"); +MODULE_DESCRIPTION("Overlay filesystem"); +MODULE_LICENSE("GPL"); + +#define OVERLAYFS_SUPER_MAGIC 0x794c764f + +struct ovl_config { + char *lowerdir; + char *upperdir; + char *workdir; +}; + +/* private information held for overlayfs's superblock */ +struct ovl_fs { + struct vfsmount *upper_mnt; + struct vfsmount *lower_mnt; + struct dentry *workdir; + long lower_namelen; + /* pathnames of lower and upper dirs, for show_options */ + struct ovl_config config; +}; + +struct ovl_dir_cache; + +/* private information held for every overlayfs dentry */ +struct ovl_entry { + struct dentry *__upperdentry; + struct dentry *lowerdentry; + struct ovl_dir_cache *cache; + union { + struct { + u64 version; + bool opaque; + }; + struct rcu_head rcu; + }; +}; + +const char *ovl_opaque_xattr = "trusted.overlay.opaque"; + + +enum ovl_path_type ovl_path_type(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe->__upperdentry) { + if (oe->lowerdentry) { + if (S_ISDIR(dentry->d_inode->i_mode)) + return OVL_PATH_MERGE; + else + return OVL_PATH_UPPER; + } else { + if (oe->opaque) + return OVL_PATH_UPPER; + else + return OVL_PATH_PURE_UPPER; + } + } else { + return OVL_PATH_LOWER; + } +} + +static struct dentry *ovl_upperdentry_dereference(struct ovl_entry *oe) +{ + struct dentry *upperdentry = ACCESS_ONCE(oe->__upperdentry); + /* + * Make sure to order reads to upperdentry wrt ovl_dentry_update() + */ + smp_read_barrier_depends(); + return upperdentry; +} + +void ovl_path_upper(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->upper_mnt; + path->dentry = ovl_upperdentry_dereference(oe); +} + +enum ovl_path_type ovl_path_real(struct dentry *dentry, struct path *path) +{ + + enum ovl_path_type type = ovl_path_type(dentry); + + if (type == OVL_PATH_LOWER) + ovl_path_lower(dentry, path); + else + ovl_path_upper(dentry, path); + + return type; +} + +struct dentry *ovl_dentry_upper(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return ovl_upperdentry_dereference(oe); +} + +struct dentry *ovl_dentry_lower(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->lowerdentry; +} + +struct dentry *ovl_dentry_real(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (!realdentry) + realdentry = oe->lowerdentry; + + return realdentry; +} + +struct dentry *ovl_entry_real(struct ovl_entry *oe, bool *is_upper) +{ + struct dentry *realdentry; + + realdentry = ovl_upperdentry_dereference(oe); + if (realdentry) { + *is_upper = true; + } else { + realdentry = oe->lowerdentry; + *is_upper = false; + } + return realdentry; +} + +struct ovl_dir_cache *ovl_dir_cache(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + return oe->cache; +} + +void ovl_set_dir_cache(struct dentry *dentry, struct ovl_dir_cache *cache) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + oe->cache = cache; +} + +void ovl_path_lower(struct dentry *dentry, struct path *path) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct ovl_entry *oe = dentry->d_fsdata; + + path->mnt = ofs->lower_mnt; + path->dentry = oe->lowerdentry; +} + +int ovl_want_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return mnt_want_write(ofs->upper_mnt); +} + +void ovl_drop_write(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + mnt_drop_write(ofs->upper_mnt); +} + +struct dentry *ovl_workdir(struct dentry *dentry) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + return ofs->workdir; +} + +bool ovl_dentry_is_opaque(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + return oe->opaque; +} + +void ovl_dentry_set_opaque(struct dentry *dentry, bool opaque) +{ + struct ovl_entry *oe = dentry->d_fsdata; + oe->opaque = opaque; +} + +void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(oe->__upperdentry); + BUG_ON(!upperdentry->d_inode); + /* + * Make sure upperdentry is consistent before making it visible to + * ovl_upperdentry_dereference(). + */ + smp_wmb(); + oe->__upperdentry = upperdentry; +} + +void ovl_dentry_version_inc(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + oe->version++; +} + +u64 ovl_dentry_version_get(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + return oe->version; +} + +bool ovl_is_whiteout(struct dentry *dentry) +{ + struct inode *inode = dentry->d_inode; + + return inode && IS_WHITEOUT(inode); +} + +static bool ovl_is_opaquedir(struct dentry *dentry) +{ + int res; + char val; + struct inode *inode = dentry->d_inode; + + if (!S_ISDIR(inode->i_mode) || !inode->i_op->getxattr) + return false; + + res = inode->i_op->getxattr(dentry, ovl_opaque_xattr, &val, 1); + if (res == 1 && val == 'y') + return true; + + return false; +} + +static void ovl_dentry_release(struct dentry *dentry) +{ + struct ovl_entry *oe = dentry->d_fsdata; + + if (oe) { + dput(oe->__upperdentry); + dput(oe->lowerdentry); + kfree_rcu(oe, rcu); + } +} + +static const struct dentry_operations ovl_dentry_operations = { + .d_release = ovl_dentry_release, +}; + +static struct ovl_entry *ovl_alloc_entry(void) +{ + return kzalloc(sizeof(struct ovl_entry), GFP_KERNEL); +} + +static inline struct dentry *ovl_lookup_real(struct dentry *dir, + struct qstr *name) +{ + struct dentry *dentry; + + mutex_lock(&dir->d_inode->i_mutex); + dentry = lookup_one_len(name->name, dir, name->len); + mutex_unlock(&dir->d_inode->i_mutex); + + if (IS_ERR(dentry)) { + if (PTR_ERR(dentry) == -ENOENT) + dentry = NULL; + } else if (!dentry->d_inode) { + dput(dentry); + dentry = NULL; + } + return dentry; +} + +struct dentry *ovl_lookup(struct inode *dir, struct dentry *dentry, + unsigned int flags) +{ + struct ovl_entry *oe; + struct dentry *upperdir; + struct dentry *lowerdir; + struct dentry *upperdentry = NULL; + struct dentry *lowerdentry = NULL; + struct inode *inode = NULL; + int err; + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (!oe) + goto out; + + upperdir = ovl_dentry_upper(dentry->d_parent); + lowerdir = ovl_dentry_lower(dentry->d_parent); + + if (upperdir) { + upperdentry = ovl_lookup_real(upperdir, &dentry->d_name); + err = PTR_ERR(upperdentry); + if (IS_ERR(upperdentry)) + goto out_put_dir; + + if (lowerdir && upperdentry) { + if (ovl_is_whiteout(upperdentry)) { + dput(upperdentry); + upperdentry = NULL; + oe->opaque = true; + } else if (ovl_is_opaquedir(upperdentry)) { + oe->opaque = true; + } + } + } + if (lowerdir && !oe->opaque) { + lowerdentry = ovl_lookup_real(lowerdir, &dentry->d_name); + err = PTR_ERR(lowerdentry); + if (IS_ERR(lowerdentry)) + goto out_dput_upper; + } + + if (lowerdentry && upperdentry && + (!S_ISDIR(upperdentry->d_inode->i_mode) || + !S_ISDIR(lowerdentry->d_inode->i_mode))) { + dput(lowerdentry); + lowerdentry = NULL; + oe->opaque = true; + } + + if (lowerdentry || upperdentry) { + struct dentry *realdentry; + + realdentry = upperdentry ? upperdentry : lowerdentry; + err = -ENOMEM; + inode = ovl_new_inode(dentry->d_sb, realdentry->d_inode->i_mode, + oe); + if (!inode) + goto out_dput; + ovl_copyattr(realdentry->d_inode, inode); + } + + oe->__upperdentry = upperdentry; + oe->lowerdentry = lowerdentry; + + dentry->d_fsdata = oe; + d_add(dentry, inode); + + return NULL; + +out_dput: + dput(lowerdentry); +out_dput_upper: + dput(upperdentry); +out_put_dir: + kfree(oe); +out: + return ERR_PTR(err); +} + +struct file *ovl_path_open(struct path *path, int flags) +{ + return dentry_open(path, flags, current_cred()); +} + +static void ovl_put_super(struct super_block *sb) +{ + struct ovl_fs *ufs = sb->s_fs_info; + + dput(ufs->workdir); + mntput(ufs->upper_mnt); + mntput(ufs->lower_mnt); + + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +} + +/** + * ovl_statfs + * @sb: The overlayfs super block + * @buf: The struct kstatfs to fill in with stats + * + * Get the filesystem statistics. As writes always target the upper layer + * filesystem pass the statfs to the same filesystem. + */ +static int ovl_statfs(struct dentry *dentry, struct kstatfs *buf) +{ + struct ovl_fs *ofs = dentry->d_sb->s_fs_info; + struct dentry *root_dentry = dentry->d_sb->s_root; + struct path path; + int err; + + ovl_path_upper(root_dentry, &path); + + err = vfs_statfs(&path, buf); + if (!err) { + buf->f_namelen = max(buf->f_namelen, ofs->lower_namelen); + buf->f_type = OVERLAYFS_SUPER_MAGIC; + } + + return err; +} + +/** + * ovl_show_options + * + * Prints the mount options for a given superblock. + * Returns zero; does not fail. + */ +static int ovl_show_options(struct seq_file *m, struct dentry *dentry) +{ + struct super_block *sb = dentry->d_sb; + struct ovl_fs *ufs = sb->s_fs_info; + + seq_printf(m, ",lowerdir=%s", ufs->config.lowerdir); + seq_printf(m, ",upperdir=%s", ufs->config.upperdir); + seq_printf(m, ",workdir=%s", ufs->config.workdir); + return 0; +} + +static const struct super_operations ovl_super_operations = { + .put_super = ovl_put_super, + .statfs = ovl_statfs, + .show_options = ovl_show_options, +}; + +enum { + OPT_LOWERDIR, + OPT_UPPERDIR, + OPT_WORKDIR, + OPT_ERR, +}; + +static const match_table_t ovl_tokens = { + {OPT_LOWERDIR, "lowerdir=%s"}, + {OPT_UPPERDIR, "upperdir=%s"}, + {OPT_WORKDIR, "workdir=%s"}, + {OPT_ERR, NULL} +}; + +static int ovl_parse_opt(char *opt, struct ovl_config *config) +{ + char *p; + + while ((p = strsep(&opt, ",")) != NULL) { + int token; + substring_t args[MAX_OPT_ARGS]; + + if (!*p) + continue; + + token = match_token(p, ovl_tokens, args); + switch (token) { + case OPT_UPPERDIR: + kfree(config->upperdir); + config->upperdir = match_strdup(&args[0]); + if (!config->upperdir) + return -ENOMEM; + break; + + case OPT_LOWERDIR: + kfree(config->lowerdir); + config->lowerdir = match_strdup(&args[0]); + if (!config->lowerdir) + return -ENOMEM; + break; + + case OPT_WORKDIR: + kfree(config->workdir); + config->workdir = match_strdup(&args[0]); + if (!config->workdir) + return -ENOMEM; + break; + + default: + return -EINVAL; + } + } + return 0; +} + +#define OVL_WORKDIR_NAME "work" + +static struct dentry *ovl_workdir_create(struct vfsmount *mnt, + struct dentry *dentry) +{ + struct inode *dir = dentry->d_inode; + struct dentry *work; + int err; + bool retried = false; + + err = mnt_want_write(mnt); + if (err) + return ERR_PTR(err); + + mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); +retry: + work = lookup_one_len(OVL_WORKDIR_NAME, dentry, + strlen(OVL_WORKDIR_NAME)); + + if (!IS_ERR(work)) { + struct kstat stat = { + .mode = S_IFDIR | 0, + }; + + if (work->d_inode) { + err = -EEXIST; + if (retried) + goto out_dput; + + retried = true; + ovl_cleanup(dir, work); + dput(work); + goto retry; + } + + err = ovl_create_real(dir, work, &stat, NULL, NULL, true); + if (err) + goto out_dput; + } +out_unlock: + mutex_unlock(&dir->i_mutex); + mnt_drop_write(mnt); + + return work; + +out_dput: + dput(work); + work = ERR_PTR(err); + goto out_unlock; +} + +static int ovl_mount_dir(const char *name, struct path *path) +{ + int err; + + err = kern_path(name, LOOKUP_FOLLOW, path); + if (err) { + pr_err("overlayfs: failed to resolve '%s': %i\n", name, err); + err = -EINVAL; + } + return err; +} + +static bool ovl_is_allowed_fs_type(struct dentry *root) +{ + const struct dentry_operations *dop = root->d_op; + + /* + * We don't support: + * - automount filesystems + * - filesystems with revalidate (FIXME for lower layer) + * - filesystems with case insensitive names + */ + if (dop && + (dop->d_manage || dop->d_automount || + dop->d_revalidate || dop->d_weak_revalidate || + dop->d_compare || dop->d_hash)) { + return false; + } + return true; +} + +/* Workdir should not be subdir of upperdir and vice versa */ +static bool ovl_workdir_ok(struct dentry *workdir, struct dentry *upperdir) +{ + bool ok = false; + + if (workdir != upperdir) { + ok = (lock_rename(workdir, upperdir) == NULL); + unlock_rename(workdir, upperdir); + } + return ok; +} + +static int ovl_fill_super(struct super_block *sb, void *data, int silent) +{ + struct path lowerpath; + struct path upperpath; + struct path workpath; + struct inode *root_inode; + struct dentry *root_dentry; + struct ovl_entry *oe; + struct ovl_fs *ufs; + struct kstatfs statfs; + int err; + + err = -ENOMEM; + ufs = kzalloc(sizeof(struct ovl_fs), GFP_KERNEL); + if (!ufs) + goto out; + + err = ovl_parse_opt((char *) data, &ufs->config); + if (err) + goto out_free_config; + + /* FIXME: workdir is not needed for a R/O mount */ + err = -EINVAL; + if (!ufs->config.upperdir || !ufs->config.lowerdir || + !ufs->config.workdir) { + pr_err("overlayfs: missing upperdir or lowerdir or workdir\n"); + goto out_free_config; + } + + err = -ENOMEM; + oe = ovl_alloc_entry(); + if (oe == NULL) + goto out_free_config; + + err = ovl_mount_dir(ufs->config.upperdir, &upperpath); + if (err) + goto out_free_oe; + + err = ovl_mount_dir(ufs->config.lowerdir, &lowerpath); + if (err) + goto out_put_upperpath; + + err = ovl_mount_dir(ufs->config.workdir, &workpath); + if (err) + goto out_put_lowerpath; + + err = -EINVAL; + if (!S_ISDIR(upperpath.dentry->d_inode->i_mode) || + !S_ISDIR(lowerpath.dentry->d_inode->i_mode) || + !S_ISDIR(workpath.dentry->d_inode->i_mode)) { + pr_err("overlayfs: upperdir or lowerdir or workdir not a directory\n"); + goto out_put_workpath; + } + + if (upperpath.mnt != workpath.mnt) { + pr_err("overlayfs: workdir and upperdir must reside under the same mount\n"); + goto out_put_workpath; + } + if (!ovl_workdir_ok(workpath.dentry, upperpath.dentry)) { + pr_err("overlayfs: workdir and upperdir must be separate subtrees\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(upperpath.dentry)) { + pr_err("overlayfs: filesystem of upperdir is not supported\n"); + goto out_put_workpath; + } + + if (!ovl_is_allowed_fs_type(lowerpath.dentry)) { + pr_err("overlayfs: filesystem of lowerdir is not supported\n"); + goto out_put_workpath; + } + + err = vfs_statfs(&lowerpath, &statfs); + if (err) { + pr_err("overlayfs: statfs failed on lowerpath\n"); + goto out_put_workpath; + } + ufs->lower_namelen = statfs.f_namelen; + + sb->s_stack_depth = max(upperpath.mnt->mnt_sb->s_stack_depth, + lowerpath.mnt->mnt_sb->s_stack_depth) + 1; + + err = -EINVAL; + if (sb->s_stack_depth > FILESYSTEM_MAX_STACK_DEPTH) { + pr_err("overlayfs: maximum fs stacking depth exceeded\n"); + goto out_put_workpath; + } + + ufs->upper_mnt = clone_private_mount(&upperpath); + err = PTR_ERR(ufs->upper_mnt); + if (IS_ERR(ufs->upper_mnt)) { + pr_err("overlayfs: failed to clone upperpath\n"); + goto out_put_workpath; + } + + ufs->lower_mnt = clone_private_mount(&lowerpath); + err = PTR_ERR(ufs->lower_mnt); + if (IS_ERR(ufs->lower_mnt)) { + pr_err("overlayfs: failed to clone lowerpath\n"); + goto out_put_upper_mnt; + } + + ufs->workdir = ovl_workdir_create(ufs->upper_mnt, workpath.dentry); + err = PTR_ERR(ufs->workdir); + if (IS_ERR(ufs->workdir)) { + pr_err("overlayfs: failed to create directory %s/%s\n", + ufs->config.workdir, OVL_WORKDIR_NAME); + goto out_put_lower_mnt; + } + + /* + * Make lower_mnt R/O. That way fchmod/fchown on lower file + * will fail instead of modifying lower fs. + */ + ufs->lower_mnt->mnt_flags |= MNT_READONLY; + + /* If the upper fs is r/o, we mark overlayfs r/o too */ + if (ufs->upper_mnt->mnt_sb->s_flags & MS_RDONLY) + sb->s_flags |= MS_RDONLY; + + sb->s_d_op = &ovl_dentry_operations; + + err = -ENOMEM; + root_inode = ovl_new_inode(sb, S_IFDIR, oe); + if (!root_inode) + goto out_put_workdir; + + root_dentry = d_make_root(root_inode); + if (!root_dentry) + goto out_put_workdir; + + mntput(upperpath.mnt); + mntput(lowerpath.mnt); + path_put(&workpath); + + oe->__upperdentry = upperpath.dentry; + oe->lowerdentry = lowerpath.dentry; + + root_dentry->d_fsdata = oe; + + sb->s_magic = OVERLAYFS_SUPER_MAGIC; + sb->s_op = &ovl_super_operations; + sb->s_root = root_dentry; + sb->s_fs_info = ufs; + + return 0; + +out_put_workdir: + dput(ufs->workdir); +out_put_lower_mnt: + mntput(ufs->lower_mnt); +out_put_upper_mnt: + mntput(ufs->upper_mnt); +out_put_workpath: + path_put(&workpath); +out_put_lowerpath: + path_put(&lowerpath); +out_put_upperpath: + path_put(&upperpath); +out_free_oe: + kfree(oe); +out_free_config: + kfree(ufs->config.lowerdir); + kfree(ufs->config.upperdir); + kfree(ufs->config.workdir); + kfree(ufs); +out: + return err; +} + +static struct dentry *ovl_mount(struct file_system_type *fs_type, int flags, + const char *dev_name, void *raw_data) +{ + return mount_nodev(fs_type, flags, raw_data, ovl_fill_super); +} + +static struct file_system_type ovl_fs_type = { + .owner = THIS_MODULE, + .name = "overlayfs", + .mount = ovl_mount, + .kill_sb = kill_anon_super, +}; +MODULE_ALIAS_FS("overlayfs"); + +static int __init ovl_init(void) +{ + return register_filesystem(&ovl_fs_type); +} + +static void __exit ovl_exit(void) +{ + unregister_filesystem(&ovl_fs_type); +} + +module_init(ovl_init); +module_exit(ovl_exit); diff --git a/fs/proc/base.c b/fs/proc/base.c index baf852b648ad..772efa45a452 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -376,37 +376,6 @@ static const struct file_operations proc_lstats_operations = { #endif -#ifdef CONFIG_CGROUPS -static int cgroup_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cgroup_show, pid); -} - -static const struct file_operations proc_cgroup_operations = { - .open = cgroup_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - -#ifdef CONFIG_PROC_PID_CPUSET - -static int cpuset_open(struct inode *inode, struct file *file) -{ - struct pid *pid = PROC_I(inode)->pid; - return single_open(file, proc_cpuset_show, pid); -} - -static const struct file_operations proc_cpuset_operations = { - .open = cpuset_open, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, -}; -#endif - static int proc_oom_score(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) { @@ -632,29 +601,35 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; -static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) + +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { - struct task_struct *task = get_proc_task(file_inode(file)); - struct mm_struct *mm; + struct task_struct *task = get_proc_task(inode); + struct mm_struct *mm = ERR_PTR(-ESRCH); - if (!task) - return -ESRCH; + if (task) { + mm = mm_access(task, mode); + put_task_struct(task); - mm = mm_access(task, mode); - put_task_struct(task); + if (!IS_ERR_OR_NULL(mm)) { + /* ensure this mm_struct can't be freed */ + atomic_inc(&mm->mm_count); + /* but do not pin its memory */ + mmput(mm); + } + } + + return mm; +} + +static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) +{ + struct mm_struct *mm = proc_mem_open(inode, mode); if (IS_ERR(mm)) return PTR_ERR(mm); - if (mm) { - /* ensure this mm_struct can't be freed */ - atomic_inc(&mm->mm_count); - /* but do not pin its memory */ - mmput(mm); - } - file->private_data = mm; - return 0; } @@ -1590,7 +1565,6 @@ int pid_revalidate(struct dentry *dentry, unsigned int flags) put_task_struct(task); return 1; } - d_drop(dentry); return 0; } @@ -1727,9 +1701,6 @@ out: put_task_struct(task); out_notask: - if (status <= 0) - d_drop(dentry); - return status; } @@ -2573,10 +2544,10 @@ static const struct pid_entry tgid_base_stuff[] = { REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), @@ -2643,8 +2614,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) /* no ->d_hash() rejects on procfs */ dentry = d_hash_and_lookup(mnt->mnt_root, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); } @@ -2664,8 +2634,7 @@ static void proc_flush_task_mnt(struct vfsmount *mnt, pid_t pid, pid_t tgid) name.len = snprintf(buf, sizeof(buf), "%d", pid); dentry = d_hash_and_lookup(dir, &name); if (dentry) { - shrink_dcache_parent(dentry); - d_drop(dentry); + d_invalidate(dentry); dput(dentry); } @@ -2919,10 +2888,10 @@ static const struct pid_entry tid_base_stuff[] = { REG("latency", S_IRUGO, proc_lstats_operations), #endif #ifdef CONFIG_PROC_PID_CPUSET - REG("cpuset", S_IRUGO, proc_cpuset_operations), + ONE("cpuset", S_IRUGO, proc_cpuset_show), #endif #ifdef CONFIG_CGROUPS - REG("cgroup", S_IRUGO, proc_cgroup_operations), + ONE("cgroup", S_IRUGO, proc_cgroup_show), #endif ONE("oom_score", S_IRUGO, proc_oom_score), REG("oom_adj", S_IRUGO|S_IWUSR, proc_oom_adj_operations), diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 955bb55fab8c..e11d7c590bb0 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -129,8 +129,6 @@ static int tid_fd_revalidate(struct dentry *dentry, unsigned int flags) } put_task_struct(task); } - - d_drop(dentry); return 0; } diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 7da13e49128a..aa7a0ee182e1 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -268,8 +268,9 @@ extern int proc_remount(struct super_block *, int *, char *); * task_[no]mmu.c */ struct proc_maps_private { - struct pid *pid; + struct inode *inode; struct task_struct *task; + struct mm_struct *mm; #ifdef CONFIG_MMU struct vm_area_struct *tail_vma; #endif @@ -278,6 +279,8 @@ struct proc_maps_private { #endif }; +struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode); + extern const struct file_operations proc_pid_maps_operations; extern const struct file_operations proc_tid_maps_operations; extern const struct file_operations proc_pid_numa_maps_operations; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 6df8d0722c97..91a4e6426321 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -610,8 +610,10 @@ static void __init proc_kcore_text_init(void) struct kcore_list kcore_modules; static void __init add_modules_range(void) { - kclist_add(&kcore_modules, (void *)MODULES_VADDR, + if (MODULES_VADDR != VMALLOC_START && MODULES_END != VMALLOC_END) { + kclist_add(&kcore_modules, (void *)MODULES_VADDR, MODULES_END - MODULES_VADDR, KCORE_VMALLOC); + } } #else static void __init add_modules_range(void) diff --git a/fs/proc/page.c b/fs/proc/page.c index e647c55275d9..1e3187da1fed 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -133,6 +133,9 @@ u64 stable_page_flags(struct page *page) if (PageBuddy(page)) u |= 1 << KPF_BUDDY; + if (PageBalloon(page)) + u |= 1 << KPF_BALLOON; + u |= kpf_copy_bit(k, KPF_LOCKED, PG_locked); u |= kpf_copy_bit(k, KPF_SLAB, PG_slab); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index c34156888d70..4e0388cffe3d 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -87,32 +87,14 @@ unsigned long task_statm(struct mm_struct *mm, #ifdef CONFIG_NUMA /* - * These functions are for numa_maps but called in generic **maps seq_file - * ->start(), ->stop() ops. - * - * numa_maps scans all vmas under mmap_sem and checks their mempolicy. - * Each mempolicy object is controlled by reference counting. The problem here - * is how to avoid accessing dead mempolicy object. - * - * Because we're holding mmap_sem while reading seq_file, it's safe to access - * each vma's mempolicy, no vma objects will never drop refs to mempolicy. - * - * A task's mempolicy (task->mempolicy) has different behavior. task->mempolicy - * is set and replaced under mmap_sem but unrefed and cleared under task_lock(). - * So, without task_lock(), we cannot trust get_vma_policy() because we cannot - * gurantee the task never exits under us. But taking task_lock() around - * get_vma_plicy() causes lock order problem. - * - * To access task->mempolicy without lock, we hold a reference count of an - * object pointed by task->mempolicy and remember it. This will guarantee - * that task->mempolicy points to an alive object or NULL in numa_maps accesses. + * Save get_task_policy() for show_numa_map(). */ static void hold_task_mempolicy(struct proc_maps_private *priv) { struct task_struct *task = priv->task; task_lock(task); - priv->task_mempolicy = task->mempolicy; + priv->task_mempolicy = get_task_policy(task); mpol_get(priv->task_mempolicy); task_unlock(task); } @@ -129,124 +111,154 @@ static void release_task_mempolicy(struct proc_maps_private *priv) } #endif -static void vma_stop(struct proc_maps_private *priv, struct vm_area_struct *vma) +static void vma_stop(struct proc_maps_private *priv) { - if (vma && vma != priv->tail_vma) { - struct mm_struct *mm = vma->vm_mm; - release_task_mempolicy(priv); - up_read(&mm->mmap_sem); - mmput(mm); - } + struct mm_struct *mm = priv->mm; + + release_task_mempolicy(priv); + up_read(&mm->mmap_sem); + mmput(mm); +} + +static struct vm_area_struct * +m_next_vma(struct proc_maps_private *priv, struct vm_area_struct *vma) +{ + if (vma == priv->tail_vma) + return NULL; + return vma->vm_next ?: priv->tail_vma; } -static void *m_start(struct seq_file *m, loff_t *pos) +static void m_cache_vma(struct seq_file *m, struct vm_area_struct *vma) +{ + if (m->count < m->size) /* vma is copied successfully */ + m->version = m_next_vma(m->private, vma) ? vma->vm_start : -1UL; +} + +static void *m_start(struct seq_file *m, loff_t *ppos) { struct proc_maps_private *priv = m->private; unsigned long last_addr = m->version; struct mm_struct *mm; - struct vm_area_struct *vma, *tail_vma = NULL; - loff_t l = *pos; - - /* Clear the per syscall fields in priv */ - priv->task = NULL; - priv->tail_vma = NULL; - - /* - * We remember last_addr rather than next_addr to hit with - * vmacache most of the time. We have zero last_addr at - * the beginning and also after lseek. We will have -1 last_addr - * after the end of the vmas. - */ + struct vm_area_struct *vma; + unsigned int pos = *ppos; + /* See m_cache_vma(). Zero at the start or after lseek. */ if (last_addr == -1UL) return NULL; - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_access(priv->task, PTRACE_MODE_READ); - if (!mm || IS_ERR(mm)) - return mm; - down_read(&mm->mmap_sem); + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; - tail_vma = get_gate_vma(priv->task->mm); - priv->tail_vma = tail_vma; + down_read(&mm->mmap_sem); hold_task_mempolicy(priv); - /* Start with last addr hint */ - vma = find_vma(mm, last_addr); - if (last_addr && vma) { - vma = vma->vm_next; - goto out; + priv->tail_vma = get_gate_vma(mm); + + if (last_addr) { + vma = find_vma(mm, last_addr); + if (vma && (vma = m_next_vma(priv, vma))) + return vma; } - /* - * Check the vma index is within the range and do - * sequential scan until m_index. - */ - vma = NULL; - if ((unsigned long)l < mm->map_count) { - vma = mm->mmap; - while (l-- && vma) + m->version = 0; + if (pos < mm->map_count) { + for (vma = mm->mmap; pos; pos--) { + m->version = vma->vm_start; vma = vma->vm_next; - goto out; + } + return vma; } - if (l != mm->map_count) - tail_vma = NULL; /* After gate vma */ + /* we do not bother to update m->version in this case */ + if (pos == mm->map_count && priv->tail_vma) + return priv->tail_vma; -out: - if (vma) - return vma; - - release_task_mempolicy(priv); - /* End of vmas has been reached */ - m->version = (tail_vma != NULL)? 0: -1UL; - up_read(&mm->mmap_sem); - mmput(mm); - return tail_vma; + vma_stop(priv); + return NULL; } static void *m_next(struct seq_file *m, void *v, loff_t *pos) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - struct vm_area_struct *tail_vma = priv->tail_vma; + struct vm_area_struct *next; (*pos)++; - if (vma && (vma != tail_vma) && vma->vm_next) - return vma->vm_next; - vma_stop(priv, vma); - return (vma != tail_vma)? tail_vma: NULL; + next = m_next_vma(priv, v); + if (!next) + vma_stop(priv); + return next; } static void m_stop(struct seq_file *m, void *v) { struct proc_maps_private *priv = m->private; - struct vm_area_struct *vma = v; - if (!IS_ERR(vma)) - vma_stop(priv, vma); - if (priv->task) + if (!IS_ERR_OR_NULL(v)) + vma_stop(priv); + if (priv->task) { put_task_struct(priv->task); + priv->task = NULL; + } +} + +static int proc_maps_open(struct inode *inode, struct file *file, + const struct seq_operations *ops, int psize) +{ + struct proc_maps_private *priv = __seq_open_private(file, ops, psize); + + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; + } + + return 0; +} + +static int proc_map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); } static int do_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { - struct proc_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } + return proc_maps_open(inode, file, ops, + sizeof(struct proc_maps_private)); +} + +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); } + rcu_read_unlock(); + return ret; } @@ -256,7 +268,6 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) struct mm_struct *mm = vma->vm_mm; struct file *file = vma->vm_file; struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; vm_flags_t flags = vma->vm_flags; unsigned long ino = 0; unsigned long long pgoff = 0; @@ -321,8 +332,7 @@ show_map_vma(struct seq_file *m, struct vm_area_struct *vma, int is_pid) goto done; } - tid = vm_is_stack(task, vma, is_pid); - + tid = pid_of_stack(priv, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or @@ -349,15 +359,8 @@ done: static int show_map(struct seq_file *m, void *v, int is_pid) { - struct vm_area_struct *vma = v; - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; - - show_map_vma(m, vma, is_pid); - - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task->mm)) - ? vma->vm_start : 0; + show_map_vma(m, v, is_pid); + m_cache_vma(m, v); return 0; } @@ -399,14 +402,14 @@ const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_maps_operations = { .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; /* @@ -583,8 +586,6 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma) static int show_smap(struct seq_file *m, void *v, int is_pid) { - struct proc_maps_private *priv = m->private; - struct task_struct *task = priv->task; struct vm_area_struct *vma = v; struct mem_size_stats mss; struct mm_walk smaps_walk = { @@ -637,10 +638,7 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) mss.nonlinear >> 10); show_smap_vma_flags(m, vma); - - if (m->count < m->size) /* vma is copied successfully */ - m->version = (vma != get_gate_vma(task->mm)) - ? vma->vm_start : 0; + m_cache_vma(m, vma); return 0; } @@ -682,14 +680,14 @@ const struct file_operations proc_pid_smaps_operations = { .open = pid_smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_smaps_operations = { .open = tid_smaps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; /* @@ -829,8 +827,21 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, .private = &cp, }; down_read(&mm->mmap_sem); - if (type == CLEAR_REFS_SOFT_DIRTY) + if (type == CLEAR_REFS_SOFT_DIRTY) { + for (vma = mm->mmap; vma; vma = vma->vm_next) { + if (!(vma->vm_flags & VM_SOFTDIRTY)) + continue; + up_read(&mm->mmap_sem); + down_write(&mm->mmap_sem); + for (vma = mm->mmap; vma; vma = vma->vm_next) { + vma->vm_flags &= ~VM_SOFTDIRTY; + vma_set_page_prot(vma); + } + downgrade_write(&mm->mmap_sem); + break; + } mmu_notifier_invalidate_range_start(mm, 0, -1); + } for (vma = mm->mmap; vma; vma = vma->vm_next) { cp.vma = vma; if (is_vm_hugetlb_page(vma)) @@ -850,10 +861,6 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, continue; if (type == CLEAR_REFS_MAPPED && !vma->vm_file) continue; - if (type == CLEAR_REFS_SOFT_DIRTY) { - if (vma->vm_flags & VM_SOFTDIRTY) - vma->vm_flags &= ~VM_SOFTDIRTY; - } walk_page_range(vma->vm_start, vma->vm_end, &clear_refs_walk); } @@ -1029,7 +1036,6 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, spinlock_t *ptl; pte_t *pte; int err = 0; - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); /* find the first VMA at or above 'addr' */ vma = find_vma(walk->mm, addr); @@ -1043,6 +1049,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, for (; addr != end; addr += PAGE_SIZE) { unsigned long offset; + pagemap_entry_t pme; offset = (addr & ~PAGEMAP_WALK_MASK) >> PAGE_SHIFT; @@ -1057,32 +1064,51 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; - for (; addr != end; addr += PAGE_SIZE) { - int flags2; - - /* check to see if we've left 'vma' behind - * and need a new, higher one */ - if (vma && (addr >= vma->vm_end)) { - vma = find_vma(walk->mm, addr); - if (vma && (vma->vm_flags & VM_SOFTDIRTY)) - flags2 = __PM_SOFT_DIRTY; - else - flags2 = 0; - pme = make_pme(PM_NOT_PRESENT(pm->v2) | PM_STATUS2(pm->v2, flags2)); + + while (1) { + /* End of address space hole, which we mark as non-present. */ + unsigned long hole_end; + + if (vma) + hole_end = min(end, vma->vm_start); + else + hole_end = end; + + for (; addr < hole_end; addr += PAGE_SIZE) { + pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); + + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; } - /* check that 'vma' actually covers this address, - * and that it isn't a huge page vma */ - if (vma && (vma->vm_start <= addr) && - !is_vm_hugetlb_page(vma)) { + if (!vma || vma->vm_start >= end) + break; + /* + * We can't possibly be in a hugetlb VMA. In general, + * for a mm_walk with a pmd_entry and a hugetlb_entry, + * the pmd_entry can only be called on addresses in a + * hugetlb if the walk starts in a non-hugetlb VMA and + * spans a hugepage VMA. Since pagemap_read walks are + * PMD-sized and PMD-aligned, this will never be true. + */ + BUG_ON(is_vm_hugetlb_page(vma)); + + /* Addresses in the VMA. */ + for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + pagemap_entry_t pme; pte = pte_offset_map(pmd, addr); pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); - /* unmap before userspace copy */ pte_unmap(pte); + err = add_to_pagemap(addr, &pme, pm); + if (err) + return err; } - err = add_to_pagemap(addr, &pme, pm); - if (err) - return err; + + if (addr == end) + break; + + vma = find_vma(walk->mm, addr); } cond_resched(); @@ -1415,7 +1441,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct vm_area_struct *vma = v; struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; - struct task_struct *task = proc_priv->task; struct mm_struct *mm = vma->vm_mm; struct mm_walk walk = {}; struct mempolicy *pol; @@ -1435,9 +1460,13 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) walk.private = md; walk.mm = mm; - pol = get_vma_policy(task, vma, vma->vm_start); - mpol_to_str(buffer, sizeof(buffer), pol); - mpol_cond_put(pol); + pol = __get_vma_policy(vma, vma->vm_start); + if (pol) { + mpol_to_str(buffer, sizeof(buffer), pol); + mpol_cond_put(pol); + } else { + mpol_to_str(buffer, sizeof(buffer), proc_priv->task_mempolicy); + } seq_printf(m, "%08lx %s", vma->vm_start, buffer); @@ -1447,7 +1476,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) } else if (vma->vm_start <= mm->brk && vma->vm_end >= mm->start_brk) { seq_puts(m, " heap"); } else { - pid_t tid = vm_is_stack(task, vma, is_pid); + pid_t tid = pid_of_stack(proc_priv, vma, is_pid); if (tid != 0) { /* * Thread stack in /proc/PID/task/TID/maps or @@ -1495,9 +1524,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) seq_printf(m, " N%d=%lu", nid, md->node[nid]); out: seq_putc(m, '\n'); - - if (m->count < m->size) - m->version = (vma != proc_priv->tail_vma) ? vma->vm_start : 0; + m_cache_vma(m, vma); return 0; } @@ -1528,20 +1555,8 @@ static const struct seq_operations proc_tid_numa_maps_op = { static int numa_maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { - struct numa_maps_private *priv; - int ret = -ENOMEM; - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->proc_maps.pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } - } - return ret; + return proc_maps_open(inode, file, ops, + sizeof(struct numa_maps_private)); } static int pid_numa_maps_open(struct inode *inode, struct file *file) @@ -1558,13 +1573,13 @@ const struct file_operations proc_pid_numa_maps_operations = { .open = pid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; const struct file_operations proc_tid_numa_maps_operations = { .open = tid_numa_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = proc_map_release, }; #endif /* CONFIG_NUMA */ diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index 678455d2d683..599ec2e20104 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -123,6 +123,25 @@ unsigned long task_statm(struct mm_struct *mm, return size; } +static pid_t pid_of_stack(struct proc_maps_private *priv, + struct vm_area_struct *vma, bool is_pid) +{ + struct inode *inode = priv->inode; + struct task_struct *task; + pid_t ret = 0; + + rcu_read_lock(); + task = pid_task(proc_pid(inode), PIDTYPE_PID); + if (task) { + task = task_of_stack(task, vma, is_pid); + if (task) + ret = task_pid_nr_ns(task, inode->i_sb->s_fs_info); + } + rcu_read_unlock(); + + return ret; +} + /* * display a single VMA to a sequenced file */ @@ -163,7 +182,7 @@ static int nommu_vma_show(struct seq_file *m, struct vm_area_struct *vma, seq_pad(m, ' '); seq_path(m, &file->f_path, ""); } else if (mm) { - pid_t tid = vm_is_stack(priv->task, vma, is_pid); + pid_t tid = pid_of_stack(priv, vma, is_pid); if (tid != 0) { seq_pad(m, ' '); @@ -212,22 +231,22 @@ static void *m_start(struct seq_file *m, loff_t *pos) loff_t n = *pos; /* pin the task and mm whilst we play with them */ - priv->task = get_pid_task(priv->pid, PIDTYPE_PID); + priv->task = get_proc_task(priv->inode); if (!priv->task) return ERR_PTR(-ESRCH); - mm = mm_access(priv->task, PTRACE_MODE_READ); - if (!mm || IS_ERR(mm)) { - put_task_struct(priv->task); - priv->task = NULL; - return mm; - } - down_read(&mm->mmap_sem); + mm = priv->mm; + if (!mm || !atomic_inc_not_zero(&mm->mm_users)) + return NULL; + down_read(&mm->mmap_sem); /* start from the Nth VMA */ for (p = rb_first(&mm->mm_rb); p; p = rb_next(p)) if (n-- == 0) return p; + + up_read(&mm->mmap_sem); + mmput(mm); return NULL; } @@ -235,11 +254,13 @@ static void m_stop(struct seq_file *m, void *_vml) { struct proc_maps_private *priv = m->private; + if (!IS_ERR_OR_NULL(_vml)) { + up_read(&priv->mm->mmap_sem); + mmput(priv->mm); + } if (priv->task) { - struct mm_struct *mm = priv->task->mm; - up_read(&mm->mmap_sem); - mmput(mm); put_task_struct(priv->task); + priv->task = NULL; } } @@ -269,20 +290,33 @@ static int maps_open(struct inode *inode, struct file *file, const struct seq_operations *ops) { struct proc_maps_private *priv; - int ret = -ENOMEM; - - priv = kzalloc(sizeof(*priv), GFP_KERNEL); - if (priv) { - priv->pid = proc_pid(inode); - ret = seq_open(file, ops); - if (!ret) { - struct seq_file *m = file->private_data; - m->private = priv; - } else { - kfree(priv); - } + + priv = __seq_open_private(file, ops, sizeof(*priv)); + if (!priv) + return -ENOMEM; + + priv->inode = inode; + priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); + if (IS_ERR(priv->mm)) { + int err = PTR_ERR(priv->mm); + + seq_release_private(inode, file); + return err; } - return ret; + + return 0; +} + + +static int map_release(struct inode *inode, struct file *file) +{ + struct seq_file *seq = file->private_data; + struct proc_maps_private *priv = seq->private; + + if (priv->mm) + mmdrop(priv->mm); + + return seq_release_private(inode, file); } static int pid_maps_open(struct inode *inode, struct file *file) @@ -299,13 +333,13 @@ const struct file_operations proc_pid_maps_operations = { .open = pid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = map_release, }; const struct file_operations proc_tid_maps_operations = { .open = tid_maps_open, .read = seq_read, .llseek = seq_lseek, - .release = seq_release_private, + .release = map_release, }; diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 192297b0090d..fafb7a02a5d6 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -320,10 +320,10 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, compressed ? ".enc.z" : ""); break; case PSTORE_TYPE_CONSOLE: - sprintf(name, "console-%s", psname); + sprintf(name, "console-%s-%lld", psname, id); break; case PSTORE_TYPE_FTRACE: - sprintf(name, "ftrace-%s", psname); + sprintf(name, "ftrace-%s-%lld", psname, id); break; case PSTORE_TYPE_MCE: sprintf(name, "mce-%s-%lld", psname, id); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index f2d0eee9d1f1..8b663b2d9562 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2725,7 +2725,7 @@ static int __init dquot_init(void) panic("Cannot create dquot hash table"); for (i = 0; i < _DQST_DQSTAT_LAST; i++) { - ret = percpu_counter_init(&dqstats.counter[i], 0); + ret = percpu_counter_init(&dqstats.counter[i], 0, GFP_KERNEL); if (ret) panic("Cannot create dquot stat counters"); } diff --git a/fs/read_write.c b/fs/read_write.c index 009d8542a889..7d9318c3d43c 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -513,6 +513,8 @@ ssize_t __kernel_write(struct file *file, const char *buf, size_t count, loff_t return ret; } +EXPORT_SYMBOL(__kernel_write); + ssize_t vfs_write(struct file *file, const char __user *buf, size_t count, loff_t *pos) { ssize_t ret; diff --git a/fs/reiserfs/journal.c b/fs/reiserfs/journal.c index a88b1b3e7db3..d571e173a990 100644 --- a/fs/reiserfs/journal.c +++ b/fs/reiserfs/journal.c @@ -699,11 +699,13 @@ static int add_to_chunk(struct buffer_chunk *chunk, struct buffer_head *bh, chunk->bh[chunk->nr++] = bh; if (chunk->nr >= CHUNK_SIZE) { ret = 1; - if (lock) + if (lock) { spin_unlock(lock); - fn(chunk); - if (lock) + fn(chunk); spin_lock(lock); + } else { + fn(chunk); + } } return ret; } diff --git a/fs/reiserfs/reiserfs.h b/fs/reiserfs/reiserfs.h index 735c2c2b4536..1894d96ccb7c 100644 --- a/fs/reiserfs/reiserfs.h +++ b/fs/reiserfs/reiserfs.h @@ -506,6 +506,9 @@ typedef struct reiserfs_proc_info_data { } reiserfs_proc_info_data_t; #endif +/* Number of quota types we support */ +#define REISERFS_MAXQUOTAS 2 + /* reiserfs union of in-core super block data */ struct reiserfs_sb_info { /* Buffer containing the super block */ @@ -615,7 +618,7 @@ struct reiserfs_sb_info { spinlock_t old_work_lock; /* protects old_work and work_queued */ #ifdef CONFIG_QUOTA - char *s_qf_names[MAXQUOTAS]; + char *s_qf_names[REISERFS_MAXQUOTAS]; int s_jquota_fmt; #endif char *s_jdev; /* Stored jdev for mount option showing */ diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index d46e88a33b02..f1376c92cf74 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -206,7 +206,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA int i; int ms_active_set; - int quota_enabled[MAXQUOTAS]; + int quota_enabled[REISERFS_MAXQUOTAS]; #endif /* compose key to look for "save" links */ @@ -227,7 +227,7 @@ static int finish_unfinished(struct super_block *s) s->s_flags |= MS_ACTIVE; } /* Turn on quotas so that they are updated correctly */ - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { quota_enabled[i] = 1; if (REISERFS_SB(s)->s_qf_names[i]) { int ret; @@ -370,7 +370,7 @@ static int finish_unfinished(struct super_block *s) #ifdef CONFIG_QUOTA /* Turn quotas off */ reiserfs_write_unlock(s); - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { if (sb_dqopt(s)->files[i] && quota_enabled[i]) dquot_quota_off(s, i); } @@ -1360,7 +1360,7 @@ static void handle_quota_files(struct super_block *s, char **qf_names, { int i; - for (i = 0; i < MAXQUOTAS; i++) { + for (i = 0; i < REISERFS_MAXQUOTAS; i++) { if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) kfree(REISERFS_SB(s)->s_qf_names[i]); REISERFS_SB(s)->s_qf_names[i] = qf_names[i]; @@ -1381,7 +1381,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) struct reiserfs_journal *journal = SB_JOURNAL(s); char *new_opts = kstrdup(arg, GFP_KERNEL); int err; - char *qf_names[MAXQUOTAS]; + char *qf_names[REISERFS_MAXQUOTAS]; unsigned int qfmt = 0; #ifdef CONFIG_QUOTA int i; @@ -1400,7 +1400,7 @@ static int reiserfs_remount(struct super_block *s, int *mount_flags, char *arg) (s, arg, &mount_options, &blocks, NULL, &commit_max_age, qf_names, &qfmt)) { #ifdef CONFIG_QUOTA - for (i = 0; i < MAXQUOTAS; i++) + for (i = 0; i < REISERFS_MAXQUOTAS; i++) if (qf_names[i] != REISERFS_SB(s)->s_qf_names[i]) kfree(qf_names[i]); #endif @@ -1844,7 +1844,7 @@ static int reiserfs_fill_super(struct super_block *s, void *data, int silent) char *jdev_name; struct reiserfs_sb_info *sbi; int errval = -EINVAL; - char *qf_names[MAXQUOTAS] = {}; + char *qf_names[REISERFS_MAXQUOTAS] = {}; unsigned int qfmt = 0; save_mount_options(s, data); @@ -2169,7 +2169,7 @@ error_unlocked: #ifdef CONFIG_QUOTA { int j; - for (j = 0; j < MAXQUOTAS; j++) + for (j = 0; j < REISERFS_MAXQUOTAS; j++) kfree(qf_names[j]); } #endif diff --git a/fs/reiserfs/xattr.h b/fs/reiserfs/xattr.h index 857ec7e3016f..f620e9678dd5 100644 --- a/fs/reiserfs/xattr.h +++ b/fs/reiserfs/xattr.h @@ -7,7 +7,6 @@ struct inode; struct dentry; struct iattr; struct super_block; -struct nameidata; int reiserfs_xattr_register_handlers(void) __init; void reiserfs_xattr_unregister_handlers(void); diff --git a/fs/splice.c b/fs/splice.c index f5cb9ba84510..75c6058eabf2 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -1330,6 +1330,7 @@ long do_splice_direct(struct file *in, loff_t *ppos, struct file *out, return ret; } +EXPORT_SYMBOL(do_splice_direct); static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_inode_info *opipe, diff --git a/fs/stack.c b/fs/stack.c index 5b5388250e29..a54e33ed10f1 100644 --- a/fs/stack.c +++ b/fs/stack.c @@ -44,7 +44,7 @@ void fsstack_copy_inode_size(struct inode *dst, struct inode *src) * include/linux/fs.h). We don't necessarily hold i_mutex when this * is called, so take i_lock for that case. * - * And if CONFIG_LBADF (on 32-bit), continue our effort to keep the + * And if CONFIG_LBDAF (on 32-bit), continue our effort to keep the * two halves of i_blocks in sync despite SMP or PREEMPT: use i_lock * for that case too, and do both at once by combining the tests. * diff --git a/fs/super.c b/fs/super.c index b9a214d2fe98..eae088f6aaae 100644 --- a/fs/super.c +++ b/fs/super.c @@ -80,6 +80,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink, inodes = list_lru_count_node(&sb->s_inode_lru, sc->nid); dentries = list_lru_count_node(&sb->s_dentry_lru, sc->nid); total_objects = dentries + inodes + fs_objects + 1; + if (!total_objects) + total_objects = 1; /* proportion the scan between the caches */ dentries = mult_frac(sc->nr_to_scan, dentries, total_objects); @@ -175,7 +177,8 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0) < 0) + if (percpu_counter_init(&s->s_writers.counter[i], 0, + GFP_KERNEL) < 0) goto fail; lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], &type->s_writers_key[i], 0); diff --git a/fs/timerfd.c b/fs/timerfd.c index 80c350216ea8..b46ffa94372a 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -333,8 +333,7 @@ static long timerfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg spin_lock_irq(&ctx->wqh.lock); if (!timerfd_canceled(ctx)) { ctx->ticks = ticks; - if (ticks) - wake_up_locked(&ctx->wqh); + wake_up_locked(&ctx->wqh); } else ret = -ECANCELED; spin_unlock_irq(&ctx->wqh.lock); diff --git a/fs/ubifs/commit.c b/fs/ubifs/commit.c index aa13ad053b14..26b69b2d4a45 100644 --- a/fs/ubifs/commit.c +++ b/fs/ubifs/commit.c @@ -166,10 +166,6 @@ static int do_commit(struct ubifs_info *c) err = ubifs_orphan_end_commit(c); if (err) goto out; - old_ltail_lnum = c->ltail_lnum; - err = ubifs_log_end_commit(c, new_ltail_lnum); - if (err) - goto out; err = dbg_check_old_index(c, &zroot); if (err) goto out; @@ -202,7 +198,9 @@ static int do_commit(struct ubifs_info *c) c->mst_node->flags |= cpu_to_le32(UBIFS_MST_NO_ORPHS); else c->mst_node->flags &= ~cpu_to_le32(UBIFS_MST_NO_ORPHS); - err = ubifs_write_master(c); + + old_ltail_lnum = c->ltail_lnum; + err = ubifs_log_end_commit(c, new_ltail_lnum); if (err) goto out; diff --git a/fs/ubifs/debug.c b/fs/ubifs/debug.c index 177b0152fef4..7ed13e1e216a 100644 --- a/fs/ubifs/debug.c +++ b/fs/ubifs/debug.c @@ -334,9 +334,9 @@ void ubifs_dump_node(const struct ubifs_info *c, const void *node) pr_err("\tkey_fmt %d (%s)\n", (int)sup->key_fmt, get_key_fmt(sup->key_fmt)); pr_err("\tflags %#x\n", sup_flags); - pr_err("\t big_lpt %u\n", + pr_err("\tbig_lpt %u\n", !!(sup_flags & UBIFS_FLG_BIGLPT)); - pr_err("\t space_fixup %u\n", + pr_err("\tspace_fixup %u\n", !!(sup_flags & UBIFS_FLG_SPACE_FIXUP)); pr_err("\tmin_io_size %u\n", le32_to_cpu(sup->min_io_size)); pr_err("\tleb_size %u\n", le32_to_cpu(sup->leb_size)); @@ -2462,7 +2462,7 @@ static int power_cut_emulated(struct ubifs_info *c, int lnum, int write) if (chance(1, 2)) { d->pc_delay = 1; - /* Fail withing 1 minute */ + /* Fail within 1 minute */ delay = prandom_u32() % 60000; d->pc_timeout = jiffies; d->pc_timeout += msecs_to_jiffies(delay); diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 0e045e75abd8..fb166e204441 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -546,15 +546,14 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, int aligned_dlen, aligned_ilen, sync = IS_DIRSYNC(dir); int last_reference = !!(deletion && inode->i_nlink == 0); struct ubifs_inode *ui = ubifs_inode(inode); - struct ubifs_inode *dir_ui = ubifs_inode(dir); + struct ubifs_inode *host_ui = ubifs_inode(dir); struct ubifs_dent_node *dent; struct ubifs_ino_node *ino; union ubifs_key dent_key, ino_key; dbg_jnl("ino %lu, dent '%.*s', data len %d in dir ino %lu", inode->i_ino, nm->len, nm->name, ui->data_len, dir->i_ino); - ubifs_assert(dir_ui->data_len == 0); - ubifs_assert(mutex_is_locked(&dir_ui->ui_mutex)); + ubifs_assert(mutex_is_locked(&host_ui->ui_mutex)); dlen = UBIFS_DENT_NODE_SZ + nm->len + 1; ilen = UBIFS_INO_NODE_SZ; @@ -658,7 +657,7 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, ui->synced_i_size = ui->ui_size; spin_unlock(&ui->ui_lock); mark_inode_clean(c, ui); - mark_inode_clean(c, dir_ui); + mark_inode_clean(c, host_ui); return 0; out_finish: diff --git a/fs/ubifs/log.c b/fs/ubifs/log.c index a47ddfc9be6b..c14628fbeee2 100644 --- a/fs/ubifs/log.c +++ b/fs/ubifs/log.c @@ -106,10 +106,14 @@ static inline long long empty_log_bytes(const struct ubifs_info *c) h = (long long)c->lhead_lnum * c->leb_size + c->lhead_offs; t = (long long)c->ltail_lnum * c->leb_size; - if (h >= t) + if (h > t) return c->log_bytes - h + t; - else + else if (h != t) return t - h; + else if (c->lhead_lnum != c->ltail_lnum) + return 0; + else + return c->log_bytes; } /** @@ -447,9 +451,9 @@ out: * @ltail_lnum: new log tail LEB number * * This function is called on when the commit operation was finished. It - * moves log tail to new position and unmaps LEBs which contain obsolete data. - * Returns zero in case of success and a negative error code in case of - * failure. + * moves log tail to new position and updates the master node so that it stores + * the new log tail LEB number. Returns zero in case of success and a negative + * error code in case of failure. */ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) { @@ -477,7 +481,12 @@ int ubifs_log_end_commit(struct ubifs_info *c, int ltail_lnum) spin_unlock(&c->buds_lock); err = dbg_check_bud_bytes(c); + if (err) + goto out; + err = ubifs_write_master(c); + +out: mutex_unlock(&c->log_mutex); return err; } diff --git a/fs/udf/file.c b/fs/udf/file.c index 86c6743ec1fe..bb15771b92ae 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -223,11 +223,18 @@ out: static int udf_release_file(struct inode *inode, struct file *filp) { - if (filp->f_mode & FMODE_WRITE) { + if (filp->f_mode & FMODE_WRITE && + atomic_read(&inode->i_writecount) > 1) { + /* + * Grab i_mutex to avoid races with writes changing i_size + * while we are running. + */ + mutex_lock(&inode->i_mutex); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); + mutex_unlock(&inode->i_mutex); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 08598843288f..c9b4df5810d5 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -1277,7 +1277,7 @@ update_time: */ #define UDF_MAX_ICB_NESTING 1024 -static int udf_read_inode(struct inode *inode) +static int udf_read_inode(struct inode *inode, bool hidden_inode) { struct buffer_head *bh = NULL; struct fileEntry *fe; @@ -1436,8 +1436,11 @@ reread: link_count = le16_to_cpu(fe->fileLinkCount); if (!link_count) { - ret = -ESTALE; - goto out; + if (!hidden_inode) { + ret = -ESTALE; + goto out; + } + link_count = 1; } set_nlink(inode, link_count); @@ -1826,7 +1829,8 @@ out: return err; } -struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) +struct inode *__udf_iget(struct super_block *sb, struct kernel_lb_addr *ino, + bool hidden_inode) { unsigned long block = udf_get_lb_pblock(sb, ino, 0); struct inode *inode = iget_locked(sb, block); @@ -1839,7 +1843,7 @@ struct inode *udf_iget(struct super_block *sb, struct kernel_lb_addr *ino) return inode; memcpy(&UDF_I(inode)->i_location, ino, sizeof(struct kernel_lb_addr)); - err = udf_read_inode(inode); + err = udf_read_inode(inode, hidden_inode); if (err < 0) { iget_failed(inode); return ERR_PTR(err); diff --git a/fs/udf/super.c b/fs/udf/super.c index 5401fc33f5cc..e229315bbf7a 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -959,7 +959,7 @@ struct inode *udf_find_metadata_inode_efe(struct super_block *sb, addr.logicalBlockNum = meta_file_loc; addr.partitionReferenceNum = partition_num; - metadata_fe = udf_iget(sb, &addr); + metadata_fe = udf_iget_special(sb, &addr); if (IS_ERR(metadata_fe)) { udf_warn(sb, "metadata inode efe not found\n"); @@ -1020,7 +1020,7 @@ static int udf_load_metadata_files(struct super_block *sb, int partition) udf_debug("Bitmap file location: block = %d part = %d\n", addr.logicalBlockNum, addr.partitionReferenceNum); - fe = udf_iget(sb, &addr); + fe = udf_iget_special(sb, &addr); if (IS_ERR(fe)) { if (sb->s_flags & MS_RDONLY) udf_warn(sb, "bitmap inode efe not found but it's ok since the disc is mounted read-only\n"); @@ -1119,7 +1119,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, }; struct inode *inode; - inode = udf_iget(sb, &loc); + inode = udf_iget_special(sb, &loc); if (IS_ERR(inode)) { udf_debug("cannot load unallocSpaceTable (part %d)\n", p_index); @@ -1154,7 +1154,7 @@ static int udf_fill_partdesc_info(struct super_block *sb, }; struct inode *inode; - inode = udf_iget(sb, &loc); + inode = udf_iget_special(sb, &loc); if (IS_ERR(inode)) { udf_debug("cannot load freedSpaceTable (part %d)\n", p_index); @@ -1198,7 +1198,7 @@ static void udf_find_vat_block(struct super_block *sb, int p_index, vat_block >= map->s_partition_root && vat_block >= start_block - 3; vat_block--) { ino.logicalBlockNum = vat_block - map->s_partition_root; - inode = udf_iget(sb, &ino); + inode = udf_iget_special(sb, &ino); if (!IS_ERR(inode)) { sbi->s_vat_inode = inode; break; diff --git a/fs/udf/udfdecl.h b/fs/udf/udfdecl.h index 742557be9936..1cc3c993ebd0 100644 --- a/fs/udf/udfdecl.h +++ b/fs/udf/udfdecl.h @@ -138,7 +138,18 @@ extern int udf_write_fi(struct inode *inode, struct fileIdentDesc *, /* file.c */ extern long udf_ioctl(struct file *, unsigned int, unsigned long); /* inode.c */ -extern struct inode *udf_iget(struct super_block *, struct kernel_lb_addr *); +extern struct inode *__udf_iget(struct super_block *, struct kernel_lb_addr *, + bool hidden_inode); +static inline struct inode *udf_iget_special(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, true); +} +static inline struct inode *udf_iget(struct super_block *sb, + struct kernel_lb_addr *ino) +{ + return __udf_iget(sb, ino, false); +} extern int udf_expand_file_adinicb(struct inode *); extern struct buffer_head *udf_expand_dir_adinicb(struct inode *, int *, int *); extern struct buffer_head *udf_bread(struct inode *, int, int, int *); diff --git a/fs/udf/udftime.c b/fs/udf/udftime.c index 1f11483eba6a..77c331f1a770 100644 --- a/fs/udf/udftime.c +++ b/fs/udf/udftime.c @@ -81,8 +81,6 @@ static time_t year_seconds[MAX_YEAR_SECONDS] = { /*2038*/ SPY(68, 17, 0) }; -extern struct timezone sys_tz; - #define SECS_PER_HOUR (60 * 60) #define SECS_PER_DAY (SECS_PER_HOUR * 24) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index 7bc20809c99e..2c1036080d52 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -784,7 +784,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, 0x0, 0x2, 0x6, 0xe, 0x1e, 0x3e, 0x7e, 0xfe, 0x1fe }; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_cylinder_group *ucg; unsigned start, length, loc; unsigned pos, want, blockmap, mask, end; u64 result; @@ -792,8 +791,6 @@ static u64 ufs_bitmap_search(struct super_block *sb, UFSD("ENTER, cg %u, goal %llu, count %u\n", ucpi->c_cgx, (unsigned long long)goal, count); - ucg = ubh_get_ucg(UCPI_UBH(ucpi)); - if (goal) start = ufs_dtogd(uspi, goal) >> 3; else diff --git a/fs/xattr.c b/fs/xattr.c index c69e6d43a0d2..64e83efb742d 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -364,13 +364,12 @@ out: return error; } -SYSCALL_DEFINE5(setxattr, const char __user *, pathname, - const char __user *, name, const void __user *, value, - size_t, size, int, flags) +static int path_setxattr(const char __user *pathname, + const char __user *name, const void __user *value, + size_t size, int flags, unsigned int lookup_flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) @@ -388,28 +387,18 @@ retry: return error; } +SYSCALL_DEFINE5(setxattr, const char __user *, pathname, + const char __user *, name, const void __user *, value, + size_t, size, int, flags) +{ + return path_setxattr(pathname, name, value, size, flags, LOOKUP_FOLLOW); +} + SYSCALL_DEFINE5(lsetxattr, const char __user *, pathname, const char __user *, name, const void __user *, value, size_t, size, int, flags) { - struct path path; - int error; - unsigned int lookup_flags = 0; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); - if (error) - return error; - error = mnt_want_write(path.mnt); - if (!error) { - error = setxattr(path.dentry, name, value, size, flags); - mnt_drop_write(path.mnt); - } - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + return path_setxattr(pathname, name, value, size, flags, 0); } SYSCALL_DEFINE5(fsetxattr, int, fd, const char __user *, name, @@ -481,12 +470,12 @@ getxattr(struct dentry *d, const char __user *name, void __user *value, return error; } -SYSCALL_DEFINE4(getxattr, const char __user *, pathname, - const char __user *, name, void __user *, value, size_t, size) +static ssize_t path_getxattr(const char __user *pathname, + const char __user *name, void __user *value, + size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; - unsigned int lookup_flags = LOOKUP_FOLLOW; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) @@ -500,23 +489,16 @@ retry: return error; } +SYSCALL_DEFINE4(getxattr, const char __user *, pathname, + const char __user *, name, void __user *, value, size_t, size) +{ + return path_getxattr(pathname, name, value, size, LOOKUP_FOLLOW); +} + SYSCALL_DEFINE4(lgetxattr, const char __user *, pathname, const char __user *, name, void __user *, value, size_t, size) { - struct path path; - ssize_t error; - unsigned int lookup_flags = 0; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); - if (error) - return error; - error = getxattr(path.dentry, name, value, size); - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + return path_getxattr(pathname, name, value, size, 0); } SYSCALL_DEFINE4(fgetxattr, int, fd, const char __user *, name, @@ -571,12 +553,11 @@ listxattr(struct dentry *d, char __user *list, size_t size) return error; } -SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, - size_t, size) +static ssize_t path_listxattr(const char __user *pathname, char __user *list, + size_t size, unsigned int lookup_flags) { struct path path; ssize_t error; - unsigned int lookup_flags = LOOKUP_FOLLOW; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) @@ -590,23 +571,16 @@ retry: return error; } +SYSCALL_DEFINE3(listxattr, const char __user *, pathname, char __user *, list, + size_t, size) +{ + return path_listxattr(pathname, list, size, LOOKUP_FOLLOW); +} + SYSCALL_DEFINE3(llistxattr, const char __user *, pathname, char __user *, list, size_t, size) { - struct path path; - ssize_t error; - unsigned int lookup_flags = 0; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); - if (error) - return error; - error = listxattr(path.dentry, list, size); - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + return path_listxattr(pathname, list, size, 0); } SYSCALL_DEFINE3(flistxattr, int, fd, char __user *, list, size_t, size) @@ -640,12 +614,11 @@ removexattr(struct dentry *d, const char __user *name) return vfs_removexattr(d, kname); } -SYSCALL_DEFINE2(removexattr, const char __user *, pathname, - const char __user *, name) +static int path_removexattr(const char __user *pathname, + const char __user *name, unsigned int lookup_flags) { struct path path; int error; - unsigned int lookup_flags = LOOKUP_FOLLOW; retry: error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); if (error) @@ -663,27 +636,16 @@ retry: return error; } +SYSCALL_DEFINE2(removexattr, const char __user *, pathname, + const char __user *, name) +{ + return path_removexattr(pathname, name, LOOKUP_FOLLOW); +} + SYSCALL_DEFINE2(lremovexattr, const char __user *, pathname, const char __user *, name) { - struct path path; - int error; - unsigned int lookup_flags = 0; -retry: - error = user_path_at(AT_FDCWD, pathname, lookup_flags, &path); - if (error) - return error; - error = mnt_want_write(path.mnt); - if (!error) { - error = removexattr(path.dentry, name); - mnt_drop_write(path.mnt); - } - path_put(&path); - if (retry_estale(error, lookup_flags)) { - lookup_flags |= LOOKUP_REVAL; - goto retry; - } - return error; + return path_removexattr(pathname, name, 0); } SYSCALL_DEFINE2(fremovexattr, int, fd, const char __user *, name) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 844e288b9576..53e95b2a1369 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -21,7 +21,6 @@ #include <linux/swap.h> #include <linux/blkdev.h> #include <linux/backing-dev.h> -#include "time.h" #include "kmem.h" #include "xfs_message.h" diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 4bffffe038a1..eff34218f405 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -2209,6 +2209,10 @@ xfs_agf_verify( be32_to_cpu(agf->agf_flcount) <= XFS_AGFL_SIZE(mp))) return false; + if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) > XFS_BTREE_MAXLEVELS || + be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) > XFS_BTREE_MAXLEVELS) + return false; + /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 86df952d3e24..79c981984dca 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -5404,22 +5404,223 @@ error0: } /* + * Determine whether an extent shift can be accomplished by a merge with the + * extent that precedes the target hole of the shift. + */ +STATIC bool +xfs_bmse_can_merge( + struct xfs_bmbt_irec *left, /* preceding extent */ + struct xfs_bmbt_irec *got, /* current extent to shift */ + xfs_fileoff_t shift) /* shift fsb */ +{ + xfs_fileoff_t startoff; + + startoff = got->br_startoff - shift; + + /* + * The extent, once shifted, must be adjacent in-file and on-disk with + * the preceding extent. + */ + if ((left->br_startoff + left->br_blockcount != startoff) || + (left->br_startblock + left->br_blockcount != got->br_startblock) || + (left->br_state != got->br_state) || + (left->br_blockcount + got->br_blockcount > MAXEXTLEN)) + return false; + + return true; +} + +/* + * A bmap extent shift adjusts the file offset of an extent to fill a preceding + * hole in the file. If an extent shift would result in the extent being fully + * adjacent to the extent that currently precedes the hole, we can merge with + * the preceding extent rather than do the shift. + * + * This function assumes the caller has verified a shift-by-merge is possible + * with the provided extents via xfs_bmse_can_merge(). + */ +STATIC int +xfs_bmse_merge( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t shift, /* shift fsb */ + int current_ext, /* idx of gotp */ + struct xfs_bmbt_rec_host *gotp, /* extent to shift */ + struct xfs_bmbt_rec_host *leftp, /* preceding extent */ + struct xfs_btree_cur *cur, + int *logflags) /* output */ +{ + struct xfs_ifork *ifp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + xfs_filblks_t blockcount; + int error, i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + xfs_bmbt_get_all(gotp, &got); + xfs_bmbt_get_all(leftp, &left); + blockcount = left.br_blockcount + got.br_blockcount; + + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); + ASSERT(xfs_bmse_can_merge(&left, &got, shift)); + + /* + * Merge the in-core extents. Note that the host record pointers and + * current_ext index are invalid once the extent has been removed via + * xfs_iext_remove(). + */ + xfs_bmbt_set_blockcount(leftp, blockcount); + xfs_iext_remove(ip, current_ext, 1, 0); + + /* + * Update the on-disk extent count, the btree if necessary and log the + * inode. + */ + XFS_IFORK_NEXT_SET(ip, whichfork, + XFS_IFORK_NEXTENTS(ip, whichfork) - 1); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + /* lookup and remove the extent to merge */ + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + error = xfs_btree_delete(cur, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + /* lookup and update size of the previous extent */ + error = xfs_bmbt_lookup_eq(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, &i); + if (error) + goto out_error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + left.br_blockcount = blockcount; + + error = xfs_bmbt_update(cur, left.br_startoff, left.br_startblock, + left.br_blockcount, left.br_state); + if (error) + goto out_error; + + return 0; + +out_error: + return error; +} + +/* + * Shift a single extent. + */ +STATIC int +xfs_bmse_shift_one( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t offset_shift_fsb, + int *current_ext, + struct xfs_bmbt_rec_host *gotp, + struct xfs_btree_cur *cur, + int *logflags) +{ + struct xfs_ifork *ifp; + xfs_fileoff_t startoff; + struct xfs_bmbt_rec_host *leftp; + struct xfs_bmbt_irec got; + struct xfs_bmbt_irec left; + int error; + int i; + + ifp = XFS_IFORK_PTR(ip, whichfork); + + xfs_bmbt_get_all(gotp, &got); + startoff = got.br_startoff - offset_shift_fsb; + + /* delalloc extents should be prevented by caller */ + XFS_WANT_CORRUPTED_GOTO(!isnullstartblock(got.br_startblock), + out_error); + + /* + * If this is the first extent in the file, make sure there's enough + * room at the start of the file and jump right to the shift as there's + * no left extent to merge. + */ + if (*current_ext == 0) { + if (got.br_startoff < offset_shift_fsb) + return -EINVAL; + goto shift_extent; + } + + /* grab the left extent and check for a large enough hole */ + leftp = xfs_iext_get_ext(ifp, *current_ext - 1); + xfs_bmbt_get_all(leftp, &left); + + if (startoff < left.br_startoff + left.br_blockcount) + return -EINVAL; + + /* check whether to merge the extent or shift it down */ + if (!xfs_bmse_can_merge(&left, &got, offset_shift_fsb)) + goto shift_extent; + + return xfs_bmse_merge(ip, whichfork, offset_shift_fsb, *current_ext, + gotp, leftp, cur, logflags); + +shift_extent: + /* + * Increment the extent index for the next iteration, update the start + * offset of the in-core extent and update the btree if applicable. + */ + (*current_ext)++; + xfs_bmbt_set_startoff(gotp, startoff); + *logflags |= XFS_ILOG_CORE; + if (!cur) { + *logflags |= XFS_ILOG_DEXT; + return 0; + } + + error = xfs_bmbt_lookup_eq(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, &i); + if (error) + return error; + XFS_WANT_CORRUPTED_GOTO(i == 1, out_error); + + got.br_startoff = startoff; + error = xfs_bmbt_update(cur, got.br_startoff, got.br_startblock, + got.br_blockcount, got.br_state); + if (error) + return error; + + return 0; + +out_error: + return error; +} + +/* * Shift extent records to the left to cover a hole. * - * The maximum number of extents to be shifted in a single operation - * is @num_exts, and @current_ext keeps track of the current extent - * index we have shifted. @offset_shift_fsb is the length by which each - * extent is shifted. If there is no hole to shift the extents - * into, this will be considered invalid operation and we abort immediately. + * The maximum number of extents to be shifted in a single operation is + * @num_exts. @start_fsb specifies the file offset to start the shift and the + * file offset where we've left off is returned in @next_fsb. @offset_shift_fsb + * is the length by which each extent is shifted. If there is no hole to shift + * the extents into, this will be considered invalid operation and we abort + * immediately. */ int xfs_bmap_shift_extents( struct xfs_trans *tp, struct xfs_inode *ip, - int *done, xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, - xfs_extnum_t *current_ext, + int *done, + xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, int num_exts) @@ -5427,16 +5628,13 @@ xfs_bmap_shift_extents( struct xfs_btree_cur *cur = NULL; struct xfs_bmbt_rec_host *gotp; struct xfs_bmbt_irec got; - struct xfs_bmbt_irec left; struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp; xfs_extnum_t nexts = 0; - xfs_fileoff_t startoff; + xfs_extnum_t current_ext; int error = 0; - int i; int whichfork = XFS_DATA_FORK; int logflags = 0; - xfs_filblks_t blockcount = 0; int total_extents; if (unlikely(XFS_TEST_ERROR( @@ -5451,7 +5649,8 @@ xfs_bmap_shift_extents( if (XFS_FORCED_SHUTDOWN(mp)) return -EIO; - ASSERT(current_ext != NULL); + ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); + ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); ifp = XFS_IFORK_PTR(ip, whichfork); if (!(ifp->if_flags & XFS_IFEXTENTS)) { @@ -5461,23 +5660,6 @@ xfs_bmap_shift_extents( return error; } - /* - * If *current_ext is 0, we would need to lookup the extent - * from where we would start shifting and store it in gotp. - */ - if (!*current_ext) { - gotp = xfs_iext_bno_to_ext(ifp, start_fsb, current_ext); - /* - * gotp can be null in 2 cases: 1) if there are no extents - * or 2) start_fsb lies in a hole beyond which there are - * no extents. Either way, we are done. - */ - if (!gotp) { - *done = 1; - return 0; - } - } - if (ifp->if_flags & XFS_IFBROOT) { cur = xfs_bmbt_init_cursor(mp, tp, ip, whichfork); cur->bc_private.b.firstblock = *firstblock; @@ -5486,112 +5668,46 @@ xfs_bmap_shift_extents( } /* + * Look up the extent index for the fsb where we start shifting. We can + * henceforth iterate with current_ext as extent list changes are locked + * out via ilock. + * + * gotp can be null in 2 cases: 1) if there are no extents or 2) + * start_fsb lies in a hole beyond which there are no extents. Either + * way, we are done. + */ + gotp = xfs_iext_bno_to_ext(ifp, start_fsb, ¤t_ext); + if (!gotp) { + *done = 1; + goto del_cursor; + } + + /* * There may be delalloc extents in the data fork before the range we - * are collapsing out, so we cannot - * use the count of real extents here. Instead we have to calculate it - * from the incore fork. + * are collapsing out, so we cannot use the count of real extents here. + * Instead we have to calculate it from the incore fork. */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); - while (nexts++ < num_exts && *current_ext < total_extents) { - - gotp = xfs_iext_get_ext(ifp, *current_ext); - xfs_bmbt_get_all(gotp, &got); - startoff = got.br_startoff - offset_shift_fsb; - - /* - * Before shifting extent into hole, make sure that the hole - * is large enough to accomodate the shift. - */ - if (*current_ext) { - xfs_bmbt_get_all(xfs_iext_get_ext(ifp, - *current_ext - 1), &left); - - if (startoff < left.br_startoff + left.br_blockcount) - error = -EINVAL; - } else if (offset_shift_fsb > got.br_startoff) { - /* - * When first extent is shifted, offset_shift_fsb - * should be less than the stating offset of - * the first extent. - */ - error = -EINVAL; - } - + while (nexts++ < num_exts && current_ext < total_extents) { + error = xfs_bmse_shift_one(ip, whichfork, offset_shift_fsb, + ¤t_ext, gotp, cur, &logflags); if (error) goto del_cursor; - if (cur) { - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } - - /* Check if we can merge 2 adjacent extents */ - if (*current_ext && - left.br_startoff + left.br_blockcount == startoff && - left.br_startblock + left.br_blockcount == - got.br_startblock && - left.br_state == got.br_state && - left.br_blockcount + got.br_blockcount <= MAXEXTLEN) { - blockcount = left.br_blockcount + - got.br_blockcount; - xfs_iext_remove(ip, *current_ext, 1, 0); - logflags |= XFS_ILOG_CORE; - if (cur) { - error = xfs_btree_delete(cur, &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } else { - logflags |= XFS_ILOG_DEXT; - } - XFS_IFORK_NEXT_SET(ip, whichfork, - XFS_IFORK_NEXTENTS(ip, whichfork) - 1); - gotp = xfs_iext_get_ext(ifp, --*current_ext); - xfs_bmbt_get_all(gotp, &got); - - /* Make cursor point to the extent we will update */ - if (cur) { - error = xfs_bmbt_lookup_eq(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - &i); - if (error) - goto del_cursor; - XFS_WANT_CORRUPTED_GOTO(i == 1, del_cursor); - } - - xfs_bmbt_set_blockcount(gotp, blockcount); - got.br_blockcount = blockcount; - } else { - /* We have to update the startoff */ - xfs_bmbt_set_startoff(gotp, startoff); - got.br_startoff = startoff; - } - - logflags |= XFS_ILOG_CORE; - if (cur) { - error = xfs_bmbt_update(cur, got.br_startoff, - got.br_startblock, - got.br_blockcount, - got.br_state); - if (error) - goto del_cursor; - } else { - logflags |= XFS_ILOG_DEXT; - } - - (*current_ext)++; + /* update total extent count and grab the next record */ total_extents = ifp->if_bytes / sizeof(xfs_bmbt_rec_t); + if (current_ext >= total_extents) + break; + gotp = xfs_iext_get_ext(ifp, current_ext); } /* Check if we are done */ - if (*current_ext == total_extents) + if (current_ext == total_extents) { *done = 1; + } else if (next_fsb) { + xfs_bmbt_get_all(gotp, &got); + *next_fsb = got.br_startoff; + } del_cursor: if (cur) @@ -5600,5 +5716,6 @@ del_cursor: if (logflags) xfs_trans_log_inode(tp, ip, logflags); + return error; } diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index b879ca56a64c..44db6db86402 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -178,9 +178,8 @@ int xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx, xfs_extnum_t num); uint xfs_default_attroffset(struct xfs_inode *ip); int xfs_bmap_shift_extents(struct xfs_trans *tp, struct xfs_inode *ip, - int *done, xfs_fileoff_t start_fsb, - xfs_fileoff_t offset_shift_fsb, xfs_extnum_t *current_ext, - xfs_fsblock_t *firstblock, struct xfs_bmap_free *flist, - int num_exts); + xfs_fileoff_t start_fsb, xfs_fileoff_t offset_shift_fsb, + int *done, xfs_fileoff_t *next_fsb, xfs_fsblock_t *firstblock, + struct xfs_bmap_free *flist, int num_exts); #endif /* __XFS_BMAP_H__ */ diff --git a/fs/xfs/libxfs/xfs_da_btree.c b/fs/xfs/libxfs/xfs_da_btree.c index 2c42ae28d027..fd827530afec 100644 --- a/fs/xfs/libxfs/xfs_da_btree.c +++ b/fs/xfs/libxfs/xfs_da_btree.c @@ -2563,7 +2563,8 @@ xfs_da_get_buf( mapp, nmap, 0); error = bp ? bp->b_error : -EIO; if (error) { - xfs_trans_brelse(trans, bp); + if (bp) + xfs_trans_brelse(trans, bp); goto out_free; } diff --git a/fs/xfs/libxfs/xfs_da_format.c b/fs/xfs/libxfs/xfs_da_format.c index c9aee52a37e2..7e42fdfd2f1d 100644 --- a/fs/xfs/libxfs/xfs_da_format.c +++ b/fs/xfs/libxfs/xfs_da_format.c @@ -270,7 +270,6 @@ xfs_dir3_data_get_ftype( { __uint8_t ftype = dep->name[dep->namelen]; - ASSERT(ftype < XFS_DIR3_FT_MAX); if (ftype >= XFS_DIR3_FT_MAX) return XFS_DIR3_FT_UNKNOWN; return ftype; diff --git a/fs/xfs/libxfs/xfs_dir2.c b/fs/xfs/libxfs/xfs_dir2.c index 6cef22152fd6..7075aaf131f4 100644 --- a/fs/xfs/libxfs/xfs_dir2.c +++ b/fs/xfs/libxfs/xfs_dir2.c @@ -237,7 +237,8 @@ xfs_dir_init( } /* - Enter a name in a directory. + * Enter a name in a directory, or check for available space. + * If inum is 0, only the available space test is performed. */ int xfs_dir_createname( @@ -254,10 +255,12 @@ xfs_dir_createname( int v; /* type-checking value */ ASSERT(S_ISDIR(dp->i_d.di_mode)); - rval = xfs_dir_ino_validate(tp->t_mountp, inum); - if (rval) - return rval; - XFS_STATS_INC(xs_dir_create); + if (inum) { + rval = xfs_dir_ino_validate(tp->t_mountp, inum); + if (rval) + return rval; + XFS_STATS_INC(xs_dir_create); + } args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); if (!args) @@ -276,6 +279,8 @@ xfs_dir_createname( args->whichfork = XFS_DATA_FORK; args->trans = tp; args->op_flags = XFS_DA_OP_ADDNAME | XFS_DA_OP_OKNOENT; + if (!inum) + args->op_flags |= XFS_DA_OP_JUSTCHECK; if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { rval = xfs_dir2_sf_addname(args); @@ -535,62 +540,14 @@ out_free: /* * See if this entry can be added to the directory without allocating space. - * First checks that the caller couldn't reserve enough space (resblks = 0). */ int xfs_dir_canenter( xfs_trans_t *tp, xfs_inode_t *dp, - struct xfs_name *name, /* name of entry to add */ - uint resblks) + struct xfs_name *name) /* name of entry to add */ { - struct xfs_da_args *args; - int rval; - int v; /* type-checking value */ - - if (resblks) - return 0; - - ASSERT(S_ISDIR(dp->i_d.di_mode)); - - args = kmem_zalloc(sizeof(*args), KM_SLEEP | KM_NOFS); - if (!args) - return -ENOMEM; - - args->geo = dp->i_mount->m_dir_geo; - args->name = name->name; - args->namelen = name->len; - args->filetype = name->type; - args->hashval = dp->i_mount->m_dirnameops->hashname(name); - args->dp = dp; - args->whichfork = XFS_DATA_FORK; - args->trans = tp; - args->op_flags = XFS_DA_OP_JUSTCHECK | XFS_DA_OP_ADDNAME | - XFS_DA_OP_OKNOENT; - - if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL) { - rval = xfs_dir2_sf_addname(args); - goto out_free; - } - - rval = xfs_dir2_isblock(args, &v); - if (rval) - goto out_free; - if (v) { - rval = xfs_dir2_block_addname(args); - goto out_free; - } - - rval = xfs_dir2_isleaf(args, &v); - if (rval) - goto out_free; - if (v) - rval = xfs_dir2_leaf_addname(args); - else - rval = xfs_dir2_node_addname(args); -out_free: - kmem_free(args); - return rval; + return xfs_dir_createname(tp, dp, name, 0, NULL, NULL, 0); } /* diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h index c8e86b0b5e99..4dff261e6ed5 100644 --- a/fs/xfs/libxfs/xfs_dir2.h +++ b/fs/xfs/libxfs/xfs_dir2.h @@ -136,7 +136,7 @@ extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp, xfs_fsblock_t *first, struct xfs_bmap_free *flist, xfs_extlen_t tot); extern int xfs_dir_canenter(struct xfs_trans *tp, struct xfs_inode *dp, - struct xfs_name *name, uint resblks); + struct xfs_name *name); /* * Direct call from the bmap code, bypassing the generic directory layer. diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index b62771f1f4b5..23dcb72fc5e6 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -1076,8 +1076,8 @@ xfs_dialloc_ag_finobt_newino( int i; if (agi->agi_newino != cpu_to_be32(NULLAGINO)) { - error = xfs_inobt_lookup(cur, agi->agi_newino, XFS_LOOKUP_EQ, - &i); + error = xfs_inobt_lookup(cur, be32_to_cpu(agi->agi_newino), + XFS_LOOKUP_EQ, &i); if (error) return error; if (i == 1) { @@ -1085,7 +1085,6 @@ xfs_dialloc_ag_finobt_newino( if (error) return error; XFS_WANT_CORRUPTED_RETURN(i == 1); - return 0; } } @@ -2051,6 +2050,8 @@ xfs_agi_verify( if (!XFS_AGI_GOOD_VERSION(be32_to_cpu(agi->agi_versionnum))) return false; + if (be32_to_cpu(agi->agi_level) > XFS_BTREE_MAXLEVELS) + return false; /* * during growfs operations, the perag is not fully initialised, * so we can't use it for any useful checking. growfs ensures we can't diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index f4dd697cac08..7c818f1e4484 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -424,20 +424,24 @@ xfs_rtfind_forw( } /* - * Read and modify the summary information for a given extent size, + * Read and/or modify the summary information for a given extent size, * bitmap block combination. * Keeps track of a current summary block, so we don't keep reading * it from the buffer cache. + * + * Summary information is returned in *sum if specified. + * If no delta is specified, returns summary only. */ int -xfs_rtmodify_summary( - xfs_mount_t *mp, /* file system mount point */ +xfs_rtmodify_summary_int( + xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ int log, /* log2 of extent size */ xfs_rtblock_t bbno, /* bitmap block number */ int delta, /* change to make to summary info */ xfs_buf_t **rbpp, /* in/out: summary block buffer */ - xfs_fsblock_t *rsb) /* in/out: summary block number */ + xfs_fsblock_t *rsb, /* in/out: summary block number */ + xfs_suminfo_t *sum) /* out: summary info for this block */ { xfs_buf_t *bp; /* buffer for the summary block */ int error; /* error value */ @@ -456,7 +460,7 @@ xfs_rtmodify_summary( /* * If we have an old buffer, and the block number matches, use that. */ - if (rbpp && *rbpp && *rsb == sb) + if (*rbpp && *rsb == sb) bp = *rbpp; /* * Otherwise we have to get the buffer. @@ -465,7 +469,7 @@ xfs_rtmodify_summary( /* * If there was an old one, get rid of it first. */ - if (rbpp && *rbpp) + if (*rbpp) xfs_trans_brelse(tp, *rbpp); error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); if (error) { @@ -474,21 +478,38 @@ xfs_rtmodify_summary( /* * Remember this buffer and block for the next call. */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } + *rbpp = bp; + *rsb = sb; } /* - * Point to the summary information, modify and log it. + * Point to the summary information, modify/log it, and/or copy it out. */ sp = XFS_SUMPTR(mp, bp, so); - *sp += delta; - xfs_trans_log_buf(tp, bp, (uint)((char *)sp - (char *)bp->b_addr), - (uint)((char *)sp - (char *)bp->b_addr + sizeof(*sp) - 1)); + if (delta) { + uint first = (uint)((char *)sp - (char *)bp->b_addr); + + *sp += delta; + xfs_trans_log_buf(tp, bp, first, first + sizeof(*sp) - 1); + } + if (sum) + *sum = *sp; return 0; } +int +xfs_rtmodify_summary( + xfs_mount_t *mp, /* file system mount structure */ + xfs_trans_t *tp, /* transaction pointer */ + int log, /* log2 of extent size */ + xfs_rtblock_t bbno, /* bitmap block number */ + int delta, /* change to make to summary info */ + xfs_buf_t **rbpp, /* in/out: summary block buffer */ + xfs_fsblock_t *rsb) /* in/out: summary block number */ +{ + return xfs_rtmodify_summary_int(mp, tp, log, bbno, + delta, rbpp, rsb, NULL); +} + /* * Set the given range of bitmap bits to the given value. * Do whatever I/O and logging is required. diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index ad525a5623a4..5f902fa7913f 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -279,11 +279,13 @@ xfs_mount_validate_sb( sbp->sb_blocklog < XFS_MIN_BLOCKSIZE_LOG || sbp->sb_blocklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_blocksize != (1 << sbp->sb_blocklog) || + sbp->sb_dirblklog > XFS_MAX_BLOCKSIZE_LOG || sbp->sb_inodesize < XFS_DINODE_MIN_SIZE || sbp->sb_inodesize > XFS_DINODE_MAX_SIZE || sbp->sb_inodelog < XFS_DINODE_MIN_LOG || sbp->sb_inodelog > XFS_DINODE_MAX_LOG || sbp->sb_inodesize != (1 << sbp->sb_inodelog) || + sbp->sb_logsunit > XLOG_MAX_RECORD_BSIZE || sbp->sb_inopblock != howmany(sbp->sb_blocksize,sbp->sb_inodesize) || (sbp->sb_blocklog - sbp->sb_inodelog != sbp->sb_inopblog) || (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE) || @@ -443,6 +445,8 @@ __xfs_sb_from_disk( to->sb_features_incompat = be32_to_cpu(from->sb_features_incompat); to->sb_features_log_incompat = be32_to_cpu(from->sb_features_log_incompat); + /* crc is only used on disk, not in memory; just init to 0 here. */ + to->sb_crc = 0; to->sb_pad = 0; to->sb_pquotino = be64_to_cpu(from->sb_pquotino); to->sb_lsn = be64_to_cpu(from->sb_lsn); @@ -548,6 +552,9 @@ xfs_sb_to_disk( if (!fields) return; + /* We should never write the crc here, it's updated in the IO path */ + fields &= ~XFS_SB_CRC; + xfs_sb_quota_to_disk(to, from, &fields); while (fields) { f = (xfs_sb_field_t)xfs_lowbit64((__uint64_t)fields); diff --git a/fs/xfs/time.h b/fs/xfs/time.h deleted file mode 100644 index 387e695a184c..000000000000 --- a/fs/xfs/time.h +++ /dev/null @@ -1,36 +0,0 @@ -/* - * Copyright (c) 2000-2003,2005 Silicon Graphics, Inc. - * All Rights Reserved. - * - * This program is free software; you can redistribute it and/or - * modify it under the terms of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it would be useful, - * but WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the - * GNU General Public License for more details. - * - * You should have received a copy of the GNU General Public License - * along with this program; if not, write the Free Software Foundation, - * Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA - */ -#ifndef __XFS_SUPPORT_TIME_H__ -#define __XFS_SUPPORT_TIME_H__ - -#include <linux/sched.h> -#include <linux/time.h> - -typedef struct timespec timespec_t; - -static inline void delay(long ticks) -{ - schedule_timeout_uninterruptible(ticks); -} - -static inline void nanotime(struct timespec *tvp) -{ - *tvp = CURRENT_TIME; -} - -#endif /* __XFS_SUPPORT_TIME_H__ */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index b984647c24db..f5b2453a43b2 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -434,10 +434,22 @@ xfs_start_page_writeback( { ASSERT(PageLocked(page)); ASSERT(!PageWriteback(page)); - if (clear_dirty) + + /* + * if the page was not fully cleaned, we need to ensure that the higher + * layers come back to it correctly. That means we need to keep the page + * dirty, and for WB_SYNC_ALL writeback we need to ensure the + * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to + * write this page in this writeback sweep will be made. + */ + if (clear_dirty) { clear_page_dirty_for_io(page); - set_page_writeback(page); + set_page_writeback(page); + } else + set_page_writeback_keepwrite(page); + unlock_page(page); + /* If no buffers on the page are to be written, finish it here */ if (!buffers) end_page_writeback(page); @@ -548,6 +560,13 @@ xfs_cancel_ioend( do { next_bh = bh->b_private; clear_buffer_async_write(bh); + /* + * The unwritten flag is cleared when added to the + * ioend. We're not submitting for I/O so mark the + * buffer unwritten again for next time around. + */ + if (ioend->io_type == XFS_IO_UNWRITTEN) + set_buffer_unwritten(bh); unlock_buffer(bh); } while ((bh = next_bh) != NULL); diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 1707980f9a4b..92e8f99a5857 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -1122,14 +1122,6 @@ xfs_zero_remaining_bytes( if (endoff > XFS_ISIZE(ip)) endoff = XFS_ISIZE(ip); - bp = xfs_buf_get_uncached(XFS_IS_REALTIME_INODE(ip) ? - mp->m_rtdev_targp : mp->m_ddev_targp, - BTOBB(mp->m_sb.sb_blocksize), 0); - if (!bp) - return -ENOMEM; - - xfs_buf_unlock(bp); - for (offset = startoff; offset <= endoff; offset = lastoffset + 1) { uint lock_mode; @@ -1152,42 +1144,24 @@ xfs_zero_remaining_bytes( ASSERT(imap.br_startblock != DELAYSTARTBLOCK); if (imap.br_state == XFS_EXT_UNWRITTEN) continue; - XFS_BUF_UNDONE(bp); - XFS_BUF_UNWRITE(bp); - XFS_BUF_READ(bp); - XFS_BUF_SET_ADDR(bp, xfs_fsb_to_db(ip, imap.br_startblock)); - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(read)"); - break; - } + error = xfs_buf_read_uncached(XFS_IS_REALTIME_INODE(ip) ? + mp->m_rtdev_targp : mp->m_ddev_targp, + xfs_fsb_to_db(ip, imap.br_startblock), + BTOBB(mp->m_sb.sb_blocksize), + 0, &bp, NULL); + if (error) + return error; + memset(bp->b_addr + - (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), - 0, lastoffset - offset + 1); - XFS_BUF_UNDONE(bp); - XFS_BUF_UNREAD(bp); - XFS_BUF_WRITE(bp); - - if (XFS_FORCED_SHUTDOWN(mp)) { - error = -EIO; - break; - } - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); - if (error) { - xfs_buf_ioerror_alert(bp, - "xfs_zero_remaining_bytes(write)"); - break; - } + (offset - XFS_FSB_TO_B(mp, imap.br_startoff)), + 0, lastoffset - offset + 1); + + error = xfs_bwrite(bp); + xfs_buf_relse(bp); + if (error) + return error; } - xfs_buf_free(bp); return error; } @@ -1205,6 +1179,7 @@ xfs_free_file_space( xfs_bmap_free_t free_list; xfs_bmbt_irec_t imap; xfs_off_t ioffset; + xfs_off_t iendoffset; xfs_extlen_t mod=0; xfs_mount_t *mp; int nimap; @@ -1233,12 +1208,13 @@ xfs_free_file_space( inode_dio_wait(VFS_I(ip)); rounding = max_t(xfs_off_t, 1 << mp->m_sb.sb_blocklog, PAGE_CACHE_SIZE); - ioffset = offset & ~(rounding - 1); - error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, - ioffset, -1); + ioffset = round_down(offset, rounding); + iendoffset = round_up(offset + len, rounding) - 1; + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, ioffset, + iendoffset); if (error) goto out; - truncate_pagecache_range(VFS_I(ip), ioffset, -1); + truncate_pagecache_range(VFS_I(ip), ioffset, iendoffset); /* * Need to zero the stuff we're not freeing, on disk. @@ -1392,14 +1368,14 @@ xfs_zero_file_space( if (start_boundary < end_boundary - 1) { /* - * punch out delayed allocation blocks and the page cache over - * the conversion range + * Writeback the range to ensure any inode size updates due to + * appending writes make it to disk (otherwise we could just + * punch out the delalloc blocks). */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - error = xfs_bmap_punch_delalloc_range(ip, - XFS_B_TO_FSBT(mp, start_boundary), - XFS_B_TO_FSB(mp, end_boundary - start_boundary)); - xfs_iunlock(ip, XFS_ILOCK_EXCL); + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + start_boundary, end_boundary - 1); + if (error) + goto out; truncate_pagecache_range(VFS_I(ip), start_boundary, end_boundary - 1); @@ -1456,41 +1432,47 @@ xfs_collapse_file_space( struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; int error; - xfs_extnum_t current_ext = 0; struct xfs_bmap_free free_list; xfs_fsblock_t first_block; int committed; xfs_fileoff_t start_fsb; + xfs_fileoff_t next_fsb; xfs_fileoff_t shift_fsb; ASSERT(xfs_isilocked(ip, XFS_IOLOCK_EXCL)); trace_xfs_collapse_file_space(ip); - start_fsb = XFS_B_TO_FSB(mp, offset + len); + next_fsb = XFS_B_TO_FSB(mp, offset + len); shift_fsb = XFS_B_TO_FSB(mp, len); - /* - * Writeback the entire file and force remove any post-eof blocks. The - * writeback prevents changes to the extent list via concurrent - * writeback and the eofblocks trim prevents the extent shift algorithm - * from running into a post-eof delalloc extent. - * - * XXX: This is a temporary fix until the extent shift loop below is - * converted to use offsets and lookups within the ILOCK rather than - * carrying around the index into the extent list for the next - * iteration. - */ - error = filemap_write_and_wait(VFS_I(ip)->i_mapping); + error = xfs_free_file_space(ip, offset, len); if (error) return error; + + /* + * Trim eofblocks to avoid shifting uninitialized post-eof preallocation + * into the accessible region of the file. + */ if (xfs_can_free_eofblocks(ip, true)) { error = xfs_free_eofblocks(mp, ip, false); if (error) return error; } - error = xfs_free_file_space(ip, offset, len); + /* + * Writeback and invalidate cache for the remainder of the file as we're + * about to shift down every extent from the collapse range to EOF. The + * free of the collapse range above might have already done some of + * this, but we shouldn't rely on it to do anything outside of the range + * that was freed. + */ + error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, + offset + len, -1); + if (error) + return error; + error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping, + (offset + len) >> PAGE_CACHE_SHIFT, -1); if (error) return error; @@ -1525,10 +1507,10 @@ xfs_collapse_file_space( * We are using the write transaction in which max 2 bmbt * updates are allowed */ - error = xfs_bmap_shift_extents(tp, ip, &done, start_fsb, - shift_fsb, ¤t_ext, - &first_block, &free_list, - XFS_BMAP_MAX_SHIFT_EXTENTS); + start_fsb = next_fsb; + error = xfs_bmap_shift_extents(tp, ip, start_fsb, shift_fsb, + &done, &next_fsb, &first_block, &free_list, + XFS_BMAP_MAX_SHIFT_EXTENTS); if (error) goto out; @@ -1638,7 +1620,7 @@ xfs_swap_extents_check_format( return 0; } -int +static int xfs_swap_extent_flush( struct xfs_inode *ip) { diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index cd7b8ca9b064..24b4ebea0d4d 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -623,10 +623,11 @@ _xfs_buf_read( bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD); bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - xfs_buf_iorequest(bp); - if (flags & XBF_ASYNC) + if (flags & XBF_ASYNC) { + xfs_buf_submit(bp); return 0; - return xfs_buf_iowait(bp); + } + return xfs_buf_submit_wait(bp); } xfs_buf_t * @@ -687,34 +688,39 @@ xfs_buf_readahead_map( * Read an uncached buffer from disk. Allocates and returns a locked * buffer containing the disk contents or nothing. */ -struct xfs_buf * +int xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, int flags, + struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { struct xfs_buf *bp; + *bpp = NULL; + bp = xfs_buf_get_uncached(target, numblks, flags); if (!bp) - return NULL; + return -ENOMEM; /* set up the buffer for a read IO */ ASSERT(bp->b_map_count == 1); - bp->b_bn = daddr; + bp->b_bn = XFS_BUF_DADDR_NULL; /* always null for uncached buffers */ bp->b_maps[0].bm_bn = daddr; bp->b_flags |= XBF_READ; bp->b_ops = ops; - if (XFS_FORCED_SHUTDOWN(target->bt_mount)) { + xfs_buf_submit_wait(bp); + if (bp->b_error) { + int error = bp->b_error; xfs_buf_relse(bp); - return NULL; + return error; } - xfs_buf_iorequest(bp); - xfs_buf_iowait(bp); - return bp; + + *bpp = bp; + return 0; } /* @@ -998,53 +1004,56 @@ xfs_buf_wait_unpin( * Buffer Utility Routines */ -STATIC void -xfs_buf_iodone_work( - struct work_struct *work) +void +xfs_buf_ioend( + struct xfs_buf *bp) { - struct xfs_buf *bp = - container_of(work, xfs_buf_t, b_iodone_work); - bool read = !!(bp->b_flags & XBF_READ); + bool read = bp->b_flags & XBF_READ; + + trace_xfs_buf_iodone(bp, _RET_IP_); bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - /* only validate buffers that were read without errors */ - if (read && bp->b_ops && !bp->b_error && (bp->b_flags & XBF_DONE)) + /* + * Pull in IO completion errors now. We are guaranteed to be running + * single threaded, so we don't need the lock to read b_io_error. + */ + if (!bp->b_error && bp->b_io_error) + xfs_buf_ioerror(bp, bp->b_io_error); + + /* Only validate buffers that were read without errors */ + if (read && !bp->b_error && bp->b_ops) { + ASSERT(!bp->b_iodone); bp->b_ops->verify_read(bp); + } + + if (!bp->b_error) + bp->b_flags |= XBF_DONE; if (bp->b_iodone) (*(bp->b_iodone))(bp); else if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); - else { - ASSERT(read && bp->b_ops); + else complete(&bp->b_iowait); - } } -void -xfs_buf_ioend( - struct xfs_buf *bp, - int schedule) +static void +xfs_buf_ioend_work( + struct work_struct *work) { - bool read = !!(bp->b_flags & XBF_READ); - - trace_xfs_buf_iodone(bp, _RET_IP_); + struct xfs_buf *bp = + container_of(work, xfs_buf_t, b_iodone_work); - if (bp->b_error == 0) - bp->b_flags |= XBF_DONE; + xfs_buf_ioend(bp); +} - if (bp->b_iodone || (read && bp->b_ops) || (bp->b_flags & XBF_ASYNC)) { - if (schedule) { - INIT_WORK(&bp->b_iodone_work, xfs_buf_iodone_work); - queue_work(xfslogd_workqueue, &bp->b_iodone_work); - } else { - xfs_buf_iodone_work(&bp->b_iodone_work); - } - } else { - bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD); - complete(&bp->b_iowait); - } +void +xfs_buf_ioend_async( + struct xfs_buf *bp) +{ + INIT_WORK(&bp->b_iodone_work, xfs_buf_ioend_work); + queue_work(xfslogd_workqueue, &bp->b_iodone_work); } void @@ -1067,96 +1076,6 @@ xfs_buf_ioerror_alert( (__uint64_t)XFS_BUF_ADDR(bp), func, -bp->b_error, bp->b_length); } -/* - * Called when we want to stop a buffer from getting written or read. - * We attach the EIO error, muck with its flags, and call xfs_buf_ioend - * so that the proper iodone callbacks get called. - */ -STATIC int -xfs_bioerror( - xfs_buf_t *bp) -{ -#ifdef XFSERRORDEBUG - ASSERT(XFS_BUF_ISREAD(bp) || bp->b_iodone); -#endif - - /* - * No need to wait until the buffer is unpinned, we aren't flushing it. - */ - xfs_buf_ioerror(bp, -EIO); - - /* - * We're calling xfs_buf_ioend, so delete XBF_DONE flag. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_UNDONE(bp); - xfs_buf_stale(bp); - - xfs_buf_ioend(bp, 0); - - return -EIO; -} - -/* - * Same as xfs_bioerror, except that we are releasing the buffer - * here ourselves, and avoiding the xfs_buf_ioend call. - * This is meant for userdata errors; metadata bufs come with - * iodone functions attached, so that we can track down errors. - */ -int -xfs_bioerror_relse( - struct xfs_buf *bp) -{ - int64_t fl = bp->b_flags; - /* - * No need to wait until the buffer is unpinned. - * We aren't flushing it. - * - * chunkhold expects B_DONE to be set, whether - * we actually finish the I/O or not. We don't want to - * change that interface. - */ - XFS_BUF_UNREAD(bp); - XFS_BUF_DONE(bp); - xfs_buf_stale(bp); - bp->b_iodone = NULL; - if (!(fl & XBF_ASYNC)) { - /* - * Mark b_error and B_ERROR _both_. - * Lot's of chunkcache code assumes that. - * There's no reason to mark error for - * ASYNC buffers. - */ - xfs_buf_ioerror(bp, -EIO); - complete(&bp->b_iowait); - } else { - xfs_buf_relse(bp); - } - - return -EIO; -} - -STATIC int -xfs_bdstrat_cb( - struct xfs_buf *bp) -{ - if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - /* - * Metadata write that didn't get logged but - * written delayed anyway. These aren't associated - * with a transaction, and can be ignored. - */ - if (!bp->b_iodone && !XFS_BUF_ISREAD(bp)) - return xfs_bioerror_relse(bp); - else - return xfs_bioerror(bp); - } - - xfs_buf_iorequest(bp); - return 0; -} - int xfs_bwrite( struct xfs_buf *bp) @@ -1166,11 +1085,10 @@ xfs_bwrite( ASSERT(xfs_buf_islocked(bp)); bp->b_flags |= XBF_WRITE; - bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | XBF_WRITE_FAIL); - - xfs_bdstrat_cb(bp); + bp->b_flags &= ~(XBF_ASYNC | XBF_READ | _XBF_DELWRI_Q | + XBF_WRITE_FAIL | XBF_DONE); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { xfs_force_shutdown(bp->b_target->bt_mount, SHUTDOWN_META_IO_ERROR); @@ -1179,15 +1097,6 @@ xfs_bwrite( } STATIC void -_xfs_buf_ioend( - xfs_buf_t *bp, - int schedule) -{ - if (atomic_dec_and_test(&bp->b_io_remaining) == 1) - xfs_buf_ioend(bp, schedule); -} - -STATIC void xfs_buf_bio_end_io( struct bio *bio, int error) @@ -1198,13 +1107,18 @@ xfs_buf_bio_end_io( * don't overwrite existing errors - otherwise we can lose errors on * buffers that require multiple bios to complete. */ - if (!bp->b_error) - xfs_buf_ioerror(bp, error); + if (error) { + spin_lock(&bp->b_lock); + if (!bp->b_io_error) + bp->b_io_error = error; + spin_unlock(&bp->b_lock); + } if (!bp->b_error && xfs_buf_is_vmapped(bp) && (bp->b_flags & XBF_READ)) invalidate_kernel_vmap_range(bp->b_addr, xfs_buf_vmap_len(bp)); - _xfs_buf_ioend(bp, 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend_async(bp); bio_put(bio); } @@ -1283,7 +1197,7 @@ next_chunk: } else { /* * This is guaranteed not to be the last io reference count - * because the caller (xfs_buf_iorequest) holds a count itself. + * because the caller (xfs_buf_submit) holds a count itself. */ atomic_dec(&bp->b_io_remaining); xfs_buf_ioerror(bp, -EIO); @@ -1373,53 +1287,131 @@ _xfs_buf_ioapply( blk_finish_plug(&plug); } +/* + * Asynchronous IO submission path. This transfers the buffer lock ownership and + * the current reference to the IO. It is not safe to reference the buffer after + * a call to this function unless the caller holds an additional reference + * itself. + */ void -xfs_buf_iorequest( - xfs_buf_t *bp) +xfs_buf_submit( + struct xfs_buf *bp) { - trace_xfs_buf_iorequest(bp, _RET_IP_); + trace_xfs_buf_submit(bp, _RET_IP_); ASSERT(!(bp->b_flags & _XBF_DELWRI_Q)); + ASSERT(bp->b_flags & XBF_ASYNC); + + /* on shutdown we stale and complete the buffer immediately */ + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + bp->b_flags &= ~XBF_DONE; + xfs_buf_stale(bp); + xfs_buf_ioend(bp); + return; + } if (bp->b_flags & XBF_WRITE) xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * The caller's reference is released during I/O completion. + * This occurs some time after the last b_io_remaining reference is + * released, so after we drop our Io reference we have to have some + * other reference to ensure the buffer doesn't go away from underneath + * us. Take a direct reference to ensure we have safe access to the + * buffer until we are finished with it. + */ xfs_buf_hold(bp); /* - * Set the count to 1 initially, this will stop an I/O - * completion callout which happens before we have started - * all the I/O from calling xfs_buf_ioend too early. + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. */ atomic_set(&bp->b_io_remaining, 1); _xfs_buf_ioapply(bp); + /* - * If _xfs_buf_ioapply failed, we'll get back here with - * only the reference we took above. _xfs_buf_ioend will - * drop it to zero, so we'd better not queue it for later, - * or we'll free it before it's done. + * If _xfs_buf_ioapply failed, we can get back here with only the IO + * reference we took above. If we drop it to zero, run completion so + * that we don't return to the caller with completion still pending. */ - _xfs_buf_ioend(bp, bp->b_error ? 0 : 1); + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) { + if (bp->b_error) + xfs_buf_ioend(bp); + else + xfs_buf_ioend_async(bp); + } xfs_buf_rele(bp); + /* Note: it is not safe to reference bp now we've dropped our ref */ } /* - * Waits for I/O to complete on the buffer supplied. It returns immediately if - * no I/O is pending or there is already a pending error on the buffer, in which - * case nothing will ever complete. It returns the I/O error code, if any, or - * 0 if there was no error. + * Synchronous buffer IO submission path, read or write. */ int -xfs_buf_iowait( - xfs_buf_t *bp) +xfs_buf_submit_wait( + struct xfs_buf *bp) { - trace_xfs_buf_iowait(bp, _RET_IP_); + int error; - if (!bp->b_error) - wait_for_completion(&bp->b_iowait); + trace_xfs_buf_submit_wait(bp, _RET_IP_); + + ASSERT(!(bp->b_flags & (_XBF_DELWRI_Q | XBF_ASYNC))); + + if (XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror(bp, -EIO); + xfs_buf_stale(bp); + bp->b_flags &= ~XBF_DONE; + return -EIO; + } + + if (bp->b_flags & XBF_WRITE) + xfs_buf_wait_unpin(bp); + + /* clear the internal error state to avoid spurious errors */ + bp->b_io_error = 0; + + /* + * For synchronous IO, the IO does not inherit the submitters reference + * count, nor the buffer lock. Hence we cannot release the reference we + * are about to take until we've waited for all IO completion to occur, + * including any xfs_buf_ioend_async() work that may be pending. + */ + xfs_buf_hold(bp); + + /* + * Set the count to 1 initially, this will stop an I/O completion + * callout which happens before we have started all the I/O from calling + * xfs_buf_ioend too early. + */ + atomic_set(&bp->b_io_remaining, 1); + _xfs_buf_ioapply(bp); + + /* + * make sure we run completion synchronously if it raced with us and is + * already complete. + */ + if (atomic_dec_and_test(&bp->b_io_remaining) == 1) + xfs_buf_ioend(bp); + /* wait for completion before gathering the error from the buffer */ + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); trace_xfs_buf_iowait_done(bp, _RET_IP_); - return bp->b_error; + error = bp->b_error; + + /* + * all done now, we can release the hold that keeps the buffer + * referenced for the entire IO. + */ + xfs_buf_rele(bp); + return error; } xfs_caddr_t @@ -1678,8 +1670,6 @@ xfs_alloc_buftarg( btp->bt_dev = bdev->bd_dev; btp->bt_bdev = bdev; btp->bt_bdi = blk_get_backing_dev_info(bdev); - if (!btp->bt_bdi) - goto error; if (xfs_setsize_buftarg_early(btp, bdev)) goto error; @@ -1813,13 +1803,19 @@ __xfs_buf_delwri_submit( blk_start_plug(&plug); list_for_each_entry_safe(bp, n, io_list, b_list) { bp->b_flags &= ~(_XBF_DELWRI_Q | XBF_ASYNC | XBF_WRITE_FAIL); - bp->b_flags |= XBF_WRITE; + bp->b_flags |= XBF_WRITE | XBF_ASYNC; - if (!wait) { - bp->b_flags |= XBF_ASYNC; + /* + * we do all Io submission async. This means if we need to wait + * for IO completion we need to take an extra reference so the + * buffer is still valid on the other side. + */ + if (wait) + xfs_buf_hold(bp); + else list_del_init(&bp->b_list); - } - xfs_bdstrat_cb(bp); + + xfs_buf_submit(bp); } blk_finish_plug(&plug); @@ -1866,7 +1862,10 @@ xfs_buf_delwri_submit( bp = list_first_entry(&io_list, struct xfs_buf, b_list); list_del_init(&bp->b_list); - error2 = xfs_buf_iowait(bp); + + /* locking the buffer will wait for async IO completion. */ + xfs_buf_lock(bp); + error2 = bp->b_error; xfs_buf_relse(bp); if (!error) error = error2; @@ -1884,7 +1883,7 @@ xfs_buf_init(void) goto out; xfslogd_workqueue = alloc_workqueue("xfslogd", - WQ_MEM_RECLAIM | WQ_HIGHPRI, 1); + WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 1); if (!xfslogd_workqueue) goto out_free_buf_zone; diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index c753183900b3..82002c00af90 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -158,6 +158,7 @@ typedef struct xfs_buf { struct list_head b_lru; /* lru list */ spinlock_t b_lock; /* internal state lock */ unsigned int b_state; /* internal state flags */ + int b_io_error; /* internal IO error state */ wait_queue_head_t b_waiters; /* unpin waiters */ struct list_head b_list; struct xfs_perag *b_pag; /* contains rbtree root */ @@ -268,9 +269,9 @@ int xfs_buf_associate_memory(struct xfs_buf *bp, void *mem, size_t length); struct xfs_buf *xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, int flags); -struct xfs_buf *xfs_buf_read_uncached(struct xfs_buftarg *target, - xfs_daddr_t daddr, size_t numblks, int flags, - const struct xfs_buf_ops *ops); +int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, + size_t numblks, int flags, struct xfs_buf **bpp, + const struct xfs_buf_ops *ops); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -286,18 +287,16 @@ extern void xfs_buf_unlock(xfs_buf_t *); /* Buffer Read and Write Routines */ extern int xfs_bwrite(struct xfs_buf *bp); -extern void xfs_buf_ioend(xfs_buf_t *, int); +extern void xfs_buf_ioend(struct xfs_buf *bp); extern void xfs_buf_ioerror(xfs_buf_t *, int); extern void xfs_buf_ioerror_alert(struct xfs_buf *, const char *func); -extern void xfs_buf_iorequest(xfs_buf_t *); -extern int xfs_buf_iowait(xfs_buf_t *); +extern void xfs_buf_submit(struct xfs_buf *bp); +extern int xfs_buf_submit_wait(struct xfs_buf *bp); extern void xfs_buf_iomove(xfs_buf_t *, size_t, size_t, void *, xfs_buf_rw_t); #define xfs_buf_zero(bp, off, len) \ xfs_buf_iomove((bp), (off), (len), NULL, XBRW_ZERO) -extern int xfs_bioerror_relse(struct xfs_buf *); - /* Buffer Utility Routines */ extern xfs_caddr_t xfs_buf_offset(xfs_buf_t *, size_t); diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 76007deed31f..f15969543326 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -491,7 +491,7 @@ xfs_buf_item_unpin( xfs_buf_ioerror(bp, -EIO); XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } } @@ -501,7 +501,7 @@ xfs_buf_item_unpin( * buffer being bad.. */ -DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); +static DEFINE_RATELIMIT_STATE(xfs_buf_write_fail_rl_state, 30 * HZ, 10); STATIC uint xfs_buf_item_push( @@ -1081,7 +1081,7 @@ xfs_buf_iodone_callbacks( * a way to shut the filesystem down if the writes keep failing. * * In practice we'll shut the filesystem down soon as non-transient - * erorrs tend to affect the whole device and a failing log write + * errors tend to affect the whole device and a failing log write * will make us give up. But we really ought to do better here. */ if (XFS_BUF_ISASYNC(bp)) { @@ -1094,7 +1094,7 @@ xfs_buf_iodone_callbacks( if (!(bp->b_flags & (XBF_STALE|XBF_WRITE_FAIL))) { bp->b_flags |= XBF_WRITE | XBF_ASYNC | XBF_DONE | XBF_WRITE_FAIL; - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); } else { xfs_buf_relse(bp); } @@ -1115,7 +1115,7 @@ do_callbacks: xfs_buf_do_callbacks(bp); bp->b_fspriv = NULL; bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index de5368c803f9..eb596b419942 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -983,7 +983,7 @@ xfs_vm_page_mkwrite( /* * This type is designed to indicate the type of offset we would like - * to search from page cache for either xfs_seek_data() or xfs_seek_hole(). + * to search from page cache for xfs_seek_hole_data(). */ enum { HOLE_OFF = 0, @@ -1040,7 +1040,7 @@ xfs_lookup_buffer_offset( /* * This routine is called to find out and return a data or hole offset * from the page cache for unwritten extents according to the desired - * type for xfs_seek_data() or xfs_seek_hole(). + * type for xfs_seek_hole_data(). * * The argument offset is used to tell where we start to search from the * page cache. Map is used to figure out the end points of the range to @@ -1200,9 +1200,10 @@ out: } STATIC loff_t -xfs_seek_data( +xfs_seek_hole_data( struct file *file, - loff_t start) + loff_t start, + int whence) { struct inode *inode = file->f_mapping->host; struct xfs_inode *ip = XFS_I(inode); @@ -1214,6 +1215,9 @@ xfs_seek_data( uint lock; int error; + if (XFS_FORCED_SHUTDOWN(mp)) + return -EIO; + lock = xfs_ilock_data_map_shared(ip); isize = i_size_read(inode); @@ -1228,6 +1232,7 @@ xfs_seek_data( */ fsbno = XFS_B_TO_FSBT(mp, start); end = XFS_B_TO_FSB(mp, isize); + for (;;) { struct xfs_bmbt_irec map[2]; int nmap = 2; @@ -1248,29 +1253,48 @@ xfs_seek_data( offset = max_t(loff_t, start, XFS_FSB_TO_B(mp, map[i].br_startoff)); - /* Landed in a data extent */ - if (map[i].br_startblock == DELAYSTARTBLOCK || - (map[i].br_state == XFS_EXT_NORM && - !isnullstartblock(map[i].br_startblock))) + /* Landed in the hole we wanted? */ + if (whence == SEEK_HOLE && + map[i].br_startblock == HOLESTARTBLOCK) + goto out; + + /* Landed in the data extent we wanted? */ + if (whence == SEEK_DATA && + (map[i].br_startblock == DELAYSTARTBLOCK || + (map[i].br_state == XFS_EXT_NORM && + !isnullstartblock(map[i].br_startblock)))) goto out; /* - * Landed in an unwritten extent, try to search data - * from page cache. + * Landed in an unwritten extent, try to search + * for hole or data from page cache. */ if (map[i].br_state == XFS_EXT_UNWRITTEN) { if (xfs_find_get_desired_pgoff(inode, &map[i], - DATA_OFF, &offset)) + whence == SEEK_HOLE ? HOLE_OFF : DATA_OFF, + &offset)) goto out; } } /* - * map[0] is hole or its an unwritten extent but - * without data in page cache. Probably means that - * we are reading after EOF if nothing in map[1]. + * We only received one extent out of the two requested. This + * means we've hit EOF and didn't find what we are looking for. */ if (nmap == 1) { + /* + * If we were looking for a hole, set offset to + * the end of the file (i.e., there is an implicit + * hole at the end of any file). + */ + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + /* + * If we were looking for data, it's nowhere to be found + */ + ASSERT(whence == SEEK_DATA); error = -ENXIO; goto out_unlock; } @@ -1279,125 +1303,30 @@ xfs_seek_data( /* * Nothing was found, proceed to the next round of search - * if reading offset not beyond or hit EOF. + * if the next reading offset is not at or beyond EOF. */ fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; start = XFS_FSB_TO_B(mp, fsbno); if (start >= isize) { + if (whence == SEEK_HOLE) { + offset = isize; + break; + } + ASSERT(whence == SEEK_DATA); error = -ENXIO; goto out_unlock; } } out: - offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); - -out_unlock: - xfs_iunlock(ip, lock); - - if (error) - return error; - return offset; -} - -STATIC loff_t -xfs_seek_hole( - struct file *file, - loff_t start) -{ - struct inode *inode = file->f_mapping->host; - struct xfs_inode *ip = XFS_I(inode); - struct xfs_mount *mp = ip->i_mount; - loff_t uninitialized_var(offset); - xfs_fsize_t isize; - xfs_fileoff_t fsbno; - xfs_filblks_t end; - uint lock; - int error; - - if (XFS_FORCED_SHUTDOWN(mp)) - return -EIO; - - lock = xfs_ilock_data_map_shared(ip); - - isize = i_size_read(inode); - if (start >= isize) { - error = -ENXIO; - goto out_unlock; - } - - fsbno = XFS_B_TO_FSBT(mp, start); - end = XFS_B_TO_FSB(mp, isize); - - for (;;) { - struct xfs_bmbt_irec map[2]; - int nmap = 2; - unsigned int i; - - error = xfs_bmapi_read(ip, fsbno, end - fsbno, map, &nmap, - XFS_BMAPI_ENTIRE); - if (error) - goto out_unlock; - - /* No extents at given offset, must be beyond EOF */ - if (nmap == 0) { - error = -ENXIO; - goto out_unlock; - } - - for (i = 0; i < nmap; i++) { - offset = max_t(loff_t, start, - XFS_FSB_TO_B(mp, map[i].br_startoff)); - - /* Landed in a hole */ - if (map[i].br_startblock == HOLESTARTBLOCK) - goto out; - - /* - * Landed in an unwritten extent, try to search hole - * from page cache. - */ - if (map[i].br_state == XFS_EXT_UNWRITTEN) { - if (xfs_find_get_desired_pgoff(inode, &map[i], - HOLE_OFF, &offset)) - goto out; - } - } - - /* - * map[0] contains data or its unwritten but contains - * data in page cache, probably means that we are - * reading after EOF. We should fix offset to point - * to the end of the file(i.e., there is an implicit - * hole at the end of any file). - */ - if (nmap == 1) { - offset = isize; - break; - } - - ASSERT(i > 1); - - /* - * Both mappings contains data, proceed to the next round of - * search if the current reading offset not beyond or hit EOF. - */ - fsbno = map[i - 1].br_startoff + map[i - 1].br_blockcount; - start = XFS_FSB_TO_B(mp, fsbno); - if (start >= isize) { - offset = isize; - break; - } - } - -out: /* - * At this point, we must have found a hole. However, the returned + * If at this point we have found the hole we wanted, the returned * offset may be bigger than the file size as it may be aligned to - * page boundary for unwritten extents, we need to deal with this + * page boundary for unwritten extents. We need to deal with this * situation in particular. */ - offset = min_t(loff_t, offset, isize); + if (whence == SEEK_HOLE) + offset = min_t(loff_t, offset, isize); offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out_unlock: @@ -1412,17 +1341,16 @@ STATIC loff_t xfs_file_llseek( struct file *file, loff_t offset, - int origin) + int whence) { - switch (origin) { + switch (whence) { case SEEK_END: case SEEK_CUR: case SEEK_SET: - return generic_file_llseek(file, offset, origin); - case SEEK_DATA: - return xfs_seek_data(file, offset); + return generic_file_llseek(file, offset, whence); case SEEK_HOLE: - return xfs_seek_hole(file, offset); + case SEEK_DATA: + return xfs_seek_hole_data(file, offset, whence); default: return -EINVAL; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index f91de1ef05e1..c05ac8b70fa9 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -172,16 +172,11 @@ xfs_growfs_data_private( if ((error = xfs_sb_validate_fsb_count(&mp->m_sb, nb))) return error; dpct = pct - mp->m_sb.sb_imax_pct; - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); new = nb; /* use new as a temporary here */ diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index 5399ef222dd7..4d41b241298f 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -43,3 +43,7 @@ xfs_param_t xfs_params = { .fstrm_timer = { 1, 30*100, 3600*100}, .eofb_timer = { 1, 300, 3600*24}, }; + +struct xfs_globals xfs_globals = { + .log_recovery_delay = 0, /* no delay by default */ +}; diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 981b2cf51985..b45f7b27b5df 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -33,7 +33,6 @@ #include "xfs_trace.h" #include "xfs_icache.h" #include "xfs_bmap_util.h" -#include "xfs_quota.h" #include "xfs_dquot_item.h" #include "xfs_dquot.h" diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index fea3c92fb3f0..8ed049d1e332 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -654,7 +654,7 @@ xfs_ialloc( xfs_inode_t *ip; uint flags; int error; - timespec_t tv; + struct timespec tv; /* * Call the space management code to pick @@ -720,7 +720,7 @@ xfs_ialloc( ip->i_d.di_nextents = 0; ASSERT(ip->i_d.di_nblocks == 0); - nanotime(&tv); + tv = current_fs_time(mp->m_super); ip->i_d.di_mtime.t_sec = (__int32_t)tv.tv_sec; ip->i_d.di_mtime.t_nsec = (__int32_t)tv.tv_nsec; ip->i_d.di_atime = ip->i_d.di_mtime; @@ -769,6 +769,8 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_EXTSZINHERIT; ip->i_d.di_extsize = pip->i_d.di_extsize; } + if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_REALTIME; @@ -789,8 +791,6 @@ xfs_ialloc( if ((pip->i_d.di_flags & XFS_DIFLAG_NOSYMLINKS) && xfs_inherit_nosymlinks) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (pip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; if ((pip->i_d.di_flags & XFS_DIFLAG_NODEFRAG) && xfs_inherit_nodefrag) di_flags |= XFS_DIFLAG_NODEFRAG; @@ -1153,9 +1153,11 @@ xfs_create( if (error) goto out_trans_cancel; - error = xfs_dir_canenter(tp, dp, name, resblks); - if (error) - goto out_trans_cancel; + if (!resblks) { + error = xfs_dir_canenter(tp, dp, name); + if (error) + goto out_trans_cancel; + } /* * A newly created regular or special file just has one directory @@ -1421,9 +1423,11 @@ xfs_link( goto error_return; } - error = xfs_dir_canenter(tp, tdp, target_name, resblks); - if (error) - goto error_return; + if (!resblks) { + error = xfs_dir_canenter(tp, tdp, target_name); + if (error) + goto error_return; + } xfs_bmap_init(&free_list, &first_block); @@ -2759,9 +2763,11 @@ xfs_rename( * If there's no space reservation, check the entry will * fit before actually inserting it. */ - error = xfs_dir_canenter(tp, target_dp, target_name, spaceres); - if (error) - goto error_return; + if (!spaceres) { + error = xfs_dir_canenter(tp, target_dp, target_name); + if (error) + goto error_return; + } /* * If target does not exist and the rename crosses * directories, adjust the target directory link count @@ -3056,7 +3062,7 @@ cluster_corrupt_out: XFS_BUF_UNDONE(bp); xfs_buf_stale(bp); xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } else { xfs_buf_stale(bp); xfs_buf_relse(bp); diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c10e3fadd9af..9af2882e1f4c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -102,7 +102,7 @@ xfs_new_eof(struct xfs_inode *ip, xfs_fsize_t new_size) { xfs_fsize_t i_size = i_size_read(VFS_I(ip)); - if (new_size > i_size) + if (new_size > i_size || new_size < 0) new_size = i_size; return new_size > ip->i_d.di_size ? new_size : 0; } diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index de5a7be36e60..63de0b0acc32 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -615,7 +615,7 @@ xfs_iflush_done( blip = bp->b_fspriv; prev = NULL; while (blip != NULL) { - if (lip->li_cb != xfs_iflush_done) { + if (blip->li_cb != xfs_iflush_done) { prev = blip; blip = blip->li_bio_list; continue; diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index 3799695b9249..24c926b6fe85 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -968,8 +968,6 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NOATIME; if (xflags & XFS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_PROJINHERIT) - di_flags |= XFS_DIFLAG_PROJINHERIT; if (xflags & XFS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; if (xflags & XFS_XFLAG_FILESTREAM) @@ -981,6 +979,8 @@ xfs_set_diflags( di_flags |= XFS_DIFLAG_NOSYMLINKS; if (xflags & XFS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; + if (xflags & XFS_XFLAG_PROJINHERIT) + di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { if (xflags & XFS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; @@ -1231,13 +1231,25 @@ xfs_ioctl_setattr( } - if (mask & FSX_EXTSIZE) - ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; if (mask & FSX_XFLAGS) { xfs_set_diflags(ip, fa->fsx_xflags); xfs_diflags_to_linux(ip); } + /* + * Only set the extent size hint if we've already determined that the + * extent size hint should be set on the inode. If no extent size flags + * are set on the inode then unconditionally clear the extent size hint. + */ + if (mask & FSX_EXTSIZE) { + int extsize = 0; + + if (ip->i_d.di_flags & + (XFS_DIFLAG_EXTSIZE | XFS_DIFLAG_EXTSZINHERIT)) + extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog; + ip->i_d.di_extsize = extsize; + } + xfs_trans_ichgtime(tp, ip, XFS_ICHGTIME_CHG); xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); @@ -1349,7 +1361,7 @@ xfs_ioc_setxflags( STATIC int xfs_getbmap_format(void **ap, struct getbmapx *bmv, int *full) { - struct getbmap __user *base = *ap; + struct getbmap __user *base = (struct getbmap __user *)*ap; /* copy only getbmap portion (not getbmapx) */ if (copy_to_user(base, bmv, sizeof(struct getbmap))) @@ -1380,7 +1392,7 @@ xfs_ioc_getbmap( bmx.bmv_iflags |= BMV_IF_NO_DMAPI_READ; error = xfs_getbmap(ip, &bmx, xfs_getbmap_format, - (struct getbmap *)arg+1); + (__force struct getbmap *)arg+1); if (error) return error; @@ -1393,7 +1405,7 @@ xfs_ioc_getbmap( STATIC int xfs_getbmapx_format(void **ap, struct getbmapx *bmv, int *full) { - struct getbmapx __user *base = *ap; + struct getbmapx __user *base = (struct getbmapx __user *)*ap; if (copy_to_user(base, bmv, sizeof(struct getbmapx))) return -EFAULT; @@ -1420,7 +1432,7 @@ xfs_ioc_getbmapx( return -EINVAL; error = xfs_getbmap(ip, &bmx, xfs_getbmapx_format, - (struct getbmapx *)arg+1); + (__force struct getbmapx *)arg+1); if (error) return error; diff --git a/fs/xfs/xfs_ioctl32.c b/fs/xfs/xfs_ioctl32.c index a554646ff141..94ce027e28e3 100644 --- a/fs/xfs/xfs_ioctl32.c +++ b/fs/xfs/xfs_ioctl32.c @@ -160,6 +160,7 @@ xfs_ioctl32_bstat_copyin( get_user(bstat->bs_gen, &bstat32->bs_gen) || get_user(bstat->bs_projid_lo, &bstat32->bs_projid_lo) || get_user(bstat->bs_projid_hi, &bstat32->bs_projid_hi) || + get_user(bstat->bs_forkoff, &bstat32->bs_forkoff) || get_user(bstat->bs_dmevmask, &bstat32->bs_dmevmask) || get_user(bstat->bs_dmstate, &bstat32->bs_dmstate) || get_user(bstat->bs_aextents, &bstat32->bs_aextents)) @@ -214,6 +215,7 @@ xfs_bulkstat_one_fmt_compat( put_user(buffer->bs_gen, &p32->bs_gen) || put_user(buffer->bs_projid, &p32->bs_projid) || put_user(buffer->bs_projid_hi, &p32->bs_projid_hi) || + put_user(buffer->bs_forkoff, &p32->bs_forkoff) || put_user(buffer->bs_dmevmask, &p32->bs_dmevmask) || put_user(buffer->bs_dmstate, &p32->bs_dmstate) || put_user(buffer->bs_aextents, &p32->bs_aextents)) diff --git a/fs/xfs/xfs_ioctl32.h b/fs/xfs/xfs_ioctl32.h index 80f4060e8970..b1bb45444df8 100644 --- a/fs/xfs/xfs_ioctl32.h +++ b/fs/xfs/xfs_ioctl32.h @@ -67,8 +67,9 @@ typedef struct compat_xfs_bstat { __u32 bs_gen; /* generation count */ __u16 bs_projid_lo; /* lower part of project id */ #define bs_projid bs_projid_lo /* (previously just bs_projid) */ + __u16 bs_forkoff; /* inode fork offset in bytes */ __u16 bs_projid_hi; /* high part of project id */ - unsigned char bs_pad[12]; /* pad space, unused */ + unsigned char bs_pad[10]; /* pad space, unused */ __u32 bs_dmevmask; /* DMIG event mask */ __u16 bs_dmstate; /* DMIG state info */ __u16 bs_aextents; /* attribute number of extents */ diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index e9c47b6f5e5a..afcf3c926565 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -404,8 +404,8 @@ xfs_quota_calc_throttle( int shift = 0; struct xfs_dquot *dq = xfs_inode_dquot(ip, type); - /* over hi wmark, squash the prealloc completely */ - if (dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { + /* no dq, or over hi wmark, squash the prealloc completely */ + if (!dq || dq->q_res_bcount >= dq->q_prealloc_hi_wmark) { *qblocks = 0; *qfreesp = 0; return; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 72129493e9d3..ec6dcdc181ee 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -849,6 +849,36 @@ xfs_setattr_size( return error; truncate_setsize(inode, newsize); + /* + * The "we can't serialise against page faults" pain gets worse. + * + * If the file is mapped then we have to clean the page at the old EOF + * when extending the file. Extending the file can expose changes the + * underlying page mapping (e.g. from beyond EOF to a hole or + * unwritten), and so on the next attempt to write to that page we need + * to remap it for write. i.e. we need .page_mkwrite() to be called. + * Hence we need to clean the page to clean the pte and so a new write + * fault will be triggered appropriately. + * + * If we do it before we change the inode size, then we can race with a + * page fault that maps the page with exactly the same problem. If we do + * it after we change the file size, then a new page fault can come in + * and allocate space before we've run the rest of the truncate + * transaction. That's kinda grotesque, but it's better than have data + * over a hole, and so that's the lesser evil that has been chosen here. + * + * The real solution, however, is to have some mechanism for locking out + * page faults while a truncate is in progress. + */ + if (newsize > oldsize && mapping_mapped(VFS_I(ip)->i_mapping)) { + error = filemap_write_and_wait_range( + VFS_I(ip)->i_mapping, + round_down(oldsize, PAGE_CACHE_SIZE), + round_up(oldsize, PAGE_CACHE_SIZE) - 1); + if (error) + return error; + } + tp = xfs_trans_alloc(mp, XFS_TRANS_SETATTR_SIZE); error = xfs_trans_reserve(tp, &M_RES(mp)->tr_itruncate, 0, 0); if (error) diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index f71be9c68017..f1deb961a296 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -639,7 +639,8 @@ next_ag: xfs_buf_relse(agbp); agbp = NULL; agino = 0; - } while (++agno < mp->m_sb.sb_agcount); + agno++; + } while (agno < mp->m_sb.sb_agcount); if (!error) { if (bufidx) { diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index d10dc8f397c9..6a51619d8690 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -56,7 +56,6 @@ typedef __uint64_t __psunsigned_t; #include "kmem.h" #include "mrlock.h" -#include "time.h" #include "uuid.h" #include <linux/semaphore.h> @@ -179,6 +178,11 @@ typedef __uint64_t __psunsigned_t; #define MAX(a,b) (max(a,b)) #define howmany(x, y) (((x)+((y)-1))/(y)) +static inline void delay(long ticks) +{ + schedule_timeout_uninterruptible(ticks); +} + /* * XFS wrapper structure for sysfs support. It depends on external data * structures and is embedded in various internal data structures to implement diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index ca4fd5bd8522..fe88ef67f93a 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -1678,7 +1678,7 @@ xlog_bdstrat( if (iclog->ic_state & XLOG_STATE_IOERROR) { xfs_buf_ioerror(bp, -EIO); xfs_buf_stale(bp); - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); /* * It would seem logical to return EIO here, but we rely on * the log state machine to propagate I/O errors instead of @@ -1688,7 +1688,7 @@ xlog_bdstrat( return 0; } - xfs_buf_iorequest(bp); + xfs_buf_submit(bp); return 0; } @@ -3867,18 +3867,17 @@ xlog_state_ioerror( * This is called from xfs_force_shutdown, when we're forcibly * shutting down the filesystem, typically because of an IO error. * Our main objectives here are to make sure that: - * a. the filesystem gets marked 'SHUTDOWN' for all interested + * a. if !logerror, flush the logs to disk. Anything modified + * after this is ignored. + * b. the filesystem gets marked 'SHUTDOWN' for all interested * parties to find out, 'atomically'. - * b. those who're sleeping on log reservations, pinned objects and + * c. those who're sleeping on log reservations, pinned objects and * other resources get woken up, and be told the bad news. - * c. nothing new gets queued up after (a) and (b) are done. - * d. if !logerror, flush the iclogs to disk, then seal them off - * for business. + * d. nothing new gets queued up after (b) and (c) are done. * - * Note: for delayed logging the !logerror case needs to flush the regions - * held in memory out to the iclogs before flushing them to disk. This needs - * to be done before the log is marked as shutdown, otherwise the flush to the - * iclogs will fail. + * Note: for the !logerror case we need to flush the regions held in memory out + * to disk first. This needs to be done before the log is marked as shutdown, + * otherwise the iclog writes will fail. */ int xfs_log_force_umount( @@ -3910,16 +3909,16 @@ xfs_log_force_umount( ASSERT(XLOG_FORCED_SHUTDOWN(log)); return 1; } - retval = 0; /* - * Flush the in memory commit item list before marking the log as - * being shut down. We need to do it in this order to ensure all the - * completed transactions are flushed to disk with the xfs_log_force() - * call below. + * Flush all the completed transactions to disk before marking the log + * being shut down. We need to do it in this order to ensure that + * completed operations are safely on disk before we shut down, and that + * we don't have to issue any buffer IO after the shutdown flags are set + * to guarantee this. */ if (!logerror) - xlog_cil_force(log); + _xfs_log_force(mp, XFS_LOG_SYNC, NULL); /* * mark the filesystem and the as in a shutdown state and wake @@ -3931,18 +3930,11 @@ xfs_log_force_umount( XFS_BUF_DONE(mp->m_sb_bp); /* - * This flag is sort of redundant because of the mount flag, but - * it's good to maintain the separation between the log and the rest - * of XFS. + * Mark the log and the iclogs with IO error flags to prevent any + * further log IO from being issued or completed. */ log->l_flags |= XLOG_IO_ERROR; - - /* - * If we hit a log error, we want to mark all the iclogs IOERROR - * while we're still holding the loglock. - */ - if (logerror) - retval = xlog_state_ioerror(log); + retval = xlog_state_ioerror(log); spin_unlock(&log->l_icloglock); /* @@ -3955,19 +3947,6 @@ xfs_log_force_umount( xlog_grant_head_wake_all(&log->l_reserve_head); xlog_grant_head_wake_all(&log->l_write_head); - if (!(log->l_iclog->ic_state & XLOG_STATE_IOERROR)) { - ASSERT(!logerror); - /* - * Force the incore logs to disk before shutting the - * log down completely. - */ - _xfs_log_force(mp, XFS_LOG_SYNC, NULL); - - spin_lock(&log->l_icloglock); - retval = xlog_state_ioerror(log); - spin_unlock(&log->l_icloglock); - } - /* * Wake up everybody waiting on xfs_log_force. Wake the CIL push first * as if the log writes were completed. The abort handling in the log diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index f6b79e5325dd..f506c457011e 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -463,12 +463,40 @@ xlog_cil_push( spin_unlock(&cil->xc_push_lock); goto out_skip; } - spin_unlock(&cil->xc_push_lock); /* check for a previously pushed seqeunce */ - if (push_seq < cil->xc_ctx->sequence) + if (push_seq < cil->xc_ctx->sequence) { + spin_unlock(&cil->xc_push_lock); goto out_skip; + } + + /* + * We are now going to push this context, so add it to the committing + * list before we do anything else. This ensures that anyone waiting on + * this push can easily detect the difference between a "push in + * progress" and "CIL is empty, nothing to do". + * + * IOWs, a wait loop can now check for: + * the current sequence not being found on the committing list; + * an empty CIL; and + * an unchanged sequence number + * to detect a push that had nothing to do and therefore does not need + * waiting on. If the CIL is not empty, we get put on the committing + * list before emptying the CIL and bumping the sequence number. Hence + * an empty CIL and an unchanged sequence number means we jumped out + * above after doing nothing. + * + * Hence the waiter will either find the commit sequence on the + * committing list or the sequence number will be unchanged and the CIL + * still dirty. In that latter case, the push has not yet started, and + * so the waiter will have to continue trying to check the CIL + * committing list until it is found. In extreme cases of delay, the + * sequence may fully commit between the attempts the wait makes to wait + * on the commit sequence. + */ + list_add(&ctx->committing, &cil->xc_committing); + spin_unlock(&cil->xc_push_lock); /* * pull all the log vectors off the items in the CIL, and @@ -532,7 +560,6 @@ xlog_cil_push( */ spin_lock(&cil->xc_push_lock); cil->xc_current_sequence = new_ctx->sequence; - list_add(&ctx->committing, &cil->xc_committing); spin_unlock(&cil->xc_push_lock); up_write(&cil->xc_ctx_lock); @@ -855,13 +882,15 @@ restart: * Hence by the time we have got here it our sequence may not have been * pushed yet. This is true if the current sequence still matches the * push sequence after the above wait loop and the CIL still contains - * dirty objects. + * dirty objects. This is guaranteed by the push code first adding the + * context to the committing list before emptying the CIL. * - * When the push occurs, it will empty the CIL and atomically increment - * the currect sequence past the push sequence and move it into the - * committing list. Of course, if the CIL is clean at the time of the - * push, it won't have pushed the CIL at all, so in that case we should - * try the push for this sequence again from the start just in case. + * Hence if we don't find the context in the committing list and the + * current sequence number is unchanged then the CIL contents are + * significant. If the CIL is empty, if means there was nothing to push + * and that means there is nothing to wait for. If the CIL is not empty, + * it means we haven't yet started the push, because if it had started + * we would have found the context on the committing list. */ if (sequence == cil->xc_current_sequence && !list_empty(&cil->xc_cil)) { diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index 1fd5787add99..00cd7f3a8f59 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -193,12 +193,8 @@ xlog_bread_noalign( bp->b_io_length = nbblks; bp->b_error = 0; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) - return -EIO; - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); - if (error) + error = xfs_buf_submit_wait(bp); + if (error && !XFS_FORCED_SHUTDOWN(log->l_mp)) xfs_buf_ioerror_alert(bp, __func__); return error; } @@ -378,12 +374,14 @@ xlog_recover_iodone( * We're not going to bother about retrying * this during recovery. One strike! */ - xfs_buf_ioerror_alert(bp, __func__); - xfs_force_shutdown(bp->b_target->bt_mount, - SHUTDOWN_META_IO_ERROR); + if (!XFS_FORCED_SHUTDOWN(bp->b_target->bt_mount)) { + xfs_buf_ioerror_alert(bp, __func__); + xfs_force_shutdown(bp->b_target->bt_mount, + SHUTDOWN_META_IO_ERROR); + } } bp->b_iodone = NULL; - xfs_buf_ioend(bp, 0); + xfs_buf_ioend(bp); } /* @@ -1445,160 +1443,6 @@ xlog_clear_stale_blocks( ****************************************************************************** */ -STATIC xlog_recover_t * -xlog_recover_find_tid( - struct hlist_head *head, - xlog_tid_t tid) -{ - xlog_recover_t *trans; - - hlist_for_each_entry(trans, head, r_list) { - if (trans->r_log_tid == tid) - return trans; - } - return NULL; -} - -STATIC void -xlog_recover_new_tid( - struct hlist_head *head, - xlog_tid_t tid, - xfs_lsn_t lsn) -{ - xlog_recover_t *trans; - - trans = kmem_zalloc(sizeof(xlog_recover_t), KM_SLEEP); - trans->r_log_tid = tid; - trans->r_lsn = lsn; - INIT_LIST_HEAD(&trans->r_itemq); - - INIT_HLIST_NODE(&trans->r_list); - hlist_add_head(&trans->r_list, head); -} - -STATIC void -xlog_recover_add_item( - struct list_head *head) -{ - xlog_recover_item_t *item; - - item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); - INIT_LIST_HEAD(&item->ri_list); - list_add_tail(&item->ri_list, head); -} - -STATIC int -xlog_recover_add_to_cont_trans( - struct xlog *log, - struct xlog_recover *trans, - xfs_caddr_t dp, - int len) -{ - xlog_recover_item_t *item; - xfs_caddr_t ptr, old_ptr; - int old_len; - - if (list_empty(&trans->r_itemq)) { - /* finish copying rest of trans header */ - xlog_recover_add_item(&trans->r_itemq); - ptr = (xfs_caddr_t) &trans->r_theader + - sizeof(xfs_trans_header_t) - len; - memcpy(ptr, dp, len); /* d, s, l */ - return 0; - } - /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); - - old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; - old_len = item->ri_buf[item->ri_cnt-1].i_len; - - ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); - memcpy(&ptr[old_len], dp, len); /* d, s, l */ - item->ri_buf[item->ri_cnt-1].i_len += len; - item->ri_buf[item->ri_cnt-1].i_addr = ptr; - trace_xfs_log_recover_item_add_cont(log, trans, item, 0); - return 0; -} - -/* - * The next region to add is the start of a new region. It could be - * a whole region or it could be the first part of a new region. Because - * of this, the assumption here is that the type and size fields of all - * format structures fit into the first 32 bits of the structure. - * - * This works because all regions must be 32 bit aligned. Therefore, we - * either have both fields or we have neither field. In the case we have - * neither field, the data part of the region is zero length. We only have - * a log_op_header and can throw away the header since a new one will appear - * later. If we have at least 4 bytes, then we can determine how many regions - * will appear in the current log item. - */ -STATIC int -xlog_recover_add_to_trans( - struct xlog *log, - struct xlog_recover *trans, - xfs_caddr_t dp, - int len) -{ - xfs_inode_log_format_t *in_f; /* any will do */ - xlog_recover_item_t *item; - xfs_caddr_t ptr; - - if (!len) - return 0; - if (list_empty(&trans->r_itemq)) { - /* we need to catch log corruptions here */ - if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { - xfs_warn(log->l_mp, "%s: bad header magic number", - __func__); - ASSERT(0); - return -EIO; - } - if (len == sizeof(xfs_trans_header_t)) - xlog_recover_add_item(&trans->r_itemq); - memcpy(&trans->r_theader, dp, len); /* d, s, l */ - return 0; - } - - ptr = kmem_alloc(len, KM_SLEEP); - memcpy(ptr, dp, len); - in_f = (xfs_inode_log_format_t *)ptr; - - /* take the tail entry */ - item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); - if (item->ri_total != 0 && - item->ri_total == item->ri_cnt) { - /* tail item is in use, get a new one */ - xlog_recover_add_item(&trans->r_itemq); - item = list_entry(trans->r_itemq.prev, - xlog_recover_item_t, ri_list); - } - - if (item->ri_total == 0) { /* first region to be added */ - if (in_f->ilf_size == 0 || - in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { - xfs_warn(log->l_mp, - "bad number of regions (%d) in inode log format", - in_f->ilf_size); - ASSERT(0); - kmem_free(ptr); - return -EIO; - } - - item->ri_total = in_f->ilf_size; - item->ri_buf = - kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), - KM_SLEEP); - } - ASSERT(item->ri_total > item->ri_cnt); - /* Description region is ri_buf[0] */ - item->ri_buf[item->ri_cnt].i_addr = ptr; - item->ri_buf[item->ri_cnt].i_len = len; - item->ri_cnt++; - trace_xfs_log_recover_item_add(log, trans, item, 0); - return 0; -} - /* * Sort the log items in the transaction. * @@ -3254,31 +3098,6 @@ xlog_recover_do_icreate_pass2( return 0; } -/* - * Free up any resources allocated by the transaction - * - * Remember that EFIs, EFDs, and IUNLINKs are handled later. - */ -STATIC void -xlog_recover_free_trans( - struct xlog_recover *trans) -{ - xlog_recover_item_t *item, *n; - int i; - - list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { - /* Free the regions in the item. */ - list_del(&item->ri_list); - for (i = 0; i < item->ri_cnt; i++) - kmem_free(item->ri_buf[i].i_addr); - /* Free the item itself */ - kmem_free(item->ri_buf); - kmem_free(item); - } - /* Free the transaction recover structure */ - kmem_free(trans); -} - STATIC void xlog_recover_buffer_ra_pass2( struct xlog *log, @@ -3528,22 +3347,309 @@ out: if (!list_empty(&done_list)) list_splice_init(&done_list, &trans->r_itemq); - xlog_recover_free_trans(trans); - error2 = xfs_buf_delwri_submit(&buffer_list); return error ? error : error2; } +STATIC void +xlog_recover_add_item( + struct list_head *head) +{ + xlog_recover_item_t *item; + + item = kmem_zalloc(sizeof(xlog_recover_item_t), KM_SLEEP); + INIT_LIST_HEAD(&item->ri_list); + list_add_tail(&item->ri_list, head); +} + STATIC int -xlog_recover_unmount_trans( - struct xlog *log) +xlog_recover_add_to_cont_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) { - /* Do nothing now */ - xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + xlog_recover_item_t *item; + xfs_caddr_t ptr, old_ptr; + int old_len; + + if (list_empty(&trans->r_itemq)) { + /* finish copying rest of trans header */ + xlog_recover_add_item(&trans->r_itemq); + ptr = (xfs_caddr_t) &trans->r_theader + + sizeof(xfs_trans_header_t) - len; + memcpy(ptr, dp, len); + return 0; + } + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + + old_ptr = item->ri_buf[item->ri_cnt-1].i_addr; + old_len = item->ri_buf[item->ri_cnt-1].i_len; + + ptr = kmem_realloc(old_ptr, len+old_len, old_len, KM_SLEEP); + memcpy(&ptr[old_len], dp, len); + item->ri_buf[item->ri_cnt-1].i_len += len; + item->ri_buf[item->ri_cnt-1].i_addr = ptr; + trace_xfs_log_recover_item_add_cont(log, trans, item, 0); + return 0; +} + +/* + * The next region to add is the start of a new region. It could be + * a whole region or it could be the first part of a new region. Because + * of this, the assumption here is that the type and size fields of all + * format structures fit into the first 32 bits of the structure. + * + * This works because all regions must be 32 bit aligned. Therefore, we + * either have both fields or we have neither field. In the case we have + * neither field, the data part of the region is zero length. We only have + * a log_op_header and can throw away the header since a new one will appear + * later. If we have at least 4 bytes, then we can determine how many regions + * will appear in the current log item. + */ +STATIC int +xlog_recover_add_to_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + int len) +{ + xfs_inode_log_format_t *in_f; /* any will do */ + xlog_recover_item_t *item; + xfs_caddr_t ptr; + + if (!len) + return 0; + if (list_empty(&trans->r_itemq)) { + /* we need to catch log corruptions here */ + if (*(uint *)dp != XFS_TRANS_HEADER_MAGIC) { + xfs_warn(log->l_mp, "%s: bad header magic number", + __func__); + ASSERT(0); + return -EIO; + } + if (len == sizeof(xfs_trans_header_t)) + xlog_recover_add_item(&trans->r_itemq); + memcpy(&trans->r_theader, dp, len); + return 0; + } + + ptr = kmem_alloc(len, KM_SLEEP); + memcpy(ptr, dp, len); + in_f = (xfs_inode_log_format_t *)ptr; + + /* take the tail entry */ + item = list_entry(trans->r_itemq.prev, xlog_recover_item_t, ri_list); + if (item->ri_total != 0 && + item->ri_total == item->ri_cnt) { + /* tail item is in use, get a new one */ + xlog_recover_add_item(&trans->r_itemq); + item = list_entry(trans->r_itemq.prev, + xlog_recover_item_t, ri_list); + } + + if (item->ri_total == 0) { /* first region to be added */ + if (in_f->ilf_size == 0 || + in_f->ilf_size > XLOG_MAX_REGIONS_IN_ITEM) { + xfs_warn(log->l_mp, + "bad number of regions (%d) in inode log format", + in_f->ilf_size); + ASSERT(0); + kmem_free(ptr); + return -EIO; + } + + item->ri_total = in_f->ilf_size; + item->ri_buf = + kmem_zalloc(item->ri_total * sizeof(xfs_log_iovec_t), + KM_SLEEP); + } + ASSERT(item->ri_total > item->ri_cnt); + /* Description region is ri_buf[0] */ + item->ri_buf[item->ri_cnt].i_addr = ptr; + item->ri_buf[item->ri_cnt].i_len = len; + item->ri_cnt++; + trace_xfs_log_recover_item_add(log, trans, item, 0); return 0; } /* + * Free up any resources allocated by the transaction + * + * Remember that EFIs, EFDs, and IUNLINKs are handled later. + */ +STATIC void +xlog_recover_free_trans( + struct xlog_recover *trans) +{ + xlog_recover_item_t *item, *n; + int i; + + list_for_each_entry_safe(item, n, &trans->r_itemq, ri_list) { + /* Free the regions in the item. */ + list_del(&item->ri_list); + for (i = 0; i < item->ri_cnt; i++) + kmem_free(item->ri_buf[i].i_addr); + /* Free the item itself */ + kmem_free(item->ri_buf); + kmem_free(item); + } + /* Free the transaction recover structure */ + kmem_free(trans); +} + +/* + * On error or completion, trans is freed. + */ +STATIC int +xlog_recovery_process_trans( + struct xlog *log, + struct xlog_recover *trans, + xfs_caddr_t dp, + unsigned int len, + unsigned int flags, + int pass) +{ + int error = 0; + bool freeit = false; + + /* mask off ophdr transaction container flags */ + flags &= ~XLOG_END_TRANS; + if (flags & XLOG_WAS_CONT_TRANS) + flags &= ~XLOG_CONTINUE_TRANS; + + /* + * Callees must not free the trans structure. We'll decide if we need to + * free it or not based on the operation being done and it's result. + */ + switch (flags) { + /* expected flag values */ + case 0: + case XLOG_CONTINUE_TRANS: + error = xlog_recover_add_to_trans(log, trans, dp, len); + break; + case XLOG_WAS_CONT_TRANS: + error = xlog_recover_add_to_cont_trans(log, trans, dp, len); + break; + case XLOG_COMMIT_TRANS: + error = xlog_recover_commit_trans(log, trans, pass); + /* success or fail, we are now done with this transaction. */ + freeit = true; + break; + + /* unexpected flag values */ + case XLOG_UNMOUNT_TRANS: + /* just skip trans */ + xfs_warn(log->l_mp, "%s: Unmount LR", __func__); + freeit = true; + break; + case XLOG_START_TRANS: + default: + xfs_warn(log->l_mp, "%s: bad flag 0x%x", __func__, flags); + ASSERT(0); + error = -EIO; + break; + } + if (error || freeit) + xlog_recover_free_trans(trans); + return error; +} + +/* + * Lookup the transaction recovery structure associated with the ID in the + * current ophdr. If the transaction doesn't exist and the start flag is set in + * the ophdr, then allocate a new transaction for future ID matches to find. + * Either way, return what we found during the lookup - an existing transaction + * or nothing. + */ +STATIC struct xlog_recover * +xlog_recover_ophdr_to_trans( + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead) +{ + struct xlog_recover *trans; + xlog_tid_t tid; + struct hlist_head *rhp; + + tid = be32_to_cpu(ohead->oh_tid); + rhp = &rhash[XLOG_RHASH(tid)]; + hlist_for_each_entry(trans, rhp, r_list) { + if (trans->r_log_tid == tid) + return trans; + } + + /* + * skip over non-start transaction headers - we could be + * processing slack space before the next transaction starts + */ + if (!(ohead->oh_flags & XLOG_START_TRANS)) + return NULL; + + ASSERT(be32_to_cpu(ohead->oh_len) == 0); + + /* + * This is a new transaction so allocate a new recovery container to + * hold the recovery ops that will follow. + */ + trans = kmem_zalloc(sizeof(struct xlog_recover), KM_SLEEP); + trans->r_log_tid = tid; + trans->r_lsn = be64_to_cpu(rhead->h_lsn); + INIT_LIST_HEAD(&trans->r_itemq); + INIT_HLIST_NODE(&trans->r_list); + hlist_add_head(&trans->r_list, rhp); + + /* + * Nothing more to do for this ophdr. Items to be added to this new + * transaction will be in subsequent ophdr containers. + */ + return NULL; +} + +STATIC int +xlog_recover_process_ophdr( + struct xlog *log, + struct hlist_head rhash[], + struct xlog_rec_header *rhead, + struct xlog_op_header *ohead, + xfs_caddr_t dp, + xfs_caddr_t end, + int pass) +{ + struct xlog_recover *trans; + unsigned int len; + + /* Do we understand who wrote this op? */ + if (ohead->oh_clientid != XFS_TRANSACTION && + ohead->oh_clientid != XFS_LOG) { + xfs_warn(log->l_mp, "%s: bad clientid 0x%x", + __func__, ohead->oh_clientid); + ASSERT(0); + return -EIO; + } + + /* + * Check the ophdr contains all the data it is supposed to contain. + */ + len = be32_to_cpu(ohead->oh_len); + if (dp + len > end) { + xfs_warn(log->l_mp, "%s: bad length 0x%x", __func__, len); + WARN_ON(1); + return -EIO; + } + + trans = xlog_recover_ophdr_to_trans(rhash, rhead, ohead); + if (!trans) { + /* nothing to do, so skip over this ophdr */ + return 0; + } + + return xlog_recovery_process_trans(log, trans, dp, len, + ohead->oh_flags, pass); +} + +/* * There are two valid states of the r_state field. 0 indicates that the * transaction structure is in a normal state. We have either seen the * start of the transaction or the last operation we added was not a partial @@ -3560,86 +3666,30 @@ xlog_recover_process_data( xfs_caddr_t dp, int pass) { - xfs_caddr_t lp; + struct xlog_op_header *ohead; + xfs_caddr_t end; int num_logops; - xlog_op_header_t *ohead; - xlog_recover_t *trans; - xlog_tid_t tid; int error; - unsigned long hash; - uint flags; - lp = dp + be32_to_cpu(rhead->h_len); + end = dp + be32_to_cpu(rhead->h_len); num_logops = be32_to_cpu(rhead->h_num_logops); /* check the log format matches our own - else we can't recover */ if (xlog_header_check_recover(log->l_mp, rhead)) return -EIO; - while ((dp < lp) && num_logops) { - ASSERT(dp + sizeof(xlog_op_header_t) <= lp); - ohead = (xlog_op_header_t *)dp; - dp += sizeof(xlog_op_header_t); - if (ohead->oh_clientid != XFS_TRANSACTION && - ohead->oh_clientid != XFS_LOG) { - xfs_warn(log->l_mp, "%s: bad clientid 0x%x", - __func__, ohead->oh_clientid); - ASSERT(0); - return -EIO; - } - tid = be32_to_cpu(ohead->oh_tid); - hash = XLOG_RHASH(tid); - trans = xlog_recover_find_tid(&rhash[hash], tid); - if (trans == NULL) { /* not found; add new tid */ - if (ohead->oh_flags & XLOG_START_TRANS) - xlog_recover_new_tid(&rhash[hash], tid, - be64_to_cpu(rhead->h_lsn)); - } else { - if (dp + be32_to_cpu(ohead->oh_len) > lp) { - xfs_warn(log->l_mp, "%s: bad length 0x%x", - __func__, be32_to_cpu(ohead->oh_len)); - WARN_ON(1); - return -EIO; - } - flags = ohead->oh_flags & ~XLOG_END_TRANS; - if (flags & XLOG_WAS_CONT_TRANS) - flags &= ~XLOG_CONTINUE_TRANS; - switch (flags) { - case XLOG_COMMIT_TRANS: - error = xlog_recover_commit_trans(log, - trans, pass); - break; - case XLOG_UNMOUNT_TRANS: - error = xlog_recover_unmount_trans(log); - break; - case XLOG_WAS_CONT_TRANS: - error = xlog_recover_add_to_cont_trans(log, - trans, dp, - be32_to_cpu(ohead->oh_len)); - break; - case XLOG_START_TRANS: - xfs_warn(log->l_mp, "%s: bad transaction", - __func__); - ASSERT(0); - error = -EIO; - break; - case 0: - case XLOG_CONTINUE_TRANS: - error = xlog_recover_add_to_trans(log, trans, - dp, be32_to_cpu(ohead->oh_len)); - break; - default: - xfs_warn(log->l_mp, "%s: bad flag 0x%x", - __func__, flags); - ASSERT(0); - error = -EIO; - break; - } - if (error) { - xlog_recover_free_trans(trans); - return error; - } - } + while ((dp < end) && num_logops) { + + ohead = (struct xlog_op_header *)dp; + dp += sizeof(*ohead); + ASSERT(dp <= end); + + /* errors will abort recovery */ + error = xlog_recover_process_ophdr(log, rhash, rhead, ohead, + dp, end, pass); + if (error) + return error; + dp += be32_to_cpu(ohead->oh_len); num_logops--; } @@ -4132,41 +4182,13 @@ xlog_do_recovery_pass( } memset(rhash, 0, sizeof(rhash)); - if (tail_blk <= head_blk) { - for (blk_no = tail_blk; blk_no < head_blk; ) { - error = xlog_bread(log, blk_no, hblks, hbp, &offset); - if (error) - goto bread_err2; - - rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); - if (error) - goto bread_err2; - - /* blocks in data section */ - bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no + hblks, bblks, dbp, - &offset); - if (error) - goto bread_err2; - - error = xlog_unpack_data(rhead, offset, log); - if (error) - goto bread_err2; - - error = xlog_recover_process_data(log, - rhash, rhead, offset, pass); - if (error) - goto bread_err2; - blk_no += bblks + hblks; - } - } else { + blk_no = tail_blk; + if (tail_blk > head_blk) { /* * Perform recovery around the end of the physical log. * When the head is not on the same cycle number as the tail, - * we can't do a sequential recovery as above. + * we can't do a sequential recovery. */ - blk_no = tail_blk; while (blk_no < log->l_logBBsize) { /* * Check for header wrapping around physical end-of-log @@ -4280,34 +4302,35 @@ xlog_do_recovery_pass( ASSERT(blk_no >= log->l_logBBsize); blk_no -= log->l_logBBsize; + } - /* read first part of physical log */ - while (blk_no < head_blk) { - error = xlog_bread(log, blk_no, hblks, hbp, &offset); - if (error) - goto bread_err2; + /* read first part of physical log */ + while (blk_no < head_blk) { + error = xlog_bread(log, blk_no, hblks, hbp, &offset); + if (error) + goto bread_err2; - rhead = (xlog_rec_header_t *)offset; - error = xlog_valid_rec_header(log, rhead, blk_no); - if (error) - goto bread_err2; + rhead = (xlog_rec_header_t *)offset; + error = xlog_valid_rec_header(log, rhead, blk_no); + if (error) + goto bread_err2; - bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); - error = xlog_bread(log, blk_no+hblks, bblks, dbp, - &offset); - if (error) - goto bread_err2; + /* blocks in data section */ + bblks = (int)BTOBB(be32_to_cpu(rhead->h_len)); + error = xlog_bread(log, blk_no+hblks, bblks, dbp, + &offset); + if (error) + goto bread_err2; - error = xlog_unpack_data(rhead, offset, log); - if (error) - goto bread_err2; + error = xlog_unpack_data(rhead, offset, log); + if (error) + goto bread_err2; - error = xlog_recover_process_data(log, rhash, - rhead, offset, pass); - if (error) - goto bread_err2; - blk_no += bblks + hblks; - } + error = xlog_recover_process_data(log, rhash, + rhead, offset, pass); + if (error) + goto bread_err2; + blk_no += bblks + hblks; } bread_err2: @@ -4427,16 +4450,12 @@ xlog_do_recover( XFS_BUF_UNASYNC(bp); bp->b_ops = &xfs_sb_buf_ops; - if (XFS_FORCED_SHUTDOWN(log->l_mp)) { - xfs_buf_relse(bp); - return -EIO; - } - - xfs_buf_iorequest(bp); - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); - ASSERT(0); + if (!XFS_FORCED_SHUTDOWN(log->l_mp)) { + xfs_buf_ioerror_alert(bp, __func__); + ASSERT(0); + } xfs_buf_relse(bp); return error; } @@ -4509,6 +4528,18 @@ xlog_recover( return -EINVAL; } + /* + * Delay log recovery if the debug hook is set. This is debug + * instrumention to coordinate simulation of I/O failures with + * log recovery. + */ + if (xfs_globals.log_recovery_delay) { + xfs_notice(log->l_mp, + "Delaying log recovery for %d seconds.", + xfs_globals.log_recovery_delay); + msleep(xfs_globals.log_recovery_delay * 1000); + } + xfs_notice(log->l_mp, "Starting recovery (logdev: %s)", log->l_mp->m_logname ? log->l_mp->m_logname : "internal"); diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index fbf0384a466f..51435dbce9c4 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -61,8 +61,6 @@ static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; static uuid_t *xfs_uuid_table; -extern struct kset *xfs_kset; - /* * See if the UUID is unique among mounted XFS filesystems. * Mount fails if UUID is nil or a FS with the same UUID is already mounted. @@ -302,21 +300,15 @@ xfs_readsb( * access to the superblock. */ reread: - bp = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), 0, buf_ops); - if (!bp) { - if (loud) - xfs_warn(mp, "SB buffer read failed"); - return -EIO; - } - if (bp->b_error) { - error = bp->b_error; + error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, + BTOBB(sector_size), 0, &bp, buf_ops); + if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); /* bad CRC means corrupted metadata */ if (error == -EFSBADCRC) error = -EFSCORRUPTED; - goto release_buf; + return error; } /* @@ -546,40 +538,43 @@ xfs_set_inoalignment(xfs_mount_t *mp) * Check that the data (and log if separate) is an ok size. */ STATIC int -xfs_check_sizes(xfs_mount_t *mp) +xfs_check_sizes( + struct xfs_mount *mp) { - xfs_buf_t *bp; + struct xfs_buf *bp; xfs_daddr_t d; + int error; d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_dblocks) { xfs_warn(mp, "filesystem size mismatch detected"); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_ddev_targp, + error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, NULL); - if (!bp) { + XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "last sector read failed"); - return -EIO; + return error; } xfs_buf_relse(bp); - if (mp->m_logdev_targp != mp->m_ddev_targp) { - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { - xfs_warn(mp, "log size mismatch detected"); - return -EFBIG; - } - bp = xfs_buf_read_uncached(mp->m_logdev_targp, + if (mp->m_logdev_targp == mp->m_ddev_targp) + return 0; + + d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); + if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_logblocks) { + xfs_warn(mp, "log size mismatch detected"); + return -EFBIG; + } + error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) { - xfs_warn(mp, "log device read failed"); - return -EIO; - } - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { + xfs_warn(mp, "log device read failed"); + return error; } + xfs_buf_relse(bp); return 0; } @@ -729,7 +724,6 @@ xfs_mountfs( xfs_set_maxicount(mp); - mp->m_kobj.kobject.kset = xfs_kset; error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, NULL, mp->m_fsname); if (error) goto out; diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index 1eb6f3df698c..30ecca3037e3 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -304,7 +304,8 @@ _xfs_mru_cache_reap( int xfs_mru_cache_init(void) { - xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", WQ_MEM_RECLAIM, 1); + xfs_mru_reap_wq = alloc_workqueue("xfs_mru_cache", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 1); if (!xfs_mru_reap_wq) return -ENOMEM; return 0; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index 10232102b4a6..d68f23021af3 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -434,6 +434,7 @@ xfs_qm_dquot_isolate( struct list_head *item, spinlock_t *lru_lock, void *arg) + __releases(lru_lock) __acquires(lru_lock) { struct xfs_dquot *dqp = container_of(item, struct xfs_dquot, q_lru); diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index 909e143b87ae..e1175ea9b551 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -46,7 +46,7 @@ * Keeps track of a current summary block, so we don't keep reading * it from the buffer cache. */ -STATIC int /* error */ +static int xfs_rtget_summary( xfs_mount_t *mp, /* file system mount structure */ xfs_trans_t *tp, /* transaction pointer */ @@ -56,60 +56,9 @@ xfs_rtget_summary( xfs_fsblock_t *rsb, /* in/out: summary block number */ xfs_suminfo_t *sum) /* out: summary info for this block */ { - xfs_buf_t *bp; /* buffer for summary block */ - int error; /* error value */ - xfs_fsblock_t sb; /* summary fsblock */ - int so; /* index into the summary file */ - xfs_suminfo_t *sp; /* pointer to returned data */ - - /* - * Compute entry number in the summary file. - */ - so = XFS_SUMOFFS(mp, log, bbno); - /* - * Compute the block number in the summary file. - */ - sb = XFS_SUMOFFSTOBLOCK(mp, so); - /* - * If we have an old buffer, and the block number matches, use that. - */ - if (rbpp && *rbpp && *rsb == sb) - bp = *rbpp; - /* - * Otherwise we have to get the buffer. - */ - else { - /* - * If there was an old one, get rid of it first. - */ - if (rbpp && *rbpp) - xfs_trans_brelse(tp, *rbpp); - error = xfs_rtbuf_get(mp, tp, sb, 1, &bp); - if (error) { - return error; - } - /* - * Remember this buffer and block for the next call. - */ - if (rbpp) { - *rbpp = bp; - *rsb = sb; - } - } - /* - * Point to the summary information & copy it out. - */ - sp = XFS_SUMPTR(mp, bp, so); - *sum = *sp; - /* - * Drop the buffer if we're not asked to remember it. - */ - if (!rbpp) - xfs_trans_brelse(tp, bp); - return 0; + return xfs_rtmodify_summary_int(mp, tp, log, bbno, 0, rbpp, rsb, sum); } - /* * Return whether there are any free extents in the size range given * by low and high, for the bitmap block bbno. @@ -972,16 +921,11 @@ xfs_growfs_rt( /* * Read in the last block of the device, make sure it exists. */ - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, nrblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp) - return -EIO; - if (bp->b_error) { - error = bp->b_error; - xfs_buf_relse(bp); + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) return error; - } xfs_buf_relse(bp); /* @@ -1235,11 +1179,12 @@ xfs_rtallocate_extent( */ int /* error */ xfs_rtmount_init( - xfs_mount_t *mp) /* file system mount structure */ + struct xfs_mount *mp) /* file system mount structure */ { - xfs_buf_t *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - xfs_sb_t *sbp; /* filesystem superblock copy in mount */ + struct xfs_buf *bp; /* buffer for last block of subvolume */ + struct xfs_sb *sbp; /* filesystem superblock copy in mount */ + xfs_daddr_t d; /* address of last block of subvolume */ + int error; sbp = &mp->m_sb; if (sbp->sb_rblocks == 0) @@ -1265,14 +1210,12 @@ xfs_rtmount_init( (unsigned long long) mp->m_sb.sb_rblocks); return -EFBIG; } - bp = xfs_buf_read_uncached(mp->m_rtdev_targp, + error = xfs_buf_read_uncached(mp->m_rtdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, NULL); - if (!bp || bp->b_error) { + XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + if (error) { xfs_warn(mp, "realtime device size check failed"); - if (bp) - xfs_buf_relse(bp); - return -EIO; + return error; } xfs_buf_relse(bp); return 0; diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index c642795324af..76c0a4a9bb17 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -111,6 +111,10 @@ int xfs_rtfind_forw(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t *rtblock); int xfs_rtmodify_range(struct xfs_mount *mp, struct xfs_trans *tp, xfs_rtblock_t start, xfs_extlen_t len, int val); +int xfs_rtmodify_summary_int(struct xfs_mount *mp, struct xfs_trans *tp, + int log, xfs_rtblock_t bbno, int delta, + xfs_buf_t **rbpp, xfs_fsblock_t *rsb, + xfs_suminfo_t *sum); int xfs_rtmodify_summary(struct xfs_mount *mp, struct xfs_trans *tp, int log, xfs_rtblock_t bbno, int delta, xfs_buf_t **rbpp, xfs_fsblock_t *rsb); diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index b194652033cd..9f622feda6a4 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -47,6 +47,7 @@ #include "xfs_dinode.h" #include "xfs_filestream.h" #include "xfs_quota.h" +#include "xfs_sysfs.h" #include <linux/namei.h> #include <linux/init.h> @@ -61,7 +62,11 @@ static const struct super_operations xfs_super_operations; static kmem_zone_t *xfs_ioend_zone; mempool_t *xfs_ioend_pool; -struct kset *xfs_kset; + +static struct kset *xfs_kset; /* top-level xfs sysfs dir */ +#ifdef DEBUG +static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */ +#endif #define MNTOPT_LOGBUFS "logbufs" /* number of XFS log buffers */ #define MNTOPT_LOGBSIZE "logbsize" /* size of XFS log buffers */ @@ -838,32 +843,32 @@ xfs_init_mount_workqueues( struct xfs_mount *mp) { mp->m_data_workqueue = alloc_workqueue("xfs-data/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_data_workqueue) goto out; mp->m_unwritten_workqueue = alloc_workqueue("xfs-conv/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_unwritten_workqueue) goto out_destroy_data_iodone_queue; mp->m_cil_workqueue = alloc_workqueue("xfs-cil/%s", - WQ_MEM_RECLAIM, 0, mp->m_fsname); + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_cil_workqueue) goto out_destroy_unwritten; mp->m_reclaim_workqueue = alloc_workqueue("xfs-reclaim/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_reclaim_workqueue) goto out_destroy_cil; mp->m_log_workqueue = alloc_workqueue("xfs-log/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_log_workqueue) goto out_destroy_reclaim; mp->m_eofblocks_workqueue = alloc_workqueue("xfs-eofblocks/%s", - 0, 0, mp->m_fsname); + WQ_FREEZABLE, 0, mp->m_fsname); if (!mp->m_eofblocks_workqueue) goto out_destroy_log; @@ -1406,6 +1411,7 @@ xfs_fs_fill_super( atomic_set(&mp->m_active_trans, 0); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker); + mp->m_kobj.kobject.kset = xfs_kset; mp->m_super = sb; sb->s_fs_info = mp; @@ -1715,7 +1721,8 @@ xfs_init_workqueues(void) * AGs in all the filesystems mounted. Hence use the default large * max_active value for this workqueue. */ - xfs_alloc_wq = alloc_workqueue("xfsalloc", WQ_MEM_RECLAIM, 0); + xfs_alloc_wq = alloc_workqueue("xfsalloc", + WQ_MEM_RECLAIM|WQ_FREEZABLE, 0); if (!xfs_alloc_wq) return -ENOMEM; @@ -1768,9 +1775,16 @@ init_xfs_fs(void) goto out_sysctl_unregister;; } - error = xfs_qm_init(); +#ifdef DEBUG + xfs_dbg_kobj.kobject.kset = xfs_kset; + error = xfs_sysfs_init(&xfs_dbg_kobj, &xfs_dbg_ktype, NULL, "debug"); if (error) goto out_kset_unregister; +#endif + + error = xfs_qm_init(); + if (error) + goto out_remove_kobj; error = register_filesystem(&xfs_fs_type); if (error) @@ -1779,7 +1793,11 @@ init_xfs_fs(void) out_qm_exit: xfs_qm_exit(); + out_remove_kobj: +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); out_kset_unregister: +#endif kset_unregister(xfs_kset); out_sysctl_unregister: xfs_sysctl_unregister(); @@ -1802,6 +1820,9 @@ exit_xfs_fs(void) { xfs_qm_exit(); unregister_filesystem(&xfs_fs_type); +#ifdef DEBUG + xfs_sysfs_del(&xfs_dbg_kobj); +#endif kset_unregister(xfs_kset); xfs_sysctl_unregister(); xfs_cleanup_procfs(); diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c index 6a944a2cd36f..02ae62a998e0 100644 --- a/fs/xfs/xfs_symlink.c +++ b/fs/xfs/xfs_symlink.c @@ -269,9 +269,11 @@ xfs_symlink( /* * Check for ability to enter directory entry, if no space reserved. */ - error = xfs_dir_canenter(tp, dp, link_name, resblks); - if (error) - goto error_return; + if (!resblks) { + error = xfs_dir_canenter(tp, dp, link_name); + if (error) + goto error_return; + } /* * Initialize the bmap freelist prior to calling either * bmapi or the directory create code. diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index bd8e157c20ef..ffef45375754 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -92,6 +92,11 @@ enum { extern xfs_param_t xfs_params; +struct xfs_globals { + int log_recovery_delay; /* log recovery delay (secs) */ +}; +extern struct xfs_globals xfs_globals; + #ifdef CONFIG_SYSCTL extern int xfs_sysctl_register(void); extern void xfs_sysctl_unregister(void); diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 9835139ce1ec..aa03670851d8 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -51,6 +51,80 @@ struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, }; +#ifdef DEBUG +/* debug */ + +STATIC ssize_t +log_recovery_delay_store( + const char *buf, + size_t count, + void *data) +{ + int ret; + int val; + + ret = kstrtoint(buf, 0, &val); + if (ret) + return ret; + + if (val < 0 || val > 60) + return -EINVAL; + + xfs_globals.log_recovery_delay = val; + + return count; +} + +STATIC ssize_t +log_recovery_delay_show( + char *buf, + void *data) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", xfs_globals.log_recovery_delay); +} +XFS_SYSFS_ATTR_RW(log_recovery_delay); + +static struct attribute *xfs_dbg_attrs[] = { + ATTR_LIST(log_recovery_delay), + NULL, +}; + +STATIC ssize_t +xfs_dbg_show( + struct kobject *kobject, + struct attribute *attr, + char *buf) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->show ? xfs_attr->show(buf, NULL) : 0; +} + +STATIC ssize_t +xfs_dbg_store( + struct kobject *kobject, + struct attribute *attr, + const char *buf, + size_t count) +{ + struct xfs_sysfs_attr *xfs_attr = to_attr(attr); + + return xfs_attr->store ? xfs_attr->store(buf, count, NULL) : 0; +} + +static struct sysfs_ops xfs_dbg_ops = { + .show = xfs_dbg_show, + .store = xfs_dbg_store, +}; + +struct kobj_type xfs_dbg_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_dbg_ops, + .default_attrs = xfs_dbg_attrs, +}; + +#endif /* DEBUG */ + /* xlog */ STATIC ssize_t diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 54a2091183c0..240eee35f342 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -20,6 +20,7 @@ #define __XFS_SYSFS_H__ extern struct kobj_type xfs_mp_ktype; /* xfs_mount */ +extern struct kobj_type xfs_dbg_ktype; /* debug */ extern struct kobj_type xfs_log_ktype; /* xlog */ static inline struct xfs_kobj * diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index 152f82782630..51372e34d988 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -349,7 +349,8 @@ DEFINE_BUF_EVENT(xfs_buf_free); DEFINE_BUF_EVENT(xfs_buf_hold); DEFINE_BUF_EVENT(xfs_buf_rele); DEFINE_BUF_EVENT(xfs_buf_iodone); -DEFINE_BUF_EVENT(xfs_buf_iorequest); +DEFINE_BUF_EVENT(xfs_buf_submit); +DEFINE_BUF_EVENT(xfs_buf_submit_wait); DEFINE_BUF_EVENT(xfs_buf_bawrite); DEFINE_BUF_EVENT(xfs_buf_lock); DEFINE_BUF_EVENT(xfs_buf_lock_done); diff --git a/fs/xfs/xfs_trans_buf.c b/fs/xfs/xfs_trans_buf.c index 96c898e7ac9a..e2b2216b1635 100644 --- a/fs/xfs/xfs_trans_buf.c +++ b/fs/xfs/xfs_trans_buf.c @@ -318,20 +318,10 @@ xfs_trans_read_buf_map( XFS_BUF_READ(bp); bp->b_ops = ops; - /* - * XXX(hch): clean up the error handling here to be less - * of a mess.. - */ - if (XFS_FORCED_SHUTDOWN(mp)) { - trace_xfs_bdstrat_shut(bp, _RET_IP_); - xfs_bioerror_relse(bp); - } else { - xfs_buf_iorequest(bp); - } - - error = xfs_buf_iowait(bp); + error = xfs_buf_submit_wait(bp); if (error) { - xfs_buf_ioerror_alert(bp, __func__); + if (!XFS_FORCED_SHUTDOWN(mp)) + xfs_buf_ioerror_alert(bp, __func__); xfs_buf_relse(bp); /* * We can gracefully recover from most read diff --git a/fs/xfs/xfs_trans_inode.c b/fs/xfs/xfs_trans_inode.c index 50c3f5614288..cdb4d86520e1 100644 --- a/fs/xfs/xfs_trans_inode.c +++ b/fs/xfs/xfs_trans_inode.c @@ -70,7 +70,7 @@ xfs_trans_ichgtime( int flags) { struct inode *inode = VFS_I(ip); - timespec_t tv; + struct timespec tv; ASSERT(tp); ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL)); |
