summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/9p/v9fs.c2
-rw-r--r--fs/9p/vfs_file.c4
-rw-r--r--fs/9p/vfs_inode.c4
-rw-r--r--fs/9p/vfs_inode_dotl.c14
-rw-r--r--fs/Kconfig10
-rw-r--r--fs/Kconfig.binfmt3
-rw-r--r--fs/Makefile1
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/cmservice.c5
-rw-r--r--fs/afs/dir.c236
-rw-r--r--fs/afs/dir_silly.c3
-rw-r--r--fs/afs/file.c483
-rw-r--r--fs/afs/fs_operation.c10
-rw-r--r--fs/afs/fsclient.c112
-rw-r--r--fs/afs/inode.c19
-rw-r--r--fs/afs/internal.h61
-rw-r--r--fs/afs/rxrpc.c150
-rw-r--r--fs/afs/vlclient.c1
-rw-r--r--fs/afs/write.c659
-rw-r--r--fs/afs/yfsclient.c82
-rw-r--r--fs/aio.c5
-rw-r--r--fs/autofs/autofs_i.h1
-rw-r--r--fs/autofs/expire.c2
-rw-r--r--fs/autofs/waitq.c72
-rw-r--r--fs/befs/TODO14
-rw-r--r--fs/binfmt_elf.c4
-rw-r--r--fs/binfmt_elf_fdpic.c3
-rw-r--r--fs/binfmt_flat.c18
-rw-r--r--fs/block_dev.c65
-rw-r--r--fs/btrfs/Makefile12
-rw-r--r--fs/btrfs/backref.c33
-rw-r--r--fs/btrfs/block-group.c207
-rw-r--r--fs/btrfs/block-group.h3
-rw-r--r--fs/btrfs/btrfs_inode.h33
-rw-r--r--fs/btrfs/check-integrity.c14
-rw-r--r--fs/btrfs/compression.c69
-rw-r--r--fs/btrfs/ctree.c984
-rw-r--r--fs/btrfs/ctree.h85
-rw-r--r--fs/btrfs/delayed-inode.c35
-rw-r--r--fs/btrfs/delayed-ref.c31
-rw-r--r--fs/btrfs/dev-replace.c3
-rw-r--r--fs/btrfs/disk-io.c181
-rw-r--r--fs/btrfs/extent-tree.c29
-rw-r--r--fs/btrfs/extent_io.c470
-rw-r--r--fs/btrfs/extent_io.h4
-rw-r--r--fs/btrfs/file-item.c109
-rw-r--r--fs/btrfs/file.c153
-rw-r--r--fs/btrfs/free-space-cache.c11
-rw-r--r--fs/btrfs/inode.c206
-rw-r--r--fs/btrfs/ioctl.c273
-rw-r--r--fs/btrfs/lzo.c9
-rw-r--r--fs/btrfs/ordered-data.c21
-rw-r--r--fs/btrfs/ordered-data.h4
-rw-r--r--fs/btrfs/qgroup.c61
-rw-r--r--fs/btrfs/raid56.c73
-rw-r--r--fs/btrfs/reflink.c104
-rw-r--r--fs/btrfs/relocation.c448
-rw-r--r--fs/btrfs/scrub.c13
-rw-r--r--fs/btrfs/send.c47
-rw-r--r--fs/btrfs/space-info.c4
-rw-r--r--fs/btrfs/subpage.c140
-rw-r--r--fs/btrfs/subpage.h7
-rw-r--r--fs/btrfs/super.c26
-rw-r--r--fs/btrfs/sysfs.c50
-rw-r--r--fs/btrfs/transaction.c59
-rw-r--r--fs/btrfs/transaction.h9
-rw-r--r--fs/btrfs/tree-checker.c5
-rw-r--r--fs/btrfs/tree-log.c68
-rw-r--r--fs/btrfs/tree-mod-log.c929
-rw-r--r--fs/btrfs/tree-mod-log.h53
-rw-r--r--fs/btrfs/volumes.c127
-rw-r--r--fs/btrfs/volumes.h1
-rw-r--r--fs/btrfs/zlib.c5
-rw-r--r--fs/btrfs/zoned.c69
-rw-r--r--fs/btrfs/zoned.h11
-rw-r--r--fs/btrfs/zstd.c5
-rw-r--r--fs/buffer.c42
-rw-r--r--fs/cachefiles/Makefile1
-rw-r--r--fs/cachefiles/bind.c6
-rw-r--r--fs/cachefiles/interface.c5
-rw-r--r--fs/cachefiles/internal.h9
-rw-r--r--fs/cachefiles/io.c420
-rw-r--r--fs/cachefiles/rdwr.c7
-rw-r--r--fs/ceph/Kconfig1
-rw-r--r--fs/ceph/addr.c626
-rw-r--r--fs/ceph/cache.c125
-rw-r--r--fs/ceph/cache.h101
-rw-r--r--fs/ceph/caps.c35
-rw-r--r--fs/ceph/debugfs.c12
-rw-r--r--fs/ceph/dir.c32
-rw-r--r--fs/ceph/export.c21
-rw-r--r--fs/ceph/file.c52
-rw-r--r--fs/ceph/inode.c77
-rw-r--r--fs/ceph/io.c2
-rw-r--r--fs/ceph/mds_client.c20
-rw-r--r--fs/ceph/mds_client.h1
-rw-r--r--fs/ceph/metric.c62
-rw-r--r--fs/ceph/metric.h56
-rw-r--r--fs/ceph/snap.c2
-rw-r--r--fs/ceph/super.h32
-rw-r--r--fs/ceph/xattr.c7
-rw-r--r--fs/cifs/Kconfig3
-rw-r--r--fs/cifs/Makefile5
-rw-r--r--fs/cifs/cifs_debug.c58
-rw-r--r--fs/cifs/cifs_dfs_ref.c14
-rw-r--r--fs/cifs/cifs_fs_sb.h5
-rw-r--r--fs/cifs/cifs_ioctl.h48
-rw-r--r--fs/cifs/cifs_swn.h27
-rw-r--r--fs/cifs/cifsacl.c7
-rw-r--r--fs/cifs/cifsfs.c63
-rw-r--r--fs/cifs/cifsfs.h2
-rw-r--r--fs/cifs/cifsglob.h53
-rw-r--r--fs/cifs/cifspdu.h8
-rw-r--r--fs/cifs/cifsproto.h45
-rw-r--r--fs/cifs/cifssmb.c52
-rw-r--r--fs/cifs/connect.c86
-rw-r--r--fs/cifs/dfs_cache.c41
-rw-r--r--fs/cifs/dir.c179
-rw-r--r--fs/cifs/file.c188
-rw-r--r--fs/cifs/fs_context.c153
-rw-r--r--fs/cifs/fs_context.h13
-rw-r--r--fs/cifs/inode.c224
-rw-r--r--fs/cifs/ioctl.c192
-rw-r--r--fs/cifs/link.c57
-rw-r--r--fs/cifs/misc.c96
-rw-r--r--fs/cifs/readdir.c19
-rw-r--r--fs/cifs/sess.c6
-rw-r--r--fs/cifs/smb1ops.c6
-rw-r--r--fs/cifs/smb2glob.h1
-rw-r--r--fs/cifs/smb2inode.c10
-rw-r--r--fs/cifs/smb2misc.c5
-rw-r--r--fs/cifs/smb2ops.c226
-rw-r--r--fs/cifs/smb2pdu.c20
-rw-r--r--fs/cifs/smb2pdu.h49
-rw-r--r--fs/cifs/smb2proto.h16
-rw-r--r--fs/cifs/smb2transport.c37
-rw-r--r--fs/cifs/trace.h29
-rw-r--r--fs/cifs/unc.c4
-rw-r--r--fs/cifs/xattr.c44
-rw-r--r--fs/coda/file.c6
-rw-r--r--fs/configfs/configfs_internal.h4
-rw-r--r--fs/configfs/dir.c4
-rw-r--r--fs/configfs/file.c4
-rw-r--r--fs/configfs/inode.c4
-rw-r--r--fs/configfs/item.c4
-rw-r--r--fs/configfs/mount.c4
-rw-r--r--fs/configfs/symlink.c4
-rw-r--r--fs/coredump.c72
-rw-r--r--fs/crypto/Kconfig30
-rw-r--r--fs/d_path.c10
-rw-r--r--fs/dax.c43
-rw-r--r--fs/dcache.c2
-rw-r--r--fs/debugfs/file.c94
-rw-r--r--fs/debugfs/inode.c11
-rw-r--r--fs/direct-io.c5
-rw-r--r--fs/dlm/config.c86
-rw-r--r--fs/dlm/config.h1
-rw-r--r--fs/dlm/debug_fs.c1
-rw-r--r--fs/dlm/lock.c2
-rw-r--r--fs/dlm/lockspace.c20
-rw-r--r--fs/dlm/lowcomms.c194
-rw-r--r--fs/dlm/lowcomms.h5
-rw-r--r--fs/dlm/midcomms.c33
-rw-r--r--fs/dlm/rcom.c2
-rw-r--r--fs/ecryptfs/crypto.c29
-rw-r--r--fs/ecryptfs/debug.c4
-rw-r--r--fs/ecryptfs/dentry.c2
-rw-r--r--fs/ecryptfs/ecryptfs_kernel.h19
-rw-r--r--fs/ecryptfs/file.c4
-rw-r--r--fs/ecryptfs/inode.c196
-rw-r--r--fs/ecryptfs/keystore.c15
-rw-r--r--fs/ecryptfs/kthread.c3
-rw-r--r--fs/ecryptfs/main.c30
-rw-r--r--fs/ecryptfs/messaging.c14
-rw-r--r--fs/ecryptfs/miscdev.c3
-rw-r--r--fs/ecryptfs/mmap.c11
-rw-r--r--fs/ecryptfs/read_write.c4
-rw-r--r--fs/ecryptfs/super.c8
-rw-r--r--fs/efivarfs/file.c77
-rw-r--r--fs/efivarfs/inode.c44
-rw-r--r--fs/erofs/Kconfig14
-rw-r--r--fs/erofs/Makefile2
-rw-r--r--fs/erofs/data.c19
-rw-r--r--fs/erofs/decompressor.c272
-rw-r--r--fs/erofs/erofs_fs.h54
-rw-r--r--fs/erofs/inode.c7
-rw-r--r--fs/erofs/internal.h86
-rw-r--r--fs/erofs/pcpubuf.c148
-rw-r--r--fs/erofs/super.c148
-rw-r--r--fs/erofs/utils.c12
-rw-r--r--fs/erofs/zdata.c254
-rw-r--r--fs/erofs/zdata.h14
-rw-r--r--fs/erofs/zmap.c181
-rw-r--r--fs/eventpoll.c58
-rw-r--r--fs/exfat/balloc.c95
-rw-r--r--fs/exfat/dir.c26
-rw-r--r--fs/exfat/exfat_fs.h11
-rw-r--r--fs/exfat/fatent.c41
-rw-r--r--fs/exfat/file.c53
-rw-r--r--fs/exfat/inode.c3
-rw-r--r--fs/exfat/namei.c11
-rw-r--r--fs/exfat/super.c1
-rw-r--r--fs/ext2/dir.c94
-rw-r--r--fs/ext2/ext2.h19
-rw-r--r--fs/ext2/file.c2
-rw-r--r--fs/ext2/ioctl.c88
-rw-r--r--fs/ext2/namei.c39
-rw-r--r--fs/ext2/super.c7
-rw-r--r--fs/ext4/ext4.h12
-rw-r--r--fs/ext4/file.c2
-rw-r--r--fs/ext4/fsmap.c4
-rw-r--r--fs/ext4/ialloc.c2
-rw-r--r--fs/ext4/ioctl.c208
-rw-r--r--fs/ext4/namei.c5
-rw-r--r--fs/ext4/super.c5
-rw-r--r--fs/ext4/verity.c2
-rw-r--r--fs/f2fs/Kconfig16
-rw-r--r--fs/f2fs/acl.c1
-rw-r--r--fs/f2fs/checkpoint.c9
-rw-r--r--fs/f2fs/compress.c70
-rw-r--r--fs/f2fs/compress.h0
-rw-r--r--fs/f2fs/data.c146
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c5
-rw-r--r--fs/f2fs/f2fs.h60
-rw-r--r--fs/f2fs/file.c272
-rw-r--r--fs/f2fs/gc.c95
-rw-r--r--fs/f2fs/gc.h6
-rw-r--r--fs/f2fs/inline.c3
-rw-r--r--fs/f2fs/inode.c3
-rw-r--r--fs/f2fs/namei.c8
-rw-r--r--fs/f2fs/node.c19
-rw-r--r--fs/f2fs/node.h1
-rw-r--r--fs/f2fs/recovery.c3
-rw-r--r--fs/f2fs/segment.c184
-rw-r--r--fs/f2fs/segment.h16
-rw-r--r--fs/f2fs/super.c102
-rw-r--r--fs/f2fs/sysfs.c47
-rw-r--r--fs/f2fs/verity.c77
-rw-r--r--fs/f2fs/xattr.c1
-rw-r--r--fs/fat/fatent.c2
-rw-r--r--fs/file.c60
-rw-r--r--fs/fs_parser.c2
-rw-r--r--fs/fscache/Kconfig1
-rw-r--r--fs/fscache/Makefile1
-rw-r--r--fs/fscache/internal.h4
-rw-r--r--fs/fscache/io.c116
-rw-r--r--fs/fscache/page.c2
-rw-r--r--fs/fscache/stats.c1
-rw-r--r--fs/fuse/Makefile2
-rw-r--r--fs/fuse/acl.c7
-rw-r--r--fs/fuse/cuse.c12
-rw-r--r--fs/fuse/dev.c7
-rw-r--r--fs/fuse/dir.c12
-rw-r--r--fs/fuse/file.c506
-rw-r--r--fs/fuse/fuse_i.h53
-rw-r--r--fs/fuse/inode.c12
-rw-r--r--fs/fuse/ioctl.c490
-rw-r--r--fs/fuse/readdir.c2
-rw-r--r--fs/fuse/virtio_fs.c28
-rw-r--r--fs/fuse/xattr.c9
-rw-r--r--fs/gfs2/aops.c5
-rw-r--r--fs/gfs2/bmap.c153
-rw-r--r--fs/gfs2/bmap.h13
-rw-r--r--fs/gfs2/dir.c52
-rw-r--r--fs/gfs2/file.c80
-rw-r--r--fs/gfs2/glock.c46
-rw-r--r--fs/gfs2/glops.c38
-rw-r--r--fs/gfs2/incore.h3
-rw-r--r--fs/gfs2/inode.c36
-rw-r--r--fs/gfs2/inode.h4
-rw-r--r--fs/gfs2/lock_dlm.c37
-rw-r--r--fs/gfs2/log.c31
-rw-r--r--fs/gfs2/log.h1
-rw-r--r--fs/gfs2/lops.c23
-rw-r--r--fs/gfs2/lops.h1
-rw-r--r--fs/gfs2/meta_io.c20
-rw-r--r--fs/gfs2/meta_io.h6
-rw-r--r--fs/gfs2/ops_fstype.c8
-rw-r--r--fs/gfs2/quota.c6
-rw-r--r--fs/gfs2/recovery.c8
-rw-r--r--fs/gfs2/rgrp.c8
-rw-r--r--fs/gfs2/super.c26
-rw-r--r--fs/gfs2/sys.c67
-rw-r--r--fs/gfs2/util.c20
-rw-r--r--fs/gfs2/xattr.c29
-rw-r--r--fs/hfsplus/dir.c2
-rw-r--r--fs/hfsplus/extents.c7
-rw-r--r--fs/hfsplus/hfsplus_fs.h14
-rw-r--r--fs/hfsplus/inode.c54
-rw-r--r--fs/hfsplus/ioctl.c84
-rw-r--r--fs/hostfs/hostfs_kern.c10
-rw-r--r--fs/hpfs/hpfs.h3
-rw-r--r--fs/hugetlbfs/inode.c16
-rw-r--r--fs/inode.c100
-rw-r--r--fs/io-wq.c375
-rw-r--r--fs/io-wq.h3
-rw-r--r--fs/io_uring.c2797
-rw-r--r--fs/ioctl.c325
-rw-r--r--fs/iomap/buffered-io.c14
-rw-r--r--fs/iomap/direct-io.c24
-rw-r--r--fs/iomap/swapfile.c38
-rw-r--r--fs/isofs/rock.c1
-rw-r--r--fs/jffs2/TODO37
-rw-r--r--fs/jffs2/file.c1
-rw-r--r--fs/jffs2/scan.c2
-rw-r--r--fs/jffs2/summary.h16
-rw-r--r--fs/jfs/file.c6
-rw-r--r--fs/jfs/ioctl.c111
-rw-r--r--fs/jfs/jfs_dinode.h7
-rw-r--r--fs/jfs/jfs_inode.h4
-rw-r--r--fs/jfs/namei.c6
-rw-r--r--fs/libfs.c1
-rw-r--r--fs/locks.c69
-rw-r--r--fs/namei.c39
-rw-r--r--fs/namespace.c20
-rw-r--r--fs/netfs/Kconfig23
-rw-r--r--fs/netfs/Makefile5
-rw-r--r--fs/netfs/internal.h97
-rw-r--r--fs/netfs/read_helper.c1185
-rw-r--r--fs/netfs/stats.c59
-rw-r--r--fs/nfs/callback_proc.c17
-rw-r--r--fs/nfs/client.c20
-rw-r--r--fs/nfs/delegation.c29
-rw-r--r--fs/nfs/delegation.h3
-rw-r--r--fs/nfs/dir.c36
-rw-r--r--fs/nfs/export.c15
-rw-r--r--fs/nfs/file.c2
-rw-r--r--fs/nfs/filelayout/filelayout.c2
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/fs_context.c69
-rw-r--r--fs/nfs/inode.c424
-rw-r--r--fs/nfs/internal.h3
-rw-r--r--fs/nfs/io.c2
-rw-r--r--fs/nfs/mount_clnt.c14
-rw-r--r--fs/nfs/namespace.c2
-rw-r--r--fs/nfs/nfs3acl.c2
-rw-r--r--fs/nfs/nfs3xdr.c5
-rw-r--r--fs/nfs/nfs42proc.c77
-rw-r--r--fs/nfs/nfs42xattr.c2
-rw-r--r--fs/nfs/nfs4file.c6
-rw-r--r--fs/nfs/nfs4proc.c268
-rw-r--r--fs/nfs/nfs4renewd.c6
-rw-r--r--fs/nfs/nfs4state.c8
-rw-r--r--fs/nfs/nfs4trace.h47
-rw-r--r--fs/nfs/nfs4xdr.c66
-rw-r--r--fs/nfs/nfstrace.c1
-rw-r--r--fs/nfs/nfstrace.h22
-rw-r--r--fs/nfs/pagelist.c24
-rw-r--r--fs/nfs/pnfs.c26
-rw-r--r--fs/nfs/proc.c1
-rw-r--r--fs/nfs/super.c14
-rw-r--r--fs/nfs/write.c7
-rw-r--r--fs/nfs_common/nfsacl.c71
-rw-r--r--fs/nfsd/Kconfig8
-rw-r--r--fs/nfsd/netns.h6
-rw-r--r--fs/nfsd/nfs2acl.c87
-rw-r--r--fs/nfsd/nfs3acl.c39
-rw-r--r--fs/nfsd/nfs3proc.c97
-rw-r--r--fs/nfsd/nfs3xdr.c1043
-rw-r--r--fs/nfsd/nfs4proc.c46
-rw-r--r--fs/nfsd/nfs4recover.c8
-rw-r--r--fs/nfsd/nfs4state.c497
-rw-r--r--fs/nfsd/nfs4xdr.c116
-rw-r--r--fs/nfsd/nfsctl.c29
-rw-r--r--fs/nfsd/nfsd.h7
-rw-r--r--fs/nfsd/nfsfh.c2
-rw-r--r--fs/nfsd/nfsfh.h2
-rw-r--r--fs/nfsd/nfsproc.c55
-rw-r--r--fs/nfsd/nfssvc.c50
-rw-r--r--fs/nfsd/nfsxdr.c413
-rw-r--r--fs/nfsd/state.h7
-rw-r--r--fs/nfsd/trace.h24
-rw-r--r--fs/nfsd/vfs.c9
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr.h23
-rw-r--r--fs/nfsd/xdr3.h37
-rw-r--r--fs/nfsd/xdr4.h8
-rw-r--r--fs/nilfs2/cpfile.c2
-rw-r--r--fs/nilfs2/file.c2
-rw-r--r--fs/nilfs2/ioctl.c65
-rw-r--r--fs/nilfs2/namei.c5
-rw-r--r--fs/nilfs2/nilfs.h3
-rw-r--r--fs/nilfs2/segment.c4
-rw-r--r--fs/nilfs2/the_nilfs.c2
-rw-r--r--fs/notify/fanotify/fanotify.c166
-rw-r--r--fs/notify/fanotify/fanotify.h46
-rw-r--r--fs/notify/fanotify/fanotify_user.c241
-rw-r--r--fs/notify/fdinfo.c3
-rw-r--r--fs/notify/group.c1
-rw-r--r--fs/notify/inotify/inotify_fsnotify.c9
-rw-r--r--fs/notify/inotify/inotify_user.c7
-rw-r--r--fs/notify/mark.c4
-rw-r--r--fs/notify/notification.c64
-rw-r--r--fs/ocfs2/acl.c4
-rw-r--r--fs/ocfs2/acl.h4
-rw-r--r--fs/ocfs2/alloc.c4
-rw-r--r--fs/ocfs2/alloc.h4
-rw-r--r--fs/ocfs2/aops.c15
-rw-r--r--fs/ocfs2/aops.h4
-rw-r--r--fs/ocfs2/blockcheck.c6
-rw-r--r--fs/ocfs2/blockcheck.h4
-rw-r--r--fs/ocfs2/buffer_head_io.c4
-rw-r--r--fs/ocfs2/buffer_head_io.h4
-rw-r--r--fs/ocfs2/cluster/heartbeat.c4
-rw-r--r--fs/ocfs2/cluster/heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/masklog.c4
-rw-r--r--fs/ocfs2/cluster/masklog.h4
-rw-r--r--fs/ocfs2/cluster/netdebug.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.c4
-rw-r--r--fs/ocfs2/cluster/nodemanager.h4
-rw-r--r--fs/ocfs2/cluster/ocfs2_heartbeat.h4
-rw-r--r--fs/ocfs2/cluster/ocfs2_nodemanager.h4
-rw-r--r--fs/ocfs2/cluster/quorum.c4
-rw-r--r--fs/ocfs2/cluster/quorum.h4
-rw-r--r--fs/ocfs2/cluster/sys.c4
-rw-r--r--fs/ocfs2/cluster/sys.h4
-rw-r--r--fs/ocfs2/cluster/tcp.c4
-rw-r--r--fs/ocfs2/cluster/tcp.h4
-rw-r--r--fs/ocfs2/cluster/tcp_internal.h4
-rw-r--r--fs/ocfs2/dcache.c4
-rw-r--r--fs/ocfs2/dcache.h4
-rw-r--r--fs/ocfs2/dir.c4
-rw-r--r--fs/ocfs2/dir.h4
-rw-r--r--fs/ocfs2/dlm/dlmapi.h4
-rw-r--r--fs/ocfs2/dlm/dlmast.c4
-rw-r--r--fs/ocfs2/dlm/dlmcommon.h4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.c4
-rw-r--r--fs/ocfs2/dlm/dlmconvert.h4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.c4
-rw-r--r--fs/ocfs2/dlm/dlmdebug.h4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.c4
-rw-r--r--fs/ocfs2/dlm/dlmdomain.h4
-rw-r--r--fs/ocfs2/dlm/dlmlock.c4
-rw-r--r--fs/ocfs2/dlm/dlmmaster.c4
-rw-r--r--fs/ocfs2/dlm/dlmrecovery.c11
-rw-r--r--fs/ocfs2/dlm/dlmthread.c4
-rw-r--r--fs/ocfs2/dlm/dlmunlock.c4
-rw-r--r--fs/ocfs2/dlmfs/dlmfs.c4
-rw-r--r--fs/ocfs2/dlmfs/userdlm.c4
-rw-r--r--fs/ocfs2/dlmfs/userdlm.h4
-rw-r--r--fs/ocfs2/dlmglue.c16
-rw-r--r--fs/ocfs2/dlmglue.h4
-rw-r--r--fs/ocfs2/export.c4
-rw-r--r--fs/ocfs2/export.h4
-rw-r--r--fs/ocfs2/extent_map.c4
-rw-r--r--fs/ocfs2/extent_map.h4
-rw-r--r--fs/ocfs2/file.c69
-rw-r--r--fs/ocfs2/file.h4
-rw-r--r--fs/ocfs2/filecheck.c4
-rw-r--r--fs/ocfs2/filecheck.h4
-rw-r--r--fs/ocfs2/heartbeat.c4
-rw-r--r--fs/ocfs2/heartbeat.h4
-rw-r--r--fs/ocfs2/inode.c4
-rw-r--r--fs/ocfs2/inode.h4
-rw-r--r--fs/ocfs2/ioctl.c59
-rw-r--r--fs/ocfs2/ioctl.h3
-rw-r--r--fs/ocfs2/journal.c4
-rw-r--r--fs/ocfs2/journal.h4
-rw-r--r--fs/ocfs2/localalloc.c4
-rw-r--r--fs/ocfs2/localalloc.h4
-rw-r--r--fs/ocfs2/locks.c4
-rw-r--r--fs/ocfs2/locks.h4
-rw-r--r--fs/ocfs2/mmap.c4
-rw-r--r--fs/ocfs2/move_extents.c4
-rw-r--r--fs/ocfs2/move_extents.h4
-rw-r--r--fs/ocfs2/namei.c7
-rw-r--r--fs/ocfs2/namei.h4
-rw-r--r--fs/ocfs2/ocfs1_fs_compat.h4
-rw-r--r--fs/ocfs2/ocfs2.h4
-rw-r--r--fs/ocfs2/ocfs2_fs.h4
-rw-r--r--fs/ocfs2/ocfs2_ioctl.h12
-rw-r--r--fs/ocfs2/ocfs2_lockid.h4
-rw-r--r--fs/ocfs2/ocfs2_lockingver.h4
-rw-r--r--fs/ocfs2/refcounttree.c4
-rw-r--r--fs/ocfs2/refcounttree.h4
-rw-r--r--fs/ocfs2/reservations.c4
-rw-r--r--fs/ocfs2/reservations.h4
-rw-r--r--fs/ocfs2/resize.c4
-rw-r--r--fs/ocfs2/resize.h4
-rw-r--r--fs/ocfs2/slot_map.c4
-rw-r--r--fs/ocfs2/slot_map.h4
-rw-r--r--fs/ocfs2/stack_o2cb.c40
-rw-r--r--fs/ocfs2/stack_user.c4
-rw-r--r--fs/ocfs2/stackglue.c6
-rw-r--r--fs/ocfs2/stackglue.h4
-rw-r--r--fs/ocfs2/suballoc.c4
-rw-r--r--fs/ocfs2/suballoc.h4
-rw-r--r--fs/ocfs2/super.c4
-rw-r--r--fs/ocfs2/super.h4
-rw-r--r--fs/ocfs2/symlink.c4
-rw-r--r--fs/ocfs2/symlink.h4
-rw-r--r--fs/ocfs2/sysfile.c4
-rw-r--r--fs/ocfs2/sysfile.h4
-rw-r--r--fs/ocfs2/uptodate.c4
-rw-r--r--fs/ocfs2/uptodate.h4
-rw-r--r--fs/ocfs2/xattr.c4
-rw-r--r--fs/ocfs2/xattr.h4
-rw-r--r--fs/openpromfs/inode.c67
-rw-r--r--fs/orangefs/file.c113
-rw-r--r--fs/orangefs/inode.c172
-rw-r--r--fs/orangefs/orangefs-mod.c2
-rw-r--r--fs/orangefs/orangefs-utils.c2
-rw-r--r--fs/overlayfs/copy_up.c3
-rw-r--r--fs/overlayfs/dir.c2
-rw-r--r--fs/overlayfs/file.c142
-rw-r--r--fs/overlayfs/inode.c95
-rw-r--r--fs/overlayfs/namei.c5
-rw-r--r--fs/overlayfs/overlayfs.h42
-rw-r--r--fs/overlayfs/readdir.c16
-rw-r--r--fs/overlayfs/super.c66
-rw-r--r--fs/overlayfs/util.c33
-rw-r--r--fs/proc/array.c2
-rw-r--r--fs/proc/base.c4
-rw-r--r--fs/proc/generic.c13
-rw-r--r--fs/proc/inode.c18
-rw-r--r--fs/proc/proc_sysctl.c15
-rw-r--r--fs/proc/task_mmu.c3
-rw-r--r--fs/pstore/platform.c5
-rw-r--r--fs/pstore/ram.c7
-rw-r--r--fs/pstore/ram_core.c18
-rw-r--r--fs/quota/dquot.c6
-rw-r--r--fs/quota/quota.c50
-rw-r--r--fs/readdir.c6
-rw-r--r--fs/reiserfs/file.c2
-rw-r--r--fs/reiserfs/ioctl.c121
-rw-r--r--fs/reiserfs/journal.c6
-rw-r--r--fs/reiserfs/namei.c2
-rw-r--r--fs/reiserfs/procfs.c10
-rw-r--r--fs/reiserfs/reiserfs.h7
-rw-r--r--fs/reiserfs/super.c2
-rw-r--r--fs/reiserfs/xattr.h2
-rw-r--r--fs/seq_file.c18
-rw-r--r--fs/signalfd.c19
-rw-r--r--fs/squashfs/export.c8
-rw-r--r--fs/squashfs/file.c6
-rw-r--r--fs/squashfs/id.c6
-rw-r--r--fs/squashfs/squashfs_fs.h1
-rw-r--r--fs/squashfs/xattr_id.c6
-rw-r--r--fs/super.c1
-rw-r--r--fs/tracefs/inode.c2
-rw-r--r--fs/ubifs/dir.c2
-rw-r--r--fs/ubifs/file.c2
-rw-r--r--fs/ubifs/gc.c7
-rw-r--r--fs/ubifs/ioctl.c78
-rw-r--r--fs/ubifs/replay.c7
-rw-r--r--fs/ubifs/sb.c3
-rw-r--r--fs/ubifs/super.c6
-rw-r--r--fs/ubifs/ubifs.h3
-rw-r--r--fs/udf/namei.c3
-rw-r--r--fs/ufs/super.c3
-rw-r--r--fs/unicode/.gitignore4
-rw-r--r--fs/userfaultfd.c149
-rw-r--r--fs/vboxsf/dir.c4
-rw-r--r--fs/vboxsf/super.c4
-rw-r--r--fs/vboxsf/utils.c68
-rw-r--r--fs/vboxsf/vfsmod.h4
-rw-r--r--fs/verity/Kconfig8
-rw-r--r--fs/xattr.c14
-rw-r--r--fs/xfs/libxfs/xfs_ag.c115
-rw-r--r--fs/xfs/libxfs/xfs_ag.h2
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c52
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c25
-rw-r--r--fs/xfs/libxfs/xfs_alloc_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_attr.c54
-rw-r--r--fs/xfs/libxfs/xfs_attr.h1
-rw-r--r--fs/xfs/libxfs/xfs_attr_leaf.c35
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c239
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h2
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c6
-rw-r--r--fs/xfs/libxfs/xfs_btree_staging.c1
-rw-r--r--fs/xfs/libxfs/xfs_da_btree.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2.c14
-rw-r--r--fs/xfs/libxfs/xfs_dir2_block.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_data.c2
-rw-r--r--fs/xfs/libxfs/xfs_dir2_leaf.c12
-rw-r--r--fs/xfs/libxfs/xfs_dir2_node.c4
-rw-r--r--fs/xfs/libxfs/xfs_dir2_priv.h3
-rw-r--r--fs/xfs/libxfs/xfs_dir2_sf.c58
-rw-r--r--fs/xfs/libxfs/xfs_errortag.h4
-rw-r--r--fs/xfs/libxfs/xfs_format.h5
-rw-r--r--fs/xfs/libxfs/xfs_fs.h2
-rw-r--r--fs/xfs/libxfs/xfs_ialloc.c4
-rw-r--r--fs/xfs/libxfs/xfs_iext_tree.c2
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c127
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h33
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c48
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h20
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h12
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c2
-rw-r--r--fs/xfs/libxfs/xfs_rtbitmap.c4
-rw-r--r--fs/xfs/libxfs/xfs_sb.c16
-rw-r--r--fs/xfs/libxfs/xfs_shared.h4
-rw-r--r--fs/xfs/libxfs/xfs_trans_inode.c24
-rw-r--r--fs/xfs/libxfs/xfs_types.c18
-rw-r--r--fs/xfs/scrub/agheader.c40
-rw-r--r--fs/xfs/scrub/alloc.c5
-rw-r--r--fs/xfs/scrub/attr.c5
-rw-r--r--fs/xfs/scrub/bitmap.c4
-rw-r--r--fs/xfs/scrub/bmap.c20
-rw-r--r--fs/xfs/scrub/btree.c30
-rw-r--r--fs/xfs/scrub/common.c42
-rw-r--r--fs/xfs/scrub/common.h58
-rw-r--r--fs/xfs/scrub/dir.c20
-rw-r--r--fs/xfs/scrub/fscounters.c43
-rw-r--r--fs/xfs/scrub/health.c3
-rw-r--r--fs/xfs/scrub/ialloc.c8
-rw-r--r--fs/xfs/scrub/inode.c5
-rw-r--r--fs/xfs/scrub/parent.c7
-rw-r--r--fs/xfs/scrub/quota.c11
-rw-r--r--fs/xfs/scrub/refcount.c5
-rw-r--r--fs/xfs/scrub/repair.c11
-rw-r--r--fs/xfs/scrub/repair.h6
-rw-r--r--fs/xfs/scrub/rmap.c5
-rw-r--r--fs/xfs/scrub/rtbitmap.c7
-rw-r--r--fs/xfs/scrub/scrub.c42
-rw-r--r--fs/xfs/scrub/scrub.h14
-rw-r--r--fs/xfs/scrub/symlink.c9
-rw-r--r--fs/xfs/scrub/xfs_scrub.h4
-rw-r--r--fs/xfs/xfs_aops.c138
-rw-r--r--fs/xfs/xfs_attr_list.c2
-rw-r--r--fs/xfs/xfs_bmap_item.c4
-rw-r--r--fs/xfs/xfs_bmap_util.c305
-rw-r--r--fs/xfs/xfs_buf.c6
-rw-r--r--fs/xfs/xfs_buf_item.c141
-rw-r--r--fs/xfs/xfs_dir2_readdir.c12
-rw-r--r--fs/xfs/xfs_dquot.c10
-rw-r--r--fs/xfs/xfs_error.c5
-rw-r--r--fs/xfs/xfs_extent_busy.c4
-rw-r--r--fs/xfs/xfs_extent_busy.h3
-rw-r--r--fs/xfs/xfs_extfree_item.c4
-rw-r--r--fs/xfs/xfs_file.c12
-rw-r--r--fs/xfs/xfs_filestream.h2
-rw-r--r--fs/xfs/xfs_fsmap.c14
-rw-r--r--fs/xfs/xfs_fsops.c197
-rw-r--r--fs/xfs/xfs_icache.c35
-rw-r--r--fs/xfs/xfs_inode.c301
-rw-r--r--fs/xfs/xfs_inode.h42
-rw-r--r--fs/xfs/xfs_inode_item.c64
-rw-r--r--fs/xfs/xfs_inode_item_recover.c6
-rw-r--r--fs/xfs/xfs_ioctl.c424
-rw-r--r--fs/xfs/xfs_ioctl.h11
-rw-r--r--fs/xfs/xfs_ioctl32.c2
-rw-r--r--fs/xfs/xfs_ioctl32.h2
-rw-r--r--fs/xfs/xfs_iomap.c27
-rw-r--r--fs/xfs/xfs_iops.c72
-rw-r--r--fs/xfs/xfs_itable.c19
-rw-r--r--fs/xfs/xfs_linux.h2
-rw-r--r--fs/xfs/xfs_log.c10
-rw-r--r--fs/xfs/xfs_log_recover.c13
-rw-r--r--fs/xfs/xfs_message.h2
-rw-r--r--fs/xfs/xfs_mount.c29
-rw-r--r--fs/xfs/xfs_mount.h8
-rw-r--r--fs/xfs/xfs_ondisk.h4
-rw-r--r--fs/xfs/xfs_pnfs.c2
-rw-r--r--fs/xfs/xfs_qm.c22
-rw-r--r--fs/xfs/xfs_qm_bhv.c2
-rw-r--r--fs/xfs/xfs_qm_syscalls.c2
-rw-r--r--fs/xfs/xfs_quotaops.c2
-rw-r--r--fs/xfs/xfs_refcount_item.c4
-rw-r--r--fs/xfs/xfs_reflink.c25
-rw-r--r--fs/xfs/xfs_rmap_item.c4
-rw-r--r--fs/xfs/xfs_rtalloc.c16
-rw-r--r--fs/xfs/xfs_super.c132
-rw-r--r--fs/xfs/xfs_super.h1
-rw-r--r--fs/xfs/xfs_symlink.c32
-rw-r--r--fs/xfs/xfs_trace.h16
-rw-r--r--fs/xfs/xfs_trans.c24
-rw-r--r--fs/xfs/xfs_trans.h15
-rw-r--r--fs/xfs/xfs_xattr.c2
-rw-r--r--fs/zonefs/super.c5
671 files changed, 20003 insertions, 13293 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c
index 39def020a074..cdb99507ef33 100644
--- a/fs/9p/v9fs.c
+++ b/fs/9p/v9fs.c
@@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = {
NULL,
};
-static struct attribute_group v9fs_attr_group = {
+static const struct attribute_group v9fs_attr_group = {
.attrs = v9fs_attrs,
};
diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 649f04f112dc..59c32c9b799f 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file)
* to work.
*/
writeback_fid = v9fs_writeback_fid(file_dentry(file));
- if (IS_ERR(fid)) {
- err = PTR_ERR(fid);
+ if (IS_ERR(writeback_fid)) {
+ err = PTR_ERR(writeback_fid);
mutex_unlock(&v9inode->v_mutex);
goto out_error;
}
diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c
index 8d97f0b45e9c..795706520b5e 100644
--- a/fs/9p/vfs_inode.c
+++ b/fs/9p/vfs_inode.c
@@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data)
umode = p9mode2unixmode(v9ses, st, &rdev);
/* don't match inode of different type */
- if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+ if (inode_wrong_type(inode, umode))
return 0;
/* compare qid details */
@@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode)
* Don't update inode if the file type is different
*/
umode = p9mode2unixmode(v9ses, st, &rdev);
- if ((inode->i_mode & S_IFMT) != (umode & S_IFMT))
+ if (inode_wrong_type(inode, umode))
goto out;
/*
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index 1dc7af046615..e1c0240b51c0 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data)
struct p9_stat_dotl *st = (struct p9_stat_dotl *)data;
/* don't match inode of different type */
- if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+ if (inode_wrong_type(inode, st->st_mode))
return 0;
if (inode->i_generation != st->st_gen)
@@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode,
if (stat->st_result_mask & P9_STATS_NLINK)
set_nlink(inode, stat->st_nlink);
if (stat->st_result_mask & P9_STATS_MODE) {
- inode->i_mode = stat->st_mode;
- if ((S_ISBLK(inode->i_mode)) ||
- (S_ISCHR(inode->i_mode)))
- init_special_inode(inode, inode->i_mode,
- inode->i_rdev);
+ mode = stat->st_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
}
- if (stat->st_result_mask & P9_STATS_RDEV)
- inode->i_rdev = new_decode_dev(stat->st_rdev);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
stat->st_result_mask & P9_STATS_SIZE)
v9fs_i_size_write(inode, stat->st_size);
@@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode)
/*
* Don't update inode if the file type is different
*/
- if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT))
+ if (inode_wrong_type(inode, st->st_mode))
goto out;
/*
diff --git a/fs/Kconfig b/fs/Kconfig
index a55bda4233bb..141a856c50e7 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig"
menu "Caches"
+source "fs/netfs/Kconfig"
source "fs/fscache/Kconfig"
source "fs/cachefiles/Kconfig"
@@ -222,10 +223,13 @@ config TMPFS_INODE64
If unsure, say N.
+config ARCH_SUPPORTS_HUGETLBFS
+ def_bool n
+
config HUGETLBFS
bool "HugeTLB file system support"
depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \
- SYS_SUPPORTS_HUGETLBFS || BROKEN
+ ARCH_SUPPORTS_HUGETLBFS || BROKEN
help
hugetlbfs is a filesystem backing for HugeTLB pages, based on
ramfs. For architectures that support it, say Y here and read
@@ -334,8 +338,8 @@ config NFS_COMMON
default y
config NFS_V4_2_SSC_HELPER
- tristate
- default y if NFS_V4=y || NFS_FS=y
+ bool
+ default y if NFS_V4_2
source "net/sunrpc/Kconfig"
source "fs/ceph/Kconfig"
diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt
index c6f1c8c1934e..06fb7a93a1bd 100644
--- a/fs/Kconfig.binfmt
+++ b/fs/Kconfig.binfmt
@@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK
config BINFMT_FLAT_OLD_ALWAYS_RAM
bool
+config BINFMT_FLAT_NO_DATA_START_OFFSET
+ bool
+
config BINFMT_FLAT_OLD
bool "Enable support for very old legacy flat binaries"
depends on BINFMT_FLAT
diff --git a/fs/Makefile b/fs/Makefile
index 3215fe205256..9c708e1fbe8f 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -67,6 +67,7 @@ obj-y += devpts/
obj-$(CONFIG_DLM) += dlm/
# Do not add any filesystems before this line
+obj-$(CONFIG_NETFS_SUPPORT) += netfs/
obj-$(CONFIG_FSCACHE) += fscache/
obj-$(CONFIG_REISERFS_FS) += reiserfs/
obj-$(CONFIG_EXT4_FS) += ext4/
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index 1ad211d72b3b..fc8ba9142f2f 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -4,6 +4,7 @@ config AFS_FS
depends on INET
select AF_RXRPC
select DNS_RESOLVER
+ select NETFS_SUPPORT
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
index a4e9e6e07e93..d3c6bb22c5f4 100644
--- a/fs/afs/cmservice.c
+++ b/fs/afs/cmservice.c
@@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call)
return ret;
call->unmarshall++;
+ fallthrough;
+
case 5:
break;
}
@@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call)
r->node[loop] = ntohl(b[loop + 5]);
call->unmarshall++;
+ fallthrough;
case 2:
break;
@@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call)
afs_extract_to_tmp(call);
call->unmarshall++;
+ fallthrough;
case 3:
break;
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 17548c1faf02..78719f2f567e 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -103,6 +103,35 @@ struct afs_lookup_cookie {
};
/*
+ * Drop the refs that we're holding on the pages we were reading into. We've
+ * got refs on the first nr_pages pages.
+ */
+static void afs_dir_read_cleanup(struct afs_read *req)
+{
+ struct address_space *mapping = req->vnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t last = req->nr_pages - 1;
+
+ XA_STATE(xas, &mapping->i_pages, 0);
+
+ if (unlikely(!req->nr_pages))
+ return;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+ BUG_ON(xa_is_value(page));
+ BUG_ON(PageCompound(page));
+ ASSERTCMP(page->mapping, ==, mapping);
+
+ put_page(page);
+ }
+
+ rcu_read_unlock();
+}
+
+/*
* check that a directory page is valid
*/
static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
@@ -127,7 +156,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
qty /= sizeof(union afs_xdr_dir_block);
/* check them */
- dbuf = kmap(page);
+ dbuf = kmap_atomic(page);
for (tmp = 0; tmp < qty; tmp++) {
if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) {
printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n",
@@ -146,7 +175,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page,
((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0;
}
- kunmap(page);
+ kunmap_atomic(dbuf);
checked:
afs_stat_v(dvnode, n_read_dir);
@@ -157,35 +186,74 @@ error:
}
/*
- * Check the contents of a directory that we've just read.
+ * Dump the contents of a directory.
*/
-static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req)
+static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req)
{
struct afs_xdr_dir_page *dbuf;
- unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct page *page;
+ unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block);
+ pgoff_t last = req->nr_pages - 1;
- for (i = 0; i < req->nr_pages; i++)
- if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len))
- goto bad;
- return true;
+ XA_STATE(xas, &mapping->i_pages, 0);
-bad:
- pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n",
+ pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n",
dvnode->fid.vid, dvnode->fid.vnode,
- req->file_size, req->len, req->actual_len, req->remain);
- pr_warn("DIR %llx %x %x %x\n",
- req->pos, req->index, req->nr_pages, req->offset);
+ req->file_size, req->len, req->actual_len);
+ pr_warn("DIR %llx %x %zx %zx\n",
+ req->pos, req->nr_pages,
+ req->iter->iov_offset, iov_iter_count(req->iter));
- for (i = 0; i < req->nr_pages; i++) {
- dbuf = kmap(req->pages[i]);
- for (j = 0; j < qty; j++) {
- union afs_xdr_dir_block *block = &dbuf->blocks[j];
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ BUG_ON(PageCompound(page));
+ BUG_ON(page->mapping != mapping);
+
+ dbuf = kmap_atomic(page);
+ for (i = 0; i < qty; i++) {
+ union afs_xdr_dir_block *block = &dbuf->blocks[i];
- pr_warn("[%02x] %32phN\n", i * qty + j, block);
+ pr_warn("[%02lx] %32phN\n", page->index * qty + i, block);
}
- kunmap(req->pages[i]);
+ kunmap_atomic(dbuf);
}
- return false;
+}
+
+/*
+ * Check all the pages in a directory. All the pages are held pinned.
+ */
+static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req)
+{
+ struct address_space *mapping = dvnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t last = req->nr_pages - 1;
+ int ret = 0;
+
+ XA_STATE(xas, &mapping->i_pages, 0);
+
+ if (unlikely(!req->nr_pages))
+ return 0;
+
+ rcu_read_lock();
+ xas_for_each(&xas, page, last) {
+ if (xas_retry(&xas, page))
+ continue;
+
+ BUG_ON(PageCompound(page));
+ BUG_ON(page->mapping != mapping);
+
+ if (!afs_dir_check_page(dvnode, page, req->file_size)) {
+ afs_dir_dump(dvnode, req);
+ ret = -EIO;
+ break;
+ }
+ }
+
+ rcu_read_unlock();
+ return ret;
}
/*
@@ -214,57 +282,57 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key)
{
struct afs_read *req;
loff_t i_size;
- int nr_pages, nr_inline, i, n;
- int ret = -ENOMEM;
+ int nr_pages, i, n;
+ int ret;
+
+ _enter("");
-retry:
+ req = kzalloc(sizeof(*req), GFP_KERNEL);
+ if (!req)
+ return ERR_PTR(-ENOMEM);
+
+ refcount_set(&req->usage, 1);
+ req->vnode = dvnode;
+ req->key = key_get(key);
+ req->cleanup = afs_dir_read_cleanup;
+
+expand:
i_size = i_size_read(&dvnode->vfs_inode);
- if (i_size < 2048)
- return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small));
+ if (i_size < 2048) {
+ ret = afs_bad(dvnode, afs_file_error_dir_small);
+ goto error;
+ }
if (i_size > 2048 * 1024) {
trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big);
- return ERR_PTR(-EFBIG);
+ ret = -EFBIG;
+ goto error;
}
_enter("%llu", i_size);
- /* Get a request record to hold the page list. We want to hold it
- * inline if we can, but we don't want to make an order 1 allocation.
- */
nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE;
- nr_inline = nr_pages;
- if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *))
- nr_inline = 0;
- req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL);
- if (!req)
- return ERR_PTR(-ENOMEM);
-
- refcount_set(&req->usage, 1);
- req->nr_pages = nr_pages;
req->actual_len = i_size; /* May change */
req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */
req->data_version = dvnode->status.data_version; /* May change */
- if (nr_inline > 0) {
- req->pages = req->array;
- } else {
- req->pages = kcalloc(nr_pages, sizeof(struct page *),
- GFP_KERNEL);
- if (!req->pages)
- goto error;
- }
+ iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages,
+ 0, i_size);
+ req->iter = &req->def_iter;
- /* Get a list of all the pages that hold or will hold the directory
- * content. We need to fill in any gaps that we might find where the
- * memory reclaimer has been at work. If there are any gaps, we will
+ /* Fill in any gaps that we might find where the memory reclaimer has
+ * been at work and pin all the pages. If there are any gaps, we will
* need to reread the entire directory contents.
*/
- i = 0;
- do {
+ i = req->nr_pages;
+ while (i < nr_pages) {
+ struct page *pages[8], *page;
+
n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i,
- req->nr_pages - i,
- req->pages + i);
- _debug("find %u at %u/%u", n, i, req->nr_pages);
+ min_t(unsigned int, nr_pages - i,
+ ARRAY_SIZE(pages)),
+ pages);
+ _debug("find %u at %u/%u", n, i, nr_pages);
+
if (n == 0) {
gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask;
@@ -272,22 +340,24 @@ retry:
afs_stat_v(dvnode, n_inval);
ret = -ENOMEM;
- req->pages[i] = __page_cache_alloc(gfp);
- if (!req->pages[i])
+ page = __page_cache_alloc(gfp);
+ if (!page)
goto error;
- ret = add_to_page_cache_lru(req->pages[i],
+ ret = add_to_page_cache_lru(page,
dvnode->vfs_inode.i_mapping,
i, gfp);
if (ret < 0)
goto error;
- attach_page_private(req->pages[i], (void *)1);
- unlock_page(req->pages[i]);
+ attach_page_private(page, (void *)1);
+ unlock_page(page);
+ req->nr_pages++;
i++;
} else {
+ req->nr_pages += n;
i += n;
}
- } while (i < req->nr_pages);
+ }
/* If we're going to reload, we need to lock all the pages to prevent
* races.
@@ -305,18 +375,23 @@ retry:
if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) {
trace_afs_reload_dir(dvnode);
- ret = afs_fetch_data(dvnode, key, req);
+ ret = afs_fetch_data(dvnode, req);
if (ret < 0)
goto error_unlock;
task_io_account_read(PAGE_SIZE * req->nr_pages);
- if (req->len < req->file_size)
- goto content_has_grown;
+ if (req->len < req->file_size) {
+ /* The content has grown, so we need to expand the
+ * buffer.
+ */
+ up_write(&dvnode->validate_lock);
+ goto expand;
+ }
/* Validate the data we just read. */
- ret = -EIO;
- if (!afs_dir_check_pages(dvnode, req))
+ ret = afs_dir_check(dvnode, req);
+ if (ret < 0)
goto error_unlock;
// TODO: Trim excess pages
@@ -334,11 +409,6 @@ error:
afs_put_read(req);
_leave(" = %d", ret);
return ERR_PTR(ret);
-
-content_has_grown:
- up_write(&dvnode->validate_lock);
- afs_put_read(req);
- goto retry;
}
/*
@@ -448,6 +518,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
struct afs_read *req;
struct page *page;
unsigned blkoff, limit;
+ void __rcu **slot;
int ret;
_enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos);
@@ -472,9 +543,15 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx,
blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1);
/* Fetch the appropriate page from the directory and re-add it
- * to the LRU.
+ * to the LRU. We have all the pages pinned with an extra ref.
*/
- page = req->pages[blkoff / PAGE_SIZE];
+ rcu_read_lock();
+ page = NULL;
+ slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages,
+ blkoff / PAGE_SIZE);
+ if (slot)
+ page = radix_tree_deref_slot(slot);
+ rcu_read_unlock();
if (!page) {
ret = afs_bad(dvnode, afs_file_error_dir_missing_page);
break;
@@ -1342,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
op->create.mode = S_IFDIR | mode;
@@ -1423,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
@@ -1559,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
/* Try to make sure we have a callback promise on the victim. */
@@ -1641,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->dentry = dentry;
@@ -1715,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir,
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
@@ -1837,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op)
new_inode = d_inode(new_dentry);
if (new_inode) {
spin_lock(&new_inode->i_lock);
- if (new_inode->i_nlink > 0)
+ if (S_ISDIR(new_inode->i_mode))
+ clear_nlink(new_inode);
+ else if (new_inode->i_nlink > 0)
drop_nlink(new_inode);
spin_unlock(&new_inode->i_lock);
}
@@ -1910,6 +1994,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir,
afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].modification = true;
+ op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
@@ -2006,6 +2092,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset,
afs_stat_v(dvnode, n_inval);
/* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_SIZE)
+ if (offset == 0 && length == thp_size(page))
detach_page_private(page);
}
diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
index 04f75a44f243..dae9a57d7ec0 100644
--- a/fs/afs/dir_silly.c
+++ b/fs/afs/dir_silly.c
@@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 1, dvnode);
op->file[0].dv_delta = 1;
op->file[1].dv_delta = 1;
+ op->file[0].modification = true;
+ op->file[1].modification = true;
op->file[0].update_ctime = true;
op->file[1].update_ctime = true;
@@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
afs_op_set_vnode(op, 0, dvnode);
afs_op_set_vnode(op, 1, vnode);
op->file[0].dv_delta = 1;
+ op->file[0].modification = true;
op->file[0].update_ctime = true;
op->file[1].op_unlinked = true;
op->file[1].update_ctime = true;
diff --git a/fs/afs/file.c b/fs/afs/file.c
index 960b64268623..db035ae2a134 100644
--- a/fs/afs/file.c
+++ b/fs/afs/file.c
@@ -14,6 +14,7 @@
#include <linux/gfp.h>
#include <linux/task_io_accounting_ops.h>
#include <linux/mm.h>
+#include <linux/netfs.h>
#include "internal.h"
static int afs_file_mmap(struct file *file, struct vm_area_struct *vma);
@@ -22,8 +23,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
unsigned int length);
static int afs_releasepage(struct page *page, gfp_t gfp_flags);
-static int afs_readpages(struct file *filp, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages);
+static void afs_readahead(struct readahead_control *ractl);
const struct file_operations afs_file_operations = {
.open = afs_open,
@@ -47,7 +47,7 @@ const struct inode_operations afs_file_inode_operations = {
const struct address_space_operations afs_fs_aops = {
.readpage = afs_readpage,
- .readpages = afs_readpages,
+ .readahead = afs_readahead,
.set_page_dirty = afs_set_page_dirty,
.launder_page = afs_launder_page,
.releasepage = afs_releasepage,
@@ -184,41 +184,50 @@ int afs_release(struct inode *inode, struct file *file)
}
/*
+ * Allocate a new read record.
+ */
+struct afs_read *afs_alloc_read(gfp_t gfp)
+{
+ struct afs_read *req;
+
+ req = kzalloc(sizeof(struct afs_read), gfp);
+ if (req)
+ refcount_set(&req->usage, 1);
+
+ return req;
+}
+
+/*
* Dispose of a ref to a read record.
*/
void afs_put_read(struct afs_read *req)
{
- int i;
-
if (refcount_dec_and_test(&req->usage)) {
- if (req->pages) {
- for (i = 0; i < req->nr_pages; i++)
- if (req->pages[i])
- put_page(req->pages[i]);
- if (req->pages != req->array)
- kfree(req->pages);
- }
+ if (req->cleanup)
+ req->cleanup(req);
+ key_put(req->key);
kfree(req);
}
}
-#ifdef CONFIG_AFS_FSCACHE
-/*
- * deal with notification that a page was read from the cache
- */
-static void afs_file_readpage_read_complete(struct page *page,
- void *data,
- int error)
+static void afs_fetch_data_notify(struct afs_operation *op)
{
- _enter("%p,%p,%d", page, data, error);
-
- /* if the read completes with an error, we just unlock the page and let
- * the VM reissue the readpage */
- if (!error)
- SetPageUptodate(page);
- unlock_page(page);
+ struct afs_read *req = op->fetch.req;
+ struct netfs_read_subrequest *subreq = req->subreq;
+ int error = op->error;
+
+ if (error == -ECONNABORTED)
+ error = afs_abort_to_error(op->ac.abort_code);
+ req->error = error;
+
+ if (subreq) {
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
+ req->subreq = NULL;
+ } else if (req->done) {
+ req->done(req);
+ }
}
-#endif
static void afs_fetch_data_success(struct afs_operation *op)
{
@@ -228,10 +237,12 @@ static void afs_fetch_data_success(struct afs_operation *op)
afs_vnode_commit_status(op, &op->file[0]);
afs_stat_v(vnode, n_fetches);
atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes);
+ afs_fetch_data_notify(op);
}
static void afs_fetch_data_put(struct afs_operation *op)
{
+ op->fetch.req->error = op->error;
afs_put_read(op->fetch.req);
}
@@ -240,13 +251,14 @@ static const struct afs_operation_ops afs_fetch_data_operation = {
.issue_yfs_rpc = yfs_fs_fetch_data,
.success = afs_fetch_data_success,
.aborted = afs_check_for_remote_deletion,
+ .failed = afs_fetch_data_notify,
.put = afs_fetch_data_put,
};
/*
* Fetch file data from the volume.
*/
-int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req)
+int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req)
{
struct afs_operation *op;
@@ -255,11 +267,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- key_serial(key));
+ key_serial(req->key));
- op = afs_alloc_operation(key, vnode->volume);
- if (IS_ERR(op))
+ op = afs_alloc_operation(req->key, vnode->volume);
+ if (IS_ERR(op)) {
+ if (req->subreq)
+ netfs_subreq_terminated(req->subreq, PTR_ERR(op), false);
return PTR_ERR(op);
+ }
afs_op_set_vnode(op, 0, vnode);
@@ -268,336 +283,103 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re
return afs_do_sync_operation(op);
}
-/*
- * read page from file, directory or symlink, given a key to use
- */
-int afs_page_filler(void *data, struct page *page)
+static void afs_req_issue_op(struct netfs_read_subrequest *subreq)
{
- struct inode *inode = page->mapping->host;
- struct afs_vnode *vnode = AFS_FS_I(inode);
- struct afs_read *req;
- struct key *key = data;
- int ret;
-
- _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index);
+ struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode);
+ struct afs_read *fsreq;
- BUG_ON(!PageLocked(page));
+ fsreq = afs_alloc_read(GFP_NOFS);
+ if (!fsreq)
+ return netfs_subreq_terminated(subreq, -ENOMEM, false);
- ret = -ESTALE;
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags))
- goto error;
+ fsreq->subreq = subreq;
+ fsreq->pos = subreq->start + subreq->transferred;
+ fsreq->len = subreq->len - subreq->transferred;
+ fsreq->key = subreq->rreq->netfs_priv;
+ fsreq->vnode = vnode;
+ fsreq->iter = &fsreq->def_iter;
- /* is it cached? */
-#ifdef CONFIG_AFS_FSCACHE
- ret = fscache_read_or_alloc_page(vnode->cache,
- page,
- afs_file_readpage_read_complete,
- NULL,
- GFP_KERNEL);
-#else
- ret = -ENOBUFS;
-#endif
- switch (ret) {
- /* read BIO submitted (page in cache) */
- case 0:
- break;
-
- /* page not yet cached */
- case -ENODATA:
- _debug("cache said ENODATA");
- goto go_on;
-
- /* page will not be cached */
- case -ENOBUFS:
- _debug("cache said ENOBUFS");
-
- fallthrough;
- default:
- go_on:
- req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
- if (!req)
- goto enomem;
-
- /* We request a full page. If the page is a partial one at the
- * end of the file, the server will return a short read and the
- * unmarshalling code will clear the unfilled space.
- */
- refcount_set(&req->usage, 1);
- req->pos = (loff_t)page->index << PAGE_SHIFT;
- req->len = PAGE_SIZE;
- req->nr_pages = 1;
- req->pages = req->array;
- req->pages[0] = page;
- get_page(page);
-
- /* read the contents of the file from the server into the
- * page */
- ret = afs_fetch_data(vnode, key, req);
- afs_put_read(req);
-
- if (ret < 0) {
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
+ iov_iter_xarray(&fsreq->def_iter, READ,
+ &fsreq->vnode->vfs_inode.i_mapping->i_pages,
+ fsreq->pos, fsreq->len);
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- BUG_ON(PageFsCache(page));
-
- if (ret == -EINTR ||
- ret == -ENOMEM ||
- ret == -ERESTARTSYS ||
- ret == -EAGAIN)
- goto error;
- goto io_error;
- }
+ afs_fetch_data(fsreq->vnode, fsreq);
+}
- SetPageUptodate(page);
+static int afs_symlink_readpage(struct page *page)
+{
+ struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
+ struct afs_read *fsreq;
+ int ret;
- /* send the page to the cache */
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, vnode->status.size,
- GFP_KERNEL) != 0) {
- fscache_uncache_page(vnode->cache, page);
- BUG_ON(PageFsCache(page));
- }
-#endif
- unlock_page(page);
- }
+ fsreq = afs_alloc_read(GFP_NOFS);
+ if (!fsreq)
+ return -ENOMEM;
- _leave(" = 0");
- return 0;
+ fsreq->pos = page->index * PAGE_SIZE;
+ fsreq->len = PAGE_SIZE;
+ fsreq->vnode = vnode;
+ fsreq->iter = &fsreq->def_iter;
+ iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages,
+ fsreq->pos, fsreq->len);
-io_error:
- SetPageError(page);
- goto error;
-enomem:
- ret = -ENOMEM;
-error:
- unlock_page(page);
- _leave(" = %d", ret);
+ ret = afs_fetch_data(fsreq->vnode, fsreq);
+ page_endio(page, false, ret);
return ret;
}
-/*
- * read page from file, directory or symlink, given a file to nominate the key
- * to be used
- */
-static int afs_readpage(struct file *file, struct page *page)
+static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file)
{
- struct key *key;
- int ret;
-
- if (file) {
- key = afs_file_key(file);
- ASSERT(key != NULL);
- ret = afs_page_filler(key, page);
- } else {
- struct inode *inode = page->mapping->host;
- key = afs_request_key(AFS_FS_S(inode->i_sb)->cell);
- if (IS_ERR(key)) {
- ret = PTR_ERR(key);
- } else {
- ret = afs_page_filler(key, page);
- key_put(key);
- }
- }
- return ret;
+ rreq->netfs_priv = key_get(afs_file_key(file));
}
-/*
- * Make pages available as they're filled.
- */
-static void afs_readpages_page_done(struct afs_read *req)
+static bool afs_is_cache_enabled(struct inode *inode)
{
-#ifdef CONFIG_AFS_FSCACHE
- struct afs_vnode *vnode = req->vnode;
-#endif
- struct page *page = req->pages[req->index];
+ struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode));
- req->pages[req->index] = NULL;
- SetPageUptodate(page);
-
- /* send the page to the cache */
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page) &&
- fscache_write_page(vnode->cache, page, vnode->status.size,
- GFP_KERNEL) != 0) {
- fscache_uncache_page(vnode->cache, page);
- BUG_ON(PageFsCache(page));
- }
-#endif
- unlock_page(page);
- put_page(page);
+ return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects);
}
-/*
- * Read a contiguous set of pages.
- */
-static int afs_readpages_one(struct file *file, struct address_space *mapping,
- struct list_head *pages)
+static int afs_begin_cache_operation(struct netfs_read_request *rreq)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct afs_read *req;
- struct list_head *p;
- struct page *first, *page;
- struct key *key = afs_file_key(file);
- pgoff_t index;
- int ret, n, i;
-
- /* Count the number of contiguous pages at the front of the list. Note
- * that the list goes prev-wards rather than next-wards.
- */
- first = lru_to_page(pages);
- index = first->index + 1;
- n = 1;
- for (p = first->lru.prev; p != pages; p = p->prev) {
- page = list_entry(p, struct page, lru);
- if (page->index != index)
- break;
- index++;
- n++;
- }
-
- req = kzalloc(struct_size(req, array, n), GFP_NOFS);
- if (!req)
- return -ENOMEM;
-
- refcount_set(&req->usage, 1);
- req->vnode = vnode;
- req->page_done = afs_readpages_page_done;
- req->pos = first->index;
- req->pos <<= PAGE_SHIFT;
- req->pages = req->array;
-
- /* Transfer the pages to the request. We add them in until one fails
- * to add to the LRU and then we stop (as that'll make a hole in the
- * contiguous run.
- *
- * Note that it's possible for the file size to change whilst we're
- * doing this, but we rely on the server returning less than we asked
- * for if the file shrank. We also rely on this to deal with a partial
- * page at the end of the file.
- */
- do {
- page = lru_to_page(pages);
- list_del(&page->lru);
- index = page->index;
- if (add_to_page_cache_lru(page, mapping, index,
- readahead_gfp_mask(mapping))) {
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- put_page(page);
- break;
- }
-
- req->pages[req->nr_pages++] = page;
- req->len += PAGE_SIZE;
- } while (req->nr_pages < n);
+ struct afs_vnode *vnode = AFS_FS_I(rreq->inode);
- if (req->nr_pages == 0) {
- kfree(req);
- return 0;
- }
-
- ret = afs_fetch_data(vnode, key, req);
- if (ret < 0)
- goto error;
-
- task_io_account_read(PAGE_SIZE * req->nr_pages);
- afs_put_read(req);
- return 0;
-
-error:
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
-
- for (i = 0; i < req->nr_pages; i++) {
- page = req->pages[i];
- if (page) {
-#ifdef CONFIG_AFS_FSCACHE
- fscache_uncache_page(vnode->cache, page);
-#endif
- SetPageError(page);
- unlock_page(page);
- }
- }
-
- afs_put_read(req);
- return ret;
+ return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode));
}
-/*
- * read a set of pages
- */
-static int afs_readpages(struct file *file, struct address_space *mapping,
- struct list_head *pages, unsigned nr_pages)
+static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len,
+ struct page *page, void **_fsdata)
{
- struct key *key = afs_file_key(file);
- struct afs_vnode *vnode;
- int ret = 0;
-
- _enter("{%d},{%lu},,%d",
- key_serial(key), mapping->host->i_ino, nr_pages);
+ struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- ASSERT(key != NULL);
+ return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0;
+}
- vnode = AFS_FS_I(mapping->host);
- if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) {
- _leave(" = -ESTALE");
- return -ESTALE;
- }
+static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv)
+{
+ key_put(netfs_priv);
+}
- /* attempt to read as many of the pages as possible */
-#ifdef CONFIG_AFS_FSCACHE
- ret = fscache_read_or_alloc_pages(vnode->cache,
- mapping,
- pages,
- &nr_pages,
- afs_file_readpage_read_complete,
- NULL,
- mapping_gfp_mask(mapping));
-#else
- ret = -ENOBUFS;
-#endif
+const struct netfs_read_request_ops afs_req_ops = {
+ .init_rreq = afs_init_rreq,
+ .is_cache_enabled = afs_is_cache_enabled,
+ .begin_cache_operation = afs_begin_cache_operation,
+ .check_write_begin = afs_check_write_begin,
+ .issue_op = afs_req_issue_op,
+ .cleanup = afs_priv_cleanup,
+};
- switch (ret) {
- /* all pages are being read from the cache */
- case 0:
- BUG_ON(!list_empty(pages));
- BUG_ON(nr_pages != 0);
- _leave(" = 0 [reading all]");
- return 0;
-
- /* there were pages that couldn't be read from the cache */
- case -ENODATA:
- case -ENOBUFS:
- break;
-
- /* other error */
- default:
- _leave(" = %d", ret);
- return ret;
- }
+static int afs_readpage(struct file *file, struct page *page)
+{
+ if (!file)
+ return afs_symlink_readpage(page);
- while (!list_empty(pages)) {
- ret = afs_readpages_one(file, mapping, pages);
- if (ret < 0)
- break;
- }
+ return netfs_readpage(file, page, &afs_req_ops, NULL);
+}
- _leave(" = %d [netting]", ret);
- return ret;
+static void afs_readahead(struct readahead_control *ractl)
+{
+ netfs_readahead(ractl, &afs_req_ops, NULL);
}
/*
@@ -625,8 +407,8 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
return;
/* We may need to shorten the dirty region */
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
if (t <= offset || f >= end)
return; /* Doesn't overlap */
@@ -644,17 +426,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset,
if (f == t)
goto undirty;
- priv = afs_page_dirty(f, t);
+ priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page);
return;
undirty:
- trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page);
clear_page_dirty_for_io(page);
full_invalidate:
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("inval"), page);
+ detach_page_private(page);
}
/*
@@ -669,20 +451,10 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
BUG_ON(!PageLocked(page));
-#ifdef CONFIG_AFS_FSCACHE
- /* we clean up only if the entire page is being invalidated */
- if (offset == 0 && length == PAGE_SIZE) {
- if (PageFsCache(page)) {
- struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- fscache_wait_on_page_write(vnode->cache, page);
- fscache_uncache_page(vnode->cache, page);
- }
- }
-#endif
-
if (PagePrivate(page))
afs_invalidate_dirty(page, offset, length);
+ wait_on_page_fscache(page);
_leave("");
}
@@ -693,7 +465,6 @@ static void afs_invalidatepage(struct page *page, unsigned int offset,
static int afs_releasepage(struct page *page, gfp_t gfp_flags)
{
struct afs_vnode *vnode = AFS_FS_I(page->mapping->host);
- unsigned long priv;
_enter("{{%llx:%llu}[%lu],%lx},%x",
vnode->fid.vid, vnode->fid.vnode, page->index, page->flags,
@@ -702,16 +473,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags)
/* deny if page is being written to the cache and the caller hasn't
* elected to wait */
#ifdef CONFIG_AFS_FSCACHE
- if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) {
- _leave(" = F [cache busy]");
- return 0;
+ if (PageFsCache(page)) {
+ if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS))
+ return false;
+ wait_on_page_fscache(page);
}
#endif
if (PagePrivate(page)) {
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("rel"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("rel"), page);
+ detach_page_private(page);
}
/* indicate that the page can be released */
diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
index 71c58723763d..d222dfbe976b 100644
--- a/fs/afs/fs_operation.c
+++ b/fs/afs/fs_operation.c
@@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param *
vp->cb_break_before = afs_calc_vnode_cb_break(vnode);
if (vnode->lock_state != AFS_VNODE_LOCK_NONE)
op->flags |= AFS_OPERATION_CUR_ONLY;
+ if (vp->modification)
+ set_bit(AFS_VNODE_MODIFYING, &vnode->flags);
}
if (vp->fid.vnode)
@@ -198,8 +200,10 @@ void afs_wait_for_operation(struct afs_operation *op)
case -ECONNABORTED:
if (op->ops->aborted)
op->ops->aborted(op);
- break;
+ fallthrough;
default:
+ if (op->ops->failed)
+ op->ops->failed(op);
break;
}
@@ -223,6 +227,10 @@ int afs_put_operation(struct afs_operation *op)
if (op->ops && op->ops->put)
op->ops->put(op);
+ if (op->file[0].modification)
+ clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags);
+ if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode)
+ clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags);
if (op->file[0].put_vnode)
iput(&op->file[0].vnode->vfs_inode);
if (op->file[1].put_vnode)
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d95ed9dd86e..dd3f45d906d2 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -10,6 +10,7 @@
#include <linux/sched.h>
#include <linux/circ_buf.h>
#include <linux/iversion.h>
+#include <linux/netfs.h>
#include "internal.h"
#include "afs_fs.h"
#include "xdr_fs.h"
@@ -302,17 +303,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
- unsigned int size;
int ret;
- _enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(call->iter), req->actual_len);
+ _enter("{%u,%zu,%zu/%llu}",
+ call->unmarshall, call->iov_len, iov_iter_count(call->iter),
+ req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- req->index = 0;
- req->offset = req->pos & (PAGE_SIZE - 1);
call->unmarshall++;
if (call->operation_ID == FSFETCHDATA64) {
afs_extract_to_tmp64(call);
@@ -322,7 +321,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
}
fallthrough;
- /* extract the returned data length */
+ /* Extract the returned data length into
+ * ->actual_len. This may indicate more or less data than was
+ * requested will be returned.
+ */
case 1:
_debug("extract data length");
ret = afs_extract_data(call, true);
@@ -331,44 +333,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
- req->remain = min(req->len, req->actual_len);
- if (req->remain == 0)
+
+ if (req->actual_len == 0)
goto no_more_data;
+ call->iter = req->iter;
+ call->iov_len = min(req->actual_len, req->len);
call->unmarshall++;
-
- begin_page:
- ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - req->offset)
- size = PAGE_SIZE - req->offset;
- else
- size = req->remain;
- call->bvec[0].bv_len = size;
- call->bvec[0].bv_offset = req->offset;
- call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
- ASSERTCMP(size, <=, PAGE_SIZE);
fallthrough;
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->remain);
+ iov_iter_count(call->iter), req->actual_len);
ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->remain -= call->bvec[0].bv_len;
- req->offset += call->bvec[0].bv_len;
- ASSERTCMP(req->offset, <=, PAGE_SIZE);
- if (req->offset == PAGE_SIZE) {
- req->offset = 0;
- req->index++;
- if (req->remain > 0)
- goto begin_page;
- }
- ASSERTCMP(req->remain, ==, 0);
+ call->iter = &call->def_iter;
if (req->actual_len <= req->len)
goto no_more_data;
@@ -405,22 +388,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call)
req->file_size = vp->scb.status.size;
call->unmarshall++;
+ fallthrough;
case 5:
break;
}
- for (; req->index < req->nr_pages; req->index++) {
- if (req->offset < PAGE_SIZE)
- zero_user_segment(req->pages[req->index],
- req->offset, PAGE_SIZE);
- req->offset = 0;
- }
-
- if (req->page_done)
- for (req->index = 0; req->index < req->nr_pages; req->index++)
- req->page_done(req);
-
_leave(" = 0 [done]");
return 0;
}
@@ -494,6 +467,8 @@ void afs_fs_fetch_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
+ req->call_debug_id = call->debug_id;
+
/* marshall the parameters */
bp = call->request;
bp[0] = htonl(FSFETCHDATA);
@@ -1079,8 +1054,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = {
/*
* store a set of pages to a very large file
*/
-static void afs_fs_store_data64(struct afs_operation *op,
- loff_t pos, loff_t size, loff_t i_size)
+static void afs_fs_store_data64(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
@@ -1095,7 +1069,7 @@ static void afs_fs_store_data64(struct afs_operation *op,
if (!call)
return afs_op_nomem(op);
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1111,47 +1085,38 @@ static void afs_fs_store_data64(struct afs_operation *op,
*bp++ = 0; /* unix mode */
*bp++ = 0; /* segment size */
- *bp++ = htonl(upper_32_bits(pos));
- *bp++ = htonl(lower_32_bits(pos));
- *bp++ = htonl(upper_32_bits(size));
- *bp++ = htonl(lower_32_bits(size));
- *bp++ = htonl(upper_32_bits(i_size));
- *bp++ = htonl(lower_32_bits(i_size));
+ *bp++ = htonl(upper_32_bits(op->store.pos));
+ *bp++ = htonl(lower_32_bits(op->store.pos));
+ *bp++ = htonl(upper_32_bits(op->store.size));
+ *bp++ = htonl(lower_32_bits(op->store.size));
+ *bp++ = htonl(upper_32_bits(op->store.i_size));
+ *bp++ = htonl(lower_32_bits(op->store.i_size));
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
}
/*
- * store a set of pages
+ * Write data to a file on the server.
*/
void afs_fs_store_data(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
- loff_t size, pos, i_size;
__be32 *bp;
_enter(",%x,{%llx:%llu},,",
key_serial(op->key), vp->fid.vid, vp->fid.vnode);
- size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset;
- if (op->store.first != op->store.last)
- size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT;
- pos = (loff_t)op->store.first << PAGE_SHIFT;
- pos += op->store.first_offset;
-
- i_size = i_size_read(&vp->vnode->vfs_inode);
- if (pos + size > i_size)
- i_size = size + pos;
-
_debug("size %llx, at %llx, i_size %llx",
- (unsigned long long) size, (unsigned long long) pos,
- (unsigned long long) i_size);
+ (unsigned long long)op->store.size,
+ (unsigned long long)op->store.pos,
+ (unsigned long long)op->store.i_size);
- if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) ||
- upper_32_bits(pos + size))
- return afs_fs_store_data64(op, pos, size, i_size);
+ if (upper_32_bits(op->store.pos) ||
+ upper_32_bits(op->store.size) ||
+ upper_32_bits(op->store.i_size))
+ return afs_fs_store_data64(op);
call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData,
(4 + 6 + 3) * 4,
@@ -1159,7 +1124,7 @@ void afs_fs_store_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1175,9 +1140,9 @@ void afs_fs_store_data(struct afs_operation *op)
*bp++ = 0; /* unix mode */
*bp++ = 0; /* segment size */
- *bp++ = htonl(lower_32_bits(pos));
- *bp++ = htonl(lower_32_bits(size));
- *bp++ = htonl(lower_32_bits(i_size));
+ *bp++ = htonl(lower_32_bits(op->store.pos));
+ *bp++ = htonl(lower_32_bits(op->store.size));
+ *bp++ = htonl(lower_32_bits(op->store.i_size));
trace_afs_make_fs_call(call, &vp->fid);
afs_make_op_call(op, call, GFP_NOFS);
@@ -1444,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call)
_debug("motd '%s'", p);
call->unmarshall++;
+ fallthrough;
case 8:
break;
@@ -1881,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 6:
break;
@@ -2015,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call)
xdr_decode_AFSVolSync(&bp, &op->volsync);
call->unmarshall++;
+ fallthrough;
case 4:
break;
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index 12be88716e4c..80b6c8d967d5 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -102,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op,
switch (status->type) {
case AFS_FTYPE_FILE:
- inode->i_mode = S_IFREG | status->mode;
+ inode->i_mode = S_IFREG | (status->mode & S_IALLUGO);
inode->i_op = &afs_file_inode_operations;
inode->i_fop = &afs_file_operations;
inode->i_mapping->a_ops = &afs_fs_aops;
break;
case AFS_FTYPE_DIR:
- inode->i_mode = S_IFDIR | status->mode;
+ inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO);
inode->i_op = &afs_dir_inode_operations;
inode->i_fop = &afs_dir_file_operations;
inode->i_mapping->a_ops = &afs_dir_aops;
@@ -198,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op,
if (status->mode != vnode->status.mode) {
mode = inode->i_mode;
mode &= ~S_IALLUGO;
- mode |= status->mode;
+ mode |= status->mode & S_IALLUGO;
WRITE_ONCE(inode->i_mode, mode);
}
@@ -214,11 +214,12 @@ static void afs_apply_status(struct afs_operation *op,
if (vp->dv_before + vp->dv_delta != status->data_version) {
if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags))
- pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n",
+ pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n",
vnode->fid.vid, vnode->fid.vnode,
(unsigned long long)vp->dv_before + vp->dv_delta,
(unsigned long long)status->data_version,
- op->type ? op->type->name : "???");
+ op->type ? op->type->name : "???",
+ op->debug_id);
vnode->invalid_before = status->data_version;
if (vnode->status.type == AFS_FTYPE_DIR) {
@@ -293,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v
op->flags &= ~AFS_OPERATION_DIR_CONFLICT;
}
} else if (vp->scb.have_status) {
- if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version &&
- vp->speculative)
+ if (vp->speculative &&
+ (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) ||
+ vp->dv_before != vnode->status.data_version))
/* Ignore the result of a speculative bulk status fetch
* if it splits around a modification op, thereby
* appearing to regress the data version.
@@ -427,7 +429,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode)
} __packed key;
struct afs_vnode_cache_aux aux;
- if (vnode->status.type == AFS_FTYPE_DIR) {
+ if (vnode->status.type != AFS_FTYPE_FILE) {
vnode->cache = NULL;
return;
}
@@ -910,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
}
op->ctime = attr->ia_ctime;
op->file[0].update_ctime = 1;
+ op->file[0].modification = true;
op->ops = &afs_setattr_operation;
ret = afs_do_sync_operation(op);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 1627b1872812..5ed416f4ff33 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -14,6 +14,7 @@
#include <linux/key.h>
#include <linux/workqueue.h>
#include <linux/sched.h>
+#define FSCACHE_USE_NEW_IO_API
#include <linux/fscache.h>
#include <linux/backing-dev.h>
#include <linux/uuid.h>
@@ -31,6 +32,7 @@
struct pagevec;
struct afs_call;
+struct afs_vnode;
/*
* Partial file-locking emulation mode. (The problem being that AFS3 only
@@ -104,7 +106,9 @@ struct afs_call {
struct afs_server *server; /* The fileserver record if fs op (pins ref) */
struct afs_vlserver *vlserver; /* The vlserver record if vl op */
void *request; /* request data (first part) */
+ size_t iov_len; /* Size of *iter to be used */
struct iov_iter def_iter; /* Default buffer/data iterator */
+ struct iov_iter *write_iter; /* Iterator defining write to be made */
struct iov_iter *iter; /* Iterator currently in use */
union { /* Convenience for ->def_iter */
struct kvec kvec[1];
@@ -131,7 +135,6 @@ struct afs_call {
unsigned char unmarshall; /* unmarshalling phase */
unsigned char addr_ix; /* Address in ->alist */
bool drop_ref; /* T if need to drop ref for incoming call */
- bool send_pages; /* T if data from mapping should be sent */
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
@@ -202,17 +205,19 @@ struct afs_read {
loff_t pos; /* Where to start reading */
loff_t len; /* How much we're asking for */
loff_t actual_len; /* How much we're actually getting */
- loff_t remain; /* Amount remaining */
loff_t file_size; /* File size returned by server */
+ struct key *key; /* The key to use to reissue the read */
+ struct afs_vnode *vnode; /* The file being read into. */
+ struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */
afs_dataversion_t data_version; /* Version number returned by server */
refcount_t usage;
- unsigned int index; /* Which page we're reading into */
+ unsigned int call_debug_id;
unsigned int nr_pages;
- unsigned int offset; /* offset into current page */
- struct afs_vnode *vnode;
- void (*page_done)(struct afs_read *);
- struct page **pages;
- struct page *array[];
+ int error;
+ void (*done)(struct afs_read *);
+ void (*cleanup)(struct afs_read *);
+ struct iov_iter *iter; /* Iterator representing the buffer */
+ struct iov_iter def_iter; /* Default iterator */
};
/*
@@ -640,6 +645,7 @@ struct afs_vnode {
#define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */
#define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */
#define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */
+#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */
struct list_head wb_keys; /* List of keys available for writeback */
struct list_head pending_locks; /* locks waiting to be granted */
@@ -739,6 +745,7 @@ struct afs_operation_ops {
void (*issue_yfs_rpc)(struct afs_operation *op);
void (*success)(struct afs_operation *op);
void (*aborted)(struct afs_operation *op);
+ void (*failed)(struct afs_operation *op);
void (*edit_dir)(struct afs_operation *op);
void (*put)(struct afs_operation *op);
};
@@ -756,6 +763,7 @@ struct afs_vnode_param {
bool set_size:1; /* Must update i_size */
bool op_unlinked:1; /* True if file was unlinked by op */
bool speculative:1; /* T if speculative status fetch (no vnode lock) */
+ bool modification:1; /* Set if the content gets modified */
};
/*
@@ -808,12 +816,11 @@ struct afs_operation {
afs_lock_type_t type;
} lock;
struct {
- struct address_space *mapping; /* Pages being written from */
- pgoff_t first; /* first page in mapping to deal with */
- pgoff_t last; /* last page in mapping to deal with */
- unsigned first_offset; /* offset into mapping[first] */
- unsigned last_to; /* amount of mapping[last] */
- bool laundering; /* Laundering page, PG_writeback not set */
+ struct iov_iter *write_iter;
+ loff_t pos;
+ loff_t size;
+ loff_t i_size;
+ bool laundering; /* Laundering page, PG_writeback not set */
} store;
struct {
struct iattr *attr;
@@ -875,31 +882,31 @@ struct afs_vnode_cache_aux {
#define __AFS_PAGE_PRIV_MMAPPED 0x8000UL
#endif
-static inline unsigned int afs_page_dirty_resolution(void)
+static inline unsigned int afs_page_dirty_resolution(struct page *page)
{
- int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
+ int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1);
return (shift > 0) ? shift : 0;
}
-static inline size_t afs_page_dirty_from(unsigned long priv)
+static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv)
{
unsigned long x = priv & __AFS_PAGE_PRIV_MASK;
/* The lower bound is inclusive */
- return x << afs_page_dirty_resolution();
+ return x << afs_page_dirty_resolution(page);
}
-static inline size_t afs_page_dirty_to(unsigned long priv)
+static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv)
{
unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK;
/* The upper bound is immediately beyond the region */
- return (x + 1) << afs_page_dirty_resolution();
+ return (x + 1) << afs_page_dirty_resolution(page);
}
-static inline unsigned long afs_page_dirty(size_t from, size_t to)
+static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to)
{
- unsigned int res = afs_page_dirty_resolution();
+ unsigned int res = afs_page_dirty_resolution(page);
from >>= res;
to = (to - 1) >> res;
return (to << __AFS_PAGE_PRIV_SHIFT) | from;
@@ -1040,13 +1047,14 @@ extern void afs_dynroot_depopulate(struct super_block *);
extern const struct address_space_operations afs_fs_aops;
extern const struct inode_operations afs_file_inode_operations;
extern const struct file_operations afs_file_operations;
+extern const struct netfs_read_request_ops afs_req_ops;
extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *);
extern void afs_put_wb_key(struct afs_wb_key *);
extern int afs_open(struct inode *, struct file *);
extern int afs_release(struct inode *, struct file *);
-extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *);
-extern int afs_page_filler(void *, struct page *);
+extern int afs_fetch_data(struct afs_vnode *, struct afs_read *);
+extern struct afs_read *afs_alloc_read(gfp_t);
extern void afs_put_read(struct afs_read *);
static inline struct afs_read *afs_get_read(struct afs_read *req)
@@ -1270,6 +1278,7 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c
static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size)
{
+ call->iov_len = size;
call->kvec[0].iov_base = buf;
call->kvec[0].iov_len = size;
iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size);
@@ -1277,21 +1286,25 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si
static inline void afs_extract_to_tmp(struct afs_call *call)
{
+ call->iov_len = sizeof(call->tmp);
afs_extract_begin(call, &call->tmp, sizeof(call->tmp));
}
static inline void afs_extract_to_tmp64(struct afs_call *call)
{
+ call->iov_len = sizeof(call->tmp64);
afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64));
}
static inline void afs_extract_discard(struct afs_call *call, size_t size)
{
+ call->iov_len = size;
iov_iter_discard(&call->def_iter, READ, size);
}
static inline void afs_extract_to_buf(struct afs_call *call, size_t size)
{
+ call->iov_len = size;
afs_extract_begin(call, call->buffer, size);
}
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8be709cb8542..23a1a92d64bb 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -271,40 +271,6 @@ void afs_flat_call_destructor(struct afs_call *call)
call->buffer = NULL;
}
-#define AFS_BVEC_MAX 8
-
-/*
- * Load the given bvec with the next few pages.
- */
-static void afs_load_bvec(struct afs_call *call, struct msghdr *msg,
- struct bio_vec *bv, pgoff_t first, pgoff_t last,
- unsigned offset)
-{
- struct afs_operation *op = call->op;
- struct page *pages[AFS_BVEC_MAX];
- unsigned int nr, n, i, to, bytes = 0;
-
- nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX);
- n = find_get_pages_contig(op->store.mapping, first, nr, pages);
- ASSERTCMP(n, ==, nr);
-
- msg->msg_flags |= MSG_MORE;
- for (i = 0; i < nr; i++) {
- to = PAGE_SIZE;
- if (first + i >= last) {
- to = op->store.last_to;
- msg->msg_flags &= ~MSG_MORE;
- }
- bv[i].bv_page = pages[i];
- bv[i].bv_len = to - offset;
- bv[i].bv_offset = offset;
- bytes += to - offset;
- offset = 0;
- }
-
- iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes);
-}
-
/*
* Advance the AFS call state when the RxRPC call ends the transmit phase.
*/
@@ -318,42 +284,6 @@ static void afs_notify_end_request_tx(struct sock *sock,
}
/*
- * attach the data from a bunch of pages on an inode to a call
- */
-static int afs_send_pages(struct afs_call *call, struct msghdr *msg)
-{
- struct afs_operation *op = call->op;
- struct bio_vec bv[AFS_BVEC_MAX];
- unsigned int bytes, nr, loop, offset;
- pgoff_t first = op->store.first, last = op->store.last;
- int ret;
-
- offset = op->store.first_offset;
- op->store.first_offset = 0;
-
- do {
- afs_load_bvec(call, msg, bv, first, last, offset);
- trace_afs_send_pages(call, msg, first, last, offset);
-
- offset = 0;
- bytes = msg->msg_iter.count;
- nr = msg->msg_iter.nr_segs;
-
- ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg,
- bytes, afs_notify_end_request_tx);
- for (loop = 0; loop < nr; loop++)
- put_page(bv[loop].bv_page);
- if (ret < 0)
- break;
-
- first += nr;
- } while (first <= last);
-
- trace_afs_sent_pages(call, op->store.first, last, first, ret);
- return ret;
-}
-
-/*
* Initiate a call and synchronously queue up the parameters for dispatch. Any
* error is stored into the call struct, which the caller must check for.
*/
@@ -363,6 +293,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
struct rxrpc_call *rxcall;
struct msghdr msg;
struct kvec iov[1];
+ size_t len;
s64 tx_total_len;
int ret;
@@ -383,21 +314,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
* after the initial fixed part.
*/
tx_total_len = call->request_size;
- if (call->send_pages) {
- struct afs_operation *op = call->op;
-
- if (op->store.last == op->store.first) {
- tx_total_len += op->store.last_to - op->store.first_offset;
- } else {
- /* It looks mathematically like you should be able to
- * combine the following lines with the ones above, but
- * unsigned arithmetic is fun when it wraps...
- */
- tx_total_len += PAGE_SIZE - op->store.first_offset;
- tx_total_len += op->store.last_to;
- tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE;
- }
- }
+ if (call->write_iter)
+ tx_total_len += iov_iter_count(call->write_iter);
/* If the call is going to be asynchronous, we need an extra ref for
* the call to hold itself so the caller need not hang on to its ref.
@@ -439,7 +357,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size);
msg.msg_control = NULL;
msg.msg_controllen = 0;
- msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0);
+ msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0);
ret = rxrpc_kernel_send_data(call->net->socket, rxcall,
&msg, call->request_size,
@@ -447,8 +365,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
if (ret < 0)
goto error_do_abort;
- if (call->send_pages) {
- ret = afs_send_pages(call, &msg);
+ if (call->write_iter) {
+ msg.msg_iter = *call->write_iter;
+ msg.msg_flags &= ~MSG_MORE;
+ trace_afs_send_data(call, &msg);
+
+ ret = rxrpc_kernel_send_data(call->net->socket,
+ call->rxcall, &msg,
+ iov_iter_count(&msg.msg_iter),
+ afs_notify_end_request_tx);
+ *call->write_iter = msg.msg_iter;
+
+ trace_afs_sent_data(call, &msg, ret);
if (ret < 0)
goto error_do_abort;
}
@@ -466,9 +394,10 @@ error_do_abort:
rxrpc_kernel_abort_call(call->net->socket, rxcall,
RX_USER_ABORT, ret, "KSD");
} else {
+ len = 0;
iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0);
rxrpc_kernel_recv_data(call->net->socket, rxcall,
- &msg.msg_iter, false,
+ &msg.msg_iter, &len, false,
&call->abort_code, &call->service_id);
ac->abort_code = call->abort_code;
ac->responded = true;
@@ -499,11 +428,45 @@ error_kill_call:
}
/*
+ * Log remote abort codes that indicate that we have a protocol disagreement
+ * with the server.
+ */
+static void afs_log_error(struct afs_call *call, s32 remote_abort)
+{
+ static int max = 0;
+ const char *msg;
+ int m;
+
+ switch (remote_abort) {
+ case RX_EOF: msg = "unexpected EOF"; break;
+ case RXGEN_CC_MARSHAL: msg = "client marshalling"; break;
+ case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break;
+ case RXGEN_SS_MARSHAL: msg = "server marshalling"; break;
+ case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break;
+ case RXGEN_DECODE: msg = "opcode decode"; break;
+ case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break;
+ case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break;
+ case -32: msg = "insufficient data"; break;
+ default:
+ return;
+ }
+
+ m = max;
+ if (m < 3) {
+ max = m + 1;
+ pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+ msg, call->type->name,
+ &call->alist->addrs[call->addr_ix].transport);
+ }
+}
+
+/*
* deliver messages to a call
*/
static void afs_deliver_to_call(struct afs_call *call)
{
enum afs_call_state state;
+ size_t len;
u32 abort_code, remote_abort = 0;
int ret;
@@ -516,10 +479,11 @@ static void afs_deliver_to_call(struct afs_call *call)
state == AFS_CALL_SV_AWAIT_ACK
) {
if (state == AFS_CALL_SV_AWAIT_ACK) {
+ len = 0;
iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0);
ret = rxrpc_kernel_recv_data(call->net->socket,
call->rxcall, &call->def_iter,
- false, &remote_abort,
+ &len, false, &remote_abort,
&call->service_id);
trace_afs_receive_data(call, &call->def_iter, false, ret);
@@ -559,6 +523,7 @@ static void afs_deliver_to_call(struct afs_call *call)
goto out;
case -ECONNABORTED:
ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
+ afs_log_error(call, call->abort_code);
goto done;
case -ENOTSUPP:
abort_code = RXGEN_OPCODE;
@@ -929,10 +894,11 @@ int afs_extract_data(struct afs_call *call, bool want_more)
u32 remote_abort = 0;
int ret;
- _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more);
+ _enter("{%s,%zu,%zu},%d",
+ call->type->name, call->iov_len, iov_iter_count(iter), want_more);
ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter,
- want_more, &remote_abort,
+ &call->iov_len, want_more, &remote_abort,
&call->service_id);
if (ret == 0 || ret == -EAGAIN)
return ret;
diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
index dc9327332f06..00fca3c66ba6 100644
--- a/fs/afs/vlclient.c
+++ b/fs/afs/vlclient.c
@@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
if (ret < 0)
return ret;
call->unmarshall = 6;
+ fallthrough;
case 6:
break;
diff --git a/fs/afs/write.c b/fs/afs/write.c
index c9195fc67fd8..3edb6204b937 100644
--- a/fs/afs/write.c
+++ b/fs/afs/write.c
@@ -11,6 +11,8 @@
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
+#include <linux/netfs.h>
+#include <linux/fscache.h>
#include "internal.h"
/*
@@ -23,55 +25,6 @@ int afs_set_page_dirty(struct page *page)
}
/*
- * partly or wholly fill a page that's under preparation for writing
- */
-static int afs_fill_page(struct afs_vnode *vnode, struct key *key,
- loff_t pos, unsigned int len, struct page *page)
-{
- struct afs_read *req;
- size_t p;
- void *data;
- int ret;
-
- _enter(",,%llu", (unsigned long long)pos);
-
- if (pos >= vnode->vfs_inode.i_size) {
- p = pos & ~PAGE_MASK;
- ASSERTCMP(p + len, <=, PAGE_SIZE);
- data = kmap(page);
- memset(data + p, 0, len);
- kunmap(page);
- return 0;
- }
-
- req = kzalloc(struct_size(req, array, 1), GFP_KERNEL);
- if (!req)
- return -ENOMEM;
-
- refcount_set(&req->usage, 1);
- req->pos = pos;
- req->len = len;
- req->nr_pages = 1;
- req->pages = req->array;
- req->pages[0] = page;
- get_page(page);
-
- ret = afs_fetch_data(vnode, key, req);
- afs_put_read(req);
- if (ret < 0) {
- if (ret == -ENOENT) {
- _debug("got NOENT from server"
- " - marking file deleted and stale");
- set_bit(AFS_VNODE_DELETED, &vnode->flags);
- ret = -ESTALE;
- }
- }
-
- _leave(" = %d", ret);
- return ret;
-}
-
-/*
* prepare to perform part of a write to a page
*/
int afs_write_begin(struct file *file, struct address_space *mapping,
@@ -80,47 +33,40 @@ int afs_write_begin(struct file *file, struct address_space *mapping,
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
struct page *page;
- struct key *key = afs_file_key(file);
unsigned long priv;
- unsigned f, from = pos & (PAGE_SIZE - 1);
- unsigned t, to = from + len;
- pgoff_t index = pos >> PAGE_SHIFT;
+ unsigned f, from;
+ unsigned t, to;
+ pgoff_t index;
int ret;
- _enter("{%llx:%llu},{%lx},%u,%u",
- vnode->fid.vid, vnode->fid.vnode, index, from, to);
+ _enter("{%llx:%llu},%llx,%x",
+ vnode->fid.vid, vnode->fid.vnode, pos, len);
- page = grab_cache_page_write_begin(mapping, index, flags);
- if (!page)
- return -ENOMEM;
+ /* Prefetch area to be written into the cache if we're caching this
+ * file. We need to do this before we get a lock on the page in case
+ * there's more than one writer competing for the same cache block.
+ */
+ ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata,
+ &afs_req_ops, NULL);
+ if (ret < 0)
+ return ret;
- if (!PageUptodate(page) && len != PAGE_SIZE) {
- ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page);
- if (ret < 0) {
- unlock_page(page);
- put_page(page);
- _leave(" = %d [prep]", ret);
- return ret;
- }
- SetPageUptodate(page);
- }
+ index = page->index;
+ from = pos - index * PAGE_SIZE;
+ to = from + len;
try_again:
/* See if this page is already partially written in a way that we can
* merge the new write with.
*/
- t = f = 0;
if (PagePrivate(page)) {
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
ASSERTCMP(f, <=, t);
- }
- if (f != t) {
if (PageWriteback(page)) {
- trace_afs_page_dirty(vnode, tracepoint_string("alrdy"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page);
goto flush_conflicting_write;
}
/* If the file is being filled locally, allow inter-write
@@ -164,12 +110,10 @@ int afs_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata)
{
struct afs_vnode *vnode = AFS_FS_I(file_inode(file));
- struct key *key = afs_file_key(file);
unsigned long priv;
- unsigned int f, from = pos & (PAGE_SIZE - 1);
+ unsigned int f, from = pos & (thp_size(page) - 1);
unsigned int t, to = from + copied;
loff_t i_size, maybe_i_size;
- int ret = 0;
_enter("{%llx:%llu},{%lx}",
vnode->fid.vid, vnode->fid.vnode, page->index);
@@ -188,88 +132,75 @@ int afs_write_end(struct file *file, struct address_space *mapping,
write_sequnlock(&vnode->cb_lock);
}
- if (!PageUptodate(page)) {
- if (copied < len) {
- /* Try and load any missing data from the server. The
- * unmarshalling routine will take care of clearing any
- * bits that are beyond the EOF.
- */
- ret = afs_fill_page(vnode, key, pos + copied,
- len - copied, page);
- if (ret < 0)
- goto out;
- }
- SetPageUptodate(page);
- }
+ ASSERT(PageUptodate(page));
if (PagePrivate(page)) {
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
if (from < f)
f = from;
if (to > t)
t = to;
- priv = afs_page_dirty(f, t);
+ priv = afs_page_dirty(page, f, t);
set_page_private(page, priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty+"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page);
} else {
- priv = afs_page_dirty(from, to);
+ priv = afs_page_dirty(page, from, to);
attach_page_private(page, (void *)priv);
- trace_afs_page_dirty(vnode, tracepoint_string("dirty"),
- page->index, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page);
}
- set_page_dirty(page);
- if (PageDirty(page))
- _debug("dirtied");
- ret = copied;
+ if (set_page_dirty(page))
+ _debug("dirtied %lx", page->index);
out:
unlock_page(page);
put_page(page);
- return ret;
+ return copied;
}
/*
* kill all the pages in the given range
*/
static void afs_kill_pages(struct address_space *mapping,
- pgoff_t first, pgoff_t last)
+ loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct pagevec pv;
- unsigned count, loop;
+ unsigned int loop, psize;
- _enter("{%llx:%llu},%lx-%lx",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ _enter("{%llx:%llu},%llx @%llx",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
pagevec_init(&pv);
do {
- _debug("kill %lx-%lx", first, last);
+ _debug("kill %llx @%llx", len, start);
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
+ pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
+ PAGEVEC_SIZE, pv.pages);
+ if (pv.nr == 0)
+ break;
- for (loop = 0; loop < count; loop++) {
+ for (loop = 0; loop < pv.nr; loop++) {
struct page *page = pv.pages[loop];
+
+ if (page->index * PAGE_SIZE >= start + len)
+ break;
+
+ psize = thp_size(page);
+ start += psize;
+ len -= psize;
ClearPageUptodate(page);
- SetPageError(page);
end_page_writeback(page);
- if (page->index >= first)
- first = page->index + 1;
lock_page(page);
generic_error_remove_page(mapping, page);
unlock_page(page);
}
__pagevec_release(&pv);
- } while (first <= last);
+ } while (len > 0);
_leave("");
}
@@ -279,37 +210,40 @@ static void afs_kill_pages(struct address_space *mapping,
*/
static void afs_redirty_pages(struct writeback_control *wbc,
struct address_space *mapping,
- pgoff_t first, pgoff_t last)
+ loff_t start, loff_t len)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct pagevec pv;
- unsigned count, loop;
+ unsigned int loop, psize;
- _enter("{%llx:%llu},%lx-%lx",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ _enter("{%llx:%llu},%llx @%llx",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
pagevec_init(&pv);
do {
- _debug("redirty %lx-%lx", first, last);
+ _debug("redirty %llx @%llx", len, start);
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(mapping, first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
+ pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE,
+ PAGEVEC_SIZE, pv.pages);
+ if (pv.nr == 0)
+ break;
- for (loop = 0; loop < count; loop++) {
+ for (loop = 0; loop < pv.nr; loop++) {
struct page *page = pv.pages[loop];
+ if (page->index * PAGE_SIZE >= start + len)
+ break;
+
+ psize = thp_size(page);
+ start += psize;
+ len -= psize;
redirty_page_for_writepage(wbc, page);
end_page_writeback(page);
- if (page->index >= first)
- first = page->index + 1;
}
__pagevec_release(&pv);
- } while (first <= last);
+ } while (len > 0);
_leave("");
}
@@ -317,37 +251,32 @@ static void afs_redirty_pages(struct writeback_control *wbc,
/*
* completion of write to server
*/
-static void afs_pages_written_back(struct afs_vnode *vnode,
- pgoff_t first, pgoff_t last)
+static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len)
{
- struct pagevec pv;
- unsigned long priv;
- unsigned count, loop;
+ struct address_space *mapping = vnode->vfs_inode.i_mapping;
+ struct page *page;
+ pgoff_t end;
- _enter("{%llx:%llu},{%lx-%lx}",
- vnode->fid.vid, vnode->fid.vnode, first, last);
+ XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE);
- pagevec_init(&pv);
+ _enter("{%llx:%llu},{%x @%llx}",
+ vnode->fid.vid, vnode->fid.vnode, len, start);
- do {
- _debug("done %lx-%lx", first, last);
-
- count = last - first + 1;
- if (count > PAGEVEC_SIZE)
- count = PAGEVEC_SIZE;
- pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping,
- first, count, pv.pages);
- ASSERTCMP(pv.nr, ==, count);
-
- for (loop = 0; loop < count; loop++) {
- priv = (unsigned long)detach_page_private(pv.pages[loop]);
- trace_afs_page_dirty(vnode, tracepoint_string("clear"),
- pv.pages[loop]->index, priv);
- end_page_writeback(pv.pages[loop]);
+ rcu_read_lock();
+
+ end = (start + len - 1) / PAGE_SIZE;
+ xas_for_each(&xas, page, end) {
+ if (!PageWriteback(page)) {
+ kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end);
+ ASSERT(PageWriteback(page));
}
- first += count;
- __pagevec_release(&pv);
- } while (first <= last);
+
+ trace_afs_page_dirty(vnode, tracepoint_string("clear"), page);
+ detach_page_private(page);
+ page_endio(page, true, 0);
+ }
+
+ rcu_read_unlock();
afs_prune_wb_keys(vnode);
_leave("");
@@ -402,11 +331,9 @@ static void afs_store_data_success(struct afs_operation *op)
afs_vnode_commit_status(op, &op->file[0]);
if (op->error == 0) {
if (!op->store.laundering)
- afs_pages_written_back(vnode, op->store.first, op->store.last);
+ afs_pages_written_back(vnode, op->store.pos, op->store.size);
afs_stat_v(vnode, n_stores);
- atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) -
- (op->store.first * PAGE_SIZE + op->store.first_offset),
- &afs_v2net(vnode)->n_store_bytes);
+ atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes);
}
}
@@ -419,21 +346,20 @@ static const struct afs_operation_ops afs_store_data_operation = {
/*
* write to a file
*/
-static int afs_store_data(struct address_space *mapping,
- pgoff_t first, pgoff_t last,
- unsigned offset, unsigned to, bool laundering)
+static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos,
+ bool laundering)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
struct afs_operation *op;
struct afs_wb_key *wbk = NULL;
- int ret;
+ loff_t size = iov_iter_count(iter), i_size;
+ int ret = -ENOKEY;
- _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x",
+ _enter("%s{%llx:%llu.%u},%llx,%llx",
vnode->volume->name,
vnode->fid.vid,
vnode->fid.vnode,
vnode->fid.unique,
- first, last, offset, to);
+ size, pos);
ret = afs_get_writeback_key(vnode, &wbk);
if (ret) {
@@ -447,13 +373,15 @@ static int afs_store_data(struct address_space *mapping,
return -ENOMEM;
}
+ i_size = i_size_read(&vnode->vfs_inode);
+
afs_op_set_vnode(op, 0, vnode);
op->file[0].dv_delta = 1;
- op->store.mapping = mapping;
- op->store.first = first;
- op->store.last = last;
- op->store.first_offset = offset;
- op->store.last_to = to;
+ op->file[0].modification = true;
+ op->store.write_iter = iter;
+ op->store.pos = pos;
+ op->store.size = size;
+ op->store.i_size = max(pos + size, i_size);
op->store.laundering = laundering;
op->mtime = vnode->vfs_inode.i_mtime;
op->flags |= AFS_OPERATION_UNINTR;
@@ -487,73 +415,58 @@ try_next_key:
}
/*
- * Synchronously write back the locked page and any subsequent non-locked dirty
- * pages.
+ * Extend the region to be written back to include subsequent contiguously
+ * dirty pages if possible, but don't sleep while doing so.
+ *
+ * If this page holds new content, then we can include filler zeros in the
+ * writeback.
*/
-static int afs_write_back_from_locked_page(struct address_space *mapping,
- struct writeback_control *wbc,
- struct page *primary_page,
- pgoff_t final_page)
+static void afs_extend_writeback(struct address_space *mapping,
+ struct afs_vnode *vnode,
+ long *_count,
+ loff_t start,
+ loff_t max_len,
+ bool new_content,
+ unsigned int *_len)
{
- struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- struct page *pages[8], *page;
- unsigned long count, priv;
- unsigned n, offset, to, f, t;
- pgoff_t start, first, last;
- loff_t i_size, end;
- int loop, ret;
-
- _enter(",%lx", primary_page->index);
+ struct pagevec pvec;
+ struct page *page;
+ unsigned long priv;
+ unsigned int psize, filler = 0;
+ unsigned int f, t;
+ loff_t len = *_len;
+ pgoff_t index = (start + len) / PAGE_SIZE;
+ bool stop = true;
+ unsigned int i;
- count = 1;
- if (test_set_page_writeback(primary_page))
- BUG();
+ XA_STATE(xas, &mapping->i_pages, index);
+ pagevec_init(&pvec);
- /* Find all consecutive lockable dirty pages that have contiguous
- * written regions, stopping when we find a page that is not
- * immediately lockable, is not dirty or is missing, or we reach the
- * end of the range.
- */
- start = primary_page->index;
- priv = page_private(primary_page);
- offset = afs_page_dirty_from(priv);
- to = afs_page_dirty_to(priv);
- trace_afs_page_dirty(vnode, tracepoint_string("store"),
- primary_page->index, priv);
-
- WARN_ON(offset == to);
- if (offset == to)
- trace_afs_page_dirty(vnode, tracepoint_string("WARN"),
- primary_page->index, priv);
-
- if (start >= final_page ||
- (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)))
- goto no_more;
-
- start++;
do {
- _debug("more %lx [%lx]", start, count);
- n = final_page - start + 1;
- if (n > ARRAY_SIZE(pages))
- n = ARRAY_SIZE(pages);
- n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages);
- _debug("fgpc %u", n);
- if (n == 0)
- goto no_more;
- if (pages[0]->index != start) {
- do {
- put_page(pages[--n]);
- } while (n > 0);
- goto no_more;
- }
+ /* Firstly, we gather up a batch of contiguous dirty pages
+ * under the RCU read lock - but we can't clear the dirty flags
+ * there if any of those pages are mapped.
+ */
+ rcu_read_lock();
- for (loop = 0; loop < n; loop++) {
- page = pages[loop];
- if (to != PAGE_SIZE &&
- !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))
+ xas_for_each(&xas, page, ULONG_MAX) {
+ stop = true;
+ if (xas_retry(&xas, page))
+ continue;
+ if (xa_is_value(page))
break;
- if (page->index > final_page)
+ if (page->index != index)
break;
+
+ if (!page_cache_get_speculative(page)) {
+ xas_reset(&xas);
+ continue;
+ }
+
+ /* Has the page moved or been split? */
+ if (unlikely(page != xas_reload(&xas)))
+ break;
+
if (!trylock_page(page))
break;
if (!PageDirty(page) || PageWriteback(page)) {
@@ -561,57 +474,134 @@ static int afs_write_back_from_locked_page(struct address_space *mapping,
break;
}
+ psize = thp_size(page);
priv = page_private(page);
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
- if (f != 0 &&
- !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) {
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
+ if (f != 0 && !new_content) {
unlock_page(page);
break;
}
- to = t;
- trace_afs_page_dirty(vnode, tracepoint_string("store+"),
- page->index, priv);
+ len += filler + t;
+ filler = psize - t;
+ if (len >= max_len || *_count <= 0)
+ stop = true;
+ else if (t == psize || new_content)
+ stop = false;
+
+ index += thp_nr_pages(page);
+ if (!pagevec_add(&pvec, page))
+ break;
+ if (stop)
+ break;
+ }
+
+ if (!stop)
+ xas_pause(&xas);
+ rcu_read_unlock();
+
+ /* Now, if we obtained any pages, we can shift them to being
+ * writable and mark them for caching.
+ */
+ if (!pagevec_count(&pvec))
+ break;
+
+ for (i = 0; i < pagevec_count(&pvec); i++) {
+ page = pvec.pages[i];
+ trace_afs_page_dirty(vnode, tracepoint_string("store+"), page);
if (!clear_page_dirty_for_io(page))
BUG();
if (test_set_page_writeback(page))
BUG();
+
+ *_count -= thp_nr_pages(page);
unlock_page(page);
- put_page(page);
- }
- count += loop;
- if (loop < n) {
- for (; loop < n; loop++)
- put_page(pages[loop]);
- goto no_more;
}
- start += loop;
- } while (start <= final_page && count < 65536);
+ pagevec_release(&pvec);
+ cond_resched();
+ } while (!stop);
+
+ *_len = len;
+}
+
+/*
+ * Synchronously write back the locked page and any subsequent non-locked dirty
+ * pages.
+ */
+static ssize_t afs_write_back_from_locked_page(struct address_space *mapping,
+ struct writeback_control *wbc,
+ struct page *page,
+ loff_t start, loff_t end)
+{
+ struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ unsigned long priv;
+ unsigned int offset, to, len, max_len;
+ loff_t i_size = i_size_read(&vnode->vfs_inode);
+ bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+ long count = wbc->nr_to_write;
+ int ret;
+
+ _enter(",%lx,%llx-%llx", page->index, start, end);
+
+ if (test_set_page_writeback(page))
+ BUG();
+
+ count -= thp_nr_pages(page);
+
+ /* Find all consecutive lockable dirty pages that have contiguous
+ * written regions, stopping when we find a page that is not
+ * immediately lockable, is not dirty or is missing, or we reach the
+ * end of the range.
+ */
+ priv = page_private(page);
+ offset = afs_page_dirty_from(page, priv);
+ to = afs_page_dirty_to(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("store"), page);
+
+ len = to - offset;
+ start += offset;
+ if (start < i_size) {
+ /* Trim the write to the EOF; the extra data is ignored. Also
+ * put an upper limit on the size of a single storedata op.
+ */
+ max_len = 65536 * 4096;
+ max_len = min_t(unsigned long long, max_len, end - start + 1);
+ max_len = min_t(unsigned long long, max_len, i_size - start);
+
+ if (len < max_len &&
+ (to == thp_size(page) || new_content))
+ afs_extend_writeback(mapping, vnode, &count,
+ start, max_len, new_content, &len);
+ len = min_t(loff_t, len, max_len);
+ }
-no_more:
/* We now have a contiguous set of dirty pages, each with writeback
* set; the first page is still locked at this point, but all the rest
* have been unlocked.
*/
- unlock_page(primary_page);
+ unlock_page(page);
- first = primary_page->index;
- last = first + count - 1;
+ if (start < i_size) {
+ _debug("write back %x @%llx [%llx]", len, start, i_size);
- end = (loff_t)last * PAGE_SIZE + to;
- i_size = i_size_read(&vnode->vfs_inode);
+ iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len);
+ ret = afs_store_data(vnode, &iter, start, false);
+ } else {
+ _debug("write discard %x @%llx [%llx]", len, start, i_size);
- _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to);
- if (end > i_size)
- to = i_size & ~PAGE_MASK;
+ /* The dirty region was entirely beyond the EOF. */
+ afs_pages_written_back(vnode, start, len);
+ ret = 0;
+ }
- ret = afs_store_data(mapping, first, last, offset, to, false);
switch (ret) {
case 0:
- ret = count;
+ wbc->nr_to_write = count;
+ ret = len;
break;
default:
@@ -623,13 +613,13 @@ no_more:
case -EKEYEXPIRED:
case -EKEYREJECTED:
case -EKEYREVOKED:
- afs_redirty_pages(wbc, mapping, first, last);
+ afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, ret);
break;
case -EDQUOT:
case -ENOSPC:
- afs_redirty_pages(wbc, mapping, first, last);
+ afs_redirty_pages(wbc, mapping, start, len);
mapping_set_error(mapping, -ENOSPC);
break;
@@ -641,7 +631,7 @@ no_more:
case -ENOMEDIUM:
case -ENXIO:
trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail);
- afs_kill_pages(mapping, first, last);
+ afs_kill_pages(mapping, start, len);
mapping_set_error(mapping, ret);
break;
}
@@ -656,19 +646,19 @@ no_more:
*/
int afs_writepage(struct page *page, struct writeback_control *wbc)
{
- int ret;
+ ssize_t ret;
+ loff_t start;
_enter("{%lx},", page->index);
+ start = page->index * PAGE_SIZE;
ret = afs_write_back_from_locked_page(page->mapping, wbc, page,
- wbc->range_end >> PAGE_SHIFT);
+ start, LLONG_MAX - start);
if (ret < 0) {
- _leave(" = %d", ret);
- return 0;
+ _leave(" = %zd", ret);
+ return ret;
}
- wbc->nr_to_write -= ret;
-
_leave(" = 0");
return 0;
}
@@ -678,35 +668,46 @@ int afs_writepage(struct page *page, struct writeback_control *wbc)
*/
static int afs_writepages_region(struct address_space *mapping,
struct writeback_control *wbc,
- pgoff_t index, pgoff_t end, pgoff_t *_next)
+ loff_t start, loff_t end, loff_t *_next)
{
struct page *page;
- int ret, n;
+ ssize_t ret;
+ int n;
- _enter(",,%lx,%lx,", index, end);
+ _enter("%llx,%llx,", start, end);
do {
- n = find_get_pages_range_tag(mapping, &index, end,
- PAGECACHE_TAG_DIRTY, 1, &page);
+ pgoff_t index = start / PAGE_SIZE;
+
+ n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE,
+ PAGECACHE_TAG_DIRTY, 1, &page);
if (!n)
break;
+ start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */
+
_debug("wback %lx", page->index);
- /*
- * at this point we hold neither the i_pages lock nor the
+ /* At this point we hold neither the i_pages lock nor the
* page lock: the page may be truncated or invalidated
* (changing page->mapping to NULL), or even swizzled
* back from swapper_space to tmpfs file mapping
*/
- ret = lock_page_killable(page);
- if (ret < 0) {
- put_page(page);
- _leave(" = %d", ret);
- return ret;
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ ret = lock_page_killable(page);
+ if (ret < 0) {
+ put_page(page);
+ return ret;
+ }
+ } else {
+ if (!trylock_page(page)) {
+ put_page(page);
+ return 0;
+ }
}
if (page->mapping != mapping || !PageDirty(page)) {
+ start += thp_size(page);
unlock_page(page);
put_page(page);
continue;
@@ -722,20 +723,20 @@ static int afs_writepages_region(struct address_space *mapping,
if (!clear_page_dirty_for_io(page))
BUG();
- ret = afs_write_back_from_locked_page(mapping, wbc, page, end);
+ ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end);
put_page(page);
if (ret < 0) {
- _leave(" = %d", ret);
+ _leave(" = %zd", ret);
return ret;
}
- wbc->nr_to_write -= ret;
+ start += ret * PAGE_SIZE;
cond_resched();
- } while (index < end && wbc->nr_to_write > 0);
+ } while (wbc->nr_to_write > 0);
- *_next = index;
- _leave(" = 0 [%lx]", *_next);
+ *_next = start;
+ _leave(" = 0 [%llx]", *_next);
return 0;
}
@@ -746,7 +747,7 @@ int afs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
- pgoff_t start, end, next;
+ loff_t start, next;
int ret;
_enter("");
@@ -761,22 +762,19 @@ int afs_writepages(struct address_space *mapping,
return 0;
if (wbc->range_cyclic) {
- start = mapping->writeback_index;
- end = -1;
- ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ start = mapping->writeback_index * PAGE_SIZE;
+ ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next);
if (start > 0 && wbc->nr_to_write > 0 && ret == 0)
ret = afs_writepages_region(mapping, wbc, 0, start,
&next);
- mapping->writeback_index = next;
+ mapping->writeback_index = next / PAGE_SIZE;
} else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) {
- end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT);
- ret = afs_writepages_region(mapping, wbc, 0, end, &next);
+ ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next);
if (wbc->nr_to_write > 0)
mapping->writeback_index = next;
} else {
- start = wbc->range_start >> PAGE_SHIFT;
- end = wbc->range_end >> PAGE_SHIFT;
- ret = afs_writepages_region(mapping, wbc, start, end, &next);
+ ret = afs_writepages_region(mapping, wbc,
+ wbc->range_start, wbc->range_end, &next);
}
up_read(&vnode->validate_lock);
@@ -834,13 +832,13 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync)
*/
vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
{
+ struct page *page = thp_head(vmf->page);
struct file *file = vmf->vma->vm_file;
struct inode *inode = file_inode(file);
struct afs_vnode *vnode = AFS_FS_I(inode);
unsigned long priv;
- _enter("{{%llx:%llu}},{%lx}",
- vnode->fid.vid, vnode->fid.vnode, vmf->page->index);
+ _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index);
sb_start_pagefault(inode->i_sb);
@@ -848,30 +846,35 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf)
* be modified. We then assume the entire page will need writing back.
*/
#ifdef CONFIG_AFS_FSCACHE
- fscache_wait_on_page_write(vnode->cache, vmf->page);
+ if (PageFsCache(page) &&
+ wait_on_page_fscache_killable(page) < 0)
+ return VM_FAULT_RETRY;
#endif
- if (PageWriteback(vmf->page) &&
- wait_on_page_bit_killable(vmf->page, PG_writeback) < 0)
+ if (wait_on_page_writeback_killable(page))
return VM_FAULT_RETRY;
- if (lock_page_killable(vmf->page) < 0)
+ if (lock_page_killable(page) < 0)
return VM_FAULT_RETRY;
/* We mustn't change page->private until writeback is complete as that
* details the portion of the page we need to write back and we might
* need to redirty the page if there's a problem.
*/
- wait_on_page_writeback(vmf->page);
+ if (wait_on_page_writeback_killable(page) < 0) {
+ unlock_page(page);
+ return VM_FAULT_RETRY;
+ }
- priv = afs_page_dirty(0, PAGE_SIZE);
+ priv = afs_page_dirty(page, 0, thp_size(page));
priv = afs_page_dirty_mmapped(priv);
- trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"),
- vmf->page->index, priv);
- if (PagePrivate(vmf->page))
- set_page_private(vmf->page, priv);
- else
- attach_page_private(vmf->page, (void *)priv);
+ if (PagePrivate(page)) {
+ set_page_private(page, priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page);
+ } else {
+ attach_page_private(page, (void *)priv);
+ trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page);
+ }
file_update_time(file);
sb_end_pagefault(inode->i_sb);
@@ -913,6 +916,8 @@ int afs_launder_page(struct page *page)
{
struct address_space *mapping = page->mapping;
struct afs_vnode *vnode = AFS_FS_I(mapping->host);
+ struct iov_iter iter;
+ struct bio_vec bv[1];
unsigned long priv;
unsigned int f, t;
int ret = 0;
@@ -922,26 +927,24 @@ int afs_launder_page(struct page *page)
priv = page_private(page);
if (clear_page_dirty_for_io(page)) {
f = 0;
- t = PAGE_SIZE;
+ t = thp_size(page);
if (PagePrivate(page)) {
- f = afs_page_dirty_from(priv);
- t = afs_page_dirty_to(priv);
+ f = afs_page_dirty_from(page, priv);
+ t = afs_page_dirty_to(page, priv);
}
- trace_afs_page_dirty(vnode, tracepoint_string("launder"),
- page->index, priv);
- ret = afs_store_data(mapping, page->index, page->index, t, f, true);
- }
-
- priv = (unsigned long)detach_page_private(page);
- trace_afs_page_dirty(vnode, tracepoint_string("laundered"),
- page->index, priv);
+ bv[0].bv_page = page;
+ bv[0].bv_offset = f;
+ bv[0].bv_len = t - f;
+ iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len);
-#ifdef CONFIG_AFS_FSCACHE
- if (PageFsCache(page)) {
- fscache_wait_on_page_write(vnode->cache, page);
- fscache_uncache_page(vnode->cache, page);
+ trace_afs_page_dirty(vnode, tracepoint_string("launder"), page);
+ ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE,
+ true);
}
-#endif
+
+ trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page);
+ detach_page_private(page);
+ wait_on_page_fscache(page);
return ret;
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index bd787e71a657..2b35cba8ad62 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -360,22 +360,23 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
struct afs_vnode_param *vp = &op->file[0];
struct afs_read *req = op->fetch.req;
const __be32 *bp;
- unsigned int size;
int ret;
- _enter("{%u,%zu/%llu}",
- call->unmarshall, iov_iter_count(call->iter), req->actual_len);
+ _enter("{%u,%zu, %zu/%llu}",
+ call->unmarshall, call->iov_len, iov_iter_count(call->iter),
+ req->actual_len);
switch (call->unmarshall) {
case 0:
req->actual_len = 0;
- req->index = 0;
- req->offset = req->pos & (PAGE_SIZE - 1);
afs_extract_to_tmp64(call);
call->unmarshall++;
fallthrough;
- /* extract the returned data length */
+ /* Extract the returned data length into ->actual_len. This
+ * may indicate more or less data than was requested will be
+ * returned.
+ */
case 1:
_debug("extract data length");
ret = afs_extract_data(call, true);
@@ -384,44 +385,25 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
req->actual_len = be64_to_cpu(call->tmp64);
_debug("DATA length: %llu", req->actual_len);
- req->remain = min(req->len, req->actual_len);
- if (req->remain == 0)
+
+ if (req->actual_len == 0)
goto no_more_data;
+ call->iter = req->iter;
+ call->iov_len = min(req->actual_len, req->len);
call->unmarshall++;
-
- begin_page:
- ASSERTCMP(req->index, <, req->nr_pages);
- if (req->remain > PAGE_SIZE - req->offset)
- size = PAGE_SIZE - req->offset;
- else
- size = req->remain;
- call->bvec[0].bv_len = size;
- call->bvec[0].bv_offset = req->offset;
- call->bvec[0].bv_page = req->pages[req->index];
- iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size);
- ASSERTCMP(size, <=, PAGE_SIZE);
fallthrough;
/* extract the returned data */
case 2:
_debug("extract data %zu/%llu",
- iov_iter_count(call->iter), req->remain);
+ iov_iter_count(call->iter), req->actual_len);
ret = afs_extract_data(call, true);
if (ret < 0)
return ret;
- req->remain -= call->bvec[0].bv_len;
- req->offset += call->bvec[0].bv_len;
- ASSERTCMP(req->offset, <=, PAGE_SIZE);
- if (req->offset == PAGE_SIZE) {
- req->offset = 0;
- req->index++;
- if (req->remain > 0)
- goto begin_page;
- }
- ASSERTCMP(req->remain, ==, 0);
+ call->iter = &call->def_iter;
if (req->actual_len <= req->len)
goto no_more_data;
@@ -467,17 +449,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call)
break;
}
- for (; req->index < req->nr_pages; req->index++) {
- if (req->offset < PAGE_SIZE)
- zero_user_segment(req->pages[req->index],
- req->offset, PAGE_SIZE);
- req->offset = 0;
- }
-
- if (req->page_done)
- for (req->index = 0; req->index < req->nr_pages; req->index++)
- req->page_done(req);
-
_leave(" = 0 [done]");
return 0;
}
@@ -516,6 +487,8 @@ void yfs_fs_fetch_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
+ req->call_debug_id = call->debug_id;
+
/* marshall the parameters */
bp = call->request;
bp = xdr_encode_u32(bp, YFSFETCHDATA64);
@@ -1102,25 +1075,15 @@ void yfs_fs_store_data(struct afs_operation *op)
{
struct afs_vnode_param *vp = &op->file[0];
struct afs_call *call;
- loff_t size, pos, i_size;
__be32 *bp;
_enter(",%x,{%llx:%llu},,",
key_serial(op->key), vp->fid.vid, vp->fid.vnode);
- size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset;
- if (op->store.first != op->store.last)
- size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT;
- pos = (loff_t)op->store.first << PAGE_SHIFT;
- pos += op->store.first_offset;
-
- i_size = i_size_read(&vp->vnode->vfs_inode);
- if (pos + size > i_size)
- i_size = size + pos;
-
_debug("size %llx, at %llx, i_size %llx",
- (unsigned long long)size, (unsigned long long)pos,
- (unsigned long long)i_size);
+ (unsigned long long)op->store.size,
+ (unsigned long long)op->store.pos,
+ (unsigned long long)op->store.i_size);
call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64,
sizeof(__be32) +
@@ -1133,8 +1096,7 @@ void yfs_fs_store_data(struct afs_operation *op)
if (!call)
return afs_op_nomem(op);
- call->key = op->key;
- call->send_pages = true;
+ call->write_iter = op->store.write_iter;
/* marshall the parameters */
bp = call->request;
@@ -1142,9 +1104,9 @@ void yfs_fs_store_data(struct afs_operation *op)
bp = xdr_encode_u32(bp, 0); /* RPC flags */
bp = xdr_encode_YFSFid(bp, &vp->fid);
bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime);
- bp = xdr_encode_u64(bp, pos);
- bp = xdr_encode_u64(bp, size);
- bp = xdr_encode_u64(bp, i_size);
+ bp = xdr_encode_u64(bp, op->store.pos);
+ bp = xdr_encode_u64(bp, op->store.size);
+ bp = xdr_encode_u64(bp, op->store.i_size);
yfs_check_req(call, bp);
trace_afs_make_fs_call(call, &vp->fid);
diff --git a/fs/aio.c b/fs/aio.c
index 1f32da13d39e..76ce0cc3ee4e 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx)
}
}
-static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags)
+static int aio_ring_mremap(struct vm_area_struct *vma)
{
struct file *file = vma->vm_file;
struct mm_struct *mm = vma->vm_mm;
struct kioctx_table *table;
int i, res = -EINVAL;
- if (flags & MREMAP_DONTUNMAP)
- return -EINVAL;
-
spin_lock(&mm->ioctx_lock);
rcu_read_lock();
table = rcu_dereference(mm->ioctx_table);
diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h
index 054f97b07754..918826eaceea 100644
--- a/fs/autofs/autofs_i.h
+++ b/fs/autofs/autofs_i.h
@@ -87,6 +87,7 @@ struct autofs_wait_queue {
autofs_wqt_t wait_queue_token;
/* We use the following to see what we are waiting for */
struct qstr name;
+ u32 offset;
u32 dev;
u64 ino;
kuid_t uid;
diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c
index a1c7701007e7..b3fefd6237c3 100644
--- a/fs/autofs/expire.c
+++ b/fs/autofs/expire.c
@@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry,
return NULL;
}
- if (d_really_is_positive(dentry) && d_is_symlink(dentry)) {
+ if (d_is_symlink(dentry)) {
pr_debug("checking symlink %p %pd\n", dentry, dentry);
/* Forced expire, user space handles busy mounts */
diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c
index 5ced859dac53..16b5fca0626e 100644
--- a/fs/autofs/waitq.c
+++ b/fs/autofs/waitq.c
@@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi)
while (wq) {
nwq = wq->next;
wq->status = -ENOENT; /* Magic is gone - report failure */
- kfree(wq->name.name);
+ kfree(wq->name.name - wq->offset);
wq->name.name = NULL;
wq->wait_ctr--;
wake_up_interruptible(&wq->queue);
@@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi,
fput(pipe);
}
-static int autofs_getpath(struct autofs_sb_info *sbi,
- struct dentry *dentry, char *name)
-{
- struct dentry *root = sbi->sb->s_root;
- struct dentry *tmp;
- char *buf;
- char *p;
- int len;
- unsigned seq;
-
-rename_retry:
- buf = name;
- len = 0;
-
- seq = read_seqbegin(&rename_lock);
- rcu_read_lock();
- spin_lock(&sbi->fs_lock);
- for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent)
- len += tmp->d_name.len + 1;
-
- if (!len || --len > NAME_MAX) {
- spin_unlock(&sbi->fs_lock);
- rcu_read_unlock();
- if (read_seqretry(&rename_lock, seq))
- goto rename_retry;
- return 0;
- }
-
- *(buf + len) = '\0';
- p = buf + len - dentry->d_name.len;
- strncpy(p, dentry->d_name.name, dentry->d_name.len);
-
- for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) {
- *(--p) = '/';
- p -= tmp->d_name.len;
- strncpy(p, tmp->d_name.name, tmp->d_name.len);
- }
- spin_unlock(&sbi->fs_lock);
- rcu_read_unlock();
- if (read_seqretry(&rename_lock, seq))
- goto rename_retry;
-
- return len;
-}
-
static struct autofs_wait_queue *
autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr)
{
@@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
struct qstr qstr;
char *name;
int status, ret, type;
+ unsigned int offset = 0;
pid_t pid;
pid_t tgid;
@@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi,
return -ENOMEM;
/* If this is a direct mount request create a dummy name */
- if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type))
+ if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) {
+ qstr.name = name;
qstr.len = sprintf(name, "%p", dentry);
- else {
- qstr.len = autofs_getpath(sbi, dentry, name);
- if (!qstr.len) {
+ } else {
+ char *p = dentry_path_raw(dentry, name, NAME_MAX);
+ if (IS_ERR(p)) {
kfree(name);
return -ENOENT;
}
+ qstr.name = ++p; // skip the leading slash
+ qstr.len = strlen(p);
+ offset = p - name;
}
- qstr.name = name;
qstr.hash = full_name_hash(dentry, name, qstr.len);
if (mutex_lock_interruptible(&sbi->wq_mutex)) {
- kfree(qstr.name);
+ kfree(name);
return -EINTR;
}
@@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
if (ret <= 0) {
if (ret != -EINTR)
mutex_unlock(&sbi->wq_mutex);
- kfree(qstr.name);
+ kfree(name);
return ret;
}
@@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
/* Create a new wait queue */
wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL);
if (!wq) {
- kfree(qstr.name);
+ kfree(name);
mutex_unlock(&sbi->wq_mutex);
return -ENOMEM;
}
@@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
sbi->queues = wq;
init_waitqueue_head(&wq->queue);
memcpy(&wq->name, &qstr, sizeof(struct qstr));
+ wq->offset = offset;
wq->dev = autofs_get_dev(sbi);
wq->ino = autofs_get_ino(sbi);
wq->uid = current_uid();
@@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi,
(unsigned long) wq->wait_queue_token, wq->name.len,
wq->name.name, notify);
mutex_unlock(&sbi->wq_mutex);
- kfree(qstr.name);
+ kfree(name);
}
/*
@@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi,
}
*wql = wq->next; /* Unlink from chain */
- kfree(wq->name.name);
+ kfree(wq->name.name - wq->offset);
wq->name.name = NULL; /* Do not wait on this queue */
wq->status = status;
wake_up(&wq->queue);
diff --git a/fs/befs/TODO b/fs/befs/TODO
deleted file mode 100644
index 3250921aa2e6..000000000000
--- a/fs/befs/TODO
+++ /dev/null
@@ -1,14 +0,0 @@
-TODO
-==========
-
-* Convert comments to the Kernel-Doc format.
-
-* Befs_fs.h has gotten big and messy. No reason not to break it up into
- smaller peices.
-
-* See if Alexander Viro's option parser made it into the kernel tree.
- Use that if we can. (include/linux/parser.h)
-
-* See if we really need separate types for on-disk and in-memory
- representations of the superblock and inode.
-
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index b12ba98ae9f5..187b3f2b9202 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm)
goto end_coredump;
/* Align to page */
- if (!dump_skip(cprm, dataoff - cprm->pos))
- goto end_coredump;
+ dump_skip_to(cprm, dataoff);
for (i = 0; i < vma_count; i++) {
struct core_vma_metadata *meta = vma_meta + i;
@@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm)
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
}
- dump_truncate(cprm);
if (!elf_core_write_extra_data(cprm))
goto end_coredump;
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index 3cfd6cd46f26..2c99b102c860 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm)
goto end_coredump;
}
- if (!dump_skip(cprm, dataoff - cprm->pos))
- goto end_coredump;
+ dump_skip_to(cprm, dataoff);
if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
goto end_coredump;
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e0548e..a1072c6a2341 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -74,6 +74,12 @@
#define MAX_SHARED_LIBS (1)
#endif
+#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET
+#define DATA_START_OFFSET_WORDS (0)
+#else
+#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS)
+#endif
+
struct lib_info {
struct {
unsigned long start_code; /* Start of text segment */
@@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
- len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long);
+ len = data_len + extra +
+ DATA_START_OFFSET_WORDS * sizeof(unsigned long);
len = PAGE_ALIGN(len);
realdatastart = vm_mmap(NULL, 0, len,
PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0);
@@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm,
goto err;
}
datapos = ALIGN(realdatastart +
- MAX_SHARED_LIBS * sizeof(unsigned long),
+ DATA_START_OFFSET_WORDS * sizeof(unsigned long),
FLAT_DATA_ALIGN);
pr_debug("Allocated data+bss+stack (%u bytes): %lx\n",
@@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm,
memp_size = len;
} else {
- len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32);
+ len = text_len + data_len + extra +
+ DATA_START_OFFSET_WORDS * sizeof(u32);
len = PAGE_ALIGN(len);
textpos = vm_mmap(NULL, 0, len,
PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0);
@@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm,
realdatastart = textpos + ntohl(hdr->data_start);
datapos = ALIGN(realdatastart +
- MAX_SHARED_LIBS * sizeof(u32),
+ DATA_START_OFFSET_WORDS * sizeof(u32),
FLAT_DATA_ALIGN);
reloc = (__be32 __user *)
@@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm,
ret = result;
pr_err("Unable to read code+data+bss, errno %d\n", ret);
vm_munmap(textpos, text_len + data_len + extra +
- MAX_SHARED_LIBS * sizeof(u32));
+ DATA_START_OFFSET_WORDS * sizeof(u32));
goto err;
}
}
diff --git a/fs/block_dev.c b/fs/block_dev.c
index 92ed7d5df677..6cc4d4cfe0c2 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev)
{
struct address_space *mapping = bdev->bd_inode->i_mapping;
- if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
+ if (mapping_empty(mapping))
return;
invalidate_bh_lrus();
@@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter,
bio.bi_opf = dio_bio_write_op(iocb);
task_io_account_write(ret);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio.bi_opf |= REQ_NOWAIT;
if (iocb->ki_flags & IOCB_HIPRI)
bio_set_polled(&bio, iocb);
@@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter,
bio->bi_opf = dio_bio_write_op(iocb);
task_io_account_write(bio->bi_iter.bi_size);
}
+ if (iocb->ki_flags & IOCB_NOWAIT)
+ bio->bi_opf |= REQ_NOWAIT;
dio->size += bio->bi_iter.bi_size;
pos += bio->bi_iter.bi_size;
@@ -1236,16 +1240,21 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part);
int bdev_disk_changed(struct block_device *bdev, bool invalidate)
{
struct gendisk *disk = bdev->bd_disk;
- int ret;
+ int ret = 0;
lockdep_assert_held(&bdev->bd_mutex);
- clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state);
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
rescan:
- ret = blk_drop_partitions(bdev);
- if (ret)
- return ret;
+ if (bdev->bd_part_count)
+ return -EBUSY;
+ sync_blockdev(bdev);
+ invalidate_bdev(bdev);
+ blk_drop_partitions(disk);
+
+ clear_bit(GD_NEED_PART_SCAN, &disk->state);
/*
* Historically we only set the capacity to zero for devices that
@@ -1259,9 +1268,6 @@ rescan:
if (disk_part_scan_enabled(disk) ||
!(disk->flags & GENHD_FL_REMOVABLE))
set_capacity(disk, 0);
- } else {
- if (disk->fops->revalidate_disk)
- disk->fops->revalidate_disk(disk);
}
if (get_capacity(disk)) {
@@ -1295,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
struct gendisk *disk = bdev->bd_disk;
int ret = 0;
+ if (!(disk->flags & GENHD_FL_UP))
+ return -ENXIO;
+
if (!bdev->bd_openers) {
if (!bdev_is_partition(bdev)) {
ret = 0;
@@ -1329,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode)
whole->bd_part_count++;
mutex_unlock(&whole->bd_mutex);
- if (!(disk->flags & GENHD_FL_UP) ||
- !bdev_nr_sectors(bdev)) {
+ if (!bdev_nr_sectors(bdev)) {
__blkdev_put(whole, mode, 1);
bdput(whole);
return -ENXIO;
@@ -1361,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev)
struct block_device *bdev;
struct gendisk *disk;
- down_read(&bdev_lookup_sem);
bdev = bdget(dev);
if (!bdev) {
- up_read(&bdev_lookup_sem);
blk_request_module(dev);
- down_read(&bdev_lookup_sem);
-
bdev = bdget(dev);
if (!bdev)
- goto unlock;
+ return NULL;
}
disk = bdev->bd_disk;
@@ -1380,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev)
goto put_disk;
if (!try_module_get(bdev->bd_disk->fops->owner))
goto put_disk;
- up_read(&bdev_lookup_sem);
return bdev;
put_disk:
put_disk(disk);
bdput:
bdput(bdev);
-unlock:
- up_read(&bdev_lookup_sem);
return NULL;
}
@@ -1433,10 +1434,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder)
if (ret)
return ERR_PTR(ret);
- /*
- * If we lost a race with 'disk' being deleted, try again. See md.c.
- */
-retry:
bdev = blkdev_get_no_open(dev);
if (!bdev)
return ERR_PTR(-ENXIO);
@@ -1483,8 +1480,6 @@ abort_claiming:
disk_unblock_events(disk);
put_blkdev:
blkdev_put_no_open(bdev);
- if (ret == -ERESTARTSYS)
- goto retry;
return ERR_PTR(ret);
}
EXPORT_SYMBOL(blkdev_get_by_dev);
@@ -1680,6 +1675,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
struct blk_plug plug;
+ size_t shorted = 0;
ssize_t ret;
if (bdev_read_only(I_BDEV(bd_inode)))
@@ -1697,12 +1693,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from)
if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT)
return -EOPNOTSUPP;
- iov_iter_truncate(from, size - iocb->ki_pos);
+ size -= iocb->ki_pos;
+ if (iov_iter_count(from) > size) {
+ shorted = iov_iter_count(from) - size;
+ iov_iter_truncate(from, size);
+ }
blk_start_plug(&plug);
ret = __generic_file_write_iter(iocb, from);
if (ret > 0)
ret = generic_write_sync(iocb, ret);
+ iov_iter_reexpand(from, iov_iter_count(from) + shorted);
blk_finish_plug(&plug);
return ret;
}
@@ -1714,13 +1715,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to)
struct inode *bd_inode = bdev_file_inode(file);
loff_t size = i_size_read(bd_inode);
loff_t pos = iocb->ki_pos;
+ size_t shorted = 0;
+ ssize_t ret;
if (pos >= size)
return 0;
size -= pos;
- iov_iter_truncate(to, size);
- return generic_file_read_iter(iocb, to);
+ if (iov_iter_count(to) > size) {
+ shorted = iov_iter_count(to) - size;
+ iov_iter_truncate(to, size);
+ }
+
+ ret = generic_file_read_iter(iocb, to);
+ iov_iter_reexpand(to, iov_iter_count(to) + shorted);
+ return ret;
}
EXPORT_SYMBOL_GPL(blkdev_read_iter);
diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile
index b634c42115ea..cec88a66bd6c 100644
--- a/fs/btrfs/Makefile
+++ b/fs/btrfs/Makefile
@@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute
subdir-ccflags-y += -Wmissing-prototypes
subdir-ccflags-y += -Wold-style-definition
subdir-ccflags-y += -Wmissing-include-dirs
-subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable)
-subdir-ccflags-y += $(call cc-option, -Wunused-const-variable)
-subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned)
-subdir-ccflags-y += $(call cc-option, -Wstringop-truncation)
+condflags := \
+ $(call cc-option, -Wunused-but-set-variable) \
+ $(call cc-option, -Wunused-const-variable) \
+ $(call cc-option, -Wpacked-not-aligned) \
+ $(call cc-option, -Wstringop-truncation)
+subdir-ccflags-y += $(condflags)
# The following turn off the warnings enabled by -Wextra
subdir-ccflags-y += -Wno-missing-field-initializers
subdir-ccflags-y += -Wno-sign-compare
@@ -28,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
- subpage.o
+ subpage.o tree-mod-log.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index f47c1528eb9a..117d423fdb93 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -14,6 +14,7 @@
#include "delayed-ref.h"
#include "locking.h"
#include "misc.h"
+#include "tree-mod-log.h"
/* Just an arbitrary number so we can be sure this happened */
#define BACKREF_FOUND_SHARED 6
@@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (path->slots[0] >= btrfs_header_nritems(eb) ||
is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb)) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
if (slot == 0 &&
(is_shared_data_backref(preftrees, eb->start) ||
ref->root_id != btrfs_header_owner(eb))) {
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_leaf(root, path);
else
ret = btrfs_next_old_leaf(root, path, time_seq);
@@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path,
eie = NULL;
}
next:
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_next_item(root, path);
else
ret = btrfs_next_old_item(root, path, time_seq);
@@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
if (path->search_commit_root)
root_level = btrfs_header_level(root->commit_root);
- else if (time_seq == SEQ_LAST)
+ else if (time_seq == BTRFS_SEQ_LAST)
root_level = btrfs_header_level(root->node);
else
root_level = btrfs_old_root_level(root, time_seq);
@@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info,
search_key.offset >= LLONG_MAX)
search_key.offset = 0;
path->lowest_level = level;
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0);
else
ret = btrfs_search_old_slot(root, &search_key, path, time_seq);
@@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info,
* indirect refs to their parent bytenr.
* When roots are found, they're added to the roots list
*
- * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave
- * much like trans == NULL case, the difference only lies in it will not
+ * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and
+ * behave much like trans == NULL case, the difference only lies in it will not
* commit root.
* The special case is for qgroup to search roots in commit_transaction().
*
@@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans,
path->skip_locking = 1;
}
- if (time_seq == SEQ_LAST)
+ if (time_seq == BTRFS_SEQ_LAST)
path->skip_locking = 1;
/*
@@ -1217,9 +1218,9 @@ again:
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (trans && likely(trans->type != __TRANS_DUMMY) &&
- time_seq != SEQ_LAST) {
+ time_seq != BTRFS_SEQ_LAST) {
#else
- if (trans && time_seq != SEQ_LAST) {
+ if (trans && time_seq != BTRFS_SEQ_LAST) {
#endif
/*
* look if there are updates for this ref queued and lock the
@@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
struct btrfs_trans_handle *trans;
struct ulist_iterator uiter;
struct ulist_node *node;
- struct seq_list elem = SEQ_LIST_INIT(elem);
+ struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem);
int ret = 0;
struct share_check shared = {
.root_objectid = root->root_key.objectid,
@@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
struct ulist *roots = NULL;
struct ulist_node *ref_node = NULL;
struct ulist_node *root_node = NULL;
- struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem);
+ struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem);
struct ulist_iterator ref_uiter;
struct ulist_iterator root_uiter;
@@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
}
if (trans)
- btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_get_tree_mod_seq(fs_info, &seq_elem);
else
down_read(&fs_info->commit_root_sem);
ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid,
- tree_mod_seq_elem.seq, &refs,
+ seq_elem.seq, &refs,
&extent_item_pos, ignore_offset);
if (ret)
goto out;
@@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
ULIST_ITER_INIT(&ref_uiter);
while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) {
ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val,
- tree_mod_seq_elem.seq, &roots,
+ seq_elem.seq, &roots,
ignore_offset);
if (ret)
break;
@@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
free_leaf_list(refs);
out:
if (trans) {
- btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem);
+ btrfs_put_tree_mod_seq(fs_info, &seq_elem);
btrfs_end_transaction(trans);
} else {
up_read(&fs_info->commit_root_sem);
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index 744b99ddc28c..aa57bdc8fc89 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
* Long running balances can keep us blocked here for eternity, so
* simply skip deletion if we're unable to get the mutex.
*/
- if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex))
+ if (!mutex_trylock(&fs_info->reclaim_bgs_lock))
return;
spin_lock(&fs_info->unused_bgs_lock);
@@ -1462,12 +1462,12 @@ next:
spin_lock(&fs_info->unused_bgs_lock);
}
spin_unlock(&fs_info->unused_bgs_lock);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
return;
flip_async:
btrfs_end_transaction(trans);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_put_block_group(block_group);
btrfs_discard_punt_unused_bgs_list(fs_info);
}
@@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg)
spin_unlock(&fs_info->unused_bgs_lock);
}
+void btrfs_reclaim_bgs_work(struct work_struct *work)
+{
+ struct btrfs_fs_info *fs_info =
+ container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+ struct btrfs_block_group *bg;
+ struct btrfs_space_info *space_info;
+ int ret;
+
+ if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+ return;
+
+ if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
+ return;
+
+ mutex_lock(&fs_info->reclaim_bgs_lock);
+ spin_lock(&fs_info->unused_bgs_lock);
+ while (!list_empty(&fs_info->reclaim_bgs)) {
+ bg = list_first_entry(&fs_info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&bg->bg_list);
+
+ space_info = bg->space_info;
+ spin_unlock(&fs_info->unused_bgs_lock);
+
+ /* Don't race with allocators so take the groups_sem */
+ down_write(&space_info->groups_sem);
+
+ spin_lock(&bg->lock);
+ if (bg->reserved || bg->pinned || bg->ro) {
+ /*
+ * We want to bail if we made new allocations or have
+ * outstanding allocations in this block group. We do
+ * the ro check in case balance is currently acting on
+ * this block group.
+ */
+ spin_unlock(&bg->lock);
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+ spin_unlock(&bg->lock);
+
+ /* Get out fast, in case we're unmounting the filesystem */
+ if (btrfs_fs_closing(fs_info)) {
+ up_write(&space_info->groups_sem);
+ goto next;
+ }
+
+ ret = inc_block_group_ro(bg, 0);
+ up_write(&space_info->groups_sem);
+ if (ret < 0)
+ goto next;
+
+ btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used",
+ bg->start, div_u64(bg->used * 100, bg->length));
+ trace_btrfs_reclaim_block_group(bg);
+ ret = btrfs_relocate_chunk(fs_info, bg->start);
+ if (ret)
+ btrfs_err(fs_info, "error relocating chunk %llu",
+ bg->start);
+
+next:
+ btrfs_put_block_group(bg);
+ spin_lock(&fs_info->unused_bgs_lock);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
+ btrfs_exclop_finish(fs_info);
+}
+
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info)
+{
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (!list_empty(&fs_info->reclaim_bgs))
+ queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work);
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg)
+{
+ struct btrfs_fs_info *fs_info = bg->fs_info;
+
+ spin_lock(&fs_info->unused_bgs_lock);
+ if (list_empty(&bg->bg_list)) {
+ btrfs_get_block_group(bg);
+ trace_btrfs_add_reclaim_block_group(bg);
+ list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs);
+ }
+ spin_unlock(&fs_info->unused_bgs_lock);
+}
+
static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key,
struct btrfs_path *path)
{
@@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache,
struct btrfs_trans_handle *trans;
u64 alloc_flags;
int ret;
+ bool dirty_bg_running;
-again:
- trans = btrfs_join_transaction(fs_info->extent_root);
- if (IS_ERR(trans))
- return PTR_ERR(trans);
+ do {
+ trans = btrfs_join_transaction(fs_info->extent_root);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
- /*
- * we're not allowed to set block groups readonly after the dirty
- * block groups cache has started writing. If it already started,
- * back off and let this transaction commit
- */
- mutex_lock(&fs_info->ro_block_group_mutex);
- if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
- u64 transid = trans->transid;
+ dirty_bg_running = false;
- mutex_unlock(&fs_info->ro_block_group_mutex);
- btrfs_end_transaction(trans);
+ /*
+ * We're not allowed to set block groups readonly after the dirty
+ * block group cache has started writing. If it already started,
+ * back off and let this transaction commit.
+ */
+ mutex_lock(&fs_info->ro_block_group_mutex);
+ if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) {
+ u64 transid = trans->transid;
- ret = btrfs_wait_for_commit(fs_info, transid);
- if (ret)
- return ret;
- goto again;
- }
+ mutex_unlock(&fs_info->ro_block_group_mutex);
+ btrfs_end_transaction(trans);
+
+ ret = btrfs_wait_for_commit(fs_info, transid);
+ if (ret)
+ return ret;
+ dirty_bg_running = true;
+ }
+ } while (dirty_bg_running);
if (do_chunk_alloc) {
/*
@@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type)
*/
void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
{
+ struct btrfs_transaction *cur_trans = trans->transaction;
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_space_info *info;
u64 left;
@@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
lockdep_assert_held(&fs_info->chunk_mutex);
info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM);
+again:
spin_lock(&info->lock);
left = info->total_bytes - btrfs_space_info_used(info, true);
spin_unlock(&info->lock);
@@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
if (left < thresh) {
u64 flags = btrfs_system_alloc_profile(fs_info);
+ u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved);
+
+ /*
+ * If there's not available space for the chunk tree (system
+ * space) and there are other tasks that reserved space for
+ * creating a new system block group, wait for them to complete
+ * the creation of their system block group and release excess
+ * reserved space. We do this because:
+ *
+ * *) We can end up allocating more system chunks than necessary
+ * when there are multiple tasks that are concurrently
+ * allocating block groups, which can lead to exhaustion of
+ * the system array in the superblock;
+ *
+ * *) If we allocate extra and unnecessary system block groups,
+ * despite being empty for a long time, and possibly forever,
+ * they end not being added to the list of unused block groups
+ * because that typically happens only when deallocating the
+ * last extent from a block group - which never happens since
+ * we never allocate from them in the first place. The few
+ * exceptions are when mounting a filesystem or running scrub,
+ * which add unused block groups to the list of unused block
+ * groups, to be deleted by the cleaner kthread.
+ * And even when they are added to the list of unused block
+ * groups, it can take a long time until they get deleted,
+ * since the cleaner kthread might be sleeping or busy with
+ * other work (deleting subvolumes, running delayed iputs,
+ * defrag scheduling, etc);
+ *
+ * This is rare in practice, but can happen when too many tasks
+ * are allocating blocks groups in parallel (via fallocate())
+ * and before the one that reserved space for a new system block
+ * group finishes the block group creation and releases the space
+ * reserved in excess (at btrfs_create_pending_block_groups()),
+ * other tasks end up here and see free system space temporarily
+ * not enough for updating the chunk tree.
+ *
+ * We unlock the chunk mutex before waiting for such tasks and
+ * lock it again after the wait, otherwise we would deadlock.
+ * It is safe to do so because allocating a system chunk is the
+ * first thing done while allocating a new block group.
+ */
+ if (reserved > trans->chunk_bytes_reserved) {
+ const u64 min_needed = reserved - thresh;
+
+ mutex_unlock(&fs_info->chunk_mutex);
+ wait_event(cur_trans->chunk_reserve_wait,
+ atomic64_read(&cur_trans->chunk_bytes_reserved) <=
+ min_needed);
+ mutex_lock(&fs_info->chunk_mutex);
+ goto again;
+ }
/*
* Ignore failure to create system chunk. We might end up not
@@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type)
ret = btrfs_block_rsv_add(fs_info->chunk_root,
&fs_info->chunk_block_rsv,
thresh, BTRFS_RESERVE_NO_FLUSH);
- if (!ret)
+ if (!ret) {
+ atomic64_add(thresh, &cur_trans->chunk_bytes_reserved);
trans->chunk_bytes_reserved += thresh;
+ }
}
}
@@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->unused_bgs_lock);
+ spin_lock(&info->unused_bgs_lock);
+ while (!list_empty(&info->reclaim_bgs)) {
+ block_group = list_first_entry(&info->reclaim_bgs,
+ struct btrfs_block_group,
+ bg_list);
+ list_del_init(&block_group->bg_list);
+ btrfs_put_block_group(block_group);
+ }
+ spin_unlock(&info->unused_bgs_lock);
+
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h
index 3ecc3372a5ce..7b927425dc71 100644
--- a/fs/btrfs/block-group.h
+++ b/fs/btrfs/block-group.h
@@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
u64 group_start, struct extent_map *em);
void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info);
void btrfs_mark_bg_unused(struct btrfs_block_group *bg);
+void btrfs_reclaim_bgs_work(struct work_struct *work);
+void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info);
+void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg);
int btrfs_read_block_groups(struct btrfs_fs_info *info);
int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used,
u64 type, u64 chunk_offset, u64 size);
diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h
index 28e202e89660..c652e19ad74e 100644
--- a/fs/btrfs/btrfs_inode.h
+++ b/fs/btrfs/btrfs_inode.h
@@ -220,6 +220,7 @@ struct btrfs_inode {
/* Hook into fs_info->delayed_iputs */
struct list_head delayed_iput;
+ struct rw_semaphore i_mmap_lock;
struct inode vfs_inode;
};
@@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode,
mod);
}
-static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
+/*
+ * Called every time after doing a buffered, direct IO or memory mapped write.
+ *
+ * This is to ensure that if we write to a file that was previously fsynced in
+ * the current transaction, then try to fsync it again in the same transaction,
+ * we will know that there were changes in the file and that it needs to be
+ * logged.
+ */
+static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode)
+{
+ spin_lock(&inode->lock);
+ inode->last_sub_trans = inode->root->log_transid;
+ spin_unlock(&inode->lock);
+}
+
+static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation)
{
- int ret = 0;
+ bool ret = false;
spin_lock(&inode->lock);
if (inode->logged_trans == generation &&
inode->last_sub_trans <= inode->last_log_commit &&
- inode->last_sub_trans <= inode->root->last_log_commit) {
- /*
- * After a ranged fsync we might have left some extent maps
- * (that fall outside the fsync's range). So return false
- * here if the list isn't empty, to make sure btrfs_log_inode()
- * will be called and process those extent maps.
- */
- smp_mb();
- if (list_empty(&inode->extent_tree.modified_extents))
- ret = 1;
- }
+ inode->last_sub_trans <= inode->root->last_log_commit)
+ ret = true;
spin_unlock(&inode->lock);
return ret;
}
diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c
index 113cb85c1fd4..169508609324 100644
--- a/fs/btrfs/check-integrity.c
+++ b/fs/btrfs/check-integrity.c
@@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx)
BUG_ON(!block_ctx->pagev);
num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >>
PAGE_SHIFT;
+ /* Pages must be unmapped in reverse order */
while (num_pages > 0) {
num_pages--;
if (block_ctx->datav[num_pages]) {
- kunmap(block_ctx->pagev[num_pages]);
+ kunmap_local(block_ctx->datav[num_pages]);
block_ctx->datav[num_pages] = NULL;
}
if (block_ctx->pagev[num_pages]) {
@@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state,
i = j;
}
for (i = 0; i < num_pages; i++)
- block_ctx->datav[i] = kmap(block_ctx->pagev[i]);
+ block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]);
return block_ctx->len;
}
@@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
- unsigned int i = 0;
+ int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
struct bio_vec bvec;
@@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio)
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
- mapped_datav[i] = kmap(bvec.bv_page);
+ mapped_datav[i] = kmap_local_page(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
@@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio)
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
- bio_for_each_segment(bvec, bio, iter)
- kunmap(bvec.bv_page);
+ /* Unmap in reverse order */
+ for (--i; i >= 0; i--)
+ kunmap_local(mapped_datav[i]);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c
index 3f4c832abfed..1346d698463a 100644
--- a/fs/btrfs/compression.c
+++ b/fs/btrfs/compression.c
@@ -28,6 +28,7 @@
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
+#include "zoned.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@@ -80,10 +81,15 @@ static int compression_compress_pages(int type, struct list_head *ws,
case BTRFS_COMPRESS_NONE:
default:
/*
- * This can't happen, the type is validated several times
- * before we get here. As a sane fallback, return what the
- * callers will understand as 'no compression happened'.
+ * This can happen when compression races with remount setting
+ * it to 'no compress', while caller doesn't call
+ * inode_need_compress() to check if we really need to
+ * compress.
+ *
+ * Not a big deal, just need to inform caller that we
+ * haven't allocated any pages yet.
*/
+ *out_pages = 0;
return -E2BIG;
}
}
@@ -344,6 +350,7 @@ static void end_compressed_bio_write(struct bio *bio)
*/
inode = cb->inode;
cb->compressed_pages[0]->mapping = cb->inode->i_mapping;
+ btrfs_record_physical_zoned(inode, cb->start, bio);
btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0],
cb->start, cb->start + cb->len - 1,
bio->bi_status == BLK_STS_OK);
@@ -396,6 +403,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
u64 first_byte = disk_start;
blk_status_t ret;
int skip_sum = inode->flags & BTRFS_INODE_NODATASUM;
+ const bool use_append = btrfs_use_zone_append(inode, disk_start);
+ const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE;
WARN_ON(!PAGE_ALIGNED(start));
cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS);
@@ -413,10 +422,31 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
cb->nr_pages = nr_pages;
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
+ if (use_append) {
+ struct extent_map *em;
+ struct map_lookup *map;
+ struct block_device *bdev;
+
+ em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE);
+ if (IS_ERR(em)) {
+ kfree(cb);
+ bio_put(bio);
+ return BLK_STS_NOTSUPP;
+ }
+
+ map = em->map_lookup;
+ /* We only support single profile for now */
+ ASSERT(map->num_stripes == 1);
+ bdev = map->stripes[0].dev->bdev;
+
+ bio_set_dev(bio, bdev);
+ free_extent_map(em);
+ }
+
if (blkcg_css) {
bio->bi_opf |= REQ_CGROUP_PUNT;
kthread_associate_blkcg(blkcg_css);
@@ -427,6 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
bytes_left = compressed_len;
for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) {
int submit = 0;
+ int len = 0;
page = compressed_pages[pg_index];
page->mapping = inode->vfs_inode.i_mapping;
@@ -434,9 +465,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio,
0);
+ /*
+ * Page can only be added to bio if the current bio fits in
+ * stripe.
+ */
+ if (!submit) {
+ if (pg_index == 0 && use_append)
+ len = bio_add_zone_append_page(bio, page,
+ PAGE_SIZE, 0);
+ else
+ len = bio_add_page(bio, page, PAGE_SIZE, 0);
+ }
+
page->mapping = NULL;
- if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) <
- PAGE_SIZE) {
+ if (submit || len < PAGE_SIZE) {
/*
* inc the count before we submit the bio so
* we know the end IO handler won't happen before
@@ -460,11 +502,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
}
bio = btrfs_bio_alloc(first_byte);
- bio->bi_opf = REQ_OP_WRITE | write_flags;
+ bio->bi_opf = bio_op | write_flags;
bio->bi_private = cb;
bio->bi_end_io = end_compressed_bio_write;
if (blkcg_css)
bio->bi_opf |= REQ_CGROUP_PUNT;
+ /*
+ * Use bio_add_page() to ensure the bio has at least one
+ * page.
+ */
bio_add_page(bio, page, PAGE_SIZE, 0);
}
if (bytes_left < PAGE_SIZE) {
@@ -586,16 +632,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
free_extent_map(em);
if (page->index == end_index) {
- char *userpage;
size_t zero_offset = offset_in_page(isize);
if (zero_offset) {
int zeros;
zeros = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, zeros);
+ memzero_page(page, zero_offset, zeros);
flush_dcache_page(page);
- kunmap_atomic(userpage);
}
}
@@ -1611,7 +1654,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
curr_sample_pos = 0;
while (index < index_end) {
page = find_get_page(inode->i_mapping, index);
- in_data = kmap(page);
+ in_data = kmap_local_page(page);
/* Handle case where the start is not aligned to PAGE_SIZE */
i = start % PAGE_SIZE;
while (i < PAGE_SIZE - SAMPLING_READ_SIZE) {
@@ -1624,7 +1667,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end,
start += SAMPLING_INTERVAL;
curr_sample_pos += SAMPLING_READ_SIZE;
}
- kunmap(page);
+ kunmap_local(in_data);
put_page(page);
index++;
diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c
index 34b929bd5c1a..a484fb72a01f 100644
--- a/fs/btrfs/ctree.c
+++ b/fs/btrfs/ctree.c
@@ -14,6 +14,7 @@
#include "locking.h"
#include "volumes.h"
#include "qgroup.h"
+#include "tree-mod-log.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@@ -233,597 +234,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans,
return 0;
}
-enum mod_log_op {
- MOD_LOG_KEY_REPLACE,
- MOD_LOG_KEY_ADD,
- MOD_LOG_KEY_REMOVE,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING,
- MOD_LOG_MOVE_KEYS,
- MOD_LOG_ROOT_REPLACE,
-};
-
-struct tree_mod_root {
- u64 logical;
- u8 level;
-};
-
-struct tree_mod_elem {
- struct rb_node node;
- u64 logical;
- u64 seq;
- enum mod_log_op op;
-
- /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
- int slot;
-
- /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
- u64 generation;
-
- /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
- struct btrfs_disk_key key;
- u64 blockptr;
-
- /* this is used for op == MOD_LOG_MOVE_KEYS */
- struct {
- int dst_slot;
- int nr_items;
- } move;
-
- /* this is used for op == MOD_LOG_ROOT_REPLACE */
- struct tree_mod_root old_root;
-};
-
-/*
- * Pull a new tree mod seq number for our operation.
- */
-static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
-{
- return atomic64_inc_return(&fs_info->tree_mod_seq);
-}
-
-/*
- * This adds a new blocker to the tree mod log's blocker list if the @elem
- * passed does not already have a sequence number set. So when a caller expects
- * to record tree modifications, it should ensure to set elem->seq to zero
- * before calling btrfs_get_tree_mod_seq.
- * Returns a fresh, unused tree log modification sequence number, even if no new
- * blocker was added.
- */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- write_lock(&fs_info->tree_mod_log_lock);
- if (!elem->seq) {
- elem->seq = btrfs_inc_tree_mod_seq(fs_info);
- list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-
- return elem->seq;
-}
-
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct rb_node *next;
- struct tree_mod_elem *tm;
- u64 min_seq = (u64)-1;
- u64 seq_putting = elem->seq;
-
- if (!seq_putting)
- return;
-
- write_lock(&fs_info->tree_mod_log_lock);
- list_del(&elem->list);
- elem->seq = 0;
-
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *first;
-
- first = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- if (seq_putting > first->seq) {
- /*
- * Blocker with lower sequence number exists, we
- * cannot remove anything from the log.
- */
- write_unlock(&fs_info->tree_mod_log_lock);
- return;
- }
- min_seq = first->seq;
- }
-
- /*
- * anything that's lower than the lowest existing (read: blocked)
- * sequence number can be removed from the tree.
- */
- tm_root = &fs_info->tree_mod_log;
- for (node = rb_first(tm_root); node; node = next) {
- next = rb_next(node);
- tm = rb_entry(node, struct tree_mod_elem, node);
- if (tm->seq >= min_seq)
- continue;
- rb_erase(node, tm_root);
- kfree(tm);
- }
- write_unlock(&fs_info->tree_mod_log_lock);
-}
-
-/*
- * key order of the log:
- * node/leaf start address -> sequence
- *
- * The 'start address' is the logical address of the *new* root node
- * for root replace operations, or the logical address of the affected
- * block for all other operations.
- */
-static noinline int
-__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
-{
- struct rb_root *tm_root;
- struct rb_node **new;
- struct rb_node *parent = NULL;
- struct tree_mod_elem *cur;
-
- lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
-
- tm->seq = btrfs_inc_tree_mod_seq(fs_info);
-
- tm_root = &fs_info->tree_mod_log;
- new = &tm_root->rb_node;
- while (*new) {
- cur = rb_entry(*new, struct tree_mod_elem, node);
- parent = *new;
- if (cur->logical < tm->logical)
- new = &((*new)->rb_left);
- else if (cur->logical > tm->logical)
- new = &((*new)->rb_right);
- else if (cur->seq < tm->seq)
- new = &((*new)->rb_left);
- else if (cur->seq > tm->seq)
- new = &((*new)->rb_right);
- else
- return -EEXIST;
- }
-
- rb_link_node(&tm->node, parent, new);
- rb_insert_color(&tm->node, tm_root);
- return 0;
-}
-
-/*
- * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
- * returns zero with the tree_mod_log_lock acquired. The caller must hold
- * this until all tree mod log insertions are recorded in the rb tree and then
- * write unlock fs_info::tree_mod_log_lock.
- */
-static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb) {
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 1;
- if (eb && btrfs_header_level(eb) == 0)
- return 1;
-
- write_lock(&fs_info->tree_mod_log_lock);
- if (list_empty(&(fs_info)->tree_mod_seq_list)) {
- write_unlock(&fs_info->tree_mod_log_lock);
- return 1;
- }
-
- return 0;
-}
-
-/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
-static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
- struct extent_buffer *eb)
-{
- smp_mb();
- if (list_empty(&(fs_info)->tree_mod_seq_list))
- return 0;
- if (eb && btrfs_header_level(eb) == 0)
- return 0;
-
- return 1;
-}
-
-static struct tree_mod_elem *
-alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
-
- tm = kzalloc(sizeof(*tm), flags);
- if (!tm)
- return NULL;
-
- tm->logical = eb->start;
- if (op != MOD_LOG_KEY_ADD) {
- btrfs_node_key(eb, &tm->key, slot);
- tm->blockptr = btrfs_node_blockptr(eb, slot);
- }
- tm->op = op;
- tm->slot = slot;
- tm->generation = btrfs_node_ptr_generation(eb, slot);
- RB_CLEAR_NODE(&tm->node);
-
- return tm;
-}
-
-static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
- enum mod_log_op op, gfp_t flags)
-{
- struct tree_mod_elem *tm;
- int ret;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm = alloc_tree_mod_elem(eb, slot, op, flags);
- if (!tm)
- return -ENOMEM;
-
- if (tree_mod_dont_log(eb->fs_info, eb)) {
- kfree(tm);
- return 0;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- kfree(tm);
-
- return ret;
-}
-
-static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
- int dst_slot, int src_slot, int nr_items)
-{
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int ret = 0;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(eb->fs_info, eb))
- return 0;
-
- tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = eb->start;
- tm->slot = src_slot;
- tm->move.dst_slot = dst_slot;
- tm->move.nr_items = nr_items;
- tm->op = MOD_LOG_MOVE_KEYS;
-
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
- MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
- locked = 1;
-
- /*
- * When we override something during the move, we log these removals.
- * This can only happen when we move towards the beginning of the
- * buffer, i.e. dst_slot < src_slot.
- */
- for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
- ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
- if (ret)
- goto free_tms;
- }
-
- ret = __tree_mod_log_insert(eb->fs_info, tm);
- if (ret)
- goto free_tms;
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-free_tms:
- for (i = 0; i < nr_items; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- kfree(tm_list);
- kfree(tm);
-
- return ret;
-}
-
-static inline int
-__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
- struct tree_mod_elem **tm_list,
- int nritems)
-{
- int i, j;
- int ret;
-
- for (i = nritems - 1; i >= 0; i--) {
- ret = __tree_mod_log_insert(fs_info, tm_list[i]);
- if (ret) {
- for (j = nritems - 1; j > i; j--)
- rb_erase(&tm_list[j]->node,
- &fs_info->tree_mod_log);
- return ret;
- }
- }
-
- return 0;
-}
-
-static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
- struct extent_buffer *new_root, int log_removal)
-{
- struct btrfs_fs_info *fs_info = old_root->fs_info;
- struct tree_mod_elem *tm = NULL;
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int ret = 0;
- int i;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (log_removal && btrfs_header_level(old_root) > 0) {
- nritems = btrfs_header_nritems(old_root);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list) {
- ret = -ENOMEM;
- goto free_tms;
- }
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(old_root, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
- }
-
- tm = kzalloc(sizeof(*tm), GFP_NOFS);
- if (!tm) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm->logical = new_root->start;
- tm->old_root.logical = old_root->start;
- tm->old_root.level = btrfs_header_level(old_root);
- tm->generation = btrfs_header_generation(old_root);
- tm->op = MOD_LOG_ROOT_REPLACE;
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
-
- if (tm_list)
- ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
- if (!ret)
- ret = __tree_mod_log_insert(fs_info, tm);
-
- write_unlock(&fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return ret;
-
-free_tms:
- if (tm_list) {
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
- }
- kfree(tm);
-
- return ret;
-}
-
-static struct tree_mod_elem *
-__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
- int smallest)
-{
- struct rb_root *tm_root;
- struct rb_node *node;
- struct tree_mod_elem *cur = NULL;
- struct tree_mod_elem *found = NULL;
-
- read_lock(&fs_info->tree_mod_log_lock);
- tm_root = &fs_info->tree_mod_log;
- node = tm_root->rb_node;
- while (node) {
- cur = rb_entry(node, struct tree_mod_elem, node);
- if (cur->logical < start) {
- node = node->rb_left;
- } else if (cur->logical > start) {
- node = node->rb_right;
- } else if (cur->seq < min_seq) {
- node = node->rb_left;
- } else if (!smallest) {
- /* we want the node with the highest seq */
- if (found)
- BUG_ON(found->seq > cur->seq);
- found = cur;
- node = node->rb_left;
- } else if (cur->seq > min_seq) {
- /* we want the node with the smallest seq */
- if (found)
- BUG_ON(found->seq < cur->seq);
- found = cur;
- node = node->rb_right;
- } else {
- found = cur;
- break;
- }
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
- return found;
-}
-
-/*
- * this returns the element from the log with the smallest time sequence
- * value that's in the log (the oldest log item). any element with a time
- * sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
- u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 1);
-}
-
-/*
- * this returns the element from the log with the largest time sequence
- * value that's in the log (the most recent log item). any element with
- * a time sequence lower than min_seq will be ignored.
- */
-static struct tree_mod_elem *
-tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
-{
- return __tree_mod_log_search(fs_info, start, min_seq, 0);
-}
-
-static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
- struct extent_buffer *src, unsigned long dst_offset,
- unsigned long src_offset, int nr_items)
-{
- struct btrfs_fs_info *fs_info = dst->fs_info;
- int ret = 0;
- struct tree_mod_elem **tm_list = NULL;
- struct tree_mod_elem **tm_list_add, **tm_list_rem;
- int i;
- int locked = 0;
-
- if (!tree_mod_need_log(fs_info, NULL))
- return 0;
-
- if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
- return 0;
-
- tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
- GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- tm_list_add = tm_list;
- tm_list_rem = tm_list + nr_items;
- for (i = 0; i < nr_items; i++) {
- tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
- MOD_LOG_KEY_REMOVE, GFP_NOFS);
- if (!tm_list_rem[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
-
- tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
- MOD_LOG_KEY_ADD, GFP_NOFS);
- if (!tm_list_add[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(fs_info, NULL))
- goto free_tms;
- locked = 1;
-
- for (i = 0; i < nr_items; i++) {
- ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
- if (ret)
- goto free_tms;
- ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
- if (ret)
- goto free_tms;
- }
-
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nr_items * 2; i++) {
- if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
- rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
- kfree(tm_list[i]);
- }
- if (locked)
- write_unlock(&fs_info->tree_mod_log_lock);
- kfree(tm_list);
-
- return ret;
-}
-
-static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
-{
- struct tree_mod_elem **tm_list = NULL;
- int nritems = 0;
- int i;
- int ret = 0;
-
- if (btrfs_header_level(eb) == 0)
- return 0;
-
- if (!tree_mod_need_log(eb->fs_info, NULL))
- return 0;
-
- nritems = btrfs_header_nritems(eb);
- tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
- if (!tm_list)
- return -ENOMEM;
-
- for (i = 0; i < nritems; i++) {
- tm_list[i] = alloc_tree_mod_elem(eb, i,
- MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
- if (!tm_list[i]) {
- ret = -ENOMEM;
- goto free_tms;
- }
- }
-
- if (tree_mod_dont_log(eb->fs_info, eb))
- goto free_tms;
-
- ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
- write_unlock(&eb->fs_info->tree_mod_log_lock);
- if (ret)
- goto free_tms;
- kfree(tm_list);
-
- return 0;
-
-free_tms:
- for (i = 0; i < nritems; i++)
- kfree(tm_list[i]);
- kfree(tm_list);
-
- return ret;
-}
-
/*
* check if the tree block can be shared by multiple trees
*/
@@ -1090,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
parent_start = buf->start;
atomic_inc(&cow->refs);
- ret = tree_mod_log_insert_root(root->node, cow, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, cow);
@@ -1100,15 +510,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
add_root_to_dirty_list(root);
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
- tree_mod_log_insert_key(parent, parent_slot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ btrfs_tree_mod_log_insert_key(parent, parent_slot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
trans->transid);
btrfs_mark_buffer_dirty(parent);
if (last_ref) {
- ret = tree_mod_log_free_eb(buf);
+ ret = btrfs_tree_mod_log_free_eb(buf);
if (ret) {
btrfs_tree_unlock(cow);
free_extent_buffer(cow);
@@ -1127,298 +537,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
return 0;
}
-/*
- * returns the logical address of the oldest predecessor of the given root.
- * entries older than time_seq are ignored.
- */
-static struct tree_mod_elem *__tree_mod_log_oldest_root(
- struct extent_buffer *eb_root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- struct tree_mod_elem *found = NULL;
- u64 root_logical = eb_root->start;
- int looped = 0;
-
- if (!time_seq)
- return NULL;
-
- /*
- * the very last operation that's logged for a root is the
- * replacement operation (if it is replaced at all). this has
- * the logical address of the *new* root, making it the very
- * first operation that's logged for this root.
- */
- while (1) {
- tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
- time_seq);
- if (!looped && !tm)
- return NULL;
- /*
- * if there are no tree operation for the oldest root, we simply
- * return it. this should only happen if that (old) root is at
- * level 0.
- */
- if (!tm)
- break;
-
- /*
- * if there's an operation that's not a root replacement, we
- * found the oldest version of our root. normally, we'll find a
- * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
- */
- if (tm->op != MOD_LOG_ROOT_REPLACE)
- break;
-
- found = tm;
- root_logical = tm->old_root.logical;
- looped = 1;
- }
-
- /* if there's no old root to return, return what we found instead */
- if (!found)
- found = tm;
-
- return found;
-}
-
-/*
- * tm is a pointer to the first operation to rewind within eb. then, all
- * previous operations will be rewound (until we reach something older than
- * time_seq).
- */
-static void
-__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
- u64 time_seq, struct tree_mod_elem *first_tm)
-{
- u32 n;
- struct rb_node *next;
- struct tree_mod_elem *tm = first_tm;
- unsigned long o_dst;
- unsigned long o_src;
- unsigned long p_size = sizeof(struct btrfs_key_ptr);
-
- n = btrfs_header_nritems(eb);
- read_lock(&fs_info->tree_mod_log_lock);
- while (tm && tm->seq >= time_seq) {
- /*
- * all the operations are recorded with the operator used for
- * the modification. as we're going backwards, we do the
- * opposite of each operation here.
- */
- switch (tm->op) {
- case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
- BUG_ON(tm->slot < n);
- fallthrough;
- case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
- case MOD_LOG_KEY_REMOVE:
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- n++;
- break;
- case MOD_LOG_KEY_REPLACE:
- BUG_ON(tm->slot >= n);
- btrfs_set_node_key(eb, &tm->key, tm->slot);
- btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
- btrfs_set_node_ptr_generation(eb, tm->slot,
- tm->generation);
- break;
- case MOD_LOG_KEY_ADD:
- /* if a move operation is needed it's in the log */
- n--;
- break;
- case MOD_LOG_MOVE_KEYS:
- o_dst = btrfs_node_key_ptr_offset(tm->slot);
- o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
- memmove_extent_buffer(eb, o_dst, o_src,
- tm->move.nr_items * p_size);
- break;
- case MOD_LOG_ROOT_REPLACE:
- /*
- * this operation is special. for roots, this must be
- * handled explicitly before rewinding.
- * for non-roots, this operation may exist if the node
- * was a root: root A -> child B; then A gets empty and
- * B is promoted to the new root. in the mod log, we'll
- * have a root-replace operation for B, a tree block
- * that is no root. we simply ignore that operation.
- */
- break;
- }
- next = rb_next(&tm->node);
- if (!next)
- break;
- tm = rb_entry(next, struct tree_mod_elem, node);
- if (tm->logical != first_tm->logical)
- break;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
- btrfs_set_header_nritems(eb, n);
-}
-
-/*
- * Called with eb read locked. If the buffer cannot be rewound, the same buffer
- * is returned. If rewind operations happen, a fresh buffer is returned. The
- * returned buffer is always read-locked. If the returned buffer is not the
- * input buffer, the lock on the input buffer is released and the input buffer
- * is freed (its refcount is decremented).
- */
-static struct extent_buffer *
-tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
- struct extent_buffer *eb, u64 time_seq)
-{
- struct extent_buffer *eb_rewin;
- struct tree_mod_elem *tm;
-
- if (!time_seq)
- return eb;
-
- if (btrfs_header_level(eb) == 0)
- return eb;
-
- tm = tree_mod_log_search(fs_info, eb->start, time_seq);
- if (!tm)
- return eb;
-
- if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- BUG_ON(tm->slot != 0);
- eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
- if (!eb_rewin) {
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- btrfs_set_header_bytenr(eb_rewin, eb->start);
- btrfs_set_header_backref_rev(eb_rewin,
- btrfs_header_backref_rev(eb));
- btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
- btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
- } else {
- eb_rewin = btrfs_clone_extent_buffer(eb);
- if (!eb_rewin) {
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
- return NULL;
- }
- }
-
- btrfs_tree_read_unlock(eb);
- free_extent_buffer(eb);
-
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
- eb_rewin, btrfs_header_level(eb_rewin));
- btrfs_tree_read_lock(eb_rewin);
- __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
- WARN_ON(btrfs_header_nritems(eb_rewin) >
- BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb_rewin;
-}
-
-/*
- * get_old_root() rewinds the state of @root's root node to the given @time_seq
- * value. If there are no changes, the current root->root_node is returned. If
- * anything changed in between, there's a fresh buffer allocated on which the
- * rewind operations are done. In any case, the returned buffer is read locked.
- * Returns NULL on error (with no locks held).
- */
-static inline struct extent_buffer *
-get_old_root(struct btrfs_root *root, u64 time_seq)
-{
- struct btrfs_fs_info *fs_info = root->fs_info;
- struct tree_mod_elem *tm;
- struct extent_buffer *eb = NULL;
- struct extent_buffer *eb_root;
- u64 eb_root_owner = 0;
- struct extent_buffer *old;
- struct tree_mod_root *old_root = NULL;
- u64 old_generation = 0;
- u64 logical;
- int level;
-
- eb_root = btrfs_read_lock_root_node(root);
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (!tm)
- return eb_root;
-
- if (tm->op == MOD_LOG_ROOT_REPLACE) {
- old_root = &tm->old_root;
- old_generation = tm->generation;
- logical = old_root->logical;
- level = old_root->level;
- } else {
- logical = eb_root->start;
- level = btrfs_header_level(eb_root);
- }
-
- tm = tree_mod_log_search(fs_info, logical, time_seq);
- if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- old = read_tree_block(fs_info, logical, root->root_key.objectid,
- 0, level, NULL);
- if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
- if (!IS_ERR(old))
- free_extent_buffer(old);
- btrfs_warn(fs_info,
- "failed to read tree block %llu from get_old_root",
- logical);
- } else {
- btrfs_tree_read_lock(old);
- eb = btrfs_clone_extent_buffer(old);
- btrfs_tree_read_unlock(old);
- free_extent_buffer(old);
- }
- } else if (old_root) {
- eb_root_owner = btrfs_header_owner(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- eb = alloc_dummy_extent_buffer(fs_info, logical);
- } else {
- eb = btrfs_clone_extent_buffer(eb_root);
- btrfs_tree_read_unlock(eb_root);
- free_extent_buffer(eb_root);
- }
-
- if (!eb)
- return NULL;
- if (old_root) {
- btrfs_set_header_bytenr(eb, eb->start);
- btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
- btrfs_set_header_owner(eb, eb_root_owner);
- btrfs_set_header_level(eb, old_root->level);
- btrfs_set_header_generation(eb, old_generation);
- }
- btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
- btrfs_header_level(eb));
- btrfs_tree_read_lock(eb);
- if (tm)
- __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
- else
- WARN_ON(btrfs_header_level(eb) != 0);
- WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
-
- return eb;
-}
-
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
-{
- struct tree_mod_elem *tm;
- int level;
- struct extent_buffer *eb_root = btrfs_root_node(root);
-
- tm = __tree_mod_log_oldest_root(eb_root, time_seq);
- if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
- level = tm->old_root.level;
- } else {
- level = btrfs_header_level(eb_root);
- }
- free_extent_buffer(eb_root);
-
- return level;
-}
-
static inline int should_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *buf)
@@ -1840,7 +958,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
goto enospc;
}
- ret = tree_mod_log_insert_root(root->node, child, 1);
+ ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, child);
@@ -1920,8 +1038,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
} else {
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -1966,8 +1084,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
/* update the parent key to reflect our changes */
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2068,8 +1186,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@@ -2122,8 +1240,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
struct btrfs_disk_key disk_key;
btrfs_node_key(right, &disk_key, 0);
- ret = tree_mod_log_insert_key(parent, pslot + 1,
- MOD_LOG_KEY_REPLACE, GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@@ -2161,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
u64 search;
u64 target;
u64 nread = 0;
+ u64 nread_max;
struct extent_buffer *eb;
u32 nr;
u32 blocksize;
u32 nscan = 0;
- if (level != 1)
+ if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
return;
if (!path->nodes[level])
@@ -2174,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
node = path->nodes[level];
+ /*
+ * Since the time between visiting leaves is much shorter than the time
+ * between visiting nodes, limit read ahead of nodes to 1, to avoid too
+ * much IO at once (possibly random).
+ */
+ if (path->reada == READA_FORWARD_ALWAYS) {
+ if (level > 1)
+ nread_max = node->fs_info->nodesize;
+ else
+ nread_max = SZ_128K;
+ } else {
+ nread_max = SZ_64K;
+ }
+
search = btrfs_node_blockptr(node, slot);
blocksize = fs_info->nodesize;
eb = find_extent_buffer(fs_info, search);
@@ -2192,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
if (nr == 0)
break;
nr--;
- } else if (path->reada == READA_FORWARD) {
+ } else if (path->reada == READA_FORWARD ||
+ path->reada == READA_FORWARD_ALWAYS) {
nr++;
if (nr >= nritems)
break;
@@ -2203,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info,
break;
}
search = btrfs_node_blockptr(node, nr);
- if ((search <= target && target - search <= 65536) ||
+ if (path->reada == READA_FORWARD_ALWAYS ||
+ (search <= target && target - search <= 65536) ||
(search > target && search - target <= 65536)) {
btrfs_readahead_node_child(node, nr);
nread += blocksize;
}
nscan++;
- if ((nread > 65536 || nscan > 32))
+ if (nread > nread_max || nscan > 32)
break;
}
}
@@ -2318,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
+ if (p->reada == READA_FORWARD_ALWAYS)
+ reada_for_search(fs_info, p, level, slot, key->objectid);
+
/* first we do an atomic uptodate check */
if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
/*
@@ -2861,7 +1999,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
}
again:
- b = get_old_root(root, time_seq);
+ b = btrfs_get_old_root(root, time_seq);
if (!b) {
ret = -EIO;
goto done;
@@ -2916,7 +2054,7 @@ again:
level = btrfs_header_level(b);
btrfs_tree_read_lock(b);
- b = tree_mod_log_rewind(fs_info, p, b, time_seq);
+ b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
if (!b) {
ret = -ENOMEM;
goto done;
@@ -3030,8 +2168,8 @@ static void fixup_low_keys(struct btrfs_path *path,
if (!path->nodes[i])
break;
t = path->nodes[i];
- ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
- GFP_ATOMIC);
+ ret = btrfs_tree_mod_log_insert_key(t, tslot,
+ BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@@ -3194,7 +2332,7 @@ static int push_node_left(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3206,8 +2344,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
if (push_items < src_nritems) {
/*
- * Don't call tree_mod_log_insert_move here, key removal was
- * already fully logged by tree_mod_log_eb_copy above.
+ * Don't call btrfs_tree_mod_log_insert_move() here, key removal
+ * was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
@@ -3268,15 +2406,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
btrfs_abort_transaction(trans, ret);
return ret;
}
- ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
+ ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
- ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
- push_items);
+ ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
+ push_items);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -3342,7 +2480,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(c);
old = root->node;
- ret = tree_mod_log_insert_root(root->node, c, 0);
+ ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
BUG_ON(ret < 0);
rcu_assign_pointer(root->node, c);
@@ -3381,8 +2519,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
if (slot != nritems) {
if (level) {
- ret = tree_mod_log_insert_move(lower, slot + 1, slot,
- nritems - slot);
+ ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
+ slot, nritems - slot);
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
@@ -3391,8 +2529,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
- ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(lower, slot,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@@ -3433,9 +2571,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
* tree mod log: We don't log_removal old root in
* insert_new_root, because that root buffer will be kept as a
* normal node. We are going to log removal of half of the
- * elements below with tree_mod_log_eb_copy. We're holding a
- * tree lock on the buffer, which is why we cannot race with
- * other tree_mod_log users.
+ * elements below with btrfs_tree_mod_log_eb_copy(). We're
+ * holding a tree lock on the buffer, which is why we cannot
+ * race with other tree_mod_log users.
*/
ret = insert_new_root(trans, root, path, level + 1);
if (ret)
@@ -3462,7 +2600,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
root_add_used(root, fs_info->nodesize);
ASSERT(btrfs_header_level(c) == level);
- ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
+ ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
if (ret) {
btrfs_abort_transaction(trans, ret);
return ret;
@@ -4844,8 +3982,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
nritems = btrfs_header_nritems(parent);
if (slot != nritems - 1) {
if (level) {
- ret = tree_mod_log_insert_move(parent, slot, slot + 1,
- nritems - slot - 1);
+ ret = btrfs_tree_mod_log_insert_move(parent, slot,
+ slot + 1, nritems - slot - 1);
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
@@ -4854,8 +3992,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
- ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
- GFP_NOFS);
+ ret = btrfs_tree_mod_log_insert_key(parent, slot,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BUG_ON(ret < 0);
}
diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h
index 9ae776ab3967..9fb76829a281 100644
--- a/fs/btrfs/ctree.h
+++ b/fs/btrfs/ctree.h
@@ -342,6 +342,27 @@ struct btrfs_node {
struct btrfs_key_ptr ptrs[];
} __attribute__ ((__packed__));
+/* Read ahead values for struct btrfs_path.reada */
+enum {
+ READA_NONE,
+ READA_BACK,
+ READA_FORWARD,
+ /*
+ * Similar to READA_FORWARD but unlike it:
+ *
+ * 1) It will trigger readahead even for leaves that are not close to
+ * each other on disk;
+ * 2) It also triggers readahead for nodes;
+ * 3) During a search, even when a node or leaf is already in memory, it
+ * will still trigger readahead for other nodes and leaves that follow
+ * it.
+ *
+ * This is meant to be used only when we know we are iterating over the
+ * entire tree or a very large part of it.
+ */
+ READA_FORWARD_ALWAYS,
+};
+
/*
* btrfs_paths remember the path taken from the root down to the leaf.
* level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point
@@ -350,7 +371,6 @@ struct btrfs_node {
* The slots array records the index of the item or block pointer
* used while walking the tree.
*/
-enum { READA_NONE, READA_BACK, READA_FORWARD };
struct btrfs_path {
struct extent_buffer *nodes[BTRFS_MAX_LEVEL];
int slots[BTRFS_MAX_LEVEL];
@@ -482,16 +502,6 @@ struct btrfs_discard_ctl {
atomic64_t discard_bytes_saved;
};
-/* delayed seq elem */
-struct seq_list {
- struct list_head list;
- u64 seq;
-};
-
-#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
-
-#define SEQ_LAST ((u64)-1)
-
enum btrfs_orphan_cleanup_state {
ORPHAN_CLEANUP_STARTED = 1,
ORPHAN_CLEANUP_DONE = 2,
@@ -572,6 +582,15 @@ enum {
/* Indicate that we can't trust the free space tree for caching yet */
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
+
+ /* Indicate whether there are any tree modification log users */
+ BTRFS_FS_TREE_MOD_LOG_USERS,
+
+#if BITS_PER_LONG == 32
+ /* Indicate if we have error/warn message printed on 32bit systems */
+ BTRFS_FS_32BIT_ERROR,
+ BTRFS_FS_32BIT_WARN,
+#endif
};
/*
@@ -941,10 +960,16 @@ struct btrfs_fs_info {
struct work_struct async_data_reclaim_work;
struct work_struct preempt_reclaim_work;
+ /* Reclaim partially filled block groups in the background */
+ struct work_struct reclaim_bgs_work;
+ struct list_head reclaim_bgs;
+ int bg_reclaim_threshold;
+
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
- struct mutex delete_unused_bgs_mutex;
+ /* Protect block groups that are going to be deleted */
+ struct mutex reclaim_bgs_lock;
/* Cached block sizes */
u32 nodesize;
@@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
struct btrfs_ref *generic_ref);
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_clear_space_info_full(struct btrfs_fs_info *info);
/*
@@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
-/* tree mod log functions from ctree.c */
-u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
- struct seq_list *elem);
-int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
-
/* root-item.c */
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence, const char *name,
@@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path);
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end, int mirror);
+ struct page *page, u64 start, u64 end);
struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode,
u64 start, u64 len);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
@@ -3110,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 new_size,
u32 min_type);
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root);
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
@@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
#define BTRFS_ILOCK_TRY (1U << 1)
+#define BTRFS_ILOCK_MMAP (1U << 2)
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags);
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
@@ -3189,6 +3207,9 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode,
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
@@ -3217,8 +3238,9 @@ extern const struct file_operations btrfs_file_operations;
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_drop_extents_args *args);
-int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
@@ -3405,6 +3427,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { }
#define ASSERT(expr) (void)(expr)
#endif
+#if BITS_PER_LONG == 32
+#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
+/*
+ * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
+ * addresses of extents.
+ *
+ * For 4K page size it's about 10T, for 64K it's 160T.
+ */
+#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
+void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
+void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
+#endif
+
/*
* Get the correct offset inside the page of extent buffer.
*
@@ -3732,8 +3767,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
return signal_pending(current);
}
-#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
-
/* Sanity test specific functions */
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
void btrfs_test_destroy_inode(struct inode *inode);
diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c
index bf25401c9768..1a88f6214ebc 100644
--- a/fs/btrfs/delayed-inode.c
+++ b/fs/btrfs/delayed-inode.c
@@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root,
static int btrfs_delayed_inode_reserve_metadata(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
- struct btrfs_inode *inode,
struct btrfs_delayed_node *node)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata(
return ret;
ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes,
BTRFS_RESERVE_NO_FLUSH);
- /*
- * Since we're under a transaction reserve_metadata_bytes could
- * try to commit the transaction which will make it return
- * EAGAIN to make us stop the transaction we have, so return
- * ENOSPC instead so that btrfs_dirty_inode knows what to do.
- */
- if (ret == -EAGAIN) {
- ret = -ENOSPC;
- btrfs_qgroup_free_meta_prealloc(root, num_bytes);
- }
- if (!ret) {
- node->bytes_reserved = num_bytes;
- trace_btrfs_space_reservation(fs_info,
- "delayed_inode",
- btrfs_ino(inode),
- num_bytes, 1);
- } else {
+ /* NO_FLUSH could only fail with -ENOSPC */
+ ASSERT(ret == 0 || ret == -ENOSPC);
+ if (ret)
btrfs_qgroup_free_meta_prealloc(root, num_bytes);
- }
- return ret;
+ } else {
+ ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
}
- ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true);
if (!ret) {
trace_btrfs_space_reservation(fs_info, "delayed_inode",
- btrfs_ino(inode), num_bytes, 1);
+ node->inode_id, num_bytes, 1);
node->bytes_reserved = num_bytes;
}
@@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
- inode_unlock_shared(inode);
- inode_lock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
+ btrfs_inode_lock(inode, 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans,
goto release_node;
}
- ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode,
- delayed_node);
+ ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node);
if (ret)
goto release_node;
diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c
index 63be7d01a9a3..c92d9d4f5f46 100644
--- a/fs/btrfs/delayed-ref.c
+++ b/fs/btrfs/delayed-ref.c
@@ -11,6 +11,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
+#include "tree-mod-log.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
if (head->is_data)
return;
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- struct seq_list *elem;
-
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- seq = elem->seq;
- }
- read_unlock(&fs_info->tree_mod_log_lock);
-
+ seq = btrfs_tree_mod_log_lowest_seq(fs_info);
again:
for (node = rb_first_cached(&head->ref_tree); node;
node = rb_next(node)) {
@@ -517,23 +509,16 @@ again:
int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
{
- struct seq_list *elem;
int ret = 0;
+ u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
- read_lock(&fs_info->tree_mod_log_lock);
- if (!list_empty(&fs_info->tree_mod_seq_list)) {
- elem = list_first_entry(&fs_info->tree_mod_seq_list,
- struct seq_list, list);
- if (seq >= elem->seq) {
- btrfs_debug(fs_info,
- "holding back delayed_ref %#x.%x, lowest is %#x.%x",
- (u32)(seq >> 32), (u32)seq,
- (u32)(elem->seq >> 32), (u32)elem->seq);
- ret = 1;
- }
+ if (min_seq != 0 && seq >= min_seq) {
+ btrfs_debug(fs_info,
+ "holding back delayed_ref %llu, lowest is %llu",
+ seq, min_seq);
+ ret = 1;
}
- read_unlock(&fs_info->tree_mod_log_lock);
return ret;
}
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index 3a9c1e046ebe..d05f73530af7 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info)
struct btrfs_dev_replace_item *ptr;
u64 src_devid;
+ if (!dev_root)
+ return 0;
+
path = btrfs_alloc_path();
if (!path) {
ret = -ENOMEM;
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 41b718cfea40..c9a3036c23bf 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -42,6 +42,7 @@
#include "discard.h"
#include "space-info.h"
#include "zoned.h"
+#include "subpage.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
return ret;
}
+static int csum_one_extent_buffer(struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ u8 result[BTRFS_CSUM_SIZE];
+ int ret;
+
+ ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
+ offsetof(struct btrfs_header, fsid),
+ BTRFS_FSID_SIZE) == 0);
+ csum_tree_block(eb, result);
+
+ if (btrfs_header_level(eb))
+ ret = btrfs_check_node(eb);
+ else
+ ret = btrfs_check_leaf_full(eb);
+
+ if (ret < 0) {
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info,
+ "block=%llu write time tree block corruption detected",
+ eb->start);
+ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
+ return ret;
+ }
+ write_extent_buffer(eb, result, 0, fs_info->csum_size);
+
+ return 0;
+}
+
+/* Checksum all dirty extent buffers in one bio_vec */
+static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
+ struct bio_vec *bvec)
+{
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 cur;
+ int ret = 0;
+
+ for (cur = bvec_start; cur < bvec_start + bvec->bv_len;
+ cur += fs_info->nodesize) {
+ struct extent_buffer *eb;
+ bool uptodate;
+
+ eb = find_extent_buffer(fs_info, cur);
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
+ fs_info->nodesize);
+
+ /* A dirty eb shouldn't disappear from buffer_radix */
+ if (WARN_ON(!eb))
+ return -EUCLEAN;
+
+ if (WARN_ON(cur != btrfs_header_bytenr(eb))) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+ if (WARN_ON(!uptodate)) {
+ free_extent_buffer(eb);
+ return -EUCLEAN;
+ }
+
+ ret = csum_one_extent_buffer(eb);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ return ret;
+ }
+ return ret;
+}
+
/*
* Checksum a dirty tree block before IO. This has extra checks to make sure
* we only fill in the checksum field in the first page of a multi-page block.
@@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
struct page *page = bvec->bv_page;
u64 start = page_offset(page);
u64 found_start;
- u8 result[BTRFS_CSUM_SIZE];
struct extent_buffer *eb;
- int ret;
+
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
if (page != eb->pages[0])
@@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
if (WARN_ON(!PageUptodate(page)))
return -EUCLEAN;
- ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid,
- offsetof(struct btrfs_header, fsid),
- BTRFS_FSID_SIZE) == 0);
-
- csum_tree_block(eb, result);
-
- if (btrfs_header_level(eb))
- ret = btrfs_check_node(eb);
- else
- ret = btrfs_check_leaf_full(eb);
-
- if (ret < 0) {
- btrfs_print_tree(eb, 0);
- btrfs_err(fs_info,
- "block=%llu write time tree block corruption detected",
- eb->start);
- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
- return ret;
- }
- write_extent_buffer(eb, result, 0, fs_info->csum_size);
-
- return 0;
+ return csum_one_extent_buffer(eb);
}
static int check_tree_block_fsid(struct extent_buffer *eb)
@@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset,
static int btree_set_page_dirty(struct page *page)
{
#ifdef DEBUG
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ struct btrfs_subpage *subpage;
struct extent_buffer *eb;
+ int cur_bit = 0;
+ u64 page_start = page_offset(page);
+
+ if (fs_info->sectorsize == PAGE_SIZE) {
+ BUG_ON(!PagePrivate(page));
+ eb = (struct extent_buffer *)page->private;
+ BUG_ON(!eb);
+ BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ BUG_ON(!atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ return __set_page_dirty_nobuffers(page);
+ }
+ ASSERT(PagePrivate(page) && page->private);
+ subpage = (struct btrfs_subpage *)page->private;
+
+ ASSERT(subpage->dirty_bitmap);
+ while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) {
+ unsigned long flags;
+ u64 cur;
+ u16 tmp = (1 << cur_bit);
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!(tmp & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur_bit++;
+ continue;
+ }
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ cur = page_start + cur_bit * fs_info->sectorsize;
- BUG_ON(!PagePrivate(page));
- eb = (struct extent_buffer *)page->private;
- BUG_ON(!eb);
- BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
- BUG_ON(!atomic_read(&eb->refs));
- btrfs_assert_tree_locked(eb);
+ eb = find_extent_buffer(fs_info, cur);
+ ASSERT(eb);
+ ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags));
+ ASSERT(atomic_read(&eb->refs));
+ btrfs_assert_tree_locked(eb);
+ free_extent_buffer(eb);
+
+ cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits);
+ }
#endif
return __set_page_dirty_nobuffers(page);
}
@@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg)
btrfs_run_defrag_inodes(fs_info);
/*
- * Acquires fs_info->delete_unused_bgs_mutex to avoid racing
+ * Acquires fs_info->reclaim_bgs_lock to avoid racing
* with relocation (btrfs_relocate_chunk) and relocation
* acquires fs_info->cleaner_mutex (btrfs_relocate_block_group)
- * after acquiring fs_info->delete_unused_bgs_mutex. So we
+ * after acquiring fs_info->reclaim_bgs_lock. So we
* can't hold, nor need to, fs_info->cleaner_mutex when deleting
* unused block groups.
*/
btrfs_delete_unused_bgs(fs_info);
+
+ /*
+ * Reclaim block groups in the reclaim_bgs list after we deleted
+ * all unused block_groups. This possibly gives us some more free
+ * space.
+ */
+ btrfs_reclaim_bgs(fs_info);
sleep:
clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags);
if (kthread_should_park())
@@ -2387,8 +2477,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info)
} else {
set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state);
fs_info->dev_root = root;
- btrfs_init_devices_late(fs_info);
}
+ /* Initialize fs_info for all devices in any case */
+ btrfs_init_devices_late(fs_info);
/* If IGNOREDATACSUMS is set don't bother reading the csum root. */
if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) {
@@ -2792,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
spin_lock_init(&fs_info->treelog_bg_lock);
rwlock_init(&fs_info->tree_mod_log_lock);
mutex_init(&fs_info->unused_bg_unpin_mutex);
- mutex_init(&fs_info->delete_unused_bgs_mutex);
+ mutex_init(&fs_info->reclaim_bgs_lock);
mutex_init(&fs_info->reloc_mutex);
mutex_init(&fs_info->delalloc_root_mutex);
mutex_init(&fs_info->zoned_meta_io_lock);
@@ -2802,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->space_info);
INIT_LIST_HEAD(&fs_info->tree_mod_seq_list);
INIT_LIST_HEAD(&fs_info->unused_bgs);
+ INIT_LIST_HEAD(&fs_info->reclaim_bgs);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&fs_info->allocated_roots);
INIT_LIST_HEAD(&fs_info->allocated_ebs);
@@ -2890,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->swapfile_pins = RB_ROOT;
fs_info->send_in_progress = 0;
+
+ fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+ INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
}
static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb)
@@ -3009,6 +3104,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
+ /*
+ * btrfs_find_orphan_roots() is responsible for finding all the dead
+ * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
+ * them into the fs_info->fs_roots_radix tree. This must be done before
+ * calling btrfs_orphan_cleanup() on the tree root. If we don't do it
+ * first, then btrfs_orphan_cleanup() will delete a dead root's orphan
+ * item before the root's tree is deleted - this means that if we unmount
+ * or crash before the deletion completes, on the next mount we will not
+ * delete what remains of the tree because the orphan item does not
+ * exists anymore, which is what tells us we have a pending deletion.
+ */
+ ret = btrfs_find_orphan_roots(fs_info);
+ if (ret)
+ goto out;
+
ret = btrfs_cleanup_fs_roots(fs_info);
if (ret)
goto out;
@@ -3068,7 +3178,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
}
}
- ret = btrfs_find_orphan_roots(fs_info);
out:
return ret;
}
@@ -4234,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info)
cancel_work_sync(&fs_info->async_data_reclaim_work);
cancel_work_sync(&fs_info->preempt_reclaim_work);
+ cancel_work_sync(&fs_info->reclaim_bgs_work);
+
/* Cancel or finish ongoing discard work */
btrfs_discard_cleanup(fs_info);
diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c
index 36a3c973fda1..3d5c35e4cb76 100644
--- a/fs/btrfs/extent-tree.c
+++ b/fs/btrfs/extent-tree.c
@@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
stripe = bbio->stripes;
for (i = 0; i < bbio->num_stripes; i++, stripe++) {
u64 bytes;
+ struct btrfs_device *device = stripe->dev;
- if (!stripe->dev->bdev) {
+ if (!device->bdev) {
ASSERT(btrfs_test_opt(fs_info, DEGRADED));
continue;
}
+ if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
+ continue;
+
ret = do_discard_extent(stripe, &bytes);
if (!ret) {
discarded_bytes += bytes;
@@ -1864,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans,
trace_run_delayed_ref_head(fs_info, head, 0);
btrfs_delayed_ref_unlock(head);
btrfs_put_delayed_ref_head(head);
- return 0;
+ return ret;
}
static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
@@ -2490,19 +2494,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
}
-int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
-{
- struct btrfs_block_group *block_group;
- int readonly = 0;
-
- block_group = btrfs_lookup_block_group(fs_info, bytenr);
- if (!block_group || block_group->ro)
- readonly = 1;
- if (block_group)
- btrfs_put_block_group(block_group);
- return readonly;
-}
-
static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@@ -3355,11 +3346,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
* find a node pointing to this leaf and record operations that
* point to this leaf.
*/
- if (btrfs_header_level(buf) == 0) {
- read_lock(&fs_info->tree_mod_log_lock);
- must_pin = !list_empty(&fs_info->tree_mod_seq_list);
- read_unlock(&fs_info->tree_mod_log_lock);
- }
+ if (btrfs_header_level(buf) == 0 &&
+ test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ must_pin = true;
if (must_pin || btrfs_is_zoned(fs_info)) {
btrfs_redirty_list_add(trans->transaction, buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 910769d5fcdb..dee2dafbc872 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -13,6 +13,7 @@
#include <linux/pagevec.h>
#include <linux/prefetch.h>
#include <linux/cleancache.h>
+#include "misc.h"
#include "extent_io.h"
#include "extent-io-tree.h"
#include "extent_map.h"
@@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio)
if (likely(uptodate)) {
if (is_data_inode(inode))
ret = btrfs_verify_data_csum(io_bio,
- bio_offset, page, start, end,
- mirror);
+ bio_offset, page, start, end);
else
ret = btrfs_validate_metadata_buffer(io_bio,
page, start, end, mirror);
@@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
}
if (page->index == last_byte >> PAGE_SHIFT) {
- char *userpage;
size_t zero_offset = offset_in_page(last_byte);
if (zero_offset) {
iosize = PAGE_SIZE - zero_offset;
- userpage = kmap_atomic(page);
- memset(userpage + zero_offset, 0, iosize);
+ memzero_page(page, zero_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
}
}
begin_page_read(fs_info, page);
@@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
u64 disk_bytenr;
if (cur >= last_byte) {
- char *userpage;
struct extent_state *cached = NULL;
iosize = PAGE_SIZE - pg_offset;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
+ memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
unlock_extent_cached(tree, cur,
@@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
/* we've found a hole, just zero and go on */
if (block_start == EXTENT_MAP_HOLE) {
- char *userpage;
struct extent_state *cached = NULL;
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0, iosize);
+ memzero_page(page, pg_offset, iosize);
flush_dcache_page(page);
- kunmap_atomic(userpage);
set_extent_uptodate(tree, cur, cur + iosize - 1,
&cached, GFP_NOFS);
@@ -3762,7 +3753,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode,
/* Note that em_end from extent_map_end() is exclusive */
iosize = min(em_end, end + 1) - cur;
- if (btrfs_use_zone_append(inode, em))
+ if (btrfs_use_zone_append(inode, em->block_start))
opf = REQ_OP_ZONE_APPEND;
free_extent_map(em);
@@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc,
}
if (page->index == end_index) {
- char *userpage;
-
- userpage = kmap_atomic(page);
- memset(userpage + pg_offset, 0,
- PAGE_SIZE - pg_offset);
- kunmap_atomic(userpage);
+ memzero_page(page, pg_offset, PAGE_SIZE - pg_offset);
flush_dcache_page(page);
}
@@ -3967,7 +3953,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb
btrfs_tree_unlock(eb);
- if (!ret)
+ /*
+ * Either we don't need to submit any tree block, or we're submitting
+ * subpage eb.
+ * Subpage metadata doesn't use page locking at all, so we can skip
+ * the page locking.
+ */
+ if (!ret || fs_info->sectorsize < PAGE_SIZE)
return ret;
num_pages = num_extent_pages(eb);
@@ -4012,12 +4004,11 @@ err_unlock:
return ret;
}
-static void set_btree_ioerr(struct page *page)
+static void set_btree_ioerr(struct page *page, struct extent_buffer *eb)
{
- struct extent_buffer *eb = (struct extent_buffer *)page->private;
- struct btrfs_fs_info *fs_info;
+ struct btrfs_fs_info *fs_info = eb->fs_info;
- SetPageError(page);
+ btrfs_page_set_error(fs_info, page, eb->start, eb->len);
if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags))
return;
@@ -4025,7 +4016,6 @@ static void set_btree_ioerr(struct page *page)
* If we error out, we should add back the dirty_metadata_bytes
* to make it consistent.
*/
- fs_info = eb->fs_info;
percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
eb->len, fs_info->dirty_metadata_batch);
@@ -4069,26 +4059,111 @@ static void set_btree_ioerr(struct page *page)
*/
switch (eb->log_index) {
case -1:
- set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags);
break;
case 0:
- set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags);
break;
case 1:
- set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags);
+ set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags);
break;
default:
BUG(); /* unexpected, logic error */
}
}
+/*
+ * The endio specific version which won't touch any unsafe spinlock in endio
+ * context.
+ */
+static struct extent_buffer *find_extent_buffer_nolock(
+ struct btrfs_fs_info *fs_info, u64 start)
+{
+ struct extent_buffer *eb;
+
+ rcu_read_lock();
+ eb = radix_tree_lookup(&fs_info->buffer_radix,
+ start >> fs_info->sectorsize_bits);
+ if (eb && atomic_inc_not_zero(&eb->refs)) {
+ rcu_read_unlock();
+ return eb;
+ }
+ rcu_read_unlock();
+ return NULL;
+}
+
+/*
+ * The endio function for subpage extent buffer write.
+ *
+ * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback()
+ * after all extent buffers in the page has finished their writeback.
+ */
+static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info,
+ struct bio *bio)
+{
+ struct bio_vec *bvec;
+ struct bvec_iter_all iter_all;
+
+ ASSERT(!bio_flagged(bio, BIO_CLONED));
+ bio_for_each_segment_all(bvec, bio, iter_all) {
+ struct page *page = bvec->bv_page;
+ u64 bvec_start = page_offset(page) + bvec->bv_offset;
+ u64 bvec_end = bvec_start + bvec->bv_len - 1;
+ u64 cur_bytenr = bvec_start;
+
+ ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize));
+
+ /* Iterate through all extent buffers in the range */
+ while (cur_bytenr <= bvec_end) {
+ struct extent_buffer *eb;
+ int done;
+
+ /*
+ * Here we can't use find_extent_buffer(), as it may
+ * try to lock eb->refs_lock, which is not safe in endio
+ * context.
+ */
+ eb = find_extent_buffer_nolock(fs_info, cur_bytenr);
+ ASSERT(eb);
+
+ cur_bytenr = eb->start + eb->len;
+
+ ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags));
+ done = atomic_dec_and_test(&eb->io_pages);
+ ASSERT(done);
+
+ if (bio->bi_status ||
+ test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
+ ClearPageUptodate(page);
+ set_btree_ioerr(page, eb);
+ }
+
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start,
+ eb->len);
+ end_extent_buffer_writeback(eb);
+ /*
+ * free_extent_buffer() will grab spinlock which is not
+ * safe in endio context. Thus here we manually dec
+ * the ref.
+ */
+ atomic_dec(&eb->refs);
+ }
+ }
+ bio_put(bio);
+}
+
static void end_bio_extent_buffer_writepage(struct bio *bio)
{
+ struct btrfs_fs_info *fs_info;
struct bio_vec *bvec;
struct extent_buffer *eb;
int done;
struct bvec_iter_all iter_all;
+ fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb);
+ if (fs_info->sectorsize < PAGE_SIZE)
+ return end_bio_subpage_eb_writepage(fs_info, bio);
+
ASSERT(!bio_flagged(bio, BIO_CLONED));
bio_for_each_segment_all(bvec, bio, iter_all) {
struct page *page = bvec->bv_page;
@@ -4100,7 +4175,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
if (bio->bi_status ||
test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) {
ClearPageUptodate(page);
- set_btree_ioerr(page);
+ set_btree_ioerr(page, eb);
}
end_page_writeback(page);
@@ -4114,6 +4189,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio)
bio_put(bio);
}
+/*
+ * Unlike the work in write_one_eb(), we rely completely on extent locking.
+ * Page locking is only utilized at minimum to keep the VMM code happy.
+ *
+ * Caller should still call write_one_eb() other than this function directly.
+ * As write_one_eb() has extra preparation before submitting the extent buffer.
+ */
+static int write_one_subpage_eb(struct extent_buffer *eb,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META;
+ bool no_dirty_ebs = false;
+ int ret;
+
+ /* clear_page_dirty_for_io() in subpage helper needs page locked */
+ lock_page(page);
+ btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len);
+
+ /* Check if this is the last dirty bit to update nr_written */
+ no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page,
+ eb->start, eb->len);
+ if (no_dirty_ebs)
+ clear_page_dirty_for_io(page);
+
+ ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page,
+ eb->start, eb->len, eb->start - page_offset(page),
+ &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0,
+ false);
+ if (ret) {
+ btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len);
+ set_btree_ioerr(page, eb);
+ unlock_page(page);
+
+ if (atomic_dec_and_test(&eb->io_pages))
+ end_extent_buffer_writeback(eb);
+ return -EIO;
+ }
+ unlock_page(page);
+ /*
+ * Submission finished without problem, if no range of the page is
+ * dirty anymore, we have submitted a page. Update nr_written in wbc.
+ */
+ if (no_dirty_ebs)
+ update_nr_written(wbc, 1);
+ return ret;
+}
+
static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
struct writeback_control *wbc,
struct extent_page_data *epd)
@@ -4145,6 +4270,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
memzero_extent_buffer(eb, start, end - start);
}
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return write_one_subpage_eb(eb, wbc, epd);
+
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
@@ -4156,7 +4284,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
end_bio_extent_buffer_writepage,
0, 0, 0, false);
if (ret) {
- set_btree_ioerr(p);
+ set_btree_ioerr(p, eb);
if (PageWriteback(p))
end_page_writeback(p);
if (atomic_sub_and_test(num_pages - i, &eb->io_pages))
@@ -4181,6 +4309,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb,
}
/*
+ * Submit one subpage btree page.
+ *
+ * The main difference to submit_eb_page() is:
+ * - Page locking
+ * For subpage, we don't rely on page locking at all.
+ *
+ * - Flush write bio
+ * We only flush bio if we may be unable to fit current extent buffers into
+ * current bio.
+ *
+ * Return >=0 for the number of submitted extent buffers.
+ * Return <0 for fatal error.
+ */
+static int submit_eb_subpage(struct page *page,
+ struct writeback_control *wbc,
+ struct extent_page_data *epd)
+{
+ struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
+ int submitted = 0;
+ u64 page_start = page_offset(page);
+ int bit_start = 0;
+ const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE;
+ int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits;
+ int ret;
+
+ /* Lock and write each dirty extent buffers in the range */
+ while (bit_start < nbits) {
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ struct extent_buffer *eb;
+ unsigned long flags;
+ u64 start;
+
+ /*
+ * Take private lock to ensure the subpage won't be detached
+ * in the meantime.
+ */
+ spin_lock(&page->mapping->private_lock);
+ if (!PagePrivate(page)) {
+ spin_unlock(&page->mapping->private_lock);
+ break;
+ }
+ spin_lock_irqsave(&subpage->lock, flags);
+ if (!((1 << bit_start) & subpage->dirty_bitmap)) {
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+ bit_start++;
+ continue;
+ }
+
+ start = page_start + bit_start * fs_info->sectorsize;
+ bit_start += sectors_per_node;
+
+ /*
+ * Here we just want to grab the eb without touching extra
+ * spin locks, so call find_extent_buffer_nolock().
+ */
+ eb = find_extent_buffer_nolock(fs_info, start);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ spin_unlock(&page->mapping->private_lock);
+
+ /*
+ * The eb has already reached 0 refs thus find_extent_buffer()
+ * doesn't return it. We don't need to write back such eb
+ * anyway.
+ */
+ if (!eb)
+ continue;
+
+ ret = lock_extent_buffer_for_io(eb, epd);
+ if (ret == 0) {
+ free_extent_buffer(eb);
+ continue;
+ }
+ if (ret < 0) {
+ free_extent_buffer(eb);
+ goto cleanup;
+ }
+ ret = write_one_eb(eb, wbc, epd);
+ free_extent_buffer(eb);
+ if (ret < 0)
+ goto cleanup;
+ submitted++;
+ }
+ return submitted;
+
+cleanup:
+ /* We hit error, end bio for the submitted extent buffers */
+ end_write_bio(epd, ret);
+ return ret;
+}
+
+/*
* Submit all page(s) of one extent buffer.
*
* @page: the page of one extent buffer
@@ -4212,6 +4432,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc,
if (!PagePrivate(page))
return 0;
+ if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
+ return submit_eb_subpage(page, wbc, epd);
+
spin_lock(&mapping->private_lock);
if (!PagePrivate(page)) {
spin_unlock(&mapping->private_lock);
@@ -4652,10 +4875,8 @@ void extent_readahead(struct readahead_control *rac)
int nr;
while ((nr = readahead_page_batch(rac, pagepool))) {
- u64 contig_start = page_offset(pagepool[0]);
- u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1;
-
- ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end);
+ u64 contig_start = readahead_pos(rac);
+ u64 contig_end = contig_start + readahead_batch_length(rac) - 1;
contiguous_readpages(pagepool, nr, contig_start, contig_end,
&em_cached, &bio, &bio_flags, &prev_em_start);
@@ -4975,7 +5196,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
u64 start, u64 len)
{
int ret = 0;
- u64 off = start;
+ u64 off;
u64 max = start + len;
u32 flags = 0;
u32 found_type;
@@ -5010,6 +5231,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo,
goto out_free_ulist;
}
+ /*
+ * We can't initialize that to 'start' as this could miss extents due
+ * to extent item merging
+ */
+ off = 0;
start = round_down(start, btrfs_inode_sectorsize(inode));
len = round_up(max, btrfs_inode_sectorsize(inode)) - start;
@@ -5469,36 +5695,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info,
{
struct extent_buffer *eb;
- rcu_read_lock();
- eb = radix_tree_lookup(&fs_info->buffer_radix,
- start >> fs_info->sectorsize_bits);
- if (eb && atomic_inc_not_zero(&eb->refs)) {
- rcu_read_unlock();
- /*
- * Lock our eb's refs_lock to avoid races with
- * free_extent_buffer. When we get our eb it might be flagged
- * with EXTENT_BUFFER_STALE and another task running
- * free_extent_buffer might have seen that flag set,
- * eb->refs == 2, that the buffer isn't under IO (dirty and
- * writeback flags not set) and it's still in the tree (flag
- * EXTENT_BUFFER_TREE_REF set), therefore being in the process
- * of decrementing the extent buffer's reference count twice.
- * So here we could race and increment the eb's reference count,
- * clear its stale flag, mark it as dirty and drop our reference
- * before the other task finishes executing free_extent_buffer,
- * which would later result in an attempt to free an extent
- * buffer that is dirty.
- */
- if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
- spin_lock(&eb->refs_lock);
- spin_unlock(&eb->refs_lock);
- }
- mark_extent_buffer_accessed(eb, NULL);
- return eb;
+ eb = find_extent_buffer_nolock(fs_info, start);
+ if (!eb)
+ return NULL;
+ /*
+ * Lock our eb's refs_lock to avoid races with free_extent_buffer().
+ * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and
+ * another task running free_extent_buffer() might have seen that flag
+ * set, eb->refs == 2, that the buffer isn't under IO (dirty and
+ * writeback flags not set) and it's still in the tree (flag
+ * EXTENT_BUFFER_TREE_REF set), therefore being in the process of
+ * decrementing the extent buffer's reference count twice. So here we
+ * could race and increment the eb's reference count, clear its stale
+ * flag, mark it as dirty and drop our reference before the other task
+ * finishes executing free_extent_buffer, which would later result in
+ * an attempt to free an extent buffer that is dirty.
+ */
+ if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) {
+ spin_lock(&eb->refs_lock);
+ spin_unlock(&eb->refs_lock);
}
- rcu_read_unlock();
-
- return NULL;
+ mark_extent_buffer_accessed(eb, NULL);
+ return eb;
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
@@ -5594,6 +5812,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
return ERR_PTR(-EINVAL);
}
+#if BITS_PER_LONG == 32
+ if (start >= MAX_LFS_FILESIZE) {
+ btrfs_err_rl(fs_info,
+ "extent buffer %llu is beyond 32bit page cache limit", start);
+ btrfs_err_32bit_limit(fs_info);
+ return ERR_PTR(-EOVERFLOW);
+ }
+ if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ btrfs_warn_32bit_limit(fs_info);
+#endif
+
if (fs_info->sectorsize < PAGE_SIZE &&
offset_in_page(start) + len > PAGE_SIZE) {
btrfs_err(fs_info,
@@ -5665,7 +5894,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info,
btrfs_page_inc_eb_refs(fs_info, p);
spin_unlock(&mapping->private_lock);
- WARN_ON(PageDirty(p));
+ WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len));
eb->pages[i] = p;
if (!PageUptodate(p))
uptodate = 0;
@@ -5814,28 +6043,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb)
release_extent_buffer(eb);
}
+static void btree_clear_page_dirty(struct page *page)
+{
+ ASSERT(PageDirty(page));
+ ASSERT(PageLocked(page));
+ clear_page_dirty_for_io(page);
+ xa_lock_irq(&page->mapping->i_pages);
+ if (!PageDirty(page))
+ __xa_clear_mark(&page->mapping->i_pages,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+ xa_unlock_irq(&page->mapping->i_pages);
+}
+
+static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+ struct page *page = eb->pages[0];
+ bool last;
+
+ /* btree_clear_page_dirty() needs page locked */
+ lock_page(page);
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start,
+ eb->len);
+ if (last)
+ btree_clear_page_dirty(page);
+ unlock_page(page);
+ WARN_ON(atomic_read(&eb->refs) == 0);
+}
+
void clear_extent_buffer_dirty(const struct extent_buffer *eb)
{
int i;
int num_pages;
struct page *page;
+ if (eb->fs_info->sectorsize < PAGE_SIZE)
+ return clear_subpage_extent_buffer_dirty(eb);
+
num_pages = num_extent_pages(eb);
for (i = 0; i < num_pages; i++) {
page = eb->pages[i];
if (!PageDirty(page))
continue;
-
lock_page(page);
- WARN_ON(!PagePrivate(page));
-
- clear_page_dirty_for_io(page);
- xa_lock_irq(&page->mapping->i_pages);
- if (!PageDirty(page))
- __xa_clear_mark(&page->mapping->i_pages,
- page_index(page), PAGECACHE_TAG_DIRTY);
- xa_unlock_irq(&page->mapping->i_pages);
+ btree_clear_page_dirty(page);
ClearPageError(page);
unlock_page(page);
}
@@ -5856,10 +6108,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb)
WARN_ON(atomic_read(&eb->refs) == 0);
WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags));
- if (!was_dirty)
- for (i = 0; i < num_pages; i++)
- set_page_dirty(eb->pages[i]);
+ if (!was_dirty) {
+ bool subpage = eb->fs_info->sectorsize < PAGE_SIZE;
+ /*
+ * For subpage case, we can have other extent buffers in the
+ * same page, and in clear_subpage_extent_buffer_dirty() we
+ * have to clear page dirty without subpage lock held.
+ * This can cause race where our page gets dirty cleared after
+ * we just set it.
+ *
+ * Thankfully, clear_subpage_extent_buffer_dirty() has locked
+ * its page for other reasons, we can use page lock to prevent
+ * the above race.
+ */
+ if (subpage)
+ lock_page(eb->pages[0]);
+ for (i = 0; i < num_pages; i++)
+ btrfs_page_set_dirty(eb->fs_info, eb->pages[i],
+ eb->start, eb->len);
+ if (subpage)
+ unlock_page(eb->pages[0]);
+ }
#ifdef CONFIG_BTRFS_DEBUG
for (i = 0; i < num_pages; i++)
ASSERT(PageDirty(eb->pages[i]));
@@ -6217,12 +6487,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv,
return ret;
}
+/*
+ * Check that the extent buffer is uptodate.
+ *
+ * For regular sector size == PAGE_SIZE case, check if @page is uptodate.
+ * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE.
+ */
+static void assert_eb_page_uptodate(const struct extent_buffer *eb,
+ struct page *page)
+{
+ struct btrfs_fs_info *fs_info = eb->fs_info;
+
+ if (fs_info->sectorsize < PAGE_SIZE) {
+ bool uptodate;
+
+ uptodate = btrfs_subpage_test_uptodate(fs_info, page,
+ eb->start, eb->len);
+ WARN_ON(!uptodate);
+ } else {
+ WARN_ON(!PageUptodate(page));
+ }
+}
+
void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb,
const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
+ assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv,
BTRFS_FSID_SIZE);
@@ -6232,7 +6524,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv)
{
char *kaddr;
- WARN_ON(!PageUptodate(eb->pages[0]));
+ assert_eb_page_uptodate(eb, eb->pages[0]);
kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0);
memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv,
BTRFS_FSID_SIZE);
@@ -6257,7 +6549,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv,
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -6286,7 +6578,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start,
while (len > 0) {
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
cur = min(len, PAGE_SIZE - offset);
kaddr = page_address(page);
@@ -6344,7 +6636,7 @@ void copy_extent_buffer(const struct extent_buffer *dst,
while (len > 0) {
page = dst->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(dst, page);
cur = min(len, (unsigned long)(PAGE_SIZE - offset));
@@ -6406,7 +6698,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start,
eb_bitmap_offset(eb, start, nr, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1)));
}
@@ -6431,7 +6723,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_set) {
@@ -6442,7 +6734,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
@@ -6474,7 +6766,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
eb_bitmap_offset(eb, start, pos, &i, &offset);
page = eb->pages[i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
while (len >= bits_to_clear) {
@@ -6485,7 +6777,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb,
if (++offset >= PAGE_SIZE && len > 0) {
offset = 0;
page = eb->pages[++i];
- WARN_ON(!PageUptodate(page));
+ assert_eb_page_uptodate(eb, page);
kaddr = page_address(page);
}
}
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index 824640cb0ace..227215a5722c 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -66,6 +66,7 @@ enum {
struct btrfs_root;
struct btrfs_inode;
struct btrfs_io_bio;
+struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
@@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size);
-struct btrfs_fs_info;
-struct btrfs_inode;
-
int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c
index 47cd3a6dc635..441cee7fbb62 100644
--- a/fs/btrfs/file-item.c
+++ b/fs/btrfs/file-item.c
@@ -9,6 +9,7 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@@ -787,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
u64 end_byte = bytenr + len;
u64 csum_end;
struct extent_buffer *leaf;
- int ret;
+ int ret = 0;
const u32 csum_size = fs_info->csum_size;
u32 blocksize_bits = fs_info->sectorsize_bits;
@@ -805,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
if (ret > 0) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -861,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_del_items(trans, root, path,
path->slots[0], del_nr);
if (ret)
- goto out;
+ break;
if (key.offset == bytenr)
break;
} else if (key.offset < bytenr && csum_end > end_byte) {
@@ -905,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
ret = btrfs_split_item(trans, root, path, &key, offset);
if (ret && ret != -EAGAIN) {
btrfs_abort_transaction(trans, ret);
- goto out;
+ break;
}
+ ret = 0;
key.offset = end_byte - 1;
} else {
@@ -916,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans,
}
btrfs_release_path(path);
}
- ret = 0;
-out:
btrfs_free_path(path);
return ret;
}
+static int find_next_csum_offset(struct btrfs_root *root,
+ struct btrfs_path *path,
+ u64 *next_offset)
+{
+ const u32 nritems = btrfs_header_nritems(path->nodes[0]);
+ struct btrfs_key found_key;
+ int slot = path->slots[0] + 1;
+ int ret;
+
+ if (nritems == 0 || slot >= nritems) {
+ ret = btrfs_next_leaf(root, path);
+ if (ret < 0) {
+ return ret;
+ } else if (ret > 0) {
+ *next_offset = (u64)-1;
+ return 0;
+ }
+ slot = path->slots[0];
+ }
+
+ btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
+
+ if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
+ found_key.type != BTRFS_EXTENT_CSUM_KEY)
+ *next_offset = (u64)-1;
+ else
+ *next_offset = found_key.offset;
+
+ return 0;
+}
+
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums)
@@ -937,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
u64 total_bytes = 0;
u64 csum_offset;
u64 bytenr;
- u32 nritems;
u32 ins_size;
int index = 0;
int found_next;
@@ -980,26 +1011,10 @@ again:
goto insert;
}
} else {
- int slot = path->slots[0] + 1;
- /* we didn't find a csum item, insert one */
- nritems = btrfs_header_nritems(path->nodes[0]);
- if (!nritems || (path->slots[0] >= nritems - 1)) {
- ret = btrfs_next_leaf(root, path);
- if (ret < 0) {
- goto out;
- } else if (ret > 0) {
- found_next = 1;
- goto insert;
- }
- slot = path->slots[0];
- }
- btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot);
- if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
- found_key.type != BTRFS_EXTENT_CSUM_KEY) {
- found_next = 1;
- goto insert;
- }
- next_offset = found_key.offset;
+ /* We didn't find a csum item, insert one. */
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
found_next = 1;
goto insert;
}
@@ -1055,8 +1070,48 @@ extend_csum:
tmp = sums->len - total_bytes;
tmp >>= fs_info->sectorsize_bits;
WARN_ON(tmp < 1);
+ extend_nr = max_t(int, 1, tmp);
+
+ /*
+ * A log tree can already have checksum items with a subset of
+ * the checksums we are trying to log. This can happen after
+ * doing a sequence of partial writes into prealloc extents and
+ * fsyncs in between, with a full fsync logging a larger subrange
+ * of an extent for which a previous fast fsync logged a smaller
+ * subrange. And this happens in particular due to merging file
+ * extent items when we complete an ordered extent for a range
+ * covered by a prealloc extent - this is done at
+ * btrfs_mark_extent_written().
+ *
+ * So if we try to extend the previous checksum item, which has
+ * a range that ends at the start of the range we want to insert,
+ * make sure we don't extend beyond the start offset of the next
+ * checksum item. If we are at the last item in the leaf, then
+ * forget the optimization of extending and add a new checksum
+ * item - it is not worth the complexity of releasing the path,
+ * getting the first key for the next leaf, repeat the btree
+ * search, etc, because log trees are temporary anyway and it
+ * would only save a few bytes of leaf space.
+ */
+ if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
+ if (path->slots[0] + 1 >=
+ btrfs_header_nritems(path->nodes[0])) {
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+ found_next = 1;
+ goto insert;
+ }
+
+ ret = find_next_csum_offset(root, path, &next_offset);
+ if (ret < 0)
+ goto out;
+
+ tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits;
+ if (tmp <= INT_MAX)
+ extend_nr = min_t(int, extend_nr, tmp);
+ }
- extend_nr = max_t(int, 1, (int)tmp);
diff = (csum_offset + extend_nr) * csum_size;
diff = min(diff,
MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size);
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index 0e155f013839..3b10d98b4ebb 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb,
else
num_written = btrfs_buffered_write(iocb, from);
- /*
- * We also have to set last_sub_trans to the current log transid,
- * otherwise subsequent syncs to a file that's been synced in this
- * transaction will appear to have already occurred.
- */
- spin_lock(&inode->lock);
- inode->last_sub_trans = inode->root->log_transid;
- spin_unlock(&inode->lock);
+ btrfs_set_inode_last_sub_trans(inode);
+
if (num_written > 0)
num_written = generic_write_sync(iocb, num_written);
@@ -2073,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
return ret;
}
+static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
+{
+ struct btrfs_inode *inode = BTRFS_I(ctx->inode);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
+
+ if (btrfs_inode_in_log(inode, fs_info->generation) &&
+ list_empty(&ctx->ordered_extents))
+ return true;
+
+ /*
+ * If we are doing a fast fsync we can not bail out if the inode's
+ * last_trans is <= then the last committed transaction, because we only
+ * update the last_trans of the inode during ordered extent completion,
+ * and for a fast fsync we don't wait for that, we only wait for the
+ * writeback to complete.
+ */
+ if (inode->last_trans <= fs_info->last_trans_committed &&
+ (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
+ list_empty(&ctx->ordered_extents)))
+ return true;
+
+ return false;
+}
+
/*
* fsync call for both files and directories. This logs the inode into
* the tree log instead of forcing full commits whenever possible.
@@ -2122,7 +2140,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
- inode_lock(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@@ -2135,11 +2153,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
&BTRFS_I(inode)->runtime_flags);
/*
- * Before we acquired the inode's lock, someone may have dirtied more
- * pages in the target range. We need to make sure that writeback for
- * any such pages does not start while we are logging the inode, because
- * if it does, any of the following might happen when we are not doing a
- * full inode sync:
+ * Before we acquired the inode's lock and the mmap lock, someone may
+ * have dirtied more pages in the target range. We need to make sure
+ * that writeback for any such pages does not start while we are logging
+ * the inode, because if it does, any of the following might happen when
+ * we are not doing a full inode sync:
*
* 1) We log an extent after its writeback finishes but before its
* checksums are added to the csum tree, leading to -EIO errors
@@ -2154,7 +2172,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2191,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
atomic_inc(&root->log_batch);
- /*
- * If we are doing a fast fsync we can not bail out if the inode's
- * last_trans is <= then the last committed transaction, because we only
- * update the last_trans of the inode during ordered extent completion,
- * and for a fast fsync we don't wait for that, we only wait for the
- * writeback to complete.
- */
smp_mb();
- if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) ||
- (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed &&
- (full_sync || list_empty(&ctx.ordered_extents)))) {
+ if (skip_inode_logging(&ctx)) {
/*
* We've had everything committed since the last time we were
* modified so clear this flag in case it was set for whatever
@@ -2255,7 +2264,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
if (ret != BTRFS_NO_LOG_SYNC) {
if (!ret) {
@@ -2285,7 +2294,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
goto out;
}
@@ -2605,16 +2614,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
* extents without inserting a new one, so we must abort the transaction to avoid
* a corruption.
*/
-int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
- const u64 start, const u64 end,
- struct btrfs_replace_extent_info *extent_info,
- struct btrfs_trans_handle **trans_out)
+int btrfs_replace_file_extents(struct btrfs_inode *inode,
+ struct btrfs_path *path, const u64 start,
+ const u64 end,
+ struct btrfs_replace_extent_info *extent_info,
+ struct btrfs_trans_handle **trans_out)
{
struct btrfs_drop_extents_args drop_args = { 0 };
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_root *root = inode->root;
+ struct btrfs_fs_info *fs_info = root->fs_info;
u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
- u64 ino_size = round_up(inode->i_size, fs_info->sectorsize);
- struct btrfs_root *root = BTRFS_I(inode)->root;
+ u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
struct btrfs_trans_handle *trans = NULL;
struct btrfs_block_rsv *rsv;
unsigned int rsv_count;
@@ -2662,10 +2672,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
drop_args.drop_cache = true;
while (cur_offset < end) {
drop_args.start = cur_offset;
- ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args);
+ ret = btrfs_drop_extents(trans, root, inode, &drop_args);
/* If we are punching a hole decrement the inode's byte count */
if (!extent_info)
- btrfs_update_inode_bytes(BTRFS_I(inode), 0,
+ btrfs_update_inode_bytes(inode, 0,
drop_args.bytes_found);
if (ret != -ENOSPC) {
/*
@@ -2685,8 +2695,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
if (!extent_info && cur_offset < drop_args.drop_end &&
cur_offset < ino_size) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_args.drop_end);
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/*
* If we failed then we didn't insert our hole
@@ -2704,7 +2714,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
* know to not set disk_i_size in this area until a new
* file extent is inserted here.
*/
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
+ ret = btrfs_inode_clear_file_extent_range(inode,
cur_offset,
drop_args.drop_end - cur_offset);
if (ret) {
@@ -2723,8 +2733,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
u64 replace_len = drop_args.drop_end -
extent_info->file_offset;
- ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode),
- path, extent_info, replace_len,
+ ret = btrfs_insert_replace_extent(trans, inode, path,
+ extent_info, replace_len,
drop_args.bytes_found);
if (ret) {
btrfs_abort_transaction(trans, ret);
@@ -2735,9 +2745,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
extent_info->file_offset += replace_len;
}
- cur_offset = drop_args.drop_end;
-
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, inode);
if (ret)
break;
@@ -2756,9 +2764,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
BUG_ON(ret); /* shouldn't happen */
trans->block_rsv = rsv;
- if (!extent_info) {
- ret = find_first_non_hole(BTRFS_I(inode), &cur_offset,
- &len);
+ cur_offset = drop_args.drop_end;
+ len = end - cur_offset;
+ if (!extent_info && len) {
+ ret = find_first_non_hole(inode, &cur_offset, &len);
if (unlikely(ret < 0))
break;
if (ret && !len) {
@@ -2771,14 +2780,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
/*
* If we were cloning, force the next fsync to be a full one since we
* we replaced (or just dropped in the case of cloning holes when
- * NO_HOLES is enabled) extents and extent maps.
- * This is for the sake of simplicity, and cloning into files larger
- * than 16Mb would force the full fsync any way (when
- * try_release_extent_mapping() is invoked during page cache truncation.
+ * NO_HOLES is enabled) file extent items and did not setup new extent
+ * maps for the replacement extents (or holes).
*/
if (extent_info && !extent_info->is_new_extent)
- set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
- &BTRFS_I(inode)->runtime_flags);
+ set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags);
if (ret)
goto out_trans;
@@ -2804,8 +2810,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
*/
if (!extent_info && cur_offset < ino_size &&
cur_offset < drop_args.drop_end) {
- ret = fill_holes(trans, BTRFS_I(inode), path,
- cur_offset, drop_args.drop_end);
+ ret = fill_holes(trans, inode, path, cur_offset,
+ drop_args.drop_end);
if (ret) {
/* Same comment as above. */
btrfs_abort_transaction(trans, ret);
@@ -2813,8 +2819,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
} else if (!extent_info && cur_offset < drop_args.drop_end) {
/* See the comment in the loop above for the reasoning here. */
- ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode),
- cur_offset, drop_args.drop_end - cur_offset);
+ ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
+ drop_args.drop_end - cur_offset);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out_trans;
@@ -2822,7 +2828,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path,
}
if (extent_info) {
- ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path,
+ ret = btrfs_insert_replace_extent(trans, inode, path,
extent_info, extent_info->data_len,
drop_args.bytes_found);
if (ret) {
@@ -2868,7 +2874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
if (ret)
return ret;
- inode_lock(inode);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@@ -2908,7 +2914,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
}
@@ -2967,8 +2973,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
goto out;
}
- ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL,
- &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
+ lockend, NULL, &trans);
btrfs_free_path(path);
if (ret)
goto out;
@@ -3009,7 +3015,7 @@ out_only_mutex:
ret = ret2;
}
}
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3335,7 +3341,7 @@ static long btrfs_fallocate(struct file *file, int mode,
return ret;
}
- btrfs_inode_lock(inode, 0);
+ btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@@ -3377,7 +3383,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
return ret;
}
@@ -3487,7 +3493,7 @@ out_unlock:
unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
@@ -3496,13 +3502,13 @@ out:
return ret;
}
-static loff_t find_desired_extent(struct inode *inode, loff_t offset,
+static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
{
- struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+ struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_map *em = NULL;
struct extent_state *cached_state = NULL;
- loff_t i_size = inode->i_size;
+ loff_t i_size = inode->vfs_inode.i_size;
u64 lockstart;
u64 lockend;
u64 start;
@@ -3525,11 +3531,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
lockend--;
len = lockend - lockstart + 1;
- lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
- &cached_state);
+ lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state);
while (start < i_size) {
- em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len);
+ em = btrfs_get_extent_fiemap(inode, start, len);
if (IS_ERR(em)) {
ret = PTR_ERR(em);
em = NULL;
@@ -3551,7 +3556,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset,
cond_resched();
}
free_extent_map(em);
- unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend,
+ unlock_extent_cached(&inode->io_tree, lockstart, lockend,
&cached_state);
if (ret) {
offset = ret;
@@ -3575,7 +3580,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
case SEEK_DATA:
case SEEK_HOLE:
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
- offset = find_desired_extent(inode, offset, whence);
+ offset = find_desired_extent(BTRFS_I(inode), offset, whence);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
break;
}
diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c
index 9988decd5717..4806295116d8 100644
--- a/fs/btrfs/free-space-cache.c
+++ b/fs/btrfs/free-space-cache.c
@@ -11,6 +11,7 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "free-space-cache.h"
#include "transaction.h"
@@ -2539,6 +2540,7 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
+ struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
@@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
}
/* All the region is now unusable. Mark it as unused and reclaim */
- if (block_group->zone_unusable == block_group->length)
+ if (block_group->zone_unusable == block_group->length) {
btrfs_mark_bg_unused(block_group);
+ } else if (block_group->zone_unusable >=
+ div_factor_fine(block_group->length,
+ fs_info->bg_reclaim_threshold)) {
+ btrfs_mark_bg_to_reclaim(block_group);
+ }
return 0;
}
@@ -3942,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
{
struct btrfs_block_group *block_group;
struct rb_node *node;
- int ret;
+ int ret = 0;
btrfs_info(fs_info, "cleaning free space cache v1");
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 7cdf65be3707..46f392943f4d 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode,
* BTRFS_ILOCK_SHARED - acquire a shared lock on the inode
* BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt
* return -EAGAIN
+ * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock
*/
int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
{
@@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
}
inode_lock(inode);
}
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ down_write(&BTRFS_I(inode)->i_mmap_lock);
return 0;
}
@@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags)
*/
void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags)
{
+ if (ilock_flags & BTRFS_ILOCK_MMAP)
+ up_write(&BTRFS_I(inode)->i_mmap_lock);
if (ilock_flags & BTRFS_ILOCK_SHARED)
inode_unlock_shared(inode);
else
@@ -641,17 +646,12 @@ again:
if (!ret) {
unsigned long offset = offset_in_page(total_compressed);
struct page *page = pages[nr_pages - 1];
- char *kaddr;
/* zero the tail end of the last page, we might be
* sending it down to disk
*/
- if (offset) {
- kaddr = kmap_atomic(page);
- memset(kaddr + offset, 0,
- PAGE_SIZE - offset);
- kunmap_atomic(kaddr);
- }
+ if (offset)
+ memzero_page(page, offset, PAGE_SIZE - offset);
will_compress = 1;
}
}
@@ -1516,7 +1516,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
struct page *locked_page,
const u64 start, const u64 end,
- int *page_started, int force,
+ int *page_started,
unsigned long *nr_written)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@@ -1530,6 +1530,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode,
u64 ino = btrfs_ino(inode);
bool nocow = false;
u64 disk_bytenr = 0;
+ const bool force = inode->flags & BTRFS_INODE_NODATACOW;
path = btrfs_alloc_path();
if (!path) {
@@ -1863,23 +1864,16 @@ error:
return ret;
}
-static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end)
+static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end)
{
-
- if (!(inode->flags & BTRFS_INODE_NODATACOW) &&
- !(inode->flags & BTRFS_INODE_PREALLOC))
- return 0;
-
- /*
- * @defrag_bytes is a hint value, no spinlock held here,
- * if is not zero, it means the file is defragging.
- * Force cow if given extent needs to be defragged.
- */
- if (inode->defrag_bytes &&
- test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL))
- return 1;
-
- return 0;
+ if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) {
+ if (inode->defrag_bytes &&
+ test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG,
+ 0, NULL))
+ return false;
+ return true;
+ }
+ return false;
}
/*
@@ -1891,17 +1885,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
struct writeback_control *wbc)
{
int ret;
- int force_cow = need_force_cow(inode, start, end);
const bool zoned = btrfs_is_zoned(inode->root->fs_info);
- if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) {
+ if (should_nocow(inode, start, end)) {
ASSERT(!zoned);
ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 1, nr_written);
- } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) {
- ASSERT(!zoned);
- ret = run_delalloc_nocow(inode, locked_page, start, end,
- page_started, 0, nr_written);
+ page_started, nr_written);
} else if (!inode_can_compress(inode) ||
!inode_need_compress(inode, start, end)) {
if (zoned)
@@ -3011,6 +3000,18 @@ out:
if (ret || truncated) {
u64 unwritten_start = start;
+ /*
+ * If we failed to finish this ordered extent for any reason we
+ * need to make sure BTRFS_ORDERED_IOERR is set on the ordered
+ * extent, and mark the inode with the error if it wasn't
+ * already set. Any error during writeback would have already
+ * set the mapping error, so we need to set it if we're the ones
+ * marking this ordered extent as failed.
+ */
+ if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR,
+ &ordered_extent->flags))
+ mapping_set_error(ordered_extent->inode->i_mapping, -EIO);
+
if (truncated)
unwritten_start += logical_len;
clear_extent_uptodate(io_tree, unwritten_start, end, NULL);
@@ -3099,11 +3100,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start,
* @bio_offset: offset to the beginning of the bio (in bytes)
* @page: page where is the data to be verified
* @pgoff: offset inside the page
+ * @start: logical offset in the file
*
* The length of such check is always one sector size.
*/
static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
- u32 bio_offset, struct page *page, u32 pgoff)
+ u32 bio_offset, struct page *page, u32 pgoff,
+ u64 start)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
@@ -3130,8 +3133,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio,
kunmap_atomic(kaddr);
return 0;
zeroit:
- btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff,
- csum, csum_expected, io_bio->mirror_num);
+ btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
+ io_bio->mirror_num);
if (io_bio->device)
btrfs_dev_stat_inc_and_print(io_bio->device,
BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3149,10 +3152,9 @@ zeroit:
* @bio_offset: offset to the beginning of the bio (in bytes)
* @start: file offset of the range start
* @end: file offset of the range end (inclusive)
- * @mirror: mirror number
*/
int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
- struct page *page, u64 start, u64 end, int mirror)
+ struct page *page, u64 start, u64 end)
{
struct inode *inode = page->mapping->host;
struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
@@ -3184,7 +3186,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset,
pg_off += sectorsize, bio_offset += sectorsize) {
int ret;
- ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off);
+ ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off,
+ page_offset(page) + pg_off);
if (ret < 0)
return -EIO;
}
@@ -3250,6 +3253,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info)
inode = list_first_entry(&fs_info->delayed_iputs,
struct btrfs_inode, delayed_iput);
run_delayed_iput_locked(fs_info, inode);
+ cond_resched_lock(&fs_info->delayed_iput_lock);
}
spin_unlock(&fs_info->delayed_iput_lock);
}
@@ -3390,15 +3394,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
int is_dead_root = 0;
/*
- * this is an orphan in the tree root. Currently these
+ * This is an orphan in the tree root. Currently these
* could come from 2 sources:
- * a) a snapshot deletion in progress
+ * a) a root (snapshot/subvolume) deletion in progress
* b) a free space cache inode
- * We need to distinguish those two, as the snapshot
- * orphan must not get deleted.
- * find_dead_roots already ran before us, so if this
- * is a snapshot deletion, we should find the root
- * in the fs_roots radix tree.
+ * We need to distinguish those two, as the orphan item
+ * for a root must not get deleted before the deletion
+ * of the snapshot/subvolume's tree completes.
+ *
+ * btrfs_find_orphan_roots() ran before us, which has
+ * found all deleted roots and loaded them into
+ * fs_info->fs_roots_radix. So here we can find if an
+ * orphan item corresponds to a deleted root by looking
+ * up the root from that radix tree.
*/
spin_lock(&fs_info->fs_roots_radix_lock);
@@ -4329,7 +4337,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry)
goto out_end_trans;
}
- btrfs_record_root_in_trans(trans, dest);
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto out_end_trans;
+ }
memset(&dest->root_item.drop_progress, 0,
sizeof(dest->root_item.drop_progress));
@@ -4829,7 +4841,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
bool only_release_metadata = false;
u32 blocksize = fs_info->sectorsize;
pgoff_t index = from >> PAGE_SHIFT;
@@ -4921,15 +4932,13 @@ again:
if (offset != blocksize) {
if (!len)
len = blocksize - offset;
- kaddr = kmap(page);
if (front)
- memset(kaddr + (block_start - page_offset(page)),
- 0, offset);
+ memzero_page(page, (block_start - page_offset(page)),
+ offset);
else
- memset(kaddr + (block_start - page_offset(page)) + offset,
- 0, len);
+ memzero_page(page, (block_start - page_offset(page)) + offset,
+ len);
flush_dcache_page(page);
- kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
@@ -6828,11 +6837,9 @@ static noinline int uncompress_inline(struct btrfs_path *path,
* cover that region here.
*/
- if (max_size + pg_offset < PAGE_SIZE) {
- char *map = kmap(page);
- memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset);
- kunmap(page);
- }
+ if (max_size + pg_offset < PAGE_SIZE)
+ memzero_page(page, pg_offset + max_size,
+ PAGE_SIZE - max_size - pg_offset);
kfree(tmp);
return ret;
}
@@ -7023,7 +7030,7 @@ next:
if (ret)
goto out;
} else {
- map = kmap(page);
+ map = kmap_local_page(page);
read_extent_buffer(leaf, map + pg_offset, ptr,
copy_size);
if (pg_offset + copy_size < PAGE_SIZE) {
@@ -7031,7 +7038,7 @@ next:
PAGE_SIZE - pg_offset -
copy_size);
}
- kunmap(page);
+ kunmap_local(map);
}
flush_dcache_page(page);
}
@@ -7259,6 +7266,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode,
return em;
}
+static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
+{
+ struct btrfs_block_group *block_group;
+ bool readonly = false;
+
+ block_group = btrfs_lookup_block_group(fs_info, bytenr);
+ if (!block_group || block_group->ro)
+ readonly = true;
+ if (block_group)
+ btrfs_put_block_group(block_group);
+ return readonly;
+}
+
/*
* Check if we can do nocow write into the range [@offset, @offset + @len)
*
@@ -7778,7 +7798,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
iomap->bdev = fs_info->fs_devices->latest_bdev;
iomap->length = len;
- if (write && btrfs_use_zone_append(BTRFS_I(inode), em))
+ if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start))
iomap->flags |= IOMAP_F_ZONE_APPEND;
free_extent_map(em);
@@ -7910,7 +7930,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode,
ASSERT(pgoff < PAGE_SIZE);
if (uptodate &&
(!csum || !check_data_csum(inode, io_bio,
- bio_offset, bvec.bv_page, pgoff))) {
+ bio_offset, bvec.bv_page,
+ pgoff, start))) {
clean_io_failure(fs_info, failure_tree, io_tree,
start, bvec.bv_page,
btrfs_ino(BTRFS_I(inode)),
@@ -8169,10 +8190,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap,
bio->bi_end_io = btrfs_end_dio_bio;
btrfs_io_bio(bio)->logical = file_offset;
- WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) &&
- fs_info->max_zone_append_size &&
- bio_op(bio) != REQ_OP_ZONE_APPEND);
-
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
status = extract_ordered_extent(BTRFS_I(inode), bio,
file_offset);
@@ -8403,17 +8420,11 @@ again:
* for the finish_ordered_io
*/
if (TestClearPagePrivate2(page)) {
- struct btrfs_ordered_inode_tree *tree;
- u64 new_len;
-
- tree = &inode->ordered_tree;
-
- spin_lock_irq(&tree->lock);
+ spin_lock_irq(&inode->ordered_tree.lock);
set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags);
- new_len = start - ordered->file_offset;
- if (new_len < ordered->truncated_len)
- ordered->truncated_len = new_len;
- spin_unlock_irq(&tree->lock);
+ ordered->truncated_len = min(ordered->truncated_len,
+ start - ordered->file_offset);
+ spin_unlock_irq(&inode->ordered_tree.lock);
if (btrfs_dec_test_ordered_pending(inode, &ordered,
start,
@@ -8498,7 +8509,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
struct btrfs_ordered_extent *ordered;
struct extent_state *cached_state = NULL;
struct extent_changeset *data_reserved = NULL;
- char *kaddr;
unsigned long zero_start;
loff_t size;
vm_fault_t ret;
@@ -8539,6 +8549,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */
again:
+ down_read(&BTRFS_I(inode)->i_mmap_lock);
lock_page(page);
size = i_size_read(inode);
@@ -8567,6 +8578,7 @@ again:
unlock_extent_cached(io_tree, page_start, page_end,
&cached_state);
unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_start_ordered_extent(ordered, 1);
btrfs_put_ordered_extent(ordered);
goto again;
@@ -8610,20 +8622,17 @@ again:
zero_start = PAGE_SIZE;
if (zero_start != PAGE_SIZE) {
- kaddr = kmap(page);
- memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start);
+ memzero_page(page, zero_start, PAGE_SIZE - zero_start);
flush_dcache_page(page);
- kunmap(page);
}
ClearPageChecked(page);
set_page_dirty(page);
SetPageUptodate(page);
- BTRFS_I(inode)->last_trans = fs_info->generation;
- BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid;
- BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit;
+ btrfs_set_inode_last_sub_trans(BTRFS_I(inode));
unlock_extent_cached(io_tree, page_start, page_end, &cached_state);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
sb_end_pagefault(inode->i_sb);
@@ -8632,6 +8641,7 @@ again:
out_unlock:
unlock_page(page);
+ up_read(&BTRFS_I(inode)->i_mmap_lock);
out:
btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE);
btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start,
@@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ei->delalloc_inodes);
INIT_LIST_HEAD(&ei->delayed_iput);
RB_CLEAR_NODE(&ei->rb_node);
+ init_rwsem(&ei->i_mmap_lock);
return inode;
}
@@ -9077,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
int ret2;
bool root_log_pinned = false;
bool dest_log_pinned = false;
+ bool need_abort = false;
/* we only allow rename subvolume link between subvolumes */
if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest)
@@ -9101,8 +9113,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
/*
* We need to find a free sequence number both in the source and
@@ -9133,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir,
old_idx);
if (ret)
goto out_fail;
+ need_abort = true;
}
/* And now for the dest. */
@@ -9148,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir,
new_ino,
btrfs_ino(BTRFS_I(old_dir)),
new_idx);
- if (ret)
+ if (ret) {
+ if (need_abort)
+ btrfs_abort_transaction(trans, ret);
goto out_fail;
+ }
}
/* Update inode version and ctime/mtime. */
@@ -9406,8 +9425,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry,
goto out_notrans;
}
- if (dest != root)
- btrfs_record_root_in_trans(trans, dest);
+ if (dest != root) {
+ ret = btrfs_record_root_in_trans(trans, dest);
+ if (ret)
+ goto out_fail;
+ }
ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index);
if (ret)
@@ -9674,7 +9696,7 @@ out:
return ret;
}
-int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
+int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context)
{
struct writeback_control wbc = {
.nr_to_write = LONG_MAX,
@@ -9687,7 +9709,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root)
if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state))
return -EROFS;
- return start_delalloc_inodes(root, &wbc, true, false);
+ return start_delalloc_inodes(root, &wbc, true, in_reclaim_context);
}
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
@@ -9919,7 +9941,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent(
goto free_qgroup;
}
- ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset,
+ ret = btrfs_replace_file_extents(inode, path, file_offset,
file_offset + len - 1, &extent_info,
&trans);
btrfs_free_path(path);
@@ -10603,6 +10625,8 @@ static const struct inode_operations btrfs_dir_inode_operations = {
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
.tmpfile = btrfs_tmpfile,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct file_operations btrfs_dir_file_operations = {
@@ -10656,6 +10680,8 @@ static const struct inode_operations btrfs_file_inode_operations = {
.get_acl = btrfs_get_acl,
.set_acl = btrfs_set_acl,
.update_time = btrfs_update_time,
+ .fileattr_get = btrfs_fileattr_get,
+ .fileattr_set = btrfs_fileattr_set,
};
static const struct inode_operations btrfs_special_inode_operations = {
.getattr = btrfs_getattr,
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index e8d53fea4c61..5dc2fd843ae3 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -26,6 +26,7 @@
#include <linux/btrfs.h>
#include <linux/uaccess.h>
#include <linux/iversion.h>
+#include <linux/fileattr.h>
#include "ctree.h"
#include "disk-io.h"
#include "export.h"
@@ -153,16 +154,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
new_fl);
}
-static int btrfs_ioctl_getflags(struct file *file, void __user *arg)
-{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags);
-
- if (copy_to_user(arg, &flags, sizeof(flags)))
- return -EFAULT;
- return 0;
-}
-
/*
* Check if @flags are a supported and valid set of FS_*_FL flags and that
* the old and new flags are not conflicting
@@ -201,9 +192,22 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
return 0;
}
-static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
+/*
+ * Set flags/xflags from the internal inode flags. The remaining items of
+ * fsxattr are zeroed.
+ */
+int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
+{
+ struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
+
+ fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags));
+ return 0;
+}
+
+int btrfs_fileattr_set(struct user_namespace *mnt_userns,
+ struct dentry *dentry, struct fileattr *fa)
{
- struct inode *inode = file_inode(file);
+ struct inode *inode = d_inode(dentry);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_inode *binode = BTRFS_I(inode);
struct btrfs_root *root = binode->root;
@@ -213,34 +217,21 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
const char *comp = NULL;
u32 binode_flags;
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
-
if (btrfs_root_readonly(root))
return -EROFS;
- if (copy_from_user(&fsflags, arg, sizeof(fsflags)))
- return -EFAULT;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
+ if (fileattr_has_fsx(fa))
+ return -EOPNOTSUPP;
- inode_lock(inode);
- fsflags = btrfs_mask_fsflags_for_type(inode, fsflags);
+ fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags);
-
- ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags);
- if (ret)
- goto out_unlock;
-
ret = check_fsflags(old_fsflags, fsflags);
if (ret)
- goto out_unlock;
+ return ret;
ret = check_fsflags_compatible(fs_info, fsflags);
if (ret)
- goto out_unlock;
+ return ret;
binode_flags = binode->flags;
if (fsflags & FS_SYNC_FL)
@@ -263,6 +254,16 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOATIME;
else
binode_flags &= ~BTRFS_INODE_NOATIME;
+
+ /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
+ if (!fa->flags_valid) {
+ /* 1 item for the inode */
+ trans = btrfs_start_transaction(root, 1);
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
+ goto update_flags;
+ }
+
if (fsflags & FS_DIRSYNC_FL)
binode_flags |= BTRFS_INODE_DIRSYNC;
else
@@ -303,10 +304,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
binode_flags |= BTRFS_INODE_NOCOMPRESS;
} else if (fsflags & FS_COMPR_FL) {
- if (IS_SWAPFILE(inode)) {
- ret = -ETXTBSY;
- goto out_unlock;
- }
+ if (IS_SWAPFILE(inode))
+ return -ETXTBSY;
binode_flags |= BTRFS_INODE_COMPRESS;
binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
@@ -323,10 +322,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
* 2 for properties
*/
trans = btrfs_start_transaction(root, 3);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
+ if (IS_ERR(trans))
+ return PTR_ERR(trans);
if (comp) {
ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
@@ -344,6 +341,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
}
}
+update_flags:
binode->flags = binode_flags;
btrfs_sync_inode_flags_to_i_flags(inode);
inode_inc_iversion(inode);
@@ -352,44 +350,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg)
out_end_trans:
btrfs_end_transaction(trans);
- out_unlock:
- inode_unlock(inode);
- mnt_drop_write_file(file);
return ret;
}
-/*
- * Translate btrfs internal inode flags to xflags as expected by the
- * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are
- * silently dropped.
- */
-static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags)
-{
- unsigned int xflags = 0;
-
- if (flags & BTRFS_INODE_APPEND)
- xflags |= FS_XFLAG_APPEND;
- if (flags & BTRFS_INODE_IMMUTABLE)
- xflags |= FS_XFLAG_IMMUTABLE;
- if (flags & BTRFS_INODE_NOATIME)
- xflags |= FS_XFLAG_NOATIME;
- if (flags & BTRFS_INODE_NODUMP)
- xflags |= FS_XFLAG_NODUMP;
- if (flags & BTRFS_INODE_SYNC)
- xflags |= FS_XFLAG_SYNC;
-
- return xflags;
-}
-
-/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */
-static int check_xflags(unsigned int flags)
-{
- if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME |
- FS_XFLAG_NODUMP | FS_XFLAG_SYNC))
- return -EOPNOTSUPP;
- return 0;
-}
-
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type)
{
@@ -402,111 +365,6 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
}
-/*
- * Set the xflags from the internal inode flags. The remaining items of fsxattr
- * are zeroed.
- */
-static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg)
-{
- struct btrfs_inode *binode = BTRFS_I(file_inode(file));
- struct fsxattr fa;
-
- simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags));
- if (copy_to_user(arg, &fa, sizeof(fa)))
- return -EFAULT;
-
- return 0;
-}
-
-static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg)
-{
- struct inode *inode = file_inode(file);
- struct btrfs_inode *binode = BTRFS_I(inode);
- struct btrfs_root *root = binode->root;
- struct btrfs_trans_handle *trans;
- struct fsxattr fa, old_fa;
- unsigned old_flags;
- unsigned old_i_flags;
- int ret = 0;
-
- if (!inode_owner_or_capable(&init_user_ns, inode))
- return -EPERM;
-
- if (btrfs_root_readonly(root))
- return -EROFS;
-
- if (copy_from_user(&fa, arg, sizeof(fa)))
- return -EFAULT;
-
- ret = check_xflags(fa.fsx_xflags);
- if (ret)
- return ret;
-
- if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0)
- return -EOPNOTSUPP;
-
- ret = mnt_want_write_file(file);
- if (ret)
- return ret;
-
- inode_lock(inode);
-
- old_flags = binode->flags;
- old_i_flags = inode->i_flags;
-
- simple_fill_fsxattr(&old_fa,
- btrfs_inode_flags_to_xflags(binode->flags));
- ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa);
- if (ret)
- goto out_unlock;
-
- if (fa.fsx_xflags & FS_XFLAG_SYNC)
- binode->flags |= BTRFS_INODE_SYNC;
- else
- binode->flags &= ~BTRFS_INODE_SYNC;
- if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE)
- binode->flags |= BTRFS_INODE_IMMUTABLE;
- else
- binode->flags &= ~BTRFS_INODE_IMMUTABLE;
- if (fa.fsx_xflags & FS_XFLAG_APPEND)
- binode->flags |= BTRFS_INODE_APPEND;
- else
- binode->flags &= ~BTRFS_INODE_APPEND;
- if (fa.fsx_xflags & FS_XFLAG_NODUMP)
- binode->flags |= BTRFS_INODE_NODUMP;
- else
- binode->flags &= ~BTRFS_INODE_NODUMP;
- if (fa.fsx_xflags & FS_XFLAG_NOATIME)
- binode->flags |= BTRFS_INODE_NOATIME;
- else
- binode->flags &= ~BTRFS_INODE_NOATIME;
-
- /* 1 item for the inode */
- trans = btrfs_start_transaction(root, 1);
- if (IS_ERR(trans)) {
- ret = PTR_ERR(trans);
- goto out_unlock;
- }
-
- btrfs_sync_inode_flags_to_i_flags(inode);
- inode_inc_iversion(inode);
- inode->i_ctime = current_time(inode);
- ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
-
- btrfs_end_transaction(trans);
-
-out_unlock:
- if (ret) {
- binode->flags = old_flags;
- inode->i_flags = old_i_flags;
- }
-
- inode_unlock(inode);
- mnt_drop_write_file(file);
-
- return ret;
-}
-
static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
{
struct inode *inode = file_inode(file);
@@ -697,8 +555,6 @@ static noinline int create_subvol(struct inode *dir,
btrfs_set_root_otransid(root_item, trans->transid);
btrfs_tree_unlock(leaf);
- free_extent_buffer(leaf);
- leaf = NULL;
btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
@@ -707,8 +563,22 @@ static noinline int create_subvol(struct inode *dir,
key.type = BTRFS_ROOT_ITEM_KEY;
ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
root_item);
- if (ret)
+ if (ret) {
+ /*
+ * Since we don't abort the transaction in this case, free the
+ * tree block so that we don't leak space and leave the
+ * filesystem in an inconsistent state (an extent item in the
+ * extent tree without backreferences). Also no need to have
+ * the tree block locked since it is not in any tree at this
+ * point, so no other task can find it and use it.
+ */
+ btrfs_free_tree_block(trans, root, leaf, 0, 1);
+ free_extent_buffer(leaf);
goto fail;
+ }
+
+ free_extent_buffer(leaf);
+ leaf = NULL;
key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
@@ -721,7 +591,12 @@ static noinline int create_subvol(struct inode *dir,
/* Freeing will be done in btrfs_put_root() of new_root */
anon_dev = 0;
- btrfs_record_root_in_trans(trans, new_root);
+ ret = btrfs_record_root_in_trans(trans, new_root);
+ if (ret) {
+ btrfs_put_root(new_root);
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
ret = btrfs_create_subvol_root(trans, new_root, root);
btrfs_put_root(new_root);
@@ -1014,7 +889,7 @@ out_up_read:
out_dput:
dput(dentry);
out_unlock:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
return error;
}
@@ -1034,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent,
*/
btrfs_drew_read_lock(&root->snapshot_lock);
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
goto out;
@@ -1612,7 +1487,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ra_index += cluster;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
if (IS_SWAPFILE(inode)) {
ret = -ETXTBSY;
} else {
@@ -1621,13 +1496,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
ret = cluster_pages_for_defrag(inode, pages, i, cluster);
}
if (ret < 0) {
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
goto out_ra;
}
defrag_count += ret;
balance_dirty_pages_ratelimited(inode->i_mapping);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
if (newer_than) {
if (newer_off == (u64)-1)
@@ -1675,9 +1550,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file,
out_ra:
if (do_compress) {
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
}
if (!file)
kfree(ra);
@@ -3112,9 +2987,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
goto out_dput;
}
- inode_lock(inode);
+ btrfs_inode_lock(inode, 0);
err = btrfs_delete_subvolume(dir, dentry);
- inode_unlock(inode);
+ btrfs_inode_unlock(inode, 0);
if (!err) {
fsnotify_rmdir(dir, dentry);
d_delete(dentry);
@@ -3123,7 +2998,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file,
out_dput:
dput(dentry);
out_unlock_dir:
- inode_unlock(dir);
+ btrfs_inode_unlock(dir, 0);
free_subvol_name:
kfree(subvol_name_ptr);
free_parent:
@@ -4915,10 +4790,6 @@ long btrfs_ioctl(struct file *file, unsigned int
void __user *argp = (void __user *)arg;
switch (cmd) {
- case FS_IOC_GETFLAGS:
- return btrfs_ioctl_getflags(file, argp);
- case FS_IOC_SETFLAGS:
- return btrfs_ioctl_setflags(file, argp);
case FS_IOC_GETVERSION:
return btrfs_ioctl_getversion(file, argp);
case FS_IOC_GETFSLABEL:
@@ -5044,10 +4915,6 @@ long btrfs_ioctl(struct file *file, unsigned int
return btrfs_ioctl_get_features(fs_info, argp);
case BTRFS_IOC_SET_FEATURES:
return btrfs_ioctl_set_features(file, argp);
- case FS_IOC_FSGETXATTR:
- return btrfs_ioctl_fsgetxattr(file, argp);
- case FS_IOC_FSSETXATTR:
- return btrfs_ioctl_fssetxattr(file, argp);
case BTRFS_IOC_GET_SUBVOL_INFO:
return btrfs_ioctl_get_subvol_info(file, argp);
case BTRFS_IOC_GET_SUBVOL_ROOTREF:
@@ -5067,12 +4934,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
* handling is necessary.
*/
switch (cmd) {
- case FS_IOC32_GETFLAGS:
- cmd = FS_IOC_GETFLAGS;
- break;
- case FS_IOC32_SETFLAGS:
- cmd = FS_IOC_SETFLAGS;
- break;
case FS_IOC32_GETVERSION:
cmd = FS_IOC_GETVERSION;
break;
diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c
index 9084a950dc09..cd042c7567a4 100644
--- a/fs/btrfs/lzo.c
+++ b/fs/btrfs/lzo.c
@@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
struct workspace *workspace = list_entry(ws, struct workspace, list);
int ret = 0;
char *data_in;
- char *cpage_out;
+ char *cpage_out, *sizes_ptr;
int nr_pages = 0;
struct page *in_page = NULL;
struct page *out_page = NULL;
@@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
}
/* store the size of all chunks of compressed data */
- cpage_out = kmap(pages[0]);
- write_compress_length(cpage_out, tot_out);
-
- kunmap(pages[0]);
+ sizes_ptr = kmap_local_page(pages[0]);
+ write_compress_length(sizes_ptr, tot_out);
+ kunmap_local(sizes_ptr);
ret = 0;
*total_out = tot_out;
diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c
index 985a21558437..6c413bb451a3 100644
--- a/fs/btrfs/ordered-data.c
+++ b/fs/btrfs/ordered-data.c
@@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset,
return NULL;
}
-/*
- * helper to check if a given offset is inside a given entry
- */
-static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset)
-{
- if (file_offset < entry->file_offset ||
- entry->file_offset + entry->num_bytes <= file_offset)
- return 0;
- return 1;
-}
-
static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset,
u64 len)
{
@@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
if (tree->last) {
entry = rb_entry(tree->last, struct btrfs_ordered_extent,
rb_node);
- if (offset_in_entry(entry, file_offset))
+ if (in_range(file_offset, entry->file_offset, entry->num_bytes))
return tree->last;
}
ret = __tree_search(root, file_offset, &prev);
@@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode,
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, *file_offset))
+ if (!in_range(*file_offset, entry->file_offset, entry->num_bytes))
goto out;
dec_start = max(*file_offset, entry->file_offset);
@@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode,
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
have_entry:
- if (!offset_in_entry(entry, file_offset))
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
goto out;
if (io_size > entry->bytes_left)
@@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino
goto out;
entry = rb_entry(node, struct btrfs_ordered_extent, rb_node);
- if (!offset_in_entry(entry, file_offset))
+ if (!in_range(file_offset, entry->file_offset, entry->num_bytes))
entry = NULL;
if (entry)
refcount_inc(&entry->refs);
@@ -995,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
if (pre)
ret = clone_ordered_extent(ordered, 0, pre);
- if (post)
+ if (ret == 0 && post)
ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes,
post);
diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h
index 99e0853e4d3b..e60c07f36427 100644
--- a/fs/btrfs/ordered-data.h
+++ b/fs/btrfs/ordered-data.h
@@ -39,8 +39,8 @@ struct btrfs_ordered_sum {
*/
enum {
/*
- * Different types for direct io, one and only one of the 4 type can
- * be set when creating ordered extent.
+ * Different types for ordered extents, one and only one of the 4 types
+ * need to be set when creating ordered extent.
*
* REGULAR: For regular non-compressed COW write
* NOCOW: For NOCOW write into existing non-hole extent
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index 14ff388fd3bd..3ded812f522c 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -23,6 +23,7 @@
#include "qgroup.h"
#include "block-group.h"
#include "sysfs.h"
+#include "tree-mod-log.h"
/* TODO XXX FIXME
* - subvol delete -> delete when ref goes to 0? delete limits also?
@@ -226,7 +227,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
{
struct btrfs_qgroup_list *list;
- btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
list_del(&qgroup->dirty);
while (!list_empty(&qgroup->groups)) {
list = list_first_entry(&qgroup->groups,
@@ -243,7 +243,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info,
list_del(&list->next_member);
kfree(list);
}
- kfree(qgroup);
}
/* must be called with qgroup_lock held */
@@ -569,6 +568,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info)
qgroup = rb_entry(n, struct btrfs_qgroup, node);
rb_erase(n, &fs_info->qgroup_tree);
__del_qgroup_rb(fs_info, qgroup);
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
}
/*
* We call btrfs_free_qgroup_config() when unmounting
@@ -1578,6 +1579,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid)
spin_lock(&fs_info->qgroup_lock);
del_qgroup_rb(fs_info, qgroupid);
spin_unlock(&fs_info->qgroup_lock);
+
+ /*
+ * Remove the qgroup from sysfs now without holding the qgroup_lock
+ * spinlock, since the sysfs_remove_group() function needs to take
+ * the mutex kernfs_mutex through kernfs_remove_by_name_ns().
+ */
+ btrfs_sysfs_del_one_qgroup(fs_info, qgroup);
+ kfree(qgroup);
out:
mutex_unlock(&fs_info->qgroup_ioctl_lock);
return ret;
@@ -2631,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
record->data_rsv,
BTRFS_QGROUP_RSV_DATA);
/*
- * Use SEQ_LAST as time_seq to do special search, which
- * doesn't lock tree or delayed_refs and search current
- * root. It's safe inside commit_transaction().
+ * Use BTRFS_SEQ_LAST as time_seq to do special search,
+ * which doesn't lock tree or delayed_refs and search
+ * current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
- record->bytenr, SEQ_LAST, &new_roots, false);
+ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
if (ret < 0)
goto cleanup;
if (qgroup_to_skip) {
@@ -3535,43 +3544,29 @@ static int try_flush_qgroup(struct btrfs_root *root)
{
struct btrfs_trans_handle *trans;
int ret;
- bool can_commit = true;
/*
- * If current process holds a transaction, we shouldn't flush, as we
- * assume all space reservation happens before a transaction handle is
- * held.
- *
- * But there are cases like btrfs_delayed_item_reserve_metadata() where
- * we try to reserve space with one transction handle already held.
- * In that case we can't commit transaction, but at least try to end it
- * and hope the started data writes can free some space.
+ * Can't hold an open transaction or we run the risk of deadlocking,
+ * and can't either be under the context of a send operation (where
+ * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that
+ * would result in a crash when starting a transaction and does not
+ * make sense either (send is a read-only operation).
*/
- if (current->journal_info &&
- current->journal_info != BTRFS_SEND_TRANS_STUB)
- can_commit = false;
+ ASSERT(current->journal_info == NULL);
+ if (WARN_ON(current->journal_info))
+ return 0;
/*
* We don't want to run flush again and again, so if there is a running
* one, we won't try to start a new flush, but exit directly.
*/
if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) {
- /*
- * We are already holding a transaction, thus we can block other
- * threads from flushing. So exit right now. This increases
- * the chance of EDQUOT for heavy load and near limit cases.
- * But we can argue that if we're already near limit, EDQUOT is
- * unavoidable anyway.
- */
- if (!can_commit)
- return 0;
-
wait_event(root->qgroup_flush_wait,
!test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state));
return 0;
}
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, true);
if (ret < 0)
goto out;
btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
@@ -3582,10 +3577,7 @@ static int try_flush_qgroup(struct btrfs_root *root)
goto out;
}
- if (can_commit)
- ret = btrfs_commit_transaction(trans);
- else
- ret = btrfs_end_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
out:
clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state);
wake_up(&root->qgroup_flush_wait);
@@ -3638,8 +3630,7 @@ cleanup:
qgroup_unreserve_range(inode, reserved, start, len);
out:
if (new_reserved) {
- extent_changeset_release(reserved);
- kfree(reserved);
+ extent_changeset_free(reserved);
*reserved_ret = NULL;
}
return ret;
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index 8c31357f08ed..244d499ebc72 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -13,6 +13,7 @@
#include <linux/list_sort.h>
#include <linux/raid/xor.h>
#include <linux/mm.h>
+#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "volumes.h"
@@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ pointers[stripe] = kmap_local_page(p);
}
/* then add the parity stripe */
p = rbio_pstripe_page(rbio, pagenr);
SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
+ pointers[stripe++] = kmap_local_page(p);
if (has_qstripe) {
@@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
*/
p = rbio_qstripe_page(rbio, pagenr);
SetPageUptodate(p);
- pointers[stripe++] = kmap(p);
+ pointers[stripe++] = kmap_local_page(p);
raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
pointers);
@@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
copy_page(pointers[nr_data], pointers[0]);
run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
}
-
-
- for (stripe = 0; stripe < rbio->real_stripes; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ for (stripe = stripe - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
/*
@@ -1634,7 +1633,8 @@ struct btrfs_plug_cb {
/*
* rbios on the plug list are sorted for easier merging.
*/
-static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int plug_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
plug_list);
@@ -1776,6 +1776,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
{
int pagenr, stripe;
void **pointers;
+ void **unmap_array;
int faila = -1, failb = -1;
struct page *page;
blk_status_t err;
@@ -1787,6 +1788,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
goto cleanup_io;
}
+ /*
+ * Store copy of pointers that does not get reordered during
+ * reconstruction so that kunmap_local works.
+ */
+ unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
+ if (!unmap_array) {
+ err = BLK_STS_RESOURCE;
+ goto cleanup_pointers;
+ }
+
faila = rbio->faila;
failb = rbio->failb;
@@ -1808,8 +1819,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
!test_bit(pagenr, rbio->dbitmap))
continue;
- /* setup our array of pointers with pages
- * from each stripe
+ /*
+ * Setup our array of pointers with pages from each stripe
+ *
+ * NOTE: store a duplicate array of pointers to preserve the
+ * pointer order
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
/*
@@ -1823,7 +1837,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
} else {
page = rbio_stripe_page(rbio, stripe, pagenr);
}
- pointers[stripe] = kmap(page);
+ pointers[stripe] = kmap_local_page(page);
+ unmap_array[stripe] = pointers[stripe];
}
/* all raid6 handling here */
@@ -1916,24 +1931,14 @@ pstripe:
}
}
}
- for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
- /*
- * if we're rebuilding a read, we have to use
- * pages from the bio list
- */
- if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
- rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
- (stripe == faila || stripe == failb)) {
- page = page_in_rbio(rbio, stripe, pagenr, 0);
- } else {
- page = rbio_stripe_page(rbio, stripe, pagenr);
- }
- kunmap(page);
- }
+ for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
+ kunmap_local(unmap_array[stripe]);
}
err = BLK_STS_OK;
cleanup:
+ kfree(unmap_array);
+cleanup_pointers:
kfree(pointers);
cleanup_io:
@@ -2358,13 +2363,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
goto cleanup;
}
SetPageUptodate(q_page);
- pointers[rbio->real_stripes - 1] = kmap(q_page);
+ pointers[rbio->real_stripes - 1] = kmap_local_page(q_page);
}
atomic_set(&rbio->error, 0);
/* Map the parity stripe just once */
- pointers[nr_data] = kmap(p_page);
+ pointers[nr_data] = kmap_local_page(p_page);
for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
struct page *p;
@@ -2372,7 +2377,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* first collect one page from each data stripe */
for (stripe = 0; stripe < nr_data; stripe++) {
p = page_in_rbio(rbio, stripe, pagenr, 0);
- pointers[stripe] = kmap(p);
+ pointers[stripe] = kmap_local_page(p);
}
if (has_qstripe) {
@@ -2387,22 +2392,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
/* Check scrubbing parity and repair it */
p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
- parity = kmap(p);
+ parity = kmap_local_page(p);
if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
copy_page(parity, pointers[rbio->scrubp]);
else
/* Parity is right, needn't writeback */
bitmap_clear(rbio->dbitmap, pagenr, 1);
- kunmap(p);
+ kunmap_local(parity);
- for (stripe = 0; stripe < nr_data; stripe++)
- kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
+ for (stripe = nr_data - 1; stripe >= 0; stripe--)
+ kunmap_local(pointers[stripe]);
}
- kunmap(p_page);
+ kunmap_local(pointers[nr_data]);
__free_page(p_page);
if (q_page) {
- kunmap(q_page);
+ kunmap_local(pointers[rbio->real_stripes - 1]);
__free_page(q_page);
}
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 762881b777b3..9178da07cc9c 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode,
* So what's in the range [500, 4095] corresponds to zeroes.
*/
if (datal < block_size) {
- char *map;
-
- map = kmap(page);
- memset(map + datal, 0, block_size - datal);
+ memzero_page(page, datal, block_size - datal);
flush_dcache_page(page);
- kunmap(page);
}
SetPageUptodate(page);
@@ -207,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst,
* inline extent's data to the page.
*/
ASSERT(key.offset > 0);
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal,
- comp_type);
- goto out;
+ goto copy_to_page;
}
} else if (i_size_read(dst) <= datal) {
struct btrfs_file_extent_item *ei;
@@ -226,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst,
BTRFS_FILE_EXTENT_INLINE)
goto copy_inline_extent;
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
copy_inline_extent:
- ret = 0;
/*
* We have no extent items, or we have an extent at offset 0 which may
* or may not be inlined. All these cases are dealt the same way.
@@ -244,11 +234,13 @@ copy_inline_extent:
* clone. Deal with all these cases by copying the inline extent
* data into the respective page at the destination inode.
*/
- ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
- inline_data, size, datal, comp_type);
- goto out;
+ goto copy_to_page;
}
+ /*
+ * Release path before starting a new transaction so we don't hold locks
+ * that would confuse lockdep.
+ */
btrfs_release_path(path);
/*
* If we end up here it means were copy the inline extent into a leaf
@@ -305,6 +297,21 @@ out:
*trans_out = trans;
return ret;
+
+copy_to_page:
+ /*
+ * Release our path because we don't need it anymore and also because
+ * copy_inline_to_page() needs to reserve data and metadata, which may
+ * need to flush delalloc when we are low on available space and
+ * therefore cause a deadlock if writeback of an inline extent needs to
+ * write to the same leaf or an ordered extent completion needs to write
+ * to the same leaf.
+ */
+ btrfs_release_path(path);
+
+ ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset,
+ inline_data, size, datal, comp_type);
+ goto out;
}
/**
@@ -478,9 +485,9 @@ process_slot:
clone_info.file_offset = new_key.offset;
clone_info.extent_buf = buf;
clone_info.is_new_extent = false;
- ret = btrfs_replace_file_extents(inode, path, drop_start,
- new_key.offset + datal - 1, &clone_info,
- &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ drop_start, new_key.offset + datal - 1,
+ &clone_info, &trans);
if (ret)
goto out;
} else if (type == BTRFS_FILE_EXTENT_INLINE) {
@@ -567,8 +574,8 @@ process_slot:
set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
&BTRFS_I(inode)->runtime_flags);
- ret = btrfs_replace_file_extents(inode, path, last_dest_end,
- destoff + len - 1, NULL, &trans);
+ ret = btrfs_replace_file_extents(BTRFS_I(inode), path,
+ last_dest_end, destoff + len - 1, NULL, &trans);
if (ret)
goto out;
@@ -604,6 +611,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
}
+static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
+{
+ if (inode1 < inode2)
+ swap(inode1, inode2);
+ down_write(&BTRFS_I(inode1)->i_mmap_lock);
+ down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING);
+}
+
+static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2)
+{
+ up_write(&BTRFS_I(inode1)->i_mmap_lock);
+ up_write(&BTRFS_I(inode2)->i_mmap_lock);
+}
+
static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len,
struct inode *dst, u64 dst_loff)
{
@@ -820,6 +841,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
len, remap_flags);
}
+static bool file_sync_write(const struct file *file)
+{
+ if (file->f_flags & (__O_SYNC | O_DSYNC))
+ return true;
+ if (IS_SYNC(file_inode(file)))
+ return true;
+
+ return false;
+}
+
loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
struct file *dst_file, loff_t destoff, loff_t len,
unsigned int remap_flags)
@@ -832,10 +863,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY))
return -EINVAL;
- if (same_inode)
- inode_lock(src_inode);
- else
+ if (same_inode) {
+ btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
lock_two_nondirectories(src_inode, dst_inode);
+ btrfs_double_mmap_lock(src_inode, dst_inode);
+ }
ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff,
&len, remap_flags);
@@ -848,10 +881,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
ret = btrfs_clone_files(dst_file, src_file, off, len, destoff);
out_unlock:
- if (same_inode)
- inode_unlock(src_inode);
- else
+ if (same_inode) {
+ btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
+ } else {
+ btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);
+ }
+
+ /*
+ * If either the source or the destination file was opened with O_SYNC,
+ * O_DSYNC or has the S_SYNC attribute, fsync both the destination and
+ * source files/ranges, so that after a successful return (0) followed
+ * by a power failure results in the reflinked data to be readable from
+ * both files/ranges.
+ */
+ if (ret == 0 && len > 0 &&
+ (file_sync_write(src_file) || file_sync_write(dst_file))) {
+ ret = btrfs_sync_file(src_file, off, off + len - 1, 0);
+ if (ret == 0)
+ ret = btrfs_sync_file(dst_file, destoff,
+ destoff + len - 1, 0);
+ }
return ret < 0 ? ret : len;
}
diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c
index 232d5da7b7be..b70be2ac2e9e 100644
--- a/fs/btrfs/relocation.c
+++ b/fs/btrfs/relocation.c
@@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root)
node->bytenr, &node->rb_node);
spin_unlock(&rc->reloc_root_tree.lock);
if (rb_node) {
- btrfs_panic(fs_info, -EEXIST,
+ btrfs_err(fs_info,
"Duplicate root found for start=%llu while inserting into relocation tree",
node->bytenr);
+ return -EEXIST;
}
list_add_tail(&root->root_list, &rc->reloc_roots);
@@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
struct extent_buffer *eb;
struct btrfs_root_item *root_item;
struct btrfs_key root_key;
- int ret;
+ int ret = 0;
+ bool must_abort = false;
root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
- BUG_ON(!root_item);
+ if (!root_item)
+ return ERR_PTR(-ENOMEM);
root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
root_key.type = BTRFS_ROOT_ITEM_KEY;
@@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
/* called by btrfs_init_reloc_root */
ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
/*
* Set the last_snapshot field to the generation of the commit
* root - like this ctree.c:btrfs_block_can_be_shared() behaves
@@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
*/
ret = btrfs_copy_root(trans, root, root->node, &eb,
BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
}
+ /*
+ * We have changed references at this point, we must abort the
+ * transaction if anything fails.
+ */
+ must_abort = true;
+
memcpy(root_item, &root->root_item, sizeof(*root_item));
btrfs_set_root_bytenr(root_item, eb->start);
btrfs_set_root_level(root_item, btrfs_header_level(eb));
@@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_insert_root(trans, fs_info->tree_root,
&root_key, root_item);
- BUG_ON(ret);
+ if (ret)
+ goto fail;
+
kfree(root_item);
reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
- BUG_ON(IS_ERR(reloc_root));
+ if (IS_ERR(reloc_root)) {
+ ret = PTR_ERR(reloc_root);
+ goto abort;
+ }
set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
reloc_root->last_trans = trans->transid;
return reloc_root;
+fail:
+ kfree(root_item);
+abort:
+ if (must_abort)
+ btrfs_abort_transaction(trans, ret);
+ return ERR_PTR(ret);
}
/*
@@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
if (clear_rsv)
trans->block_rsv = rsv;
+ if (IS_ERR(reloc_root))
+ return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
root->reloc_root = btrfs_grab_root(reloc_root);
return 0;
}
@@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
int ret;
if (!have_reloc_root(root))
- goto out;
+ return 0;
reloc_root = root->reloc_root;
root_item = &reloc_root->root_item;
@@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
ret = btrfs_update_root(trans, fs_info->tree_root,
&reloc_root->root_key, root_item);
- BUG_ON(ret);
btrfs_put_root(reloc_root);
-out:
- return 0;
+ return ret;
}
/*
@@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
int ret;
int slot;
- BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
- BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
+ ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
last_snapshot = btrfs_root_last_snapshot(&src->root_item);
again:
@@ -1205,7 +1233,11 @@ again:
if (cow) {
ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
BTRFS_NESTING_COW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ return ret;
+ }
}
if (next_key) {
@@ -1217,7 +1249,7 @@ again:
parent = eb;
while (1) {
level = btrfs_header_level(parent);
- BUG_ON(level < lowest_level);
+ ASSERT(level >= lowest_level);
ret = btrfs_bin_search(parent, &key, &slot);
if (ret < 0)
@@ -1265,7 +1297,11 @@ again:
ret = btrfs_cow_block(trans, dest, eb, parent,
slot, &eb,
BTRFS_NESTING_COW);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_tree_unlock(eb);
+ free_extent_buffer(eb);
+ break;
+ }
}
btrfs_tree_unlock(parent);
@@ -1289,7 +1325,11 @@ again:
path->lowest_level = level;
ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
path->lowest_level = 0;
- BUG_ON(ret);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ break;
+ }
/*
* Info qgroup to trace both subtrees.
@@ -1329,27 +1369,39 @@ again:
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
blocksize, 0);
ref.skip_qgroup = true;
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
blocksize, path->nodes[level]->start);
btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
blocksize, 0);
btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid);
ref.skip_qgroup = true;
ret = btrfs_free_extent(trans, &ref);
- BUG_ON(ret);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ break;
+ }
btrfs_unlock_up_safe(path, 0);
@@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level,
/*
* Insert current subvolume into reloc_control::dirty_subvol_roots
*/
-static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
- struct reloc_control *rc,
- struct btrfs_root *root)
+static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
+ struct reloc_control *rc,
+ struct btrfs_root *root)
{
struct btrfs_root *reloc_root = root->reloc_root;
struct btrfs_root_item *reloc_root_item;
+ int ret;
/* @root must be a subvolume tree root with a valid reloc tree */
ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
@@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans,
sizeof(reloc_root_item->drop_progress));
btrfs_set_root_drop_level(reloc_root_item, 0);
btrfs_set_root_refs(reloc_root_item, 0);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ if (ret)
+ return ret;
if (list_empty(&root->reloc_dirty_list)) {
btrfs_grab_root(root);
list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
}
+
+ return 0;
}
static int clean_dirty_subvols(struct reloc_control *rc)
@@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
out:
btrfs_free_path(path);
- if (ret == 0)
- insert_dirty_subvol(trans, rc, root);
+ if (ret == 0) {
+ ret = insert_dirty_subvol(trans, rc, root);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ }
if (trans)
btrfs_end_transaction_throttle(trans);
@@ -1825,8 +1885,18 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ if (IS_ERR(root)) {
+ /*
+ * Even if we have an error we need this reloc root
+ * back on our list so we can clean up properly.
+ */
+ list_add(&reloc_root->root_list, &reloc_roots);
+ btrfs_abort_transaction(trans, (int)PTR_ERR(root));
+ if (!err)
+ err = PTR_ERR(root);
+ break;
+ }
+ ASSERT(root->reloc_root == reloc_root);
/*
* set reference count to 1, so btrfs_recover_relocation
@@ -1834,16 +1904,27 @@ again:
*/
if (!err)
btrfs_set_root_refs(&reloc_root->root_item, 1);
- btrfs_update_reloc_root(trans, root);
+ ret = btrfs_update_reloc_root(trans, root);
+ /*
+ * Even if we have an error we need this reloc root back on our
+ * list so we can clean up properly.
+ */
list_add(&reloc_root->root_list, &reloc_roots);
btrfs_put_root(root);
+
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ if (!err)
+ err = ret;
+ break;
+ }
}
list_splice(&reloc_roots, &rc->reloc_roots);
if (!err)
- btrfs_commit_transaction(trans);
+ err = btrfs_commit_transaction(trans);
else
btrfs_end_transaction(trans);
return err;
@@ -1888,8 +1969,29 @@ again:
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
false);
if (btrfs_root_refs(&reloc_root->root_item) > 0) {
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+ if (IS_ERR(root)) {
+ /*
+ * For recovery we read the fs roots on mount,
+ * and if we didn't find the root then we marked
+ * the reloc root as a garbage root. For normal
+ * relocation obviously the root should exist in
+ * memory. However there's no reason we can't
+ * handle the error properly here just in case.
+ */
+ ASSERT(0);
+ ret = PTR_ERR(root);
+ goto out;
+ }
+ if (root->reloc_root != reloc_root) {
+ /*
+ * This is actually impossible without something
+ * going really wrong (like weird race condition
+ * or cosmic rays).
+ */
+ ASSERT(0);
+ ret = -EINVAL;
+ goto out;
+ }
ret = merge_reloc_root(rc, root);
btrfs_put_root(root);
if (ret) {
@@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
- BUG_ON(IS_ERR(root));
- BUG_ON(root->reloc_root != reloc_root);
+
+ /*
+ * This should succeed, since we can't have a reloc root without having
+ * already looked up the actual root and created the reloc root for this
+ * root.
+ *
+ * However if there's some sort of corruption where we have a ref to a
+ * reloc root without a corresponding root this could return ENOENT.
+ */
+ if (IS_ERR(root)) {
+ ASSERT(0);
+ return PTR_ERR(root);
+ }
+ if (root->reloc_root != reloc_root) {
+ ASSERT(0);
+ btrfs_err(fs_info,
+ "root %llu has two reloc roots associated with it",
+ reloc_root->root_key.offset);
+ btrfs_put_root(root);
+ return -EUCLEAN;
+ }
ret = btrfs_record_root_in_trans(trans, root);
btrfs_put_root(root);
@@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_backref_node *next;
struct btrfs_root *root;
int index = 0;
+ int ret;
next = node;
while (1) {
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
- BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state));
+
+ /*
+ * If there is no root, then our references for this block are
+ * incomplete, as we should be able to walk all the way up to a
+ * block that is owned by a root.
+ *
+ * This path is only for SHAREABLE roots, so if we come upon a
+ * non-SHAREABLE root then we have backrefs that resolve
+ * improperly.
+ *
+ * Both of these cases indicate file system corruption, or a bug
+ * in the backref walking code.
+ */
+ if (!root) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu doesn't have a backref path ending in a root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
+ ASSERT(0);
+ btrfs_err(trans->fs_info,
+ "bytenr %llu has multiple refs with one ending in a non-shareable root",
+ node->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
- record_reloc_root_in_trans(trans, root);
+ ret = record_reloc_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
break;
}
- btrfs_record_root_in_trans(trans, root);
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ return ERR_PTR(ret);
root = root->reloc_root;
+ /*
+ * We could have raced with another thread which failed, so
+ * root->reloc_root may not be set, return ENOENT in this case.
+ */
+ if (!root)
+ return ERR_PTR(-ENOENT);
+
if (next->new_bytenr != root->node->start) {
- BUG_ON(next->new_bytenr);
- BUG_ON(!list_empty(&next->list));
+ /*
+ * We just created the reloc root, so we shouldn't have
+ * ->new_bytenr set and this shouldn't be in the changed
+ * list. If it is then we have multiple roots pointing
+ * at the same bytenr which indicates corruption, or
+ * we've made a mistake in the backref walking code.
+ */
+ ASSERT(next->new_bytenr == 0);
+ ASSERT(list_empty(&next->list));
+ if (next->new_bytenr || !list_empty(&next->list)) {
+ btrfs_err(trans->fs_info,
+ "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
+ node->bytenr, next->bytenr);
+ return ERR_PTR(-EUCLEAN);
+ }
+
next->new_bytenr = root->node->start;
btrfs_put_root(next->root);
next->root = btrfs_grab_root(root);
@@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
if (!next || next->level <= node->level)
break;
}
- if (!root)
- return NULL;
+ if (!root) {
+ /*
+ * This can happen if there's fs corruption or if there's a bug
+ * in the backref lookup code.
+ */
+ ASSERT(0);
+ return ERR_PTR(-ENOENT);
+ }
next = node;
/* setup backref node path for btrfs_reloc_cow_block */
@@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
cond_resched();
next = walk_up_backref(next, edges, &index);
root = next->root;
- BUG_ON(!root);
+
+ /*
+ * This can occur if we have incomplete extent refs leading all
+ * the way up a particular path, in this case return -EUCLEAN.
+ */
+ if (!root)
+ return ERR_PTR(-EUCLEAN);
/* No other choice for non-shareable tree */
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
@@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
int slot;
int ret = 0;
- BUG_ON(lowest && node->eb);
+ /*
+ * If we are lowest then this is the first time we're processing this
+ * block, and thus shouldn't have an eb associated with it yet.
+ */
+ ASSERT(!lowest || !node->eb);
path->lowest_level = node->level + 1;
rc->backref_cache.path[node->level] = node;
@@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans,
upper = edge->node[UPPER];
root = select_reloc_root(trans, rc, upper, edges);
- BUG_ON(!root);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+ goto next;
+ }
if (upper->eb && !upper->locked) {
if (!lowest) {
@@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
free_extent_buffer(eb);
if (ret < 0)
goto next;
- BUG_ON(node->eb != eb);
+ /*
+ * We've just COWed this block, it should have updated
+ * the correct backref node entry.
+ */
+ ASSERT(node->eb == eb);
} else {
btrfs_set_node_blockptr(upper->eb, slot,
node->eb->start);
@@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans,
btrfs_init_tree_ref(&ref, node->level,
btrfs_header_owner(upper->eb));
ret = btrfs_inc_extent_ref(trans, &ref);
- BUG_ON(ret);
-
- ret = btrfs_drop_subtree(trans, root, eb, upper->eb);
- BUG_ON(ret);
+ if (!ret)
+ ret = btrfs_drop_subtree(trans, root, eb,
+ upper->eb);
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
}
next:
if (!upper->pending)
@@ -2302,7 +2498,12 @@ next:
}
path->lowest_level = 0;
- BUG_ON(ret == -ENOSPC);
+
+ /*
+ * We should have allocated all of our space in the block rsv and thus
+ * shouldn't ENOSPC.
+ */
+ ASSERT(ret != -ENOSPC);
return ret;
}
@@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans,
BUG_ON(node->processed);
root = select_one_root(node);
- if (root == ERR_PTR(-ENOENT)) {
- update_processed_blocks(rc, node);
+ if (IS_ERR(root)) {
+ ret = PTR_ERR(root);
+
+ /* See explanation in select_one_root for the -EUCLEAN case. */
+ ASSERT(ret == -ENOENT);
+ if (ret == -ENOENT) {
+ ret = 0;
+ update_processed_blocks(rc, node);
+ }
goto out;
}
if (root) {
if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
- BUG_ON(node->new_bytenr);
- BUG_ON(!list_empty(&node->list));
- btrfs_record_root_in_trans(trans, root);
+ /*
+ * This block was the root block of a root, and this is
+ * the first time we're processing the block and thus it
+ * should not have had the ->new_bytenr modified and
+ * should have not been included on the changed list.
+ *
+ * However in the case of corruption we could have
+ * multiple refs pointing to the same block improperly,
+ * and thus we would trip over these checks. ASSERT()
+ * for the developer case, because it could indicate a
+ * bug in the backref code, however error out for a
+ * normal user in the case of corruption.
+ */
+ ASSERT(node->new_bytenr == 0);
+ ASSERT(list_empty(&node->list));
+ if (node->new_bytenr || !list_empty(&node->list)) {
+ btrfs_err(root->fs_info,
+ "bytenr %llu has improper references to it",
+ node->bytenr);
+ ret = -EUCLEAN;
+ goto out;
+ }
+ ret = btrfs_record_root_in_trans(trans, root);
+ if (ret)
+ goto out;
+ /*
+ * Another thread could have failed, need to check if we
+ * have reloc_root actually set.
+ */
+ if (!root->reloc_root) {
+ ret = -ENOENT;
+ goto out;
+ }
root = root->reloc_root;
node->new_bytenr = root->node->start;
btrfs_put_root(node->root);
@@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
return btrfs_end_transaction(trans);
}
- inode_lock(&inode->vfs_inode);
+ btrfs_inode_lock(&inode->vfs_inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
@@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
break;
}
- inode_unlock(&inode->vfs_inode);
+ btrfs_inode_unlock(&inode->vfs_inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc)
mutex_unlock(&fs_info->reloc_mutex);
}
-static int check_extent_flags(u64 flags)
-{
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if (!(flags & BTRFS_EXTENT_FLAG_DATA) &&
- !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
- return 1;
- if ((flags & BTRFS_EXTENT_FLAG_DATA) &&
- (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
- return 1;
- return 0;
-}
-
static noinline_for_stack
int prepare_to_relocate(struct reloc_control *rc)
{
@@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc)
*/
return PTR_ERR(trans);
}
- btrfs_commit_transaction(trans);
- return 0;
+ return btrfs_commit_transaction(trans);
}
static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
@@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
struct btrfs_path *path;
struct btrfs_extent_item *ei;
u64 flags;
- u32 item_size;
int ret;
int err = 0;
int progress = 0;
@@ -3334,19 +3556,7 @@ restart:
ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_extent_item);
- item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]);
- if (item_size >= sizeof(*ei)) {
- flags = btrfs_extent_flags(path->nodes[0], ei);
- ret = check_extent_flags(flags);
- BUG_ON(ret);
- } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) {
- err = -EINVAL;
- btrfs_print_v0_err(trans->fs_info);
- btrfs_abort_transaction(trans, err);
- break;
- } else {
- BUG();
- }
+ flags = btrfs_extent_flags(path->nodes[0], ei);
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = add_tree_block(rc, &key, path, &blocks);
@@ -3445,7 +3655,9 @@ restart:
err = PTR_ERR(trans);
goto out_free;
}
- btrfs_commit_transaction(trans);
+ ret = btrfs_commit_transaction(trans);
+ if (ret && !err)
+ err = ret;
out_free:
ret = clean_dirty_subvols(rc);
if (ret < 0 && !err)
@@ -3488,6 +3700,35 @@ out:
return ret;
}
+static void delete_orphan_inode(struct btrfs_trans_handle *trans,
+ struct btrfs_root *root, u64 objectid)
+{
+ struct btrfs_path *path;
+ struct btrfs_key key;
+ int ret = 0;
+
+ path = btrfs_alloc_path();
+ if (!path) {
+ ret = -ENOMEM;
+ goto out;
+ }
+
+ key.objectid = objectid;
+ key.type = BTRFS_INODE_ITEM_KEY;
+ key.offset = 0;
+ ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
+ if (ret) {
+ if (ret > 0)
+ ret = -ENOENT;
+ goto out;
+ }
+ ret = btrfs_del_item(trans, root, path);
+out:
+ if (ret)
+ btrfs_abort_transaction(trans, ret);
+ btrfs_free_path(path);
+}
+
/*
* helper to create inode for data relocation.
* the inode is in data relocation tree and its link count is 0
@@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info,
goto out;
err = __insert_orphan_inode(trans, root, objectid);
- BUG_ON(err);
+ if (err)
+ goto out;
inode = btrfs_iget(fs_info->sb, objectid, root);
- BUG_ON(IS_ERR(inode));
+ if (IS_ERR(inode)) {
+ delete_orphan_inode(trans, root, objectid);
+ err = PTR_ERR(inode);
+ inode = NULL;
+ goto out;
+ }
BTRFS_I(inode)->index_cnt = group->start;
err = btrfs_orphan_add(trans, BTRFS_I(inode));
@@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root)
}
err = __add_reloc_root(reloc_root);
- BUG_ON(err < 0); /* -ENOMEM or logic error */
+ ASSERT(err != -EEXIST);
+ if (err) {
+ list_add_tail(&reloc_root->root_list, &reloc_roots);
+ btrfs_put_root(fs_root);
+ btrfs_end_transaction(trans);
+ goto out_unset;
+ }
fs_root->reloc_root = btrfs_grab_root(reloc_root);
btrfs_put_root(fs_root);
}
@@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
return PTR_ERR(reloc_root);
ret = __add_reloc_root(reloc_root);
- BUG_ON(ret < 0);
+ ASSERT(ret != -EEXIST);
+ if (ret) {
+ /* Pairs with create_reloc_root */
+ btrfs_put_root(reloc_root);
+ return ret;
+ }
new_root->reloc_root = btrfs_grab_root(reloc_root);
if (rc->create_reloc_tree)
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 3d9088eab2fc..485cda3eb8d7 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -206,9 +206,6 @@ struct full_stripe_lock {
struct mutex mutex;
};
-static void scrub_pending_bio_inc(struct scrub_ctx *sctx);
-static void scrub_pending_bio_dec(struct scrub_ctx *sctx);
-static int scrub_handle_errored_block(struct scrub_block *sblock_to_check);
static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
struct scrub_block *sblocks_for_recheck);
static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
@@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock,
static int scrub_checksum_data(struct scrub_block *sblock);
static int scrub_checksum_tree_block(struct scrub_block *sblock);
static int scrub_checksum_super(struct scrub_block *sblock);
-static void scrub_block_get(struct scrub_block *sblock);
static void scrub_block_put(struct scrub_block *sblock);
static void scrub_page_get(struct scrub_page *spage);
static void scrub_page_put(struct scrub_page *spage);
static void scrub_parity_get(struct scrub_parity *sparity);
static void scrub_parity_put(struct scrub_parity *sparity);
-static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx,
- struct scrub_page *spage);
static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len,
u64 physical, struct btrfs_device *dev, u64 flags,
u64 gen, int mirror_num, u8 *csum,
@@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx,
static void scrub_wr_submit(struct scrub_ctx *sctx);
static void scrub_wr_bio_end_io(struct bio *bio);
static void scrub_wr_bio_end_io_worker(struct btrfs_work *work);
-static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
-static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info);
static void scrub_put_ctx(struct scrub_ctx *sctx);
static inline int scrub_is_page_on_raid56(struct scrub_page *spage)
@@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
spin_lock(&cache->lock);
if (!cache->to_copy) {
spin_unlock(&cache->lock);
- ro_set = 0;
- goto done;
+ btrfs_put_block_group(cache);
+ goto skip;
}
spin_unlock(&cache->lock);
}
@@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx,
cache, found_key.offset))
ro_set = 0;
-done:
down_write(&dev_replace->rwsem);
dev_replace->cursor_left = dev_replace->cursor_right;
dev_replace->item_needs_writeback = 1;
diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c
index 8f323859156b..bd69db72acc5 100644
--- a/fs/btrfs/send.c
+++ b/fs/btrfs/send.c
@@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx)
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
+ path->reada = READA_FORWARD_ALWAYS;
key.objectid = BTRFS_FIRST_FREE_OBJECTID;
key.type = BTRFS_INODE_ITEM_KEY;
@@ -6688,15 +6689,35 @@ out:
return ret;
}
-static int tree_move_down(struct btrfs_path *path, int *level)
+static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
{
struct extent_buffer *eb;
+ struct extent_buffer *parent = path->nodes[*level];
+ int slot = path->slots[*level];
+ const int nritems = btrfs_header_nritems(parent);
+ u64 reada_max;
+ u64 reada_done = 0;
BUG_ON(*level == 0);
- eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]);
+ eb = btrfs_read_node_slot(parent, slot);
if (IS_ERR(eb))
return PTR_ERR(eb);
+ /*
+ * Trigger readahead for the next leaves we will process, so that it is
+ * very likely that when we need them they are already in memory and we
+ * will not block on disk IO. For nodes we only do readahead for one,
+ * since the time window between processing nodes is typically larger.
+ */
+ reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize);
+
+ for (slot++; slot < nritems && reada_done < reada_max; slot++) {
+ if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) {
+ btrfs_readahead_node_child(parent, slot);
+ reada_done += eb->fs_info->nodesize;
+ }
+ }
+
path->nodes[*level - 1] = eb;
path->slots[*level - 1] = 0;
(*level)--;
@@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path,
static int tree_advance(struct btrfs_path *path,
int *level, int root_level,
int allow_down,
- struct btrfs_key *key)
+ struct btrfs_key *key,
+ u64 reada_min_gen)
{
int ret;
if (*level == 0 || !allow_down) {
ret = tree_move_next_or_upnext(path, level, root_level);
} else {
- ret = tree_move_down(path, level);
+ ret = tree_move_down(path, level, reada_min_gen);
}
if (ret >= 0) {
if (*level == 0)
@@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
u64 right_blockptr;
u64 left_gen;
u64 right_gen;
+ u64 reada_min_gen;
left_path = btrfs_alloc_path();
if (!left_path) {
@@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = -ENOMEM;
goto out;
}
+ /*
+ * Our right root is the parent root, while the left root is the "send"
+ * root. We know that all new nodes/leaves in the left root must have
+ * a generation greater than the right root's generation, so we trigger
+ * readahead for those nodes and leaves of the left root, as we know we
+ * will need to read them at some point.
+ */
+ reada_min_gen = btrfs_header_generation(right_root->commit_root);
up_read(&fs_info->commit_root_sem);
if (left_level == 0)
@@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(left_path, &left_level,
left_root_level,
advance_left != ADVANCE_ONLY_NEXT,
- &left_key);
+ &left_key, reada_min_gen);
if (ret == -1)
left_end_reached = ADVANCE;
else if (ret < 0)
@@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root,
ret = tree_advance(right_path, &right_level,
right_root_level,
advance_right != ADVANCE_ONLY_NEXT,
- &right_key);
+ &right_key, reada_min_gen);
if (ret == -1)
right_end_reached = ADVANCE;
else if (ret < 0)
@@ -7139,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
int i;
if (root) {
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
@@ -7147,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx)
for (i = 0; i < sctx->clone_roots_cnt; i++) {
root = sctx->clone_roots[i].root;
- ret = btrfs_start_delalloc_snapshot(root);
+ ret = btrfs_start_delalloc_snapshot(root, false);
if (ret)
return ret;
btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX);
diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c
index 2da6177f4b0b..2dc674b7c3b1 100644
--- a/fs/btrfs/space-info.c
+++ b/fs/btrfs/space-info.c
@@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
* of heavy DIO or ordered reservations, preemptive flushing will just
* waste time and cause us to slow down.
*/
- ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes);
- delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes);
+ ordered = percpu_counter_read_positive(&fs_info->ordered_bytes);
+ delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes);
if (ordered >= delalloc)
used += fs_info->delayed_refs_rsv.reserved +
fs_info->delayed_block_rsv.reserved;
diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c
index c69049e7daa9..2d19089ab625 100644
--- a/fs/btrfs/subpage.c
+++ b/fs/btrfs/subpage.c
@@ -4,6 +4,64 @@
#include "ctree.h"
#include "subpage.h"
+/*
+ * Subpage (sectorsize < PAGE_SIZE) support overview:
+ *
+ * Limitations:
+ *
+ * - Only support 64K page size for now
+ * This is to make metadata handling easier, as 64K page would ensure
+ * all nodesize would fit inside one page, thus we don't need to handle
+ * cases where a tree block crosses several pages.
+ *
+ * - Only metadata read-write for now
+ * The data read-write part is in development.
+ *
+ * - Metadata can't cross 64K page boundary
+ * btrfs-progs and kernel have done that for a while, thus only ancient
+ * filesystems could have such problem. For such case, do a graceful
+ * rejection.
+ *
+ * Special behavior:
+ *
+ * - Metadata
+ * Metadata read is fully supported.
+ * Meaning when reading one tree block will only trigger the read for the
+ * needed range, other unrelated range in the same page will not be touched.
+ *
+ * Metadata write support is partial.
+ * The writeback is still for the full page, but we will only submit
+ * the dirty extent buffers in the page.
+ *
+ * This means, if we have a metadata page like this:
+ *
+ * Page offset
+ * 0 16K 32K 48K 64K
+ * |/////////| |///////////|
+ * \- Tree block A \- Tree block B
+ *
+ * Even if we just want to writeback tree block A, we will also writeback
+ * tree block B if it's also dirty.
+ *
+ * This may cause extra metadata writeback which results more COW.
+ *
+ * Implementation:
+ *
+ * - Common
+ * Both metadata and data will use a new structure, btrfs_subpage, to
+ * record the status of each sector inside a page. This provides the extra
+ * granularity needed.
+ *
+ * - Metadata
+ * Since we have multiple tree blocks inside one page, we can't rely on page
+ * locking anymore, or we will have greatly reduced concurrency or even
+ * deadlocks (hold one tree lock while trying to lock another tree lock in
+ * the same page).
+ *
+ * Thus for metadata locking, subpage support relies on io_tree locking only.
+ * This means a slightly higher tree locking latency.
+ */
+
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type)
{
@@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info,
spin_unlock_irqrestore(&subpage->lock, flags);
}
+void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->dirty_bitmap |= tmp;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ set_page_dirty(page);
+}
+
+/*
+ * Extra clear_and_test function for subpage dirty bitmap.
+ *
+ * Return true if we're the last bits in the dirty_bitmap and clear the
+ * dirty_bitmap.
+ * Return false otherwise.
+ *
+ * NOTE: Callers should manually clear page dirty for true case, as we have
+ * extra handling for tree blocks.
+ */
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+ bool last = false;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->dirty_bitmap &= ~tmp;
+ if (subpage->dirty_bitmap == 0)
+ last = true;
+ spin_unlock_irqrestore(&subpage->lock, flags);
+ return last;
+}
+
+void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ bool last;
+
+ last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len);
+ if (last)
+ clear_page_dirty_for_io(page);
+}
+
+void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->writeback_bitmap |= tmp;
+ set_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
+void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len)
+{
+ struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private;
+ u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len);
+ unsigned long flags;
+
+ spin_lock_irqsave(&subpage->lock, flags);
+ subpage->writeback_bitmap &= ~tmp;
+ if (subpage->writeback_bitmap == 0)
+ end_page_writeback(page);
+ spin_unlock_irqrestore(&subpage->lock, flags);
+}
+
/*
* Unlike set/clear which is dependent on each page status, for test all bits
* are tested in the same way.
@@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \
}
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate);
IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty);
+IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback);
/*
* Note that, in selftests (extent-io-tests), we can have empty fs_info passed
@@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate,
PageUptodate);
IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError);
+IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io,
+ PageDirty);
+IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback,
+ PageWriteback);
diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h
index b86a4881475d..bfd626e955be 100644
--- a/fs/btrfs/subpage.h
+++ b/fs/btrfs/subpage.h
@@ -20,6 +20,8 @@ struct btrfs_subpage {
spinlock_t lock;
u16 uptodate_bitmap;
u16 error_bitmap;
+ u16 dirty_bitmap;
+ u16 writeback_bitmap;
union {
/*
* Structures only used by metadata
@@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
DECLARE_BTRFS_SUBPAGE_OPS(uptodate);
DECLARE_BTRFS_SUBPAGE_OPS(error);
+DECLARE_BTRFS_SUBPAGE_OPS(dirty);
+DECLARE_BTRFS_SUBPAGE_OPS(writeback);
+
+bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info,
+ struct page *page, u64 start, u32 len);
#endif
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index f7a4ad86adee..4a396c1147f1 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
}
#endif
+#if BITS_PER_LONG == 32
+void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
+ btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
+ btrfs_warn(fs_info,
+"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_warn(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+
+void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
+{
+ if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
+ btrfs_err(fs_info, "reached 32bit limit for logical addresses");
+ btrfs_err(fs_info,
+"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
+ BTRFS_32BIT_MAX_FILE_SIZE >> 40);
+ btrfs_err(fs_info,
+ "please consider upgrading to 64bit kernel/hardware");
+ }
+}
+#endif
+
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 6eb1c50fa98c..436ac7b4b334 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj,
BTRFS_ATTR(static_feature, supported_rescue_options,
supported_rescue_options_show);
+static ssize_t supported_sectorsizes_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ ssize_t ret = 0;
+
+ /* Only sectorsize == PAGE_SIZE is now supported */
+ ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE);
+
+ return ret;
+}
+BTRFS_ATTR(static_feature, supported_sectorsizes,
+ supported_sectorsizes_show);
+
static struct attribute *btrfs_supported_static_feature_attrs[] = {
BTRFS_ATTR_PTR(static_feature, rmdir_subvol),
BTRFS_ATTR_PTR(static_feature, supported_checksums),
BTRFS_ATTR_PTR(static_feature, send_stream_version),
BTRFS_ATTR_PTR(static_feature, supported_rescue_options),
+ BTRFS_ATTR_PTR(static_feature, supported_sectorsizes),
NULL
};
@@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj,
}
BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store);
+static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj,
+ struct kobj_attribute *a,
+ char *buf)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ ssize_t ret;
+
+ ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold);
+
+ return ret;
+}
+
+static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj,
+ struct kobj_attribute *a,
+ const char *buf, size_t len)
+{
+ struct btrfs_fs_info *fs_info = to_fs_info(kobj);
+ int thresh;
+ int ret;
+
+ ret = kstrtoint(buf, 10, &thresh);
+ if (ret)
+ return ret;
+
+ if (thresh <= 50 || thresh > 100)
+ return -EINVAL;
+
+ fs_info->bg_reclaim_threshold = thresh;
+
+ return len;
+}
+BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show,
+ btrfs_bg_reclaim_threshold_store);
+
static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, label),
BTRFS_ATTR_PTR(, nodesize),
@@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = {
BTRFS_ATTR_PTR(, exclusive_operation),
BTRFS_ATTR_PTR(, generation),
BTRFS_ATTR_PTR(, read_policy),
+ BTRFS_ATTR_PTR(, bg_reclaim_threshold),
NULL,
};
diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c
index acff6bb49a97..f75de9f6c0ad 100644
--- a/fs/btrfs/transaction.c
+++ b/fs/btrfs/transaction.c
@@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans)
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
+ struct btrfs_transaction *cur_trans = trans->transaction;
if (!trans->chunk_bytes_reserved)
return;
@@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans)
btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv,
trans->chunk_bytes_reserved, NULL);
+ atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved);
+ cond_wake_up(&cur_trans->chunk_reserve_wait);
trans->chunk_bytes_reserved = 0;
}
@@ -383,6 +386,8 @@ loop:
spin_lock_init(&cur_trans->dropped_roots_lock);
INIT_LIST_HEAD(&cur_trans->releasing_ebs);
spin_lock_init(&cur_trans->releasing_ebs_lock);
+ atomic64_set(&cur_trans->chunk_bytes_reserved, 0);
+ init_waitqueue_head(&cur_trans->chunk_reserve_wait);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode);
@@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
int force)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret = 0;
if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
root->last_trans < trans->transid) || force) {
@@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
* lock. smp_wmb() makes sure that all the writes above are
* done before we pop in the zero below
*/
- btrfs_init_reloc_root(trans, root);
+ ret = btrfs_init_reloc_root(trans, root);
smp_mb__before_atomic();
clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state);
}
- return 0;
+ return ret;
}
@@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
struct btrfs_root *root)
{
struct btrfs_fs_info *fs_info = root->fs_info;
+ int ret;
if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
return 0;
@@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&fs_info->reloc_mutex);
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
mutex_unlock(&fs_info->reloc_mutex);
- return 0;
+ return ret;
}
static inline int is_transaction_blocked(struct btrfs_transaction *trans)
@@ -741,7 +748,16 @@ got_it:
* Thus it need to be called after current->journal_info initialized,
* or we can deadlock.
*/
- btrfs_record_root_in_trans(h, root);
+ ret = btrfs_record_root_in_trans(h, root);
+ if (ret) {
+ /*
+ * The transaction handle is fully initialized and linked with
+ * other structures so it needs to be ended in case of errors,
+ * not just freed.
+ */
+ btrfs_end_transaction(h);
+ return ERR_PTR(ret);
+ }
return h;
@@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
spin_unlock(&fs_info->fs_roots_radix_lock);
btrfs_free_log(trans, root);
- btrfs_update_reloc_root(trans, root);
+ ret2 = btrfs_update_reloc_root(trans, root);
+ if (ret2)
+ return ret2;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
@@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans,
* recorded root will never be updated again, causing an outdated root
* item.
*/
- record_root_in_trans(trans, src, 1);
+ ret = record_root_in_trans(trans, src, 1);
+ if (ret)
+ return ret;
/*
* btrfs_qgroup_inherit relies on a consistent view of the usage for the
@@ -1509,7 +1529,7 @@ out:
* insert_dir_item()
*/
if (!ret)
- record_root_in_trans(trans, parent, 1);
+ ret = record_root_in_trans(trans, parent, 1);
return ret;
}
@@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
- record_root_in_trans(trans, parent_root, 0);
-
+ ret = record_root_in_trans(trans, parent_root, 0);
+ if (ret)
+ goto fail;
cur_time = current_time(parent_inode);
/*
@@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
goto fail;
}
- record_root_in_trans(trans, root, 0);
+ ret = record_root_in_trans(trans, root, 0);
+ if (ret) {
+ btrfs_abort_transaction(trans, ret);
+ goto fail;
+ }
btrfs_set_root_last_snapshot(&root->root_item, trans->transid);
memcpy(new_root_item, &root->root_item, sizeof(*new_root_item));
btrfs_check_and_init_root_item(new_root_item);
@@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
*/
BUG_ON(list_empty(&cur_trans->list));
- list_del_init(&cur_trans->list);
if (cur_trans == fs_info->running_transaction) {
cur_trans->state = TRANS_STATE_COMMIT_DOING;
spin_unlock(&fs_info->trans_lock);
@@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err)
spin_lock(&fs_info->trans_lock);
}
+
+ /*
+ * Now that we know no one else is still using the transaction we can
+ * remove the transaction from the list of transactions. This avoids
+ * the transaction kthread from cleaning up the transaction while some
+ * other task is still using it, which could result in a use-after-free
+ * on things like log trees, as it forces the transaction kthread to
+ * wait for this transaction to be cleaned up by us.
+ */
+ list_del_init(&cur_trans->list);
+
spin_unlock(&fs_info->trans_lock);
btrfs_cleanup_one_transaction(trans->transaction, fs_info);
diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h
index 6335716e513f..364cfbb4c5c5 100644
--- a/fs/btrfs/transaction.h
+++ b/fs/btrfs/transaction.h
@@ -96,6 +96,13 @@ struct btrfs_transaction {
spinlock_t releasing_ebs_lock;
struct list_head releasing_ebs;
+
+ /*
+ * The number of bytes currently reserved, by all transaction handles
+ * attached to this transaction, for metadata extents of the chunk tree.
+ */
+ atomic64_t chunk_bytes_reserved;
+ wait_queue_head_t chunk_reserve_wait;
};
#define __TRANS_FREEZABLE (1U << 0)
@@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans,
spin_lock(&inode->lock);
inode->last_trans = trans->transaction->transid;
inode->last_sub_trans = inode->root->log_transid;
- inode->last_log_commit = inode->root->last_log_commit;
+ inode->last_log_commit = inode->last_sub_trans - 1;
spin_unlock(&inode->lock);
}
diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c
index f4ade821307d..a8b2e0d2c025 100644
--- a/fs/btrfs/tree-checker.c
+++ b/fs/btrfs/tree-checker.c
@@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf,
key->offset, fs_info->sectorsize);
return -EUCLEAN;
}
+ if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
+ extent_err(leaf, slot,
+ "invalid extent flag, data has full backref set");
+ return -EUCLEAN;
+ }
}
ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1);
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 92a368627791..362d14db1e38 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans,
if (ret)
goto out;
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen;
@@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans,
if (nlink != inode->i_nlink) {
set_nlink(inode, nlink);
- btrfs_update_inode(trans, root, BTRFS_I(inode));
+ ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
+ if (ret)
+ goto out;
}
BTRFS_I(inode)->index_cnt = (u64)-1;
@@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
break;
if (ret == 1) {
+ ret = 0;
if (path->slots[0] == 0)
break;
path->slots[0]--;
@@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
ret = btrfs_del_item(trans, root, path);
if (ret)
- goto out;
+ break;
btrfs_release_path(path);
inode = read_one_inode(root, key.offset);
- if (!inode)
- return -EIO;
+ if (!inode) {
+ ret = -EIO;
+ break;
+ }
ret = fixup_inode_link_count(trans, root, inode);
iput(inode);
if (ret)
- goto out;
+ break;
/*
* fixup on a directory may create new entries,
@@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans,
*/
key.offset = (u64)-1;
}
- ret = 0;
-out:
btrfs_release_path(path);
return ret;
}
@@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans,
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
} else if (ret == -EEXIST) {
ret = 0;
- } else {
- BUG(); /* Logic Error */
}
iput(inode);
@@ -3165,20 +3168,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans,
*/
mutex_unlock(&root->log_mutex);
- btrfs_init_log_ctx(&root_log_ctx, NULL);
-
- mutex_lock(&log_root_tree->log_mutex);
-
if (btrfs_is_zoned(fs_info)) {
+ mutex_lock(&fs_info->tree_root->log_mutex);
if (!log_root_tree->node) {
ret = btrfs_alloc_log_tree_node(trans, log_root_tree);
if (ret) {
- mutex_unlock(&log_root_tree->log_mutex);
+ mutex_unlock(&fs_info->tree_log_mutex);
goto out;
}
}
+ mutex_unlock(&fs_info->tree_root->log_mutex);
}
+ btrfs_init_log_ctx(&root_log_ctx, NULL);
+
+ mutex_lock(&log_root_tree->log_mutex);
+
index2 = log_root_tree->log_transid % 2;
list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]);
root_log_ctx.log_transid = log_root_tree->log_transid;
@@ -4136,7 +4141,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans,
return ret;
}
-static int extent_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int extent_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct extent_map *em1, *em2;
@@ -6058,7 +6064,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans,
* (since logging them is pointless, a link count of 0 means they
* will never be accessible).
*/
- if (btrfs_inode_in_log(inode, trans->transid) ||
+ if ((btrfs_inode_in_log(inode, trans->transid) &&
+ list_empty(&ctx->ordered_extents)) ||
inode->vfs_inode.i_nlink == 0) {
ret = BTRFS_NO_LOG_SYNC;
goto end_no_trans;
@@ -6278,8 +6285,13 @@ again:
}
wc.replay_dest->log_root = log;
- btrfs_record_root_in_trans(trans, wc.replay_dest);
- ret = walk_log_tree(trans, log, &wc);
+ ret = btrfs_record_root_in_trans(trans, wc.replay_dest);
+ if (ret)
+ /* The loop needs to continue due to the root refs */
+ btrfs_handle_fs_error(fs_info, ret,
+ "failed to record the log root in transaction");
+ else
+ ret = walk_log_tree(trans, log, &wc);
if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) {
ret = fixup_inode_link_counts(trans, wc.replay_dest,
@@ -6454,6 +6466,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans,
(!old_dir || old_dir->logged_trans < trans->transid))
return;
+ /*
+ * If we are doing a rename (old_dir is not NULL) from a directory that
+ * was previously logged, make sure the next log attempt on the directory
+ * is not skipped and logs the inode again. This is because the log may
+ * not currently be authoritative for a range including the old
+ * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make
+ * sure after a log replay we do not end up with both the new and old
+ * dentries around (in case the inode is a directory we would have a
+ * directory with two hard links and 2 inode references for different
+ * parents). The next log attempt of old_dir will happen at
+ * btrfs_log_all_parents(), called through btrfs_log_inode_parent()
+ * below, because we have previously set inode->last_unlink_trans to the
+ * current transaction ID, either here or at btrfs_record_unlink_dir() in
+ * case inode is a directory.
+ */
+ if (old_dir)
+ old_dir->logged_trans = 0;
+
btrfs_init_log_ctx(&ctx, &inode->vfs_inode);
ctx.logging_new_name = true;
/*
diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c
new file mode 100644
index 000000000000..8a3a14686d3e
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.c
@@ -0,0 +1,929 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "tree-mod-log.h"
+#include "disk-io.h"
+
+struct tree_mod_root {
+ u64 logical;
+ u8 level;
+};
+
+struct tree_mod_elem {
+ struct rb_node node;
+ u64 logical;
+ u64 seq;
+ enum btrfs_mod_log_op op;
+
+ /*
+ * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
+ * operations.
+ */
+ int slot;
+
+ /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
+ u64 generation;
+
+ /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
+ struct btrfs_disk_key key;
+ u64 blockptr;
+
+ /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
+ struct {
+ int dst_slot;
+ int nr_items;
+ } move;
+
+ /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
+ struct tree_mod_root old_root;
+};
+
+/*
+ * Pull a new tree mod seq number for our operation.
+ */
+static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
+{
+ return atomic64_inc_return(&fs_info->tree_mod_seq);
+}
+
+/*
+ * This adds a new blocker to the tree mod log's blocker list if the @elem
+ * passed does not already have a sequence number set. So when a caller expects
+ * to record tree modifications, it should ensure to set elem->seq to zero
+ * before calling btrfs_get_tree_mod_seq.
+ * Returns a fresh, unused tree log modification sequence number, even if no new
+ * blocker was added.
+ */
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (!elem->seq) {
+ elem->seq = btrfs_inc_tree_mod_seq(fs_info);
+ list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
+ set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+
+ return elem->seq;
+}
+
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct rb_node *next;
+ struct tree_mod_elem *tm;
+ u64 min_seq = BTRFS_SEQ_LAST;
+ u64 seq_putting = elem->seq;
+
+ if (!seq_putting)
+ return;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ list_del(&elem->list);
+ elem->seq = 0;
+
+ if (list_empty(&fs_info->tree_mod_seq_list)) {
+ clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
+ } else {
+ struct btrfs_seq_list *first;
+
+ first = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ if (seq_putting > first->seq) {
+ /*
+ * Blocker with lower sequence number exists, we cannot
+ * remove anything from the log.
+ */
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return;
+ }
+ min_seq = first->seq;
+ }
+
+ /*
+ * Anything that's lower than the lowest existing (read: blocked)
+ * sequence number can be removed from the tree.
+ */
+ tm_root = &fs_info->tree_mod_log;
+ for (node = rb_first(tm_root); node; node = next) {
+ next = rb_next(node);
+ tm = rb_entry(node, struct tree_mod_elem, node);
+ if (tm->seq >= min_seq)
+ continue;
+ rb_erase(node, tm_root);
+ kfree(tm);
+ }
+ write_unlock(&fs_info->tree_mod_log_lock);
+}
+
+/*
+ * Key order of the log:
+ * node/leaf start address -> sequence
+ *
+ * The 'start address' is the logical address of the *new* root node for root
+ * replace operations, or the logical address of the affected block for all
+ * other operations.
+ */
+static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem *tm)
+{
+ struct rb_root *tm_root;
+ struct rb_node **new;
+ struct rb_node *parent = NULL;
+ struct tree_mod_elem *cur;
+
+ lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
+
+ tm->seq = btrfs_inc_tree_mod_seq(fs_info);
+
+ tm_root = &fs_info->tree_mod_log;
+ new = &tm_root->rb_node;
+ while (*new) {
+ cur = rb_entry(*new, struct tree_mod_elem, node);
+ parent = *new;
+ if (cur->logical < tm->logical)
+ new = &((*new)->rb_left);
+ else if (cur->logical > tm->logical)
+ new = &((*new)->rb_right);
+ else if (cur->seq < tm->seq)
+ new = &((*new)->rb_left);
+ else if (cur->seq > tm->seq)
+ new = &((*new)->rb_right);
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&tm->node, parent, new);
+ rb_insert_color(&tm->node, tm_root);
+ return 0;
+}
+
+/*
+ * Determines if logging can be omitted. Returns true if it can. Otherwise, it
+ * returns false with the tree_mod_log_lock acquired. The caller must hold
+ * this until all tree mod log insertions are recorded in the rb tree and then
+ * write unlock fs_info::tree_mod_log_lock.
+ */
+static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return true;
+ if (eb && btrfs_header_level(eb) == 0)
+ return true;
+
+ write_lock(&fs_info->tree_mod_log_lock);
+ if (list_empty(&(fs_info)->tree_mod_seq_list)) {
+ write_unlock(&fs_info->tree_mod_log_lock);
+ return true;
+ }
+
+ return false;
+}
+
+/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
+static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb)
+{
+ if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
+ return false;
+ if (eb && btrfs_header_level(eb) == 0)
+ return false;
+
+ return true;
+}
+
+static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
+ int slot,
+ enum btrfs_mod_log_op op,
+ gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+
+ tm = kzalloc(sizeof(*tm), flags);
+ if (!tm)
+ return NULL;
+
+ tm->logical = eb->start;
+ if (op != BTRFS_MOD_LOG_KEY_ADD) {
+ btrfs_node_key(eb, &tm->key, slot);
+ tm->blockptr = btrfs_node_blockptr(eb, slot);
+ }
+ tm->op = op;
+ tm->slot = slot;
+ tm->generation = btrfs_node_ptr_generation(eb, slot);
+ RB_CLEAR_NODE(&tm->node);
+
+ return tm;
+}
+
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags)
+{
+ struct tree_mod_elem *tm;
+ int ret;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm = alloc_tree_mod_elem(eb, slot, op, flags);
+ if (!tm)
+ return -ENOMEM;
+
+ if (tree_mod_dont_log(eb->fs_info, eb)) {
+ kfree(tm);
+ return 0;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ kfree(tm);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items)
+{
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int ret = 0;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = eb->start;
+ tm->slot = src_slot;
+ tm->move.dst_slot = dst_slot;
+ tm->move.nr_items = nr_items;
+ tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
+
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+ locked = true;
+
+ /*
+ * When we override something during the move, we log these removals.
+ * This can only happen when we move towards the beginning of the
+ * buffer, i.e. dst_slot < src_slot.
+ */
+ for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
+ ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ ret = tree_mod_log_insert(eb->fs_info, tm);
+ if (ret)
+ goto free_tms;
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+ kfree(tm);
+
+ return ret;
+}
+
+static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
+ struct tree_mod_elem **tm_list,
+ int nritems)
+{
+ int i, j;
+ int ret;
+
+ for (i = nritems - 1; i >= 0; i--) {
+ ret = tree_mod_log_insert(fs_info, tm_list[i]);
+ if (ret) {
+ for (j = nritems - 1; j > i; j--)
+ rb_erase(&tm_list[j]->node,
+ &fs_info->tree_mod_log);
+ return ret;
+ }
+ }
+
+ return 0;
+}
+
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal)
+{
+ struct btrfs_fs_info *fs_info = old_root->fs_info;
+ struct tree_mod_elem *tm = NULL;
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int ret = 0;
+ int i;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (log_removal && btrfs_header_level(old_root) > 0) {
+ nritems = btrfs_header_nritems(old_root);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(old_root, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+ }
+
+ tm = kzalloc(sizeof(*tm), GFP_NOFS);
+ if (!tm) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm->logical = new_root->start;
+ tm->old_root.logical = old_root->start;
+ tm->old_root.level = btrfs_header_level(old_root);
+ tm->generation = btrfs_header_generation(old_root);
+ tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+
+ if (tm_list)
+ ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
+ if (!ret)
+ ret = tree_mod_log_insert(fs_info, tm);
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return ret;
+
+free_tms:
+ if (tm_list) {
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+ }
+ kfree(tm);
+
+ return ret;
+}
+
+static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq,
+ bool smallest)
+{
+ struct rb_root *tm_root;
+ struct rb_node *node;
+ struct tree_mod_elem *cur = NULL;
+ struct tree_mod_elem *found = NULL;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ tm_root = &fs_info->tree_mod_log;
+ node = tm_root->rb_node;
+ while (node) {
+ cur = rb_entry(node, struct tree_mod_elem, node);
+ if (cur->logical < start) {
+ node = node->rb_left;
+ } else if (cur->logical > start) {
+ node = node->rb_right;
+ } else if (cur->seq < min_seq) {
+ node = node->rb_left;
+ } else if (!smallest) {
+ /* We want the node with the highest seq */
+ if (found)
+ BUG_ON(found->seq > cur->seq);
+ found = cur;
+ node = node->rb_left;
+ } else if (cur->seq > min_seq) {
+ /* We want the node with the smallest seq */
+ if (found)
+ BUG_ON(found->seq < cur->seq);
+ found = cur;
+ node = node->rb_right;
+ } else {
+ found = cur;
+ break;
+ }
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return found;
+}
+
+/*
+ * This returns the element from the log with the smallest time sequence
+ * value that's in the log (the oldest log item). Any element with a time
+ * sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, true);
+}
+
+/*
+ * This returns the element from the log with the largest time sequence
+ * value that's in the log (the most recent log item). Any element with
+ * a time sequence lower than min_seq will be ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
+ u64 start, u64 min_seq)
+{
+ return __tree_mod_log_search(fs_info, start, min_seq, false);
+}
+
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items)
+{
+ struct btrfs_fs_info *fs_info = dst->fs_info;
+ int ret = 0;
+ struct tree_mod_elem **tm_list = NULL;
+ struct tree_mod_elem **tm_list_add, **tm_list_rem;
+ int i;
+ bool locked = false;
+
+ if (!tree_mod_need_log(fs_info, NULL))
+ return 0;
+
+ if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
+ return 0;
+
+ tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
+ GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ tm_list_add = tm_list;
+ tm_list_rem = tm_list + nr_items;
+ for (i = 0; i < nr_items; i++) {
+ tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
+ BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
+ if (!tm_list_rem[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+
+ tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
+ BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
+ if (!tm_list_add[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(fs_info, NULL))
+ goto free_tms;
+ locked = true;
+
+ for (i = 0; i < nr_items; i++) {
+ ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
+ if (ret)
+ goto free_tms;
+ ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
+ if (ret)
+ goto free_tms;
+ }
+
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nr_items * 2; i++) {
+ if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
+ rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
+ kfree(tm_list[i]);
+ }
+ if (locked)
+ write_unlock(&fs_info->tree_mod_log_lock);
+ kfree(tm_list);
+
+ return ret;
+}
+
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
+{
+ struct tree_mod_elem **tm_list = NULL;
+ int nritems = 0;
+ int i;
+ int ret = 0;
+
+ if (!tree_mod_need_log(eb->fs_info, eb))
+ return 0;
+
+ nritems = btrfs_header_nritems(eb);
+ tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
+ if (!tm_list)
+ return -ENOMEM;
+
+ for (i = 0; i < nritems; i++) {
+ tm_list[i] = alloc_tree_mod_elem(eb, i,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
+ if (!tm_list[i]) {
+ ret = -ENOMEM;
+ goto free_tms;
+ }
+ }
+
+ if (tree_mod_dont_log(eb->fs_info, eb))
+ goto free_tms;
+
+ ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
+ write_unlock(&eb->fs_info->tree_mod_log_lock);
+ if (ret)
+ goto free_tms;
+ kfree(tm_list);
+
+ return 0;
+
+free_tms:
+ for (i = 0; i < nritems; i++)
+ kfree(tm_list[i]);
+ kfree(tm_list);
+
+ return ret;
+}
+
+/*
+ * Returns the logical address of the oldest predecessor of the given root.
+ * Entries older than time_seq are ignored.
+ */
+static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
+ u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ struct tree_mod_elem *found = NULL;
+ u64 root_logical = eb_root->start;
+ bool looped = false;
+
+ if (!time_seq)
+ return NULL;
+
+ /*
+ * The very last operation that's logged for a root is the replacement
+ * operation (if it is replaced at all). This has the logical address
+ * of the *new* root, making it the very first operation that's logged
+ * for this root.
+ */
+ while (1) {
+ tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
+ time_seq);
+ if (!looped && !tm)
+ return NULL;
+ /*
+ * If there are no tree operation for the oldest root, we simply
+ * return it. This should only happen if that (old) root is at
+ * level 0.
+ */
+ if (!tm)
+ break;
+
+ /*
+ * If there's an operation that's not a root replacement, we
+ * found the oldest version of our root. Normally, we'll find a
+ * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
+ */
+ if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
+ break;
+
+ found = tm;
+ root_logical = tm->old_root.logical;
+ looped = true;
+ }
+
+ /* If there's no old root to return, return what we found instead */
+ if (!found)
+ found = tm;
+
+ return found;
+}
+
+
+/*
+ * tm is a pointer to the first operation to rewind within eb. Then, all
+ * previous operations will be rewound (until we reach something older than
+ * time_seq).
+ */
+static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct extent_buffer *eb,
+ u64 time_seq,
+ struct tree_mod_elem *first_tm)
+{
+ u32 n;
+ struct rb_node *next;
+ struct tree_mod_elem *tm = first_tm;
+ unsigned long o_dst;
+ unsigned long o_src;
+ unsigned long p_size = sizeof(struct btrfs_key_ptr);
+
+ n = btrfs_header_nritems(eb);
+ read_lock(&fs_info->tree_mod_log_lock);
+ while (tm && tm->seq >= time_seq) {
+ /*
+ * All the operations are recorded with the operator used for
+ * the modification. As we're going backwards, we do the
+ * opposite of each operation here.
+ */
+ switch (tm->op) {
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
+ BUG_ON(tm->slot < n);
+ fallthrough;
+ case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
+ case BTRFS_MOD_LOG_KEY_REMOVE:
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ n++;
+ break;
+ case BTRFS_MOD_LOG_KEY_REPLACE:
+ BUG_ON(tm->slot >= n);
+ btrfs_set_node_key(eb, &tm->key, tm->slot);
+ btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
+ btrfs_set_node_ptr_generation(eb, tm->slot,
+ tm->generation);
+ break;
+ case BTRFS_MOD_LOG_KEY_ADD:
+ /* if a move operation is needed it's in the log */
+ n--;
+ break;
+ case BTRFS_MOD_LOG_MOVE_KEYS:
+ o_dst = btrfs_node_key_ptr_offset(tm->slot);
+ o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
+ memmove_extent_buffer(eb, o_dst, o_src,
+ tm->move.nr_items * p_size);
+ break;
+ case BTRFS_MOD_LOG_ROOT_REPLACE:
+ /*
+ * This operation is special. For roots, this must be
+ * handled explicitly before rewinding.
+ * For non-roots, this operation may exist if the node
+ * was a root: root A -> child B; then A gets empty and
+ * B is promoted to the new root. In the mod log, we'll
+ * have a root-replace operation for B, a tree block
+ * that is no root. We simply ignore that operation.
+ */
+ break;
+ }
+ next = rb_next(&tm->node);
+ if (!next)
+ break;
+ tm = rb_entry(next, struct tree_mod_elem, node);
+ if (tm->logical != first_tm->logical)
+ break;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+ btrfs_set_header_nritems(eb, n);
+}
+
+/*
+ * Called with eb read locked. If the buffer cannot be rewound, the same buffer
+ * is returned. If rewind operations happen, a fresh buffer is returned. The
+ * returned buffer is always read-locked. If the returned buffer is not the
+ * input buffer, the lock on the input buffer is released and the input buffer
+ * is freed (its refcount is decremented).
+ */
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq)
+{
+ struct extent_buffer *eb_rewin;
+ struct tree_mod_elem *tm;
+
+ if (!time_seq)
+ return eb;
+
+ if (btrfs_header_level(eb) == 0)
+ return eb;
+
+ tm = tree_mod_log_search(fs_info, eb->start, time_seq);
+ if (!tm)
+ return eb;
+
+ if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ BUG_ON(tm->slot != 0);
+ eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ btrfs_set_header_bytenr(eb_rewin, eb->start);
+ btrfs_set_header_backref_rev(eb_rewin,
+ btrfs_header_backref_rev(eb));
+ btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
+ btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
+ } else {
+ eb_rewin = btrfs_clone_extent_buffer(eb);
+ if (!eb_rewin) {
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ }
+
+ btrfs_tree_read_unlock(eb);
+ free_extent_buffer(eb);
+
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
+ eb_rewin, btrfs_header_level(eb_rewin));
+ btrfs_tree_read_lock(eb_rewin);
+ tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
+ WARN_ON(btrfs_header_nritems(eb_rewin) >
+ BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb_rewin;
+}
+
+/*
+ * Rewind the state of @root's root node to the given @time_seq value.
+ * If there are no changes, the current root->root_node is returned. If anything
+ * changed in between, there's a fresh buffer allocated on which the rewind
+ * operations are done. In any case, the returned buffer is read locked.
+ * Returns NULL on error (with no locks held).
+ */
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
+{
+ struct btrfs_fs_info *fs_info = root->fs_info;
+ struct tree_mod_elem *tm;
+ struct extent_buffer *eb = NULL;
+ struct extent_buffer *eb_root;
+ u64 eb_root_owner = 0;
+ struct extent_buffer *old;
+ struct tree_mod_root *old_root = NULL;
+ u64 old_generation = 0;
+ u64 logical;
+ int level;
+
+ eb_root = btrfs_read_lock_root_node(root);
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (!tm)
+ return eb_root;
+
+ if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
+ old_root = &tm->old_root;
+ old_generation = tm->generation;
+ logical = old_root->logical;
+ level = old_root->level;
+ } else {
+ logical = eb_root->start;
+ level = btrfs_header_level(eb_root);
+ }
+
+ tm = tree_mod_log_search(fs_info, logical, time_seq);
+ if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ old = read_tree_block(fs_info, logical, root->root_key.objectid,
+ 0, level, NULL);
+ if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
+ if (!IS_ERR(old))
+ free_extent_buffer(old);
+ btrfs_warn(fs_info,
+ "failed to read tree block %llu from get_old_root",
+ logical);
+ } else {
+ struct tree_mod_elem *tm2;
+
+ btrfs_tree_read_lock(old);
+ eb = btrfs_clone_extent_buffer(old);
+ /*
+ * After the lookup for the most recent tree mod operation
+ * above and before we locked and cloned the extent buffer
+ * 'old', a new tree mod log operation may have been added.
+ * So lookup for a more recent one to make sure the number
+ * of mod log operations we replay is consistent with the
+ * number of items we have in the cloned extent buffer,
+ * otherwise we can hit a BUG_ON when rewinding the extent
+ * buffer.
+ */
+ tm2 = tree_mod_log_search(fs_info, logical, time_seq);
+ btrfs_tree_read_unlock(old);
+ free_extent_buffer(old);
+ ASSERT(tm2);
+ ASSERT(tm2 == tm || tm2->seq > tm->seq);
+ if (!tm2 || tm2->seq < tm->seq) {
+ free_extent_buffer(eb);
+ return NULL;
+ }
+ tm = tm2;
+ }
+ } else if (old_root) {
+ eb_root_owner = btrfs_header_owner(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ eb = alloc_dummy_extent_buffer(fs_info, logical);
+ } else {
+ eb = btrfs_clone_extent_buffer(eb_root);
+ btrfs_tree_read_unlock(eb_root);
+ free_extent_buffer(eb_root);
+ }
+
+ if (!eb)
+ return NULL;
+ if (old_root) {
+ btrfs_set_header_bytenr(eb, eb->start);
+ btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
+ btrfs_set_header_owner(eb, eb_root_owner);
+ btrfs_set_header_level(eb, old_root->level);
+ btrfs_set_header_generation(eb, old_generation);
+ }
+ btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
+ btrfs_header_level(eb));
+ btrfs_tree_read_lock(eb);
+ if (tm)
+ tree_mod_log_rewind(fs_info, eb, time_seq, tm);
+ else
+ WARN_ON(btrfs_header_level(eb) != 0);
+ WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
+
+ return eb;
+}
+
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
+{
+ struct tree_mod_elem *tm;
+ int level;
+ struct extent_buffer *eb_root = btrfs_root_node(root);
+
+ tm = tree_mod_log_oldest_root(eb_root, time_seq);
+ if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
+ level = tm->old_root.level;
+ else
+ level = btrfs_header_level(eb_root);
+
+ free_extent_buffer(eb_root);
+
+ return level;
+}
+
+/*
+ * Return the lowest sequence number in the tree modification log.
+ *
+ * Return the sequence number of the oldest tree modification log user, which
+ * corresponds to the lowest sequence number of all existing users. If there are
+ * no users it returns 0.
+ */
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
+{
+ u64 ret = 0;
+
+ read_lock(&fs_info->tree_mod_log_lock);
+ if (!list_empty(&fs_info->tree_mod_seq_list)) {
+ struct btrfs_seq_list *elem;
+
+ elem = list_first_entry(&fs_info->tree_mod_seq_list,
+ struct btrfs_seq_list, list);
+ ret = elem->seq;
+ }
+ read_unlock(&fs_info->tree_mod_log_lock);
+
+ return ret;
+}
diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h
new file mode 100644
index 000000000000..12605d19621b
--- /dev/null
+++ b/fs/btrfs/tree-mod-log.h
@@ -0,0 +1,53 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#ifndef BTRFS_TREE_MOD_LOG_H
+#define BTRFS_TREE_MOD_LOG_H
+
+#include "ctree.h"
+
+/* Represents a tree mod log user. */
+struct btrfs_seq_list {
+ struct list_head list;
+ u64 seq;
+};
+
+#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 }
+#define BTRFS_SEQ_LAST ((u64)-1)
+
+enum btrfs_mod_log_op {
+ BTRFS_MOD_LOG_KEY_REPLACE,
+ BTRFS_MOD_LOG_KEY_ADD,
+ BTRFS_MOD_LOG_KEY_REMOVE,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING,
+ BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING,
+ BTRFS_MOD_LOG_MOVE_KEYS,
+ BTRFS_MOD_LOG_ROOT_REPLACE,
+};
+
+u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
+ struct btrfs_seq_list *elem);
+int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
+ struct extent_buffer *new_root,
+ bool log_removal);
+int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
+ enum btrfs_mod_log_op op, gfp_t flags);
+int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb);
+struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
+ struct btrfs_path *path,
+ struct extent_buffer *eb,
+ u64 time_seq);
+struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq);
+int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq);
+int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
+ struct extent_buffer *src,
+ unsigned long dst_offset,
+ unsigned long src_offset,
+ int nr_items);
+int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
+ int dst_slot, int src_slot,
+ int nr_items);
+u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info);
+
+#endif
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index bc3b33efddc5..47d27059d064 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
return 0;
}
-static int devid_cmp(void *priv, struct list_head *a, struct list_head *b)
+static int devid_cmp(void *priv, const struct list_head *a,
+ const struct list_head *b)
{
struct btrfs_device *dev1, *dev2;
@@ -1459,7 +1460,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device,
if (ret == -ERANGE) {
*hole_start += *hole_size;
*hole_size = 0;
- return 1;
+ return true;
}
*hole_start += zone_size;
@@ -3098,11 +3099,12 @@ out:
return ret;
}
-static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
{
struct btrfs_root *root = fs_info->chunk_root;
struct btrfs_trans_handle *trans;
struct btrfs_block_group *block_group;
+ u64 length;
int ret;
/*
@@ -3117,7 +3119,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
* we release the path used to search the chunk/dev tree and before
* the current task acquires this mutex and calls us.
*/
- lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
+ lockdep_assert_held(&fs_info->reclaim_bgs_lock);
/* step one, relocate all the extents inside this chunk */
btrfs_scrub_pause(fs_info);
@@ -3130,8 +3132,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
if (!block_group)
return -ENOENT;
btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
+ length = block_group->length;
btrfs_put_block_group(block_group);
+ /*
+ * On a zoned file system, discard the whole block group, this will
+ * trigger a REQ_OP_ZONE_RESET operation on the device zone. If
+ * resetting the zone fails, don't treat it as a fatal problem from the
+ * filesystem's point of view.
+ */
+ if (btrfs_is_zoned(fs_info)) {
+ ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL);
+ if (ret)
+ btrfs_info(fs_info,
+ "failed to reset zone %llu after relocation",
+ chunk_offset);
+ }
+
trans = btrfs_start_trans_remove_block_group(root->fs_info,
chunk_offset);
if (IS_ERR(trans)) {
@@ -3172,10 +3189,10 @@ again:
key.type = BTRFS_CHUNK_ITEM_KEY;
while (1) {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
BUG_ON(ret == 0); /* Corruption */
@@ -3183,7 +3200,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, key.objectid,
key.type);
if (ret)
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto error;
if (ret > 0)
@@ -3204,7 +3221,7 @@ again:
else
BUG_ON(ret);
}
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (found_key.offset == 0)
break;
@@ -3744,10 +3761,10 @@ again:
goto error;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
}
@@ -3761,7 +3778,7 @@ again:
ret = btrfs_previous_item(chunk_root, path, 0,
BTRFS_CHUNK_ITEM_KEY);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
ret = 0;
break;
}
@@ -3771,7 +3788,7 @@ again:
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != key.objectid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
break;
}
@@ -3788,12 +3805,12 @@ again:
btrfs_release_path(path);
if (!ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
if (counting) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
spin_lock(&fs_info->balance_lock);
bctl->stat.expected++;
spin_unlock(&fs_info->balance_lock);
@@ -3818,7 +3835,7 @@ again:
count_meta < bctl->meta.limit_min)
|| ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
count_sys < bctl->sys.limit_min)) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto loop;
}
@@ -3832,7 +3849,7 @@ again:
ret = btrfs_may_alloc_data_chunk(fs_info,
found_key.offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto error;
} else if (ret == 1) {
chunk_reserved = 1;
@@ -3840,7 +3857,7 @@ again:
}
ret = btrfs_relocate_chunk(fs_info, found_key.offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
enospc_errors++;
} else if (ret == -ETXTBSY) {
@@ -4725,16 +4742,16 @@ again:
key.type = BTRFS_DEV_EXTENT_KEY;
do {
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_previous_item(root, path, 0, key.type);
if (ret) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret < 0)
goto done;
ret = 0;
@@ -4747,7 +4764,7 @@ again:
btrfs_item_key_to_cpu(l, &key, path->slots[0]);
if (key.objectid != device->devid) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4756,7 +4773,7 @@ again:
length = btrfs_dev_extent_length(l, dev_extent);
if (key.offset + length <= new_size) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_release_path(path);
break;
}
@@ -4772,12 +4789,12 @@ again:
*/
ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
if (ret < 0) {
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
goto done;
}
ret = btrfs_relocate_chunk(fs_info, chunk_offset);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
if (ret == -ENOSPC) {
failed++;
} else if (ret) {
@@ -4989,6 +5006,8 @@ static void init_alloc_chunk_ctl_policy_zoned(
ctl->max_chunk_size = 2 * ctl->max_stripe_size;
ctl->devs_max = min_t(int, ctl->devs_max,
BTRFS_MAX_DEVS_SYS_CHUNK);
+ } else {
+ BUG();
}
/* We don't want a chunk larger than 10% of writable space */
@@ -6787,6 +6806,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
return div_u64(chunk_len, data_stripes);
}
+#if BITS_PER_LONG == 32
+/*
+ * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE
+ * can't be accessed on 32bit systems.
+ *
+ * This function do mount time check to reject the fs if it already has
+ * metadata chunk beyond that limit.
+ */
+static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return 0;
+
+ if (logical + length < MAX_LFS_FILESIZE)
+ return 0;
+
+ btrfs_err_32bit_limit(fs_info);
+ return -EOVERFLOW;
+}
+
+/*
+ * This is to give early warning for any metadata chunk reaching
+ * BTRFS_32BIT_EARLY_WARN_THRESHOLD.
+ * Although we can still access the metadata, it's not going to be possible
+ * once the limit is reached.
+ */
+static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info,
+ u64 logical, u64 length, u64 type)
+{
+ if (!(type & BTRFS_BLOCK_GROUP_METADATA))
+ return;
+
+ if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD)
+ return;
+
+ btrfs_warn_32bit_limit(fs_info);
+}
+#endif
+
static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
struct btrfs_chunk *chunk)
{
@@ -6797,6 +6856,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
u64 logical;
u64 length;
u64 devid;
+ u64 type;
u8 uuid[BTRFS_UUID_SIZE];
int num_stripes;
int ret;
@@ -6804,8 +6864,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
logical = key->offset;
length = btrfs_chunk_length(leaf, chunk);
+ type = btrfs_chunk_type(leaf, chunk);
num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
+#if BITS_PER_LONG == 32
+ ret = check_32bit_meta_chunk(fs_info, logical, length, type);
+ if (ret < 0)
+ return ret;
+ warn_32bit_meta_chunk(fs_info, logical, length, type);
+#endif
+
/*
* Only need to verify chunk item if we're reading from sys chunk array,
* as chunk item in tree block is already verified by tree-checker.
@@ -6849,10 +6917,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
map->io_width = btrfs_chunk_io_width(leaf, chunk);
map->io_align = btrfs_chunk_io_align(leaf, chunk);
map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
- map->type = btrfs_chunk_type(leaf, chunk);
+ map->type = type;
map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
map->verified_stripes = 0;
- em->orig_block_len = calc_stripe_length(map->type, em->len,
+ em->orig_block_len = calc_stripe_length(type, em->len,
map->num_stripes);
for (i = 0; i < num_stripes; i++) {
map->stripes[i].physical =
@@ -7448,6 +7516,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device,
int item_size;
int i, ret, slot;
+ if (!device->fs_info->dev_root)
+ return 0;
+
key.objectid = BTRFS_DEV_STATS_OBJECTID;
key.type = BTRFS_PERSISTENT_ITEM_KEY;
key.offset = device->devid;
@@ -7998,7 +8069,7 @@ static int relocating_repair_kthread(void *data)
return -EBUSY;
}
- mutex_lock(&fs_info->delete_unused_bgs_mutex);
+ mutex_lock(&fs_info->reclaim_bgs_lock);
/* Ensure block group still exists */
cache = btrfs_lookup_block_group(fs_info, target);
@@ -8020,7 +8091,7 @@ static int relocating_repair_kthread(void *data)
out:
if (cache)
btrfs_put_block_group(cache);
- mutex_unlock(&fs_info->delete_unused_bgs_mutex);
+ mutex_unlock(&fs_info->reclaim_bgs_lock);
btrfs_exclop_finish(fs_info);
return ret;
diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h
index d4c3e0dd32b8..9c0d84e5ec06 100644
--- a/fs/btrfs/volumes.h
+++ b/fs/btrfs/volumes.h
@@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf);
int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info);
int btrfs_recover_balance(struct btrfs_fs_info *fs_info);
int btrfs_pause_balance(struct btrfs_fs_info *fs_info);
+int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset);
int btrfs_cancel_balance(struct btrfs_fs_info *fs_info);
int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info);
int btrfs_uuid_scan_kthread(void *data);
diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c
index d524acf7b3e5..c3fa7d3fa770 100644
--- a/fs/btrfs/zlib.c
+++ b/fs/btrfs/zlib.c
@@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in,
unsigned long bytes_left;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
destlen = min_t(unsigned long, destlen, PAGE_SIZE);
bytes_left = destlen;
@@ -455,9 +454,7 @@ next:
* end of the inline extent (destlen) to the end of the page
*/
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}
diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c
index 1f972b75a9ab..1bb8ee97aae0 100644
--- a/fs/btrfs/zoned.c
+++ b/fs/btrfs/zoned.c
@@ -21,9 +21,30 @@
/* Pseudo write pointer value for conventional zone */
#define WP_CONVENTIONAL ((u64)-2)
+/*
+ * Location of the first zone of superblock logging zone pairs.
+ *
+ * - primary superblock: 0B (zone 0)
+ * - first copy: 512G (zone starting at that offset)
+ * - second copy: 4T (zone starting at that offset)
+ */
+#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL)
+#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G)
+#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G)
+
+#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET)
+#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET)
+
/* Number of superblock log zones */
#define BTRFS_NR_SB_LOG_ZONES 2
+/*
+ * Maximum supported zone size. Currently, SMR disks have a zone size of
+ * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
+ * expect the zone size to become larger than 8GiB in the near future.
+ */
+#define BTRFS_MAX_ZONE_SIZE SZ_8G
+
static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data)
{
struct blk_zone *zones = data;
@@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones,
}
/*
- * The following zones are reserved as the circular buffer on ZONED btrfs.
- * - The primary superblock: zones 0 and 1
- * - The first copy: zones 16 and 17
- * - The second copy: zones 1024 or zone at 256GB which is minimum, and
- * the following one
+ * Get the first zone number of the superblock mirror
*/
static inline u32 sb_zone_number(int shift, int mirror)
{
- ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
+ u64 zone;
+ ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX);
switch (mirror) {
- case 0: return 0;
- case 1: return 16;
- case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024);
+ case 0: zone = 0; break;
+ case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break;
+ case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break;
}
- return 0;
+ ASSERT(zone <= U32_MAX);
+
+ return (u32)zone;
}
/*
@@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
zone_sectors = bdev_zone_sectors(bdev);
}
- nr_sectors = bdev_nr_sectors(bdev);
/* Check if it's power of 2 (see is_power_of_2) */
ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0);
zone_info->zone_size = zone_sectors << SECTOR_SHIFT;
+
+ /* We reject devices with a zone size larger than 8GB */
+ if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) {
+ btrfs_err_in_rcu(fs_info,
+ "zoned: %s: zone size %llu larger than supported maximum %llu",
+ rcu_str_deref(device->name),
+ zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
+ ret = -EINVAL;
+ goto out;
+ }
+
+ nr_sectors = bdev_nr_sectors(bdev);
zone_info->zone_size_shift = ilog2(zone_info->zone_size);
zone_info->max_zone_append_size =
(u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT;
@@ -311,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device)
if (!IS_ALIGNED(nr_sectors, zone_sectors))
zone_info->nr_zones++;
+ if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) {
+ btrfs_err(fs_info, "zoned: device %pg does not support zone append",
+ bdev);
+ ret = -EINVAL;
+ goto out;
+ }
+
zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL);
if (!zone_info->seq_zones) {
ret = -ENOMEM;
@@ -1088,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new)
goto out;
}
+ if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) {
+ ret = -EIO;
+ goto out;
+ }
+
switch (zone.cond) {
case BLK_ZONE_COND_OFFLINE:
case BLK_ZONE_COND_READONLY:
@@ -1235,7 +1278,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans)
spin_unlock(&trans->releasing_ebs_lock);
}
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_block_group *cache;
@@ -1250,7 +1293,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em)
if (!is_data_inode(&inode->vfs_inode))
return false;
- cache = btrfs_lookup_block_group(fs_info, em->block_start);
+ cache = btrfs_lookup_block_group(fs_info, start);
ASSERT(cache);
if (!cache)
return false;
diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h
index 61e969652fe1..e55d32595c2c 100644
--- a/fs/btrfs/zoned.h
+++ b/fs/btrfs/zoned.h
@@ -9,6 +9,12 @@
#include "disk-io.h"
#include "block-group.h"
+/*
+ * Block groups with more than this value (percents) of unusable space will be
+ * scheduled for background reclaim.
+ */
+#define BTRFS_DEFAULT_RECLAIM_THRESH 75
+
struct btrfs_zoned_device_info {
/*
* Number of zones, zone size and types of zones if bdev is a
@@ -47,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb);
void btrfs_free_redirty_list(struct btrfs_transaction *trans);
-bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em);
+bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start);
void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset,
struct bio *bio);
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
@@ -146,8 +152,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
struct extent_buffer *eb) { }
static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
-static inline bool btrfs_use_zone_append(struct btrfs_inode *inode,
- struct extent_map *em)
+static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start)
{
return false;
}
diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c
index 8e9626d63976..3e26b466476a 100644
--- a/fs/btrfs/zstd.c
+++ b/fs/btrfs/zstd.c
@@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
size_t ret2;
unsigned long total_out = 0;
unsigned long pg_offset = 0;
- char *kaddr;
stream = ZSTD_initDStream(
ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size);
@@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in,
ret = 0;
finish:
if (pg_offset < destlen) {
- kaddr = kmap_atomic(dest_page);
- memset(kaddr + pg_offset, 0, destlen - pg_offset);
- kunmap_atomic(kaddr);
+ memzero_page(dest_page, pg_offset, destlen - pg_offset);
}
return ret;
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 0cb7ffd4977c..ea48c01fb76b 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp)
pgoff_t index;
int sizebits;
- sizebits = -1;
- do {
- sizebits++;
- } while ((size << sizebits) < PAGE_SIZE);
-
+ sizebits = PAGE_SHIFT - __ffs(size);
index = block >> sizebits;
/*
@@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh)
int i;
check_irqs_on();
+ /*
+ * the refcount of buffer_head in bh_lru prevents dropping the
+ * attached page(i.e., try_to_free_buffers) so it could cause
+ * failing page migration.
+ * Skip putting upcoming bh into bh_lru until migration is done.
+ */
+ if (lru_cache_disabled())
+ return;
+
bh_lru_lock();
b = this_cpu_ptr(&bh_lrus);
@@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block,
}
EXPORT_SYMBOL(__bread_gfp);
+static void __invalidate_bh_lrus(struct bh_lru *b)
+{
+ int i;
+
+ for (i = 0; i < BH_LRU_SIZE; i++) {
+ brelse(b->bhs[i]);
+ b->bhs[i] = NULL;
+ }
+}
/*
* invalidate_bh_lrus() is called rarely - but not only at unmount.
* This doesn't race because it runs in each cpu either in irq
@@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp);
static void invalidate_bh_lru(void *arg)
{
struct bh_lru *b = &get_cpu_var(bh_lrus);
- int i;
- for (i = 0; i < BH_LRU_SIZE; i++) {
- brelse(b->bhs[i]);
- b->bhs[i] = NULL;
- }
+ __invalidate_bh_lrus(b);
put_cpu_var(bh_lrus);
}
-static bool has_bh_in_lru(int cpu, void *dummy)
+bool has_bh_in_lru(int cpu, void *dummy)
{
struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu);
int i;
@@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void)
}
EXPORT_SYMBOL_GPL(invalidate_bh_lrus);
+void invalidate_bh_lrus_cpu(int cpu)
+{
+ struct bh_lru *b;
+
+ bh_lru_lock();
+ b = per_cpu_ptr(&bh_lrus, cpu);
+ __invalidate_bh_lrus(b);
+ bh_lru_unlock();
+}
+
void set_bh_page(struct buffer_head *bh,
struct page *page, unsigned long offset)
{
diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile
index 891dedda5905..2227dc2d5498 100644
--- a/fs/cachefiles/Makefile
+++ b/fs/cachefiles/Makefile
@@ -7,6 +7,7 @@ cachefiles-y := \
bind.o \
daemon.o \
interface.o \
+ io.o \
key.o \
main.o \
namei.o \
diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c
index dfb14dbddf51..38bb7764b454 100644
--- a/fs/cachefiles/bind.c
+++ b/fs/cachefiles/bind.c
@@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache)
cache->mnt = path.mnt;
root = path.dentry;
+ ret = -EINVAL;
+ if (mnt_user_ns(path.mnt) != &init_user_ns) {
+ pr_warn("File cache on idmapped mounts not supported");
+ goto error_unsupported;
+ }
+
/* check parameters */
ret = -EOPNOTSUPP;
if (d_is_negative(root) ||
diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c
index 5efa6a3702c0..da3948fdb615 100644
--- a/fs/cachefiles/interface.c
+++ b/fs/cachefiles/interface.c
@@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object)
/*
* dispose of a reference to an object
*/
-static void cachefiles_put_object(struct fscache_object *_object,
- enum fscache_obj_ref_trace why)
+void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why)
{
struct cachefiles_object *object;
struct fscache_cache *cache;
@@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = {
.uncache_page = cachefiles_uncache_page,
.dissociate_pages = cachefiles_dissociate_pages,
.check_consistency = cachefiles_check_consistency,
+ .begin_read_operation = cachefiles_begin_read_operation,
};
diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h
index cf9bd6401c2d..4ed83aa5253b 100644
--- a/fs/cachefiles/internal.h
+++ b/fs/cachefiles/internal.h
@@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache,
*/
extern const struct fscache_cache_ops cachefiles_cache_ops;
+void cachefiles_put_object(struct fscache_object *_object,
+ enum fscache_obj_ref_trace why);
+
/*
* key.c
*/
@@ -218,6 +221,12 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *);
extern void cachefiles_uncache_page(struct fscache_object *, struct page *);
/*
+ * rdwr2.c
+ */
+extern int cachefiles_begin_read_operation(struct netfs_read_request *,
+ struct fscache_retrieval *);
+
+/*
* security.c
*/
extern int cachefiles_get_security_ID(struct cachefiles_cache *cache);
diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c
new file mode 100644
index 000000000000..b13fb45fc3f3
--- /dev/null
+++ b/fs/cachefiles/io.c
@@ -0,0 +1,420 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* kiocb-using read/write
+ *
+ * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/mount.h>
+#include <linux/slab.h>
+#include <linux/file.h>
+#include <linux/uio.h>
+#include <linux/sched/mm.h>
+#include <linux/netfs.h>
+#include "internal.h"
+
+struct cachefiles_kiocb {
+ struct kiocb iocb;
+ refcount_t ki_refcnt;
+ loff_t start;
+ union {
+ size_t skipped;
+ size_t len;
+ };
+ netfs_io_terminated_t term_func;
+ void *term_func_priv;
+ bool was_async;
+};
+
+static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki)
+{
+ if (refcount_dec_and_test(&ki->ki_refcnt)) {
+ fput(ki->iocb.ki_filp);
+ kfree(ki);
+ }
+}
+
+/*
+ * Handle completion of a read from the cache.
+ */
+static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+
+ _enter("%ld,%ld", ret, ret2);
+
+ if (ki->term_func) {
+ if (ret >= 0)
+ ret += ki->skipped;
+ ki->term_func(ki->term_func_priv, ret, ki->was_async);
+ }
+
+ cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a read from the cache.
+ */
+static int cachefiles_read(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ bool seek_data,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ struct cachefiles_kiocb *ki;
+ struct file *file = cres->cache_priv2;
+ unsigned int old_nofs;
+ ssize_t ret = -ENOBUFS;
+ size_t len = iov_iter_count(iter), skipped = 0;
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start_pos, len,
+ i_size_read(file->f_inode));
+
+ /* If the caller asked us to seek for data before doing the read, then
+ * we should do that now. If we find a gap, we fill it with zeros.
+ */
+ if (seek_data) {
+ loff_t off = start_pos, off2;
+
+ off2 = vfs_llseek(file, off, SEEK_DATA);
+ if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) {
+ skipped = 0;
+ ret = off2;
+ goto presubmission_error;
+ }
+
+ if (off2 == -ENXIO || off2 >= start_pos + len) {
+ /* The region is beyond the EOF or there's no more data
+ * in the region, so clear the rest of the buffer and
+ * return success.
+ */
+ iov_iter_zero(len, iter);
+ skipped = len;
+ ret = 0;
+ goto presubmission_error;
+ }
+
+ skipped = off2 - off;
+ iov_iter_zero(skipped, iter);
+ }
+
+ ret = -ENOBUFS;
+ ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+ if (!ki)
+ goto presubmission_error;
+
+ refcount_set(&ki->ki_refcnt, 2);
+ ki->iocb.ki_filp = file;
+ ki->iocb.ki_pos = start_pos + skipped;
+ ki->iocb.ki_flags = IOCB_DIRECT;
+ ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
+ ki->iocb.ki_ioprio = get_current_ioprio();
+ ki->skipped = skipped;
+ ki->term_func = term_func;
+ ki->term_func_priv = term_func_priv;
+ ki->was_async = true;
+
+ if (ki->term_func)
+ ki->iocb.ki_complete = cachefiles_read_complete;
+
+ get_file(ki->iocb.ki_filp);
+
+ old_nofs = memalloc_nofs_save();
+ ret = vfs_iocb_iter_read(file, &ki->iocb, iter);
+ memalloc_nofs_restore(old_nofs);
+ switch (ret) {
+ case -EIOCBQUEUED:
+ goto in_progress;
+
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /* There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+ ret = -EINTR;
+ fallthrough;
+ default:
+ ki->was_async = false;
+ cachefiles_read_complete(&ki->iocb, ret, 0);
+ if (ret > 0)
+ ret = 0;
+ break;
+ }
+
+in_progress:
+ cachefiles_put_kiocb(ki);
+ _leave(" = %zd", ret);
+ return ret;
+
+presubmission_error:
+ if (term_func)
+ term_func(term_func_priv, ret < 0 ? ret : skipped, false);
+ return ret;
+}
+
+/*
+ * Handle completion of a write to the cache.
+ */
+static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2)
+{
+ struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb);
+ struct inode *inode = file_inode(ki->iocb.ki_filp);
+
+ _enter("%ld,%ld", ret, ret2);
+
+ /* Tell lockdep we inherited freeze protection from submission thread */
+ __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE);
+ __sb_end_write(inode->i_sb, SB_FREEZE_WRITE);
+
+ if (ki->term_func)
+ ki->term_func(ki->term_func_priv, ret, ki->was_async);
+
+ cachefiles_put_kiocb(ki);
+}
+
+/*
+ * Initiate a write to the cache.
+ */
+static int cachefiles_write(struct netfs_cache_resources *cres,
+ loff_t start_pos,
+ struct iov_iter *iter,
+ netfs_io_terminated_t term_func,
+ void *term_func_priv)
+{
+ struct cachefiles_kiocb *ki;
+ struct inode *inode;
+ struct file *file = cres->cache_priv2;
+ unsigned int old_nofs;
+ ssize_t ret = -ENOBUFS;
+ size_t len = iov_iter_count(iter);
+
+ _enter("%pD,%li,%llx,%zx/%llx",
+ file, file_inode(file)->i_ino, start_pos, len,
+ i_size_read(file->f_inode));
+
+ ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL);
+ if (!ki)
+ goto presubmission_error;
+
+ refcount_set(&ki->ki_refcnt, 2);
+ ki->iocb.ki_filp = file;
+ ki->iocb.ki_pos = start_pos;
+ ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE;
+ ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file));
+ ki->iocb.ki_ioprio = get_current_ioprio();
+ ki->start = start_pos;
+ ki->len = len;
+ ki->term_func = term_func;
+ ki->term_func_priv = term_func_priv;
+ ki->was_async = true;
+
+ if (ki->term_func)
+ ki->iocb.ki_complete = cachefiles_write_complete;
+
+ /* Open-code file_start_write here to grab freeze protection, which
+ * will be released by another thread in aio_complete_rw(). Fool
+ * lockdep by telling it the lock got released so that it doesn't
+ * complain about the held lock when we return to userspace.
+ */
+ inode = file_inode(file);
+ __sb_start_write(inode->i_sb, SB_FREEZE_WRITE);
+ __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE);
+
+ get_file(ki->iocb.ki_filp);
+
+ old_nofs = memalloc_nofs_save();
+ ret = vfs_iocb_iter_write(file, &ki->iocb, iter);
+ memalloc_nofs_restore(old_nofs);
+ switch (ret) {
+ case -EIOCBQUEUED:
+ goto in_progress;
+
+ case -ERESTARTSYS:
+ case -ERESTARTNOINTR:
+ case -ERESTARTNOHAND:
+ case -ERESTART_RESTARTBLOCK:
+ /* There's no easy way to restart the syscall since other AIO's
+ * may be already running. Just fail this IO with EINTR.
+ */
+